[latex3-commits] [git/LaTeX3-latex3-latex2e] utf8andspace: integrate utf8 extension for pdftex supporting label/ref, file names and typeouts (2edd933)

Tue Apr 9 13:44:47 CEST 2019

Repository : https://github.com/latex3/latex2e
On branch  : utf8andspace
Link       : https://github.com/latex3/latex2e/commit/2edd9339ed9a710d45b435ba91545240d0cc9528

>---------------------------------------------------------------

commit 2edd9339ed9a710d45b435ba91545240d0cc9528
Author: Frank Mittelbach <frank.mittelbach at latex-project.org>
Date:   Tue Apr 9 13:44:47 2019 +0200

    integrate utf8 extension for pdftex supporting label/ref, file names and typeouts


>---------------------------------------------------------------

2edd9339ed9a710d45b435ba91545240d0cc9528
 base/changes.txt      |    7 ++
 base/ltfinal.dtx      |   21 +++++-
 base/utf8andspace.tex |   65 +++++++++++--------
 base/utf8ienc.dtx     |  171 ++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 213 insertions(+), 51 deletions(-)

diff --git a/base/changes.txt b/base/changes.txt
index c967669..bd3091d 100644
--- a/base/changes.txt
+++ b/base/changes.txt
@@ -4,6 +4,13 @@ completeness or accuracy and it contains some references to files that
 are not part of the distribution.
 =======================================================================
 
+2019-04-09  Frank Mittelbach  <Frank.Mittelbach at latex-project.org>
+
+	* utf8ienc.dtx: adjustment for extened UTF-8 support in pdftex,
+	supporting UTF-8 in labels and refs and in filenames and typeouts.
+
+	* ltfinal.dtx: adjustment for extened UTF-8 support in pdftex
+
 2019-03-06  Frank Mittelbach  <Frank.Mittelbach at latex-project.org>
 
 	* nfssfont.dtx (section{The code}): Added a default action
diff --git a/base/ltfinal.dtx b/base/ltfinal.dtx
index 5c9f3d3..7f3d192 100644
--- a/base/ltfinal.dtx
+++ b/base/ltfinal.dtx
@@ -1,6 +1,6 @@
 % \iffalse meta-comment
 %
-% Copyright 1993-2018
+% Copyright 1993-2019
 % The LaTeX3 Project and any individual authors listed elsewhere
 % in this file.
 %
@@ -33,7 +33,7 @@
 %<*driver>
 % \fi
 \ProvidesFile{ltfinal.dtx}
-             [2018/08/24 v2.1f LaTeX Kernel (Final Settings)]
+             [2019/04/09 v2.1g LaTeX Kernel (Final Settings)]
 % \iffalse
 \documentclass{ltxdoc}
 \GetFileInfo{ltfinal.dtx}
@@ -664,7 +664,7 @@
 %  \end{macro}
 %
 % \changes{v2.1d}{2018/04/08}{Delay full UTF-8 handling to \cs{everyjob}}
-% \changes{v2.18}{2018/05/11}{Make invalit UTF-8 also safe, for legacy filesystem encodings}
+% \changes{v2.18}{2018/05/11}{Make invalid UTF-8 also safe, for legacy filesystem encodings}
 %    \begin{macrocode}
 \edef\inputencodingname{utf8}%
 \input{utf8.def}
@@ -717,7 +717,22 @@
 %<latexrelease>                 {\UTFviii at invalid}{UTF-8 default}%
 %    \end{macrocode}
 %
+%    the first block of commands got only introduced in 2019 but we
+%    revert all of Unicode support  in one go not jump to the
+%    intermediate version.
 %    \begin{macrocode}
+%<latexrelease>  \let\UTFviii at two@octets at combine\@undefined
+%<latexrelease>  \let\UTFviii at three@octets at combine\@undefined
+%<latexrelease>  \let\UTFviii at four@octets at combine\@undefined
+%<latexrelease>  \let\UTFviii at two@octets at string\@undefined
+%<latexrelease>  \let\UTFviii at three@octets at string\@undefined
+%<latexrelease>  \let\UTFviii at four@octets at string\@undefined
+%<latexrelease>  \let\UTFviii at two@octets at noexpand\@undefined
+%<latexrelease>  \let\UTFviii at three@octets at noexpand\@undefined
+%<latexrelease>  \let\UTFviii at four@octets at noexpand\@undefined
+%    \end{macrocode}
+%
+%    \end{macrocode}
 %<latexrelease>\@tempcnta=0
 %<latexrelease>\loop
 %<latexrelease>  \catcode\@tempcnta=15
diff --git a/base/utf8andspace.tex b/base/utf8andspace.tex
index 9ed03b4..a53ae62 100644
--- a/base/utf8andspace.tex
+++ b/base/utf8andspace.tex
@@ -36,9 +36,20 @@
 %% Thanks!
 
 
+% this is just a simpleminded way to disable some of the code below if this file is
+% loaded int the kernel directly:
+\newif\ifskipcode
+\ifnum\the\catcode`\@ = 11
+  \skipcodetrue
+\else
+  \skipcodefalse
+\fi
+
+
 \makeatletter
 
 
+\ifskipcode  % this part is already integrated ...
 
 % utf8
 %
@@ -86,7 +97,7 @@
 % \long\def\UTFviii at two@octets{%
 %   \ifincsname
 %     \expandafter
-%     \UTF at twostring@octets
+%     \UTF at two@octets at string
 %   \else
 %     \ifx\protect\@typeset at protect
 %     \else
@@ -94,23 +105,23 @@
 %      \UTF at twoharmless@octets
 %     \fi
 %   \fi
-%   \UTFviii at two@octets at do
+%   \UTFviii at two@octets at combine
 % }
 % 
 
 % \ifcsname is tested first because that can be true even if we are
 %  otherwise doing typesetting. If this is the case use \string on the
-%  whole octet sequence. \UTF at twostring@octets not only does this but
-%  also gets rid of \UTFviii at two@octets at do in the input stream by
+%  whole octet sequence. \UTF at two@octets at string not only does this but
+%  also gets rid of \UTFviii at two@octets at combine in the input stream by
 %  picking it up as a first argument and dropping it.
 %
 % If this is not the case and we are doing typesetting (i.e., \protect
-%  is \typeset at protect) then execute \UTFviii at two@octets at do which
+%  is \typeset at protect) then execute \UTFviii at two@octets at combine which
 %  picks up all octets and typesets the character (or generates an
 %  error if it doesn't know how to typeset it).
 %
-% If we are not doing typesetting then we run \UTFviii at two@octets at do
-%  which is like \UTF at twostring@octets but uses \noexpand instead
+% If we are not doing typesetting then we run \UTFviii at two@octets at combine
+%  which is like \UTF at two@octets at string but uses \noexpand instead
 %  of \string. This way the sequence is temporay frozen, eg would
 %  display as is or stays put inside a \protected at edef but if the
 %  result is later reused the starting octet is still active.
@@ -131,6 +142,7 @@
 %  should of course use a \string version of the octet since there is
 %  no point do extra work.
 
+
 \begingroup
 \catcode`\~13
 \catcode`\"12
@@ -178,45 +190,45 @@
 \long\def\UTFviii at two@octets{%
   \ifincsname
     \expandafter
-    \UTF at twostring@octets
+    \UTF at two@octets at string
   \else
     \ifx\protect\@typeset at protect
     \else
       \expandafter\expandafter\expandafter
-     \UTF at twoharmless@octets
+     \UTF at two@octets at noexpand
     \fi
   \fi
-  \UTFviii at two@octets at do
+  \UTFviii at two@octets at combine
 }
 
 
 \long\def\UTFviii at three@octets{%
   \ifincsname
     \expandafter
-    \UTF at threestring@octets
+    \UTF at three@octets at string
   \else
     \ifx\protect\@typeset at protect
     \else
       \expandafter\expandafter\expandafter
-     \UTF at threeharmless@octets
+     \UTF at three@octets at noexpand
     \fi
   \fi
-  \UTFviii at three@octets at do
+  \UTFviii at three@octets at combine
 }
 
 
 \long\def\UTFviii at four@octets{%
   \ifincsname
     \expandafter
-    \UTF at fourstring@octets
+    \UTF at four@octets at string
   \else
     \ifx\protect\@typeset at protect
     \else
       \expandafter\expandafter\expandafter
-     \UTF at fourharmless@octets
+     \UTF at four@octets at noexpand
     \fi
   \fi
-  \UTFviii at four@octets at do
+  \UTFviii at four@octets at combine
 }
 
 
@@ -226,30 +238,30 @@
 %  is faster than having it figure out that by itself that it is in a
 %  csname.
 
-\long\def\UTFviii at two@octets at do#1#2{\expandafter
+\long\def\UTFviii at two@octets at combine#1#2{\expandafter
     \UTFviii at defined\csname u8:\string#1\string#2\endcsname}
-\long\def\UTFviii at three@octets at do#1#2#3{\expandafter
+\long\def\UTFviii at three@octets at combine#1#2#3{\expandafter
     \UTFviii at defined\csname u8:\string#1\string#2\string#3\endcsname}
-\long\def\UTFviii at four@octets at do#1#2#3#4{\expandafter
+\long\def\UTFviii at four@octets at combine#1#2#3#4{\expandafter
     \UTFviii at defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
 
 
 % These tempoarily prevent the active chars from expanding. (Maybe
 %  using \unexpanded would be faster here?)
 
-\long\def\UTF at twoharmless@octets#1#2{\noexpand#2\noexpand}
-\long\def\UTF at threeharmless@octets#1#2#3{\noexpand#2\noexpand#3\noexpand}
-\long\def\UTF at fourharmless@octets#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
+\long\def\UTF at two@octets at noexpand#1#2{\noexpand#2\noexpand}
+\long\def\UTF at three@octets at noexpand#1#2#3{\noexpand#2\noexpand#3\noexpand}
+\long\def\UTF at four@octets at noexpand#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
 
 % And the same with \string for use in \csname constructions.
 
-\long\def\UTF at twostring@octets#1#2{\string#2\string}
-\long\def\UTF at threestring@octets#1#2#3{\string#2\string#3\string}
-\long\def\UTF at fourstring@octets#1#2#3#4{\string#2\string#3\string#4\string}
+\long\def\UTF at two@octets at string#1#2{\string#2\string}
+\long\def\UTF at three@octets at string#1#2#3{\string#2\string#3\string}
+\long\def\UTF at four@octets at string#1#2#3#4{\string#2\string#3\string#4\string}
 
 
 % The kernel already has saved away definitions for the starting code so
-%  we have to refresh that (until the day this is properly integrated):
+% we have to refresh that (until the day this is properly integrated):
 
 % if used in the kernel we also need this:
 \let\UTFviii at two@octets@@\UTFviii at two@octets
@@ -258,6 +270,7 @@
 
 % Done :-)
 
+\fi % end if ifskipcode
 
 %-------------------------------------------------------------------------
 %
diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index 42dc0d7..3e586dd 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -47,7 +47,6 @@
 %</driver>
 % \fi
 %
-%
 % \newpage
 %
 % \section{Introduction}
@@ -237,48 +236,175 @@
 %
 % \subsection{Parsing UTF-8 input}
 %
-% \begin{macro}{\UTFviii at two@octets}
-% \begin{macro}{\UTFviii at three@octets}
-% \begin{macro}{\UTFviii at four@octets}
-% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
 %    A UTF-8 char (that is not actually a 7-bit char, i.e.~a single
 %    octet) is parsed as follows: each starting octet is an active
 %    \TeX{} character token; each of these is defined below to be a
 %    macro with one to three arguments nominally (depending on the
 %    starting octet). It calls one of |\UTFviii at two@octets|,
 %    |\UTFviii at three@octets|, or |\UTFviii at four@octets| which then
-%    actually picks up the argument(s).
+%    actually picks up the remaining octets as the argument(s).
+%
+%    \begin{itemize}
+%  \item When typesetting we pick up the necessary number of additional
+%    octets, check if they form a command that \LaTeX{} knows about
+%    (via \cs{csname} \texttt{u8:}\cs{string}
+%     \verb=#1=\cs{string} \verb=#2...=\cs{endcsname}) and if so use that
+%    for typesetting.  \cs{string} is needed as the octets may (all?) be
+%    active and we want the literal values in the name.
+%
+%  \item If the UTF-8 character is going to be part of a label, then it is
+%    essentially becoming part of some csname and with the
+%    test \cs{ifincsname} we can find this out. If so, we render the whole
+%    sequence off octets harmless by using \cs{string} too when the
+%    starting octet executes (\cs{UTF at ...@octets at string}).
+%
+%  \item Another possible case is that \cs{protect} has \emph{not} the meaning
+%    of \cs{typeset at protect}. In that case we may do a \cs{write} or we may do
+%    a \cs{protected at edef} or \ldots{}  In all such cases we want to keep the
+%    sequence of octets unchanged, but we can't use \cs{string} this time, since at
+%    least in the case of \cs{protect at edef} the result may later be
+%    typeset after all (in fact that is quite likely) and so at that
+%    point the starting octet needs to be an active character again
+%    (the others could be stringified). So for this case we use \cs{noexpand}
+%    ((\cs{UTF at ...s@octets at noexpand}).
+%  \end{itemize}
+%
+% \begin{macro}{\UTFviii at two@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
+%    Putting that all together the code for a start octet of a two
+%    byte sequence would then look like this:
+%    \begin{macrocode}
+\long\def\UTFviii at two@octets{%
+  \ifincsname
+    \expandafter \UTF at two@octets at string
+  \else
+    \ifx \protect\@typeset at protect \else
+      \expandafter\expandafter\expandafter \UTF at two@octets at noexpand
+    \fi
+  \fi
+  \UTFviii at two@octets at combine
+}
+%    \end{macrocode}
+
+%    \cs{ifcsname} is tested first because that can be true even if we
+%    are otherwise doing typesetting. If this is the case we use
+%    \cs{string} on the whole octet
+%    sequence. \cs{UTF at two@octets at string} not only does this but also
+%    gets rid of the command \cs{UTFviii at two@octets at combine} in the input
+%    stream by picking it up as a first argument and dropping it.
+%
+%     If this is not the case and we are doing typesetting (i.e.,
+%    \cs{protect} is \cs{typeset at protect}), then we execute
+%    \cs{UTFviii at two@octets at combine} which picks up all octets and typesets
+%    the character (or generates an error if it doesn't know how to
+%    typeset it).
+%
+%    However, if we are not doing typesetting, then we execute the
+%    command \cs{UTFviii at two@octets at noexpand} which works like
+%    \cs{UTF at two@octets at string} but uses \cs{noexpand} instead of
+%    \cs{string}. This way the sequence is temporay rendered harmless,
+%    e.g., would display as is or stays put inside a
+%    \cs{protected at edef}. But if the result is later reused the
+%    starting octet is still active and so will be able to construct
+%    the UTF-8 character again.
+% \end{macro}
+%
+%
+% \begin{macro}{\UTFviii at three@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
+% \begin{macro}{\UTFviii at four@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved
+%                             error messages}
+%    The definitions for the other starting octets
+%    are the same except that they pick up more octets after them.
+%    \begin{macrocode}
+\long\def\UTFviii at three@octets{%
+  \ifincsname
+    \expandafter \UTF at three@octets at string
+  \else
+    \ifx \protect\@typeset at protect \else
+      \expandafter\expandafter\expandafter \UTF at three@octets at noexpand
+    \fi
+  \fi
+  \UTFviii at three@octets at combine
+}
+%    \end{macrocode}
+%    \begin{macrocode}
+\long\def\UTFviii at four@octets{%
+  \ifincsname
+    \expandafter \UTF at four@octets at string
+  \else
+    \ifx \protect\@typeset at protect \else
+      \expandafter\expandafter\expandafter \UTF at four@octets at noexpand
+    \fi
+  \fi
+  \UTFviii at four@octets at combine
+}
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
 %
+% \begin{macro}{\UTFviii at two@octets at noexpand}
+% \begin{macro}{\UTFviii at three@octets at noexpand}
+% \begin{macro}{\UTFviii at four@octets at noexpand}
+%    These tempoarily prevent the active chars from expanding.
+%    \begin{macrocode}
+\long\def\UTF at two@octets at noexpand#1#2{\noexpand#2\noexpand}
+\long\def\UTF at three@octets at noexpand#1#2#3{\noexpand#2\noexpand#3\noexpand}
+\long\def\UTF at four@octets at noexpand#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+
+% \begin{macro}{\UTFviii at two@octets at string}
+% \begin{macro}{\UTFviii at three@octets at string}
+% \begin{macro}{\UTFviii at four@octets at string}
+%    And the same with \cs{string} for use in \cs{csname} constructions.
+%    \begin{macrocode}
+\long\def\UTF at two@octets at string#1#2{\string#2\string}
+\long\def\UTF at three@octets at string#1#2#3{\string#2\string#3\string}
+\long\def\UTF at four@octets at string#1#2#3#4{\string#2\string#3\string#4\string}
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+
+
+% \begin{macro}{\UTFviii at two@octets at combine}
+% \begin{macro}{\UTFviii at three@octets at combine}
+% \begin{macro}{\UTFviii at four@octets at combine}
 %    From the arguments a control sequence with a name of the form
 %    \verb=u8:#1#2...= is constructed where the |#i| ($i>1$) are the
-%    arguments and |#1| is the starting octet (as a \TeX{} character
+%    arguments and |#1| is the starting octet (as a \TeX{} active character
 %    token).  Since some or even all of these characters are active
-%    (when inputenc is loaded) we need to use |\string| when building
-%    the csname.
+%    we need to use |\string| when building
+%    the \cs{csname}.
 %
-%    The csname thus constructed can of course be undefined but to
+%    The \cs{csname} thus constructed can of course be undefined but to
 %    avoid producing an unhelpful low-level undefined command error we
 %    pass it to |\UTFviii at defined| which is responsible for producing
 %    a more sensible error message (not yet done!!).  If, however, it is
 %    defined we simply execute the thing (which should then expand to
 %    an encoding specific internal \LaTeX{} form).
 %    \begin{macrocode}
-\long\def\UTFviii at two@octets#1#2{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\endcsname}
+\long\def\UTFviii at two@octets at combine#1#2{\expandafter
+    \UTFviii at defined\csname u8:\string#1\string#2\endcsname}
 %    \end{macrocode}
-% \end{macro}
 %
 %    \begin{macrocode}
-\long\def\UTFviii at three@octets#1#2#3{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\string#3\endcsname}
+\long\def\UTFviii at three@octets at combine#1#2#3{\expandafter
+    \UTFviii at defined\csname u8:\string#1\string#2\string#3\endcsname}
 %    \end{macrocode}
-% \end{macro}
 %
 %    \begin{macrocode}
-\long\def\UTFviii at four@octets#1#2#3#4{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\string#3\string#4\endcsname}
+\long\def\UTFviii at four@octets at combine#1#2#3#4{\expandafter
+    \UTFviii at defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
 %    \end{macrocode}
 % \end{macro}
+% \end{macro}
+% \end{macro}
+%
 %
 % \begin{macro}{\UTFviii at defined}
 %    This tests whether its argument is different from |\relax|: it
@@ -447,11 +573,12 @@
 \UTFviii at loop
 %    \end{macrocode}
 %
-%    Setting up 2-byte UTF-8:
+%    Setting up 2-byte UTF-8: The starting bytes is passed as an
+%    active character so that it can be reprocessed later!
 %    \begin{macrocode}
     \count@"C2
     \@tempcnta"E0
-    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at two@octets\string~}}
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at two@octets\noexpand~}}
 \UTFviii at loop
 %    \end{macrocode}
 %
@@ -459,7 +586,7 @@
 %    \begin{macrocode}
     \count@"E0
     \@tempcnta"F0
-    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at three@octets\string~}}
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at three@octets\noexpand~}}
 \UTFviii at loop
 %    \end{macrocode}
 %
@@ -468,7 +595,7 @@
 %    \begin{macrocode}
     \count@"F0
     \@tempcnta"F5
-    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\string~}}
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\noexpand~}}
 \UTFviii at loop
 %    \end{macrocode}
 %