[latex3-commits] [git/LaTeX3-latex3-latex2e] utf8andspace: integrate utf8 extension for pdftex supporting label/ref, file names and typeouts (2edd933)
Frank Mittelbach
frank.mittelbach at latex-project.org
Tue Apr 9 13:44:47 CEST 2019
Repository : https://github.com/latex3/latex2e
On branch : utf8andspace
Link : https://github.com/latex3/latex2e/commit/2edd9339ed9a710d45b435ba91545240d0cc9528
>---------------------------------------------------------------
commit 2edd9339ed9a710d45b435ba91545240d0cc9528
Author: Frank Mittelbach <frank.mittelbach at latex-project.org>
Date: Tue Apr 9 13:44:47 2019 +0200
integrate utf8 extension for pdftex supporting label/ref, file names and typeouts
>---------------------------------------------------------------
2edd9339ed9a710d45b435ba91545240d0cc9528
base/changes.txt | 7 ++
base/ltfinal.dtx | 21 +++++-
base/utf8andspace.tex | 65 +++++++++++--------
base/utf8ienc.dtx | 171 ++++++++++++++++++++++++++++++++++++++++++-------
4 files changed, 213 insertions(+), 51 deletions(-)
diff --git a/base/changes.txt b/base/changes.txt
index c967669..bd3091d 100644
--- a/base/changes.txt
+++ b/base/changes.txt
@@ -4,6 +4,13 @@ completeness or accuracy and it contains some references to files that
are not part of the distribution.
=======================================================================
+2019-04-09 Frank Mittelbach <Frank.Mittelbach at latex-project.org>
+
+ * utf8ienc.dtx: adjustment for extened UTF-8 support in pdftex,
+ supporting UTF-8 in labels and refs and in filenames and typeouts.
+
+ * ltfinal.dtx: adjustment for extened UTF-8 support in pdftex
+
2019-03-06 Frank Mittelbach <Frank.Mittelbach at latex-project.org>
* nfssfont.dtx (section{The code}): Added a default action
diff --git a/base/ltfinal.dtx b/base/ltfinal.dtx
index 5c9f3d3..7f3d192 100644
--- a/base/ltfinal.dtx
+++ b/base/ltfinal.dtx
@@ -1,6 +1,6 @@
% \iffalse meta-comment
%
-% Copyright 1993-2018
+% Copyright 1993-2019
% The LaTeX3 Project and any individual authors listed elsewhere
% in this file.
%
@@ -33,7 +33,7 @@
%<*driver>
% \fi
\ProvidesFile{ltfinal.dtx}
- [2018/08/24 v2.1f LaTeX Kernel (Final Settings)]
+ [2019/04/09 v2.1g LaTeX Kernel (Final Settings)]
% \iffalse
\documentclass{ltxdoc}
\GetFileInfo{ltfinal.dtx}
@@ -664,7 +664,7 @@
% \end{macro}
%
% \changes{v2.1d}{2018/04/08}{Delay full UTF-8 handling to \cs{everyjob}}
-% \changes{v2.18}{2018/05/11}{Make invalit UTF-8 also safe, for legacy filesystem encodings}
+% \changes{v2.18}{2018/05/11}{Make invalid UTF-8 also safe, for legacy filesystem encodings}
% \begin{macrocode}
\edef\inputencodingname{utf8}%
\input{utf8.def}
@@ -717,7 +717,22 @@
%<latexrelease> {\UTFviii at invalid}{UTF-8 default}%
% \end{macrocode}
%
+% the first block of commands got only introduced in 2019 but we
+% revert all of Unicode support in one go not jump to the
+% intermediate version.
% \begin{macrocode}
+%<latexrelease> \let\UTFviii at two@octets at combine\@undefined
+%<latexrelease> \let\UTFviii at three@octets at combine\@undefined
+%<latexrelease> \let\UTFviii at four@octets at combine\@undefined
+%<latexrelease> \let\UTFviii at two@octets at string\@undefined
+%<latexrelease> \let\UTFviii at three@octets at string\@undefined
+%<latexrelease> \let\UTFviii at four@octets at string\@undefined
+%<latexrelease> \let\UTFviii at two@octets at noexpand\@undefined
+%<latexrelease> \let\UTFviii at three@octets at noexpand\@undefined
+%<latexrelease> \let\UTFviii at four@octets at noexpand\@undefined
+% \end{macrocode}
+%
+% \end{macrocode}
%<latexrelease>\@tempcnta=0
%<latexrelease>\loop
%<latexrelease> \catcode\@tempcnta=15
diff --git a/base/utf8andspace.tex b/base/utf8andspace.tex
index 9ed03b4..a53ae62 100644
--- a/base/utf8andspace.tex
+++ b/base/utf8andspace.tex
@@ -36,9 +36,20 @@
%% Thanks!
+% this is just a simpleminded way to disable some of the code below if this file is
+% loaded int the kernel directly:
+\newif\ifskipcode
+\ifnum\the\catcode`\@ = 11
+ \skipcodetrue
+\else
+ \skipcodefalse
+\fi
+
+
\makeatletter
+\ifskipcode % this part is already integrated ...
% utf8
%
@@ -86,7 +97,7 @@
% \long\def\UTFviii at two@octets{%
% \ifincsname
% \expandafter
-% \UTF at twostring@octets
+% \UTF at two@octets at string
% \else
% \ifx\protect\@typeset at protect
% \else
@@ -94,23 +105,23 @@
% \UTF at twoharmless@octets
% \fi
% \fi
-% \UTFviii at two@octets at do
+% \UTFviii at two@octets at combine
% }
%
% \ifcsname is tested first because that can be true even if we are
% otherwise doing typesetting. If this is the case use \string on the
-% whole octet sequence. \UTF at twostring@octets not only does this but
-% also gets rid of \UTFviii at two@octets at do in the input stream by
+% whole octet sequence. \UTF at two@octets at string not only does this but
+% also gets rid of \UTFviii at two@octets at combine in the input stream by
% picking it up as a first argument and dropping it.
%
% If this is not the case and we are doing typesetting (i.e., \protect
-% is \typeset at protect) then execute \UTFviii at two@octets at do which
+% is \typeset at protect) then execute \UTFviii at two@octets at combine which
% picks up all octets and typesets the character (or generates an
% error if it doesn't know how to typeset it).
%
-% If we are not doing typesetting then we run \UTFviii at two@octets at do
-% which is like \UTF at twostring@octets but uses \noexpand instead
+% If we are not doing typesetting then we run \UTFviii at two@octets at combine
+% which is like \UTF at two@octets at string but uses \noexpand instead
% of \string. This way the sequence is temporay frozen, eg would
% display as is or stays put inside a \protected at edef but if the
% result is later reused the starting octet is still active.
@@ -131,6 +142,7 @@
% should of course use a \string version of the octet since there is
% no point do extra work.
+
\begingroup
\catcode`\~13
\catcode`\"12
@@ -178,45 +190,45 @@
\long\def\UTFviii at two@octets{%
\ifincsname
\expandafter
- \UTF at twostring@octets
+ \UTF at two@octets at string
\else
\ifx\protect\@typeset at protect
\else
\expandafter\expandafter\expandafter
- \UTF at twoharmless@octets
+ \UTF at two@octets at noexpand
\fi
\fi
- \UTFviii at two@octets at do
+ \UTFviii at two@octets at combine
}
\long\def\UTFviii at three@octets{%
\ifincsname
\expandafter
- \UTF at threestring@octets
+ \UTF at three@octets at string
\else
\ifx\protect\@typeset at protect
\else
\expandafter\expandafter\expandafter
- \UTF at threeharmless@octets
+ \UTF at three@octets at noexpand
\fi
\fi
- \UTFviii at three@octets at do
+ \UTFviii at three@octets at combine
}
\long\def\UTFviii at four@octets{%
\ifincsname
\expandafter
- \UTF at fourstring@octets
+ \UTF at four@octets at string
\else
\ifx\protect\@typeset at protect
\else
\expandafter\expandafter\expandafter
- \UTF at fourharmless@octets
+ \UTF at four@octets at noexpand
\fi
\fi
- \UTFviii at four@octets at do
+ \UTFviii at four@octets at combine
}
@@ -226,30 +238,30 @@
% is faster than having it figure out that by itself that it is in a
% csname.
-\long\def\UTFviii at two@octets at do#1#2{\expandafter
+\long\def\UTFviii at two@octets at combine#1#2{\expandafter
\UTFviii at defined\csname u8:\string#1\string#2\endcsname}
-\long\def\UTFviii at three@octets at do#1#2#3{\expandafter
+\long\def\UTFviii at three@octets at combine#1#2#3{\expandafter
\UTFviii at defined\csname u8:\string#1\string#2\string#3\endcsname}
-\long\def\UTFviii at four@octets at do#1#2#3#4{\expandafter
+\long\def\UTFviii at four@octets at combine#1#2#3#4{\expandafter
\UTFviii at defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
% These tempoarily prevent the active chars from expanding. (Maybe
% using \unexpanded would be faster here?)
-\long\def\UTF at twoharmless@octets#1#2{\noexpand#2\noexpand}
-\long\def\UTF at threeharmless@octets#1#2#3{\noexpand#2\noexpand#3\noexpand}
-\long\def\UTF at fourharmless@octets#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
+\long\def\UTF at two@octets at noexpand#1#2{\noexpand#2\noexpand}
+\long\def\UTF at three@octets at noexpand#1#2#3{\noexpand#2\noexpand#3\noexpand}
+\long\def\UTF at four@octets at noexpand#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
% And the same with \string for use in \csname constructions.
-\long\def\UTF at twostring@octets#1#2{\string#2\string}
-\long\def\UTF at threestring@octets#1#2#3{\string#2\string#3\string}
-\long\def\UTF at fourstring@octets#1#2#3#4{\string#2\string#3\string#4\string}
+\long\def\UTF at two@octets at string#1#2{\string#2\string}
+\long\def\UTF at three@octets at string#1#2#3{\string#2\string#3\string}
+\long\def\UTF at four@octets at string#1#2#3#4{\string#2\string#3\string#4\string}
% The kernel already has saved away definitions for the starting code so
-% we have to refresh that (until the day this is properly integrated):
+% we have to refresh that (until the day this is properly integrated):
% if used in the kernel we also need this:
\let\UTFviii at two@octets@@\UTFviii at two@octets
@@ -258,6 +270,7 @@
% Done :-)
+\fi % end if ifskipcode
%-------------------------------------------------------------------------
%
diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index 42dc0d7..3e586dd 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -47,7 +47,6 @@
%</driver>
% \fi
%
-%
% \newpage
%
% \section{Introduction}
@@ -237,48 +236,175 @@
%
% \subsection{Parsing UTF-8 input}
%
-% \begin{macro}{\UTFviii at two@octets}
-% \begin{macro}{\UTFviii at three@octets}
-% \begin{macro}{\UTFviii at four@octets}
-% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
% A UTF-8 char (that is not actually a 7-bit char, i.e.~a single
% octet) is parsed as follows: each starting octet is an active
% \TeX{} character token; each of these is defined below to be a
% macro with one to three arguments nominally (depending on the
% starting octet). It calls one of |\UTFviii at two@octets|,
% |\UTFviii at three@octets|, or |\UTFviii at four@octets| which then
-% actually picks up the argument(s).
+% actually picks up the remaining octets as the argument(s).
+%
+% \begin{itemize}
+% \item When typesetting we pick up the necessary number of additional
+% octets, check if they form a command that \LaTeX{} knows about
+% (via \cs{csname} \texttt{u8:}\cs{string}
+% \verb=#1=\cs{string} \verb=#2...=\cs{endcsname}) and if so use that
+% for typesetting. \cs{string} is needed as the octets may (all?) be
+% active and we want the literal values in the name.
+%
+% \item If the UTF-8 character is going to be part of a label, then it is
+% essentially becoming part of some csname and with the
+% test \cs{ifincsname} we can find this out. If so, we render the whole
+% sequence off octets harmless by using \cs{string} too when the
+% starting octet executes (\cs{UTF at ...@octets at string}).
+%
+% \item Another possible case is that \cs{protect} has \emph{not} the meaning
+% of \cs{typeset at protect}. In that case we may do a \cs{write} or we may do
+% a \cs{protected at edef} or \ldots{} In all such cases we want to keep the
+% sequence of octets unchanged, but we can't use \cs{string} this time, since at
+% least in the case of \cs{protect at edef} the result may later be
+% typeset after all (in fact that is quite likely) and so at that
+% point the starting octet needs to be an active character again
+% (the others could be stringified). So for this case we use \cs{noexpand}
+% ((\cs{UTF at ...s@octets at noexpand}).
+% \end{itemize}
+%
+% \begin{macro}{\UTFviii at two@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
+% Putting that all together the code for a start octet of a two
+% byte sequence would then look like this:
+% \begin{macrocode}
+\long\def\UTFviii at two@octets{%
+ \ifincsname
+ \expandafter \UTF at two@octets at string
+ \else
+ \ifx \protect\@typeset at protect \else
+ \expandafter\expandafter\expandafter \UTF at two@octets at noexpand
+ \fi
+ \fi
+ \UTFviii at two@octets at combine
+}
+% \end{macrocode}
+
+% \cs{ifcsname} is tested first because that can be true even if we
+% are otherwise doing typesetting. If this is the case we use
+% \cs{string} on the whole octet
+% sequence. \cs{UTF at two@octets at string} not only does this but also
+% gets rid of the command \cs{UTFviii at two@octets at combine} in the input
+% stream by picking it up as a first argument and dropping it.
+%
+% If this is not the case and we are doing typesetting (i.e.,
+% \cs{protect} is \cs{typeset at protect}), then we execute
+% \cs{UTFviii at two@octets at combine} which picks up all octets and typesets
+% the character (or generates an error if it doesn't know how to
+% typeset it).
+%
+% However, if we are not doing typesetting, then we execute the
+% command \cs{UTFviii at two@octets at noexpand} which works like
+% \cs{UTF at two@octets at string} but uses \cs{noexpand} instead of
+% \cs{string}. This way the sequence is temporay rendered harmless,
+% e.g., would display as is or stays put inside a
+% \cs{protected at edef}. But if the result is later reused the
+% starting octet is still active and so will be able to construct
+% the UTF-8 character again.
+% \end{macro}
+%
+%
+% \begin{macro}{\UTFviii at three@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
+% \begin{macro}{\UTFviii at four@octets}
+% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved
+% error messages}
+% The definitions for the other starting octets
+% are the same except that they pick up more octets after them.
+% \begin{macrocode}
+\long\def\UTFviii at three@octets{%
+ \ifincsname
+ \expandafter \UTF at three@octets at string
+ \else
+ \ifx \protect\@typeset at protect \else
+ \expandafter\expandafter\expandafter \UTF at three@octets at noexpand
+ \fi
+ \fi
+ \UTFviii at three@octets at combine
+}
+% \end{macrocode}
+% \begin{macrocode}
+\long\def\UTFviii at four@octets{%
+ \ifincsname
+ \expandafter \UTF at four@octets at string
+ \else
+ \ifx \protect\@typeset at protect \else
+ \expandafter\expandafter\expandafter \UTF at four@octets at noexpand
+ \fi
+ \fi
+ \UTFviii at four@octets at combine
+}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
%
+% \begin{macro}{\UTFviii at two@octets at noexpand}
+% \begin{macro}{\UTFviii at three@octets at noexpand}
+% \begin{macro}{\UTFviii at four@octets at noexpand}
+% These tempoarily prevent the active chars from expanding.
+% \begin{macrocode}
+\long\def\UTF at two@octets at noexpand#1#2{\noexpand#2\noexpand}
+\long\def\UTF at three@octets at noexpand#1#2#3{\noexpand#2\noexpand#3\noexpand}
+\long\def\UTF at four@octets at noexpand#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+
+% \begin{macro}{\UTFviii at two@octets at string}
+% \begin{macro}{\UTFviii at three@octets at string}
+% \begin{macro}{\UTFviii at four@octets at string}
+% And the same with \cs{string} for use in \cs{csname} constructions.
+% \begin{macrocode}
+\long\def\UTF at two@octets at string#1#2{\string#2\string}
+\long\def\UTF at three@octets at string#1#2#3{\string#2\string#3\string}
+\long\def\UTF at four@octets at string#1#2#3#4{\string#2\string#3\string#4\string}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+
+
+% \begin{macro}{\UTFviii at two@octets at combine}
+% \begin{macro}{\UTFviii at three@octets at combine}
+% \begin{macro}{\UTFviii at four@octets at combine}
% From the arguments a control sequence with a name of the form
% \verb=u8:#1#2...= is constructed where the |#i| ($i>1$) are the
-% arguments and |#1| is the starting octet (as a \TeX{} character
+% arguments and |#1| is the starting octet (as a \TeX{} active character
% token). Since some or even all of these characters are active
-% (when inputenc is loaded) we need to use |\string| when building
-% the csname.
+% we need to use |\string| when building
+% the \cs{csname}.
%
-% The csname thus constructed can of course be undefined but to
+% The \cs{csname} thus constructed can of course be undefined but to
% avoid producing an unhelpful low-level undefined command error we
% pass it to |\UTFviii at defined| which is responsible for producing
% a more sensible error message (not yet done!!). If, however, it is
% defined we simply execute the thing (which should then expand to
% an encoding specific internal \LaTeX{} form).
% \begin{macrocode}
-\long\def\UTFviii at two@octets#1#2{\expandafter
- \UTFviii at defined\csname u8:#1\string#2\endcsname}
+\long\def\UTFviii at two@octets at combine#1#2{\expandafter
+ \UTFviii at defined\csname u8:\string#1\string#2\endcsname}
% \end{macrocode}
-% \end{macro}
%
% \begin{macrocode}
-\long\def\UTFviii at three@octets#1#2#3{\expandafter
- \UTFviii at defined\csname u8:#1\string#2\string#3\endcsname}
+\long\def\UTFviii at three@octets at combine#1#2#3{\expandafter
+ \UTFviii at defined\csname u8:\string#1\string#2\string#3\endcsname}
% \end{macrocode}
-% \end{macro}
%
% \begin{macrocode}
-\long\def\UTFviii at four@octets#1#2#3#4{\expandafter
- \UTFviii at defined\csname u8:#1\string#2\string#3\string#4\endcsname}
+\long\def\UTFviii at four@octets at combine#1#2#3#4{\expandafter
+ \UTFviii at defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
% \end{macrocode}
% \end{macro}
+% \end{macro}
+% \end{macro}
+%
%
% \begin{macro}{\UTFviii at defined}
% This tests whether its argument is different from |\relax|: it
@@ -447,11 +573,12 @@
\UTFviii at loop
% \end{macrocode}
%
-% Setting up 2-byte UTF-8:
+% Setting up 2-byte UTF-8: The starting bytes is passed as an
+% active character so that it can be reprocessed later!
% \begin{macrocode}
\count@"C2
\@tempcnta"E0
- \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at two@octets\string~}}
+ \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at two@octets\noexpand~}}
\UTFviii at loop
% \end{macrocode}
%
@@ -459,7 +586,7 @@
% \begin{macrocode}
\count@"E0
\@tempcnta"F0
- \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at three@octets\string~}}
+ \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at three@octets\noexpand~}}
\UTFviii at loop
% \end{macrocode}
%
@@ -468,7 +595,7 @@
% \begin{macrocode}
\count@"F0
\@tempcnta"F5
- \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\string~}}
+ \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\noexpand~}}
\UTFviii at loop
% \end{macrocode}
%
More information about the latex3-commits
mailing list