[latex3-commits] [git/LaTeX3-latex3-latex2e] master: fixes from pr #60 (a9a3b09)
David Carlisle
d.p.carlisle at gmail.com
Fri Sep 28 13:49:09 CEST 2018
Repository : https://github.com/latex3/latex2e
On branch : master
Link : https://github.com/latex3/latex2e/commit/a9a3b09478b06c1fa513bc78284164ffe924b157
>---------------------------------------------------------------
commit a9a3b09478b06c1fa513bc78284164ffe924b157
Author: David Carlisle <d.p.carlisle at gmail.com>
Date: Fri Sep 28 12:49:09 2018 +0100
fixes from pr #60
>---------------------------------------------------------------
a9a3b09478b06c1fa513bc78284164ffe924b157
base/changes.txt | 4 ++
...tlb-ltluatex-001.tlg => github-0060.luatex.tlg} | 0
base/testfiles/github-0060.lvt | 35 +++++++++++++++++
base/testfiles/github-0060.tlg | 40 ++++++++++++++++++++
...{tlb-ltluatex-001.tlg => github-0060.xetex.tlg} | 0
base/utf8ienc.dtx | 23 +++++++++--
6 files changed, 98 insertions(+), 4 deletions(-)
diff --git a/base/changes.txt b/base/changes.txt
index 9c604f4..2173b4b 100644
--- a/base/changes.txt
+++ b/base/changes.txt
@@ -4,6 +4,10 @@ completeness or accuracy and it contains some references to files that
are not part of the distribution.
=======================================================================
+2018-09-28 David Carlisle <David.Carlisle at latex-project.org>
+
+ * utf8ienc.dtx: Fix to handling of 4-octet UTF-8. Mostly from github PR 60
+
2018-09-26 Frank Mittelbach <Frank.Mittelbach at latex-project.org>
* ltmiscen.dtx (subsection{Environments}):
diff --git a/base/testfiles/tlb-ltluatex-001.tlg b/base/testfiles/github-0060.luatex.tlg
similarity index 100%
copy from base/testfiles/tlb-ltluatex-001.tlg
copy to base/testfiles/github-0060.luatex.tlg
diff --git a/base/testfiles/github-0060.lvt b/base/testfiles/github-0060.lvt
new file mode 100644
index 0000000..9a3167d
--- /dev/null
+++ b/base/testfiles/github-0060.lvt
@@ -0,0 +1,35 @@
+\input{test2e}
+% correction for 4-byte UTF8 (GitHub PR 60)
+
+
+\documentclass{article}
+
+\START
+\ifx\Umathchar\undefined\else\expandafter\END\fi
+
+\typeout{^^J===Declare 10FFFF}
+\DeclareUnicodeCharacter{10FFFF}{U+10FFFF\typeout{MAX VALUE!!}}
+\typeout{^^J===Declare 110FFF}
+\DeclareUnicodeCharacter{110FFF}{?\typeout{TOO BIG!!}}
+
+\OMIT
+\begin{document}
+\TIMO
+
+
+\typeout{^^J===10FFFF a}
+[]% U+10FFFF highest valid UTF-8
+
+\typeout{^^J===10FFFF b}
+[^^f4^^8f^^bf^^bf]% U+10FFFF highest valid UTF-8
+
+\typeout{^^J===110FFF}
+[^^f4^^90^^bf^^bf]% trying to be U+110FFF, too high
+
+\typeout{^^J===110FFE}
+[^^f4^^90^^bf^^be]% trying to be U+110FFE, too high, undeclared
+
+\typeout{^^J===bad 2 byte}
+[^^dd^^dd]% mal formed utf8 (second byte not 10....)
+
+\END
diff --git a/base/testfiles/github-0060.tlg b/base/testfiles/github-0060.tlg
new file mode 100644
index 0000000..981f315
--- /dev/null
+++ b/base/testfiles/github-0060.tlg
@@ -0,0 +1,40 @@
+This is a generated file for the LaTeX2e validation system.
+Don't change this file in any respect.
+===Declare 10FFFF
+ defining Unicode char U+10FFFF (decimal 1114111)
+===Declare 110FFF
+ defining Unicode char U+110FFF (decimal 1118207)
+! Package inputenc Error: 110FFF too large for Unicode.
+See the inputenc package documentation for explanation.
+Type H <return> for immediate help.
+ ...
+l. ......odeCharacter{110FFF}{?\typeout{TOO BIG!!}}
+Values between 0 and 10FFFF are permitted
+===10FFFF a
+MAX VALUE!!
+===10FFFF b
+MAX VALUE!!
+===110FFF
+TOO BIG!!
+===110FFE
+! Package inputenc Error: Unicode character ^^f4^^90^^bf^^be (U+110FFE)
+(inputenc) not set up for use with LaTeX.
+See the inputenc package documentation for explanation.
+Type H <return> for immediate help.
+ ...
+l. ...[^^f4^^90^^bf^^be
+ ]% trying to be U+110FFE, too high, undeclared
+You may provide a definition with
+\DeclareUnicodeCharacter
+===bad 2 byte
+! Package inputenc Error: Invalid UTF-8 byte sequence.
+See the inputenc package documentation for explanation.
+Type H <return> for immediate help.
+ ...
+l. ...[^^dd^^dd
+ ]% mal formed utf8 (second byte not 10....)
+The document does not appear to be in UTF-8 encoding.
+Try adding \UseRawInputEncoding as the first line of the file
+or specify an encoding such as \usepackage [latin1]{inputenc}
+in the document preamble.
+Alternatively, save the file in UTF-8 using your editor or another tool
diff --git a/base/testfiles/tlb-ltluatex-001.tlg b/base/testfiles/github-0060.xetex.tlg
similarity index 100%
copy from base/testfiles/tlb-ltluatex-001.tlg
copy to base/testfiles/github-0060.xetex.tlg
diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index db2d584..39dd704 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -217,7 +217,7 @@
%<+ts1> \ProvidesFile{ts1enc.dfu}
%<+x2> \ProvidesFile{x2enc.dfu}
%<+all> \ProvidesFile{utf8enc.dfu}
- [2018/07/30 v1.2d UTF-8 support for inputenc]
+ [2018/09/28 v1.2e UTF-8 support for inputenc]
% \end{macrocode}
%
% \begin{macrocode}
@@ -463,9 +463,10 @@
% \end{macrocode}
%
% Setting up 4-byte UTF-8:
+% \changes{v1.2e}{2018/09/28}{Fix "F4 lead byte}%
% \begin{macrocode}
\count@"F0
- \@tempcnta"F4
+ \@tempcnta"F5
\def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\string~}}
\UTFviii at loop
% \end{macrocode}
@@ -473,7 +474,7 @@
% Bytes above F4 are not valid UTF-8 starting bytes as they would encode numbers beyond
% the Unicode range
% \begin{macrocode}
- \count@"F4
+ \count@"F5
\@tempcnta"100
\def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at invalid@err\string~}}
\UTFviii at loop
@@ -620,7 +621,7 @@
% |\parse at XML@charref| work without arguments.
% \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
% \changes{v1.2a}{2018/03/24}{Allow control characters if active}
-% In the case single byte UTF-8 sequences, only allw definition if
+% In the case single byte UTF-8 sequences, only allow definition if
% the character os already active. The definition of |\UTFviii at tmp|
% looks slightly strange but is designed for the sequence of |\expandafter|
% in |\DeclareUnicodeCharacter|.
@@ -650,6 +651,20 @@
\parse at UTFviii@a,%
\parse at UTFviii@b E\UTFviii at three@octets.{,;}%
\else
+% \end{macrocode}
+%
+% Test added here for out of range values, the 4-octet definitions are still set up
+% so that |\DeclareUnicodeCharacter| does something sensible if the user scrolls
+% past this error.
+% \begin{macrocode}
+ \ifnum\count@>"10FFFF\relax
+ \PackageError{inputenc}%
+ {\UTFviii at hexnumber\count@\space too large for Unicode}%
+ {Values between 0 and 10FFFF are permitted}%
+ \fi
+% \end{macrocode}
+%
+% \begin{macrocode}
\parse at UTFviii@a;%
\parse at UTFviii@a,%
\parse at UTFviii@a!%
More information about the latex3-commits
mailing list