[latex3-commits] [git/LaTeX3-latex3-latex2e] master: fixes from pr #60 (a9a3b09)

David Carlisle d.p.carlisle at gmail.com
Fri Sep 28 13:49:09 CEST 2018


Repository : https://github.com/latex3/latex2e
On branch  : master
Link       : https://github.com/latex3/latex2e/commit/a9a3b09478b06c1fa513bc78284164ffe924b157

>---------------------------------------------------------------

commit a9a3b09478b06c1fa513bc78284164ffe924b157
Author: David Carlisle <d.p.carlisle at gmail.com>
Date:   Fri Sep 28 12:49:09 2018 +0100

    fixes from pr #60


>---------------------------------------------------------------

a9a3b09478b06c1fa513bc78284164ffe924b157
 base/changes.txt                                   |    4 ++
 ...tlb-ltluatex-001.tlg => github-0060.luatex.tlg} |    0
 base/testfiles/github-0060.lvt                     |   35 +++++++++++++++++
 base/testfiles/github-0060.tlg                     |   40 ++++++++++++++++++++
 ...{tlb-ltluatex-001.tlg => github-0060.xetex.tlg} |    0
 base/utf8ienc.dtx                                  |   23 +++++++++--
 6 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/base/changes.txt b/base/changes.txt
index 9c604f4..2173b4b 100644
--- a/base/changes.txt
+++ b/base/changes.txt
@@ -4,6 +4,10 @@ completeness or accuracy and it contains some references to files that
 are not part of the distribution.
 =======================================================================
 
+2018-09-28  David Carlisle  <David.Carlisle at latex-project.org>
+
+	* utf8ienc.dtx: Fix to handling of 4-octet UTF-8. Mostly from github PR 60
+
 2018-09-26  Frank Mittelbach  <Frank.Mittelbach at latex-project.org>
 
 	* ltmiscen.dtx (subsection{Environments}):
diff --git a/base/testfiles/tlb-ltluatex-001.tlg b/base/testfiles/github-0060.luatex.tlg
similarity index 100%
copy from base/testfiles/tlb-ltluatex-001.tlg
copy to base/testfiles/github-0060.luatex.tlg
diff --git a/base/testfiles/github-0060.lvt b/base/testfiles/github-0060.lvt
new file mode 100644
index 0000000..9a3167d
--- /dev/null
+++ b/base/testfiles/github-0060.lvt
@@ -0,0 +1,35 @@
+\input{test2e}
+% correction for 4-byte UTF8 (GitHub PR 60)
+
+
+\documentclass{article}
+
+\START
+\ifx\Umathchar\undefined\else\expandafter\END\fi
+
+\typeout{^^J===Declare 10FFFF}
+\DeclareUnicodeCharacter{10FFFF}{U+10FFFF\typeout{MAX VALUE!!}}
+\typeout{^^J===Declare 110FFF}
+\DeclareUnicodeCharacter{110FFF}{?\typeout{TOO BIG!!}}
+
+\OMIT
+\begin{document}
+\TIMO
+
+
+\typeout{^^J===10FFFF a}
+[􏿿]% U+10FFFF highest valid UTF-8
+
+\typeout{^^J===10FFFF b}
+[^^f4^^8f^^bf^^bf]% U+10FFFF highest valid UTF-8
+
+\typeout{^^J===110FFF}
+[^^f4^^90^^bf^^bf]% trying to be U+110FFF, too high
+
+\typeout{^^J===110FFE}
+[^^f4^^90^^bf^^be]% trying to be U+110FFE, too high, undeclared
+
+\typeout{^^J===bad 2 byte}
+[^^dd^^dd]% mal formed utf8 (second byte not 10....)
+
+\END
diff --git a/base/testfiles/github-0060.tlg b/base/testfiles/github-0060.tlg
new file mode 100644
index 0000000..981f315
--- /dev/null
+++ b/base/testfiles/github-0060.tlg
@@ -0,0 +1,40 @@
+This is a generated file for the LaTeX2e validation system.
+Don't change this file in any respect.
+===Declare 10FFFF
+   defining Unicode char U+10FFFF (decimal 1114111)
+===Declare 110FFF
+   defining Unicode char U+110FFF (decimal 1118207)
+! Package inputenc Error: 110FFF too large for Unicode.
+See the inputenc package documentation for explanation.
+Type  H <return>  for immediate help.
+ ...                                              
+l. ......odeCharacter{110FFF}{?\typeout{TOO BIG!!}}
+Values between 0 and 10FFFF are permitted
+===10FFFF a
+MAX VALUE!!
+===10FFFF b
+MAX VALUE!!
+===110FFF
+TOO BIG!!
+===110FFE
+! Package inputenc Error: Unicode character ^^f4^^90^^bf^^be (U+110FFE)
+(inputenc)                not set up for use with LaTeX.
+See the inputenc package documentation for explanation.
+Type  H <return>  for immediate help.
+ ...                                              
+l. ...[^^f4^^90^^bf^^be
+                      ]% trying to be U+110FFE, too high, undeclared
+You may provide a definition with
+\DeclareUnicodeCharacter 
+===bad 2 byte
+! Package inputenc Error: Invalid UTF-8 byte sequence.
+See the inputenc package documentation for explanation.
+Type  H <return>  for immediate help.
+ ...                                              
+l. ...[^^dd^^dd
+              ]% mal formed utf8 (second byte not 10....)
+The document does not appear to be in UTF-8 encoding.
+Try adding \UseRawInputEncoding as the first line of the file
+or specify an encoding such as \usepackage [latin1]{inputenc}
+in the document preamble.
+Alternatively, save the file in UTF-8 using your editor or another tool
diff --git a/base/testfiles/tlb-ltluatex-001.tlg b/base/testfiles/github-0060.xetex.tlg
similarity index 100%
copy from base/testfiles/tlb-ltluatex-001.tlg
copy to base/testfiles/github-0060.xetex.tlg
diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index db2d584..39dd704 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -217,7 +217,7 @@
 %<+ts1> \ProvidesFile{ts1enc.dfu}
 %<+x2>  \ProvidesFile{x2enc.dfu}
 %<+all> \ProvidesFile{utf8enc.dfu}
-   [2018/07/30 v1.2d UTF-8 support for inputenc]
+   [2018/09/28 v1.2e UTF-8 support for inputenc]
 %    \end{macrocode}
 %
 %    \begin{macrocode}
@@ -463,9 +463,10 @@
 %    \end{macrocode}
 %
 %    Setting up 4-byte UTF-8:
+% \changes{v1.2e}{2018/09/28}{Fix "F4 lead byte}%
 %    \begin{macrocode}
     \count@"F0
-    \@tempcnta"F4
+    \@tempcnta"F5
     \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\string~}}
 \UTFviii at loop
 %    \end{macrocode}
@@ -473,7 +474,7 @@
 % Bytes above F4 are not valid UTF-8 starting bytes as they would encode numbers beyond
 % the Unicode range
 %    \begin{macrocode}
-    \count@"F4
+    \count@"F5
     \@tempcnta"100
     \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at invalid@err\string~}}
 \UTFviii at loop
@@ -620,7 +621,7 @@
 %    |\parse at XML@charref| work without arguments.
 % \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
 % \changes{v1.2a}{2018/03/24}{Allow control characters if active}
-% In the case single byte UTF-8 sequences, only allw definition if
+% In the case single byte UTF-8 sequences, only allow definition if
 % the character os already active.  The definition of |\UTFviii at tmp|
 % looks slightly strange but is designed for the sequence of |\expandafter|
 % in |\DeclareUnicodeCharacter|.
@@ -650,6 +651,20 @@
      \parse at UTFviii@a,%
      \parse at UTFviii@b E\UTFviii at three@octets.{,;}%
    \else
+%    \end{macrocode}
+%
+% Test added here for out of range values, the 4-octet definitions are still set up
+% so that |\DeclareUnicodeCharacter| does something sensible if the user scrolls
+% past this error.
+%    \begin{macrocode}
+     \ifnum\count@>"10FFFF\relax
+           \PackageError{inputenc}%
+                {\UTFviii at hexnumber\count@\space too large for Unicode}%
+                {Values between 0 and 10FFFF are permitted}%
+      \fi
+%    \end{macrocode}
+%
+%    \begin{macrocode}
      \parse at UTFviii@a;%
      \parse at UTFviii@a,%
      \parse at UTFviii@a!%





More information about the latex3-commits mailing list