[latex3-commits] [l3svn] branch master updated: Add some ideas on input encoding

Fri Nov 13 12:25:56 CET 2015

This is an automated email from the git hooks/post-receive script.

joseph pushed a commit to branch master
in repository l3svn.

The following commit(s) were added to refs/heads/master by this push:
       new  b8adef1   Add some ideas on input encoding
b8adef1 is described below

commit b8adef1f11bb29026701a5ab4e39e8c84b03ea40
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Fri Nov 13 11:22:28 2015 +0000

    Add some ideas on input encoding
    
    The idea here is that in contrast to LaTeX2e we should probably
    start from an assumption of UTF-8 input in all cases. That will
    make loading hyphenation patterns rather easier when we come to
    it. So for the 8-bit (like) engines we can include the appropriate
    conversion code in the format itself.
    
    The upTeX situation is complicated as this might all go horribly
    wrong when \disablecjktokens is not active: need to see some real
    use cases to know.
    
    Whether having a fixed input/font encoding is workable is very much
    an open question but at least for testing this seems a reasonable
    approach (if only to possibly rule things out).
---
 l3kernel/l3final.dtx |  163 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/l3kernel/l3final.dtx b/l3kernel/l3final.dtx
index a28215b..dc0bb87 100644
--- a/l3kernel/l3final.dtx
+++ b/l3kernel/l3final.dtx
@@ -304,6 +304,169 @@
   }
 %    \end{macrocode}
 %
+% \subsection{Input encoding}
+%
+% The letters |a|--|z| and |A|--|Z| will be correct directly from Ini\TeX{}
+% while for Unicode engines (almost) all characters to be treated as letters
+% are defined by the automatic data parsing. Thus the changes here are to 
+% deal with the additional cases.
+%
+% All the characters in the range $0$--$31$ \emph{except} tab (|^^I|), nl
+% (|^^J|), ff (|^^L|), cr (|^^M|), dotless-I (|^^Y|) and dotless-J (|^^Z|).
+%    \begin{macrocode}
+\int_step_inline:nn { 0 } { 1 } { `\^^H }
+  { \char_set_catcode_invalid:n {#1} }
+\char_set_catcode_invalid:n { `\^^K }
+\int_step_inline:nn { `\^^N } { 1 } { ^^W }
+  { \char_set_catcode_invalid:n {#1} }
+\int_step_inline:nn { 27 } { 1 } { 31 }
+  { \char_set_catcode_invalid:n {#1} }
+%    \end{macrocode}
+% The same is true for the top of the $7$-bit range.
+%    \begin{macrocode}
+\char_set_catcode_invalid:n { 127 }
+%    \end{macrocode}
+%
+% For non-Unicode engine we now need to convert from UTF-$8$ to $8$-bit
+% for pattern reading and document use. The set up here is copied from
+% the file |conv-utf8-ec.tex| maintained by \acro{tug} for hyphenation
+% pattern use. As some of the relevant primitives have been moved and to
+% allow for self-contained code that is copied here with minor adjustments.
+% (The primitives have to be available at point of use not just at point of
+% definition so a compatibility layer is hard to arrange here.)
+%    \begin{macrocode}
+\bool_if:nF
+  {
+       \sys_if_engine_luatex_p:
+    || \sys_if_engine_xetex_p:
+  }
+  {
+%    \end{macrocode}
+% At least for the present, make \upTeX{} behave like \pdfTeX{} so
+% the set up is easier.
+%    \begin{macrocode}
+    \sys_if_engine_uptex:T
+      { \uptex_disablecjktoken:D }
+%    \end{macrocode}
+% The actual mappings: these are kept low-level for performance reasons.
+%    \begin{macrocode}
+    \cs_new_nopar:Npn { __char_active_C3:N } #1
+      {
+        \if_meaning:w #1 ^^9f ^^ff \else: % ß - U+00DF - germandbls
+        \if_meaning:w #1 ^^a0 ^^e0 \else: % à - U+00E0 - agrave
+        \if_meaning:w #1 ^^a1 ^^e1 \else: % á - U+00E1 - aacute
+        \if_meaning:w #1 ^^a2 ^^e2 \else: % â - U+00E2 - acircumflex
+        \if_meaning:w #1 ^^a3 ^^e3 \else: % ã - U+00E3 - atilde
+        \if_meaning:w #1 ^^a4 ^^e4 \else: % ä - U+00E4 - adieresis
+        \if_meaning:w #1 ^^a5 ^^e5 \else: % å - U+00E5 - aring
+        \if_meaning:w #1 ^^a6 ^^e6 \else: % æ - U+00E6 - ae
+        \if_meaning:w #1 ^^a7 ^^e7 \else: % ç - U+00E7 - ccedilla
+        \if_meaning:w #1 ^^a8 ^^e8 \else: % è - U+00E8 - egrave
+        \if_meaning:w #1 ^^a9 ^^e9 \else: % é - U+00E9 - eacute
+        \if_meaning:w #1 ^^aa ^^ea \else: % ê - U+00EA - ecircumflex
+        \if_meaning:w #1 ^^ab ^^eb \else: % ë - U+00EB - edieresis
+        \if_meaning:w #1 ^^ac ^^ec \else: % ì - U+00EC - igrave
+        \if_meaning:w #1 ^^ad ^^ed \else: % í - U+00ED - iacute
+        \if_meaning:w #1 ^^ae ^^ee \else: % î - U+00EE - icircumflex
+        \if_meaning:w #1 ^^af ^^ef \else: % ï - U+00EF - idieresis
+        \if_meaning:w #1 ^^b0 ^^f0 \else: % ð - U+00F0 - eth
+        \if_meaning:w #1 ^^b1 ^^f1 \else: % ñ - U+00F1 - ntilde
+        \if_meaning:w #1 ^^b2 ^^f2 \else: % ò - U+00F2 - ograve
+        \if_meaning:w #1 ^^b3 ^^f3 \else: % ó - U+00F3 - oacute
+        \if_meaning:w #1 ^^b4 ^^f4 \else: % ô - U+00F4 - ocircumflex
+        \if_meaning:w #1 ^^b5 ^^f5 \else: % õ - U+00F5 - otilde
+        \if_meaning:w #1 ^^b6 ^^f6 \else: % ö - U+00F6 - odieresis
+        \if_meaning:w #1 ^^b8 ^^f8 \else: % ø - U+00F8 - oslash
+        \if_meaning:w #1 ^^b9 ^^f9 \else: % ù - U+00F9 - ugrave
+        \if_meaning:w #1 ^^ba ^^fa \else: % ú - U+00FA - uacute
+        \if_meaning:w #1 ^^bb ^^fb \else: % û - U+00FB - ucircumflex
+        \if_meaning:w #1 ^^bc ^^fc \else: % ü - U+00FC - udieresis
+        \if_meaning:w #1 ^^bd ^^fd \else: % ý - U+00FD - yacute
+        \if_meaning:w #1 ^^be ^^fe \else: % þ - U+00FE - thorn
+        \if_meaning:w #1 ^^bf ^^b8 \else: % ÿ - U+00FF - ydieresis
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+      }
+    \cs_new_nopar:Npn { __char_active_C4:N } #1
+      {
+        \if_meaning:w #1 ^^83 ^^a0 \else: % ă - U+0103 - abreve
+        \if_meaning:w #1 ^^85 ^^a1 \else: % ą - U+0105 - aogonek
+        \if_meaning:w #1 ^^87 ^^a2 \else: % ć - U+0107 - cacute
+        \if_meaning:w #1 ^^8d ^^a3 \else: % č - U+010D - ccaron
+        \if_meaning:w #1 ^^8f ^^a4 \else: % ď - U+010F - dcaron
+        \if_meaning:w #1 ^^91 ^^9e \else: % đ - U+0111 - dcroat
+        \if_meaning:w #1 ^^99 ^^a6 \else: % ę - U+0119 - eogonek
+        \if_meaning:w #1 ^^9b ^^a5 \else: % ě - U+011B - ecaron
+        \if_meaning:w #1 ^^9f ^^a7 \else: % ğ - U+011F - gbreve
+        \if_meaning:w #1 ^^b1 ^^19 \else: % ı - U+0131 - dotlessi
+        \if_meaning:w #1 ^^b3 ^^bc \else: % ĳ - U+0133 - ij
+        \if_meaning:w #1 ^^ba ^^a8 \else: % ĺ - U+013A - lacute
+        \if_meaning:w #1 ^^be ^^a9 \else: % ľ - U+013E - lcaron
+          \__msg_kernel_expandable_error:nn { kernel } { encoding-failure }
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi
+      }
+    \cs_new_nopar:Npn { __char_active_C5:N } #1
+      {
+        \if_meaning:w #1 ^^82 ^^aa \else: % ł - U+0142 - lslash
+        \if_meaning:w #1 ^^84 ^^ab \else: % ń - U+0144 - nacute
+        \if_meaning:w #1 ^^88 ^^ac \else: % ň - U+0148 - ncaron
+        \if_meaning:w #1 ^^8b ^^ad \else: % ŋ - U+014B - eng
+        \if_meaning:w #1 ^^91 ^^ae \else: % ő - U+0151 - ohungarumlaut
+        \if_meaning:w #1 ^^93 ^^f7 \else: % œ - U+0153 - oe
+        \if_meaning:w #1 ^^95 ^^af \else: % ŕ - U+0155 - racute
+        \if_meaning:w #1 ^^99 ^^b0 \else: % ř - U+0159 - rcaron
+        \if_meaning:w #1 ^^9b ^^b1 \else: % ś - U+015B - sacute
+        \if_meaning:w #1 ^^9f ^^b3 \else: % ş - U+015F - scedilla
+        \if_meaning:w #1 ^^a1 ^^b2 \else: % š - U+0161 - scaron
+        \if_meaning:w #1 ^^a5 ^^b4 \else: % ť - U+0165 - tcaron
+        \if_meaning:w #1 ^^af ^^b7 \else: % ů - U+016F - uring
+        \if_meaning:w #1 ^^b1 ^^b6 \else: % ű - U+0171 - uhungarumlaut
+        \if_meaning:w #1 ^^ba ^^b9 \else: % ź - U+017A - zacute
+        \if_meaning:w #1 ^^bc ^^bb \else: % ż - U+017C - zdotaccent
+        \if_meaning:w #1 ^^be ^^ba \else: % ž - U+017E - zcaron
+          \__msg_kernel_expandable_error:nn { kernel } { encoding-failure }
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+        \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+      }
+    \cs_new_nopar:Npn { __char_active_C8:N } #1
+      {
+        \if_meaning:w #1 ^^99 ^^b3 \else: % ș - U+0219 - scommaaccent
+        \if_meaning:w #1 ^^9b ^^b5 \else: % ț - U+021B - tcommaaccent
+        \if_meaning:w #1 ^^b7 ^^1a \else: % ȷ - U+0237 - dotlessj
+          \__msg_kernel_expandable_error:nn { kernel } { encoding-failure }
+        \fi: \fi: \fi:
+      }
+%    \end{macrocode}
+% Install and record the active characters.
+%    \begin{macrocode}
+    \clist_step_inline:nn { C3 , C4 , C5 , C8 }
+      {
+        \char_set_catcode_active:n { "#1 }
+        \cs_set_active_eq:nc { "#1 } { __char_active_ #1 :N }
+        \seq_put_right:Nx \l_char_special_seq
+          { \exp_not:c { \char_generate:n { "#1 } } }
+        \seq_put_right:Nx \l_char_active_seq
+          { \exp_not:c { \char_generate:n { "#1 } } }
+      }
+    \__msg_kernel_new:nnn { kernel } { encoding-failure }
+      { Unknown~UTF-8~char }
+%    \end{macrocode}
+% All of the chars are lower case so give them the correct \tn{lccode}
+% values.
+%    \begin{macrocode}
+    \clist_map_inline:nn
+      {
+        19 , 1A , 9E , A0 , A1 , A2 , A3 , A4 , A5 , A6 , A7 , A8 , A9 ,
+        AA , AB , AC , AD , AE , AF , B0 , B1 , B2 , B3 , B3 , B4 , B5 ,
+        B6 , B7 , B8 , B9 , BA , BB , BC , E0 , E1 , E2 , E3 , E4 , E5 ,
+        E6 , E7 , E8 , E9 , EA , EB , EC , ED , EE , EF , F0 , F1 , F2 ,
+        F3 , F4 , F5 , F6 , F7 , F8 , F9 , FA , FB , FC , FD , FE , FF
+      }
+      { \char_set_lccode:nn { "#1 } { "#1 } }
+  }
+%    \end{macrocode}
+%
 %    \begin{macrocode}
 %</initex>
 %    \end{macrocode}

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.