texlive[63556] Build/source/texk/ptexenc: ptexenc: support guessing

commits+takuji at tug.org commits+takuji at tug.org
Sun Jun 12 10:18:09 CEST 2022


Revision: 63556
          http://tug.org/svn/texlive?view=revision&revision=63556
Author:   takuji
Date:     2022-06-12 10:18:08 +0200 (Sun, 12 Jun 2022)
Log Message:
-----------
ptexenc: support guessing input file encodings

Modified Paths:
--------------
    trunk/Build/source/texk/ptexenc/ChangeLog
    trunk/Build/source/texk/ptexenc/c-auto.in
    trunk/Build/source/texk/ptexenc/configure
    trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h
    trunk/Build/source/texk/ptexenc/ptexenc/unicode.h
    trunk/Build/source/texk/ptexenc/ptexenc.c
    trunk/Build/source/texk/ptexenc/unicode.c
    trunk/Build/source/texk/ptexenc/version.ac

Modified: trunk/Build/source/texk/ptexenc/ChangeLog
===================================================================
--- trunk/Build/source/texk/ptexenc/ChangeLog	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ChangeLog	2022-06-12 08:18:08 UTC (rev 63556)
@@ -1,3 +1,11 @@
+2022-06-12  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
+
+	* ptexenc.c, unicode.c, ptexenc/ptexenc.h, ptexenc/unicode.h:
+	Add new functions ptenc_guess_enc() and set_guess_file_enc()
+	to support guessing input file encodings.
+	https://github.com/texjporg/tex-jp-build/issues/142
+	* version.ac: Bump to 1.4.2/dev.
+
 2022-03-21  Karl Berry  <karl at tug.org>
 
 	* TL'22 release.

Modified: trunk/Build/source/texk/ptexenc/c-auto.in
===================================================================
--- trunk/Build/source/texk/ptexenc/c-auto.in	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/c-auto.in	2022-06-12 08:18:08 UTC (rev 63556)
@@ -6,7 +6,7 @@
 #define PTEXENC_C_AUTO_H
 
 /* ptexenc: the version string. */
-#define PTEXENCVERSION "ptexenc version 1.4.1/dev"
+#define PTEXENCVERSION "ptexenc version 1.4.2/dev"
 
 /* Define to 1 if the `closedir' function returns void instead of int. */
 #undef CLOSEDIR_VOID

Modified: trunk/Build/source/texk/ptexenc/configure
===================================================================
--- trunk/Build/source/texk/ptexenc/configure	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/configure	2022-06-12 08:18:08 UTC (rev 63556)
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for ptexenc 1.4.1/dev.
+# Generated by GNU Autoconf 2.71 for ptexenc 1.4.2/dev.
 #
 # Report bugs to <tex-k at tug.org>.
 #
@@ -629,8 +629,8 @@
 # Identity of this package.
 PACKAGE_NAME='ptexenc'
 PACKAGE_TARNAME='ptexenc'
-PACKAGE_VERSION='1.4.1/dev'
-PACKAGE_STRING='ptexenc 1.4.1/dev'
+PACKAGE_VERSION='1.4.2/dev'
+PACKAGE_STRING='ptexenc 1.4.2/dev'
 PACKAGE_BUGREPORT='tex-k at tug.org'
 PACKAGE_URL=''
 
@@ -1375,7 +1375,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ptexenc 1.4.1/dev to adapt to many kinds of systems.
+\`configure' configures ptexenc 1.4.2/dev to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1446,7 +1446,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ptexenc 1.4.1/dev:";;
+     short | recursive ) echo "Configuration of ptexenc 1.4.2/dev:";;
    esac
   cat <<\_ACEOF
 
@@ -1566,7 +1566,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ptexenc configure 1.4.1/dev
+ptexenc configure 1.4.2/dev
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2090,7 +2090,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ptexenc $as_me 1.4.1/dev, which was
+It was created by ptexenc $as_me 1.4.2/dev, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2850,10 +2850,10 @@
 
 
 
-PTEXENCVERSION=1.4.1/dev
+PTEXENCVERSION=1.4.2/dev
 
 
-PTEXENC_LT_VERSINFO=5:1:4
+PTEXENC_LT_VERSINFO=5:2:4
 
 
 am__api_version='1.16'
@@ -8418,7 +8418,7 @@
 
 # Define the identity of the package.
  PACKAGE='ptexenc'
- VERSION='1.4.1/dev'
+ VERSION='1.4.2/dev'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -14528,7 +14528,7 @@
 Report bugs to <bug-libtool at gnu.org>."
 
 lt_cl_version="\
-ptexenc config.lt 1.4.1/dev
+ptexenc config.lt 1.4.2/dev
 configured by $0, generated by GNU Autoconf 2.71.
 
 Copyright (C) 2011 Free Software Foundation, Inc.
@@ -16255,7 +16255,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ptexenc $as_me 1.4.1/dev, which was
+This file was extended by ptexenc $as_me 1.4.2/dev, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -16323,7 +16323,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-ptexenc config.status 1.4.1/dev
+ptexenc config.status 1.4.2/dev
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 

Modified: trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h	2022-06-12 08:18:08 UTC (rev 63556)
@@ -5,6 +5,9 @@
 #ifndef PTEXENC_PTEXENC_H
 #define PTEXENC_PTEXENC_H
 
+#ifdef WIN32
+#include <ptexenc/c-auto.h>
+#endif
 #include <kpathsea/types.h>
 
 #if defined (WIN32) && !defined (__MINGW32__) && !defined (NO_PTENC_DLL)
@@ -22,9 +25,10 @@
 #endif
 
 extern PTENCDLL const char *ptexenc_version_string;
+extern PTENCDLL int infile_enc_auto;
 #if defined(WIN32)
 extern PTENCDLL FILE *Poptr;
-extern PTENCDLL int infile_enc_auto;
+extern PTENCDLL char * ptexenc_nkf(FILE *f);
 #endif
 
 #define KANJI_OPTS "{jis|euc|sjis|utf8}"
@@ -32,6 +36,7 @@
 /* enable/disable UPTEX */
 extern PTENCDLL void enable_UPTEX (boolean enable);
 extern PTENCDLL void set_prior_file_enc(void);
+extern PTENCDLL void set_guess_file_enc(boolean enable);
 extern PTENCDLL void ptenc_ptex_mode(const boolean enable);
 
 /* get/set Kanji encoding by string */

Modified: trunk/Build/source/texk/ptexenc/ptexenc/unicode.h
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc/unicode.h	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc/unicode.h	2022-06-12 08:18:08 UTC (rev 63556)
@@ -26,8 +26,13 @@
 #define BYTE2(x) (((x)>>16) & 0xff)
 #define BYTE3(x) (((x)>> 8) & 0xff)
 #define BYTE4(x) ( (x)      & 0xff)
- 
 
+/* used only for debug */
+#ifdef DEBUG
+extern int UCStoUTF8S(long ucs, unsigned char *s);
+#endif
+
+
 /* UTF-8 -> UCS */
 #define UTF8BtoUCS(a,b)   ((((a)&0x1f) << 6) | ((b)&0x3f))
 #define UTF8CtoUCS(a,b,c) ((((a)&0x0f) <<12) | (((b)&0x3f) << 6) | ((c)&0x3f))

Modified: trunk/Build/source/texk/ptexenc/ptexenc.c
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc.c	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc.c	2022-06-12 08:18:08 UTC (rev 63556)
@@ -41,10 +41,8 @@
 const char *ptexenc_version_string = PTEXENCVERSION;
 #if defined(WIN32)
 FILE *Poptr;
-int infile_enc_auto;
-#else
-static int infile_enc_auto = 1;
 #endif
+int infile_enc_auto = 0;
 
 static int     file_enc = ENC_UNKNOWN;
 static int internal_enc = ENC_UNKNOWN;
@@ -72,6 +70,8 @@
     if (strcasecmp(str, "utf8")   == 0) return ENC_UTF8;
     if (UPTEX_enabled && strcasecmp(str, "uptex")  == 0) return ENC_UPTEX;
 
+    if (strcasecmp(str, "ASCII")== 0)        return file_enc;
+    if (strncasecmp(str, "AMBIGUOUS", 9) == 0) return file_enc;
     if (strcasecmp(str, "BINARY") == 0)      return ENC_JIS;
     if (strcasecmp(str, "ISO-2022-JP") == 0) return ENC_JIS;
     if (strcasecmp(str, "EUC-JP") == 0)      return ENC_EUC;
@@ -146,6 +146,11 @@
     return terminal_enc;
 }
 
+void set_guess_file_enc(boolean enable)
+{
+    infile_enc_auto = (int) enable;
+}
+
 /* enable ptex mode (use flag 0x100 for Japanese char) */
 void ptenc_ptex_mode (const boolean enable)
 {
@@ -199,7 +204,6 @@
     if (file     != ENC_UNKNOWN) {
         set_file_enc(file);
 #if !defined(WIN32)
-        infile_enc_auto = 0;
         nkf_disable();
 #endif
     }
@@ -753,7 +757,7 @@
         buffer[last++] = i;
         ungetc4(j, fp);
     }
-}        
+}
 
 static void get_sjis(int i, FILE *fp)
 {
@@ -765,7 +769,7 @@
         buffer[last++] = i;
         ungetc4(j, fp);
     }
-}        
+}
 
 static boolean is_tail(long *c, FILE *fp)
 {
@@ -787,7 +791,7 @@
     int i;
     int c[MARK_LEN];
     int bom_u[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0x7E };
-    int bom_l[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0 };
+    int bom_l[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0x01 };
 
     for (i=0; i<MARK_LEN; i++) {
         c[i] = getc4(fp);
@@ -803,6 +807,177 @@
 static int infile_enc[NOFILE]; /* ENC_UNKNOWN (=0): not determined
                                   other: determined */
 
+/* guess file encoding */
+/*
+    asumption:
+      No halfwidth katakana in Shift_JIS
+      No SS2 nor SS3 in EUC-JP
+      JIS X 0208 only and no platform dependent characters in Shift_JIS, EUC-JP
+*/
+char *ptenc_guess_enc(FILE *fp)
+{
+    char *enc;
+    int k0, k1, k2, cdb[2], cu8[4], len_utf8;
+    int is_ascii=1, lbyte=0;
+    int maybe_sjis=1, maybe_euc=1, maybe_utf8=1, pos_db=0, pos_utf8=0;
+#ifdef DEBUG
+    int i;
+    unsigned char str0[5];
+#endif /* DEBUG */
+    enc = xmalloc(sizeof(char)*18);
+
+    while ((k0 = fgetc(fp)) != EOF && maybe_sjis+maybe_euc+maybe_utf8>1) {
+        lbyte++;
+        if (k0==ESC) {
+            k0 = fgetc(fp);
+            if (k0=='$') {
+                k0 = fgetc(fp);
+                if (k0=='@' || k0=='B') {
+                    strcpy(enc,"ISO-2022-JP");
+                    goto post_process;
+                }
+            }
+            if (k0>0x7F) {
+                strcpy(enc,"BINARY");
+                goto post_process;
+            } else if (k0==EOF) {
+                break;
+            }
+            continue;
+        } else if (k0==0x00) {
+            strcpy(enc,"BINARY");
+            goto post_process;
+        } else if (k0<0x80) {
+            if (pos_utf8>0) {
+                maybe_utf8 = 0;
+                pos_utf8 = 0;
+            }
+            if (pos_db==1) {
+                maybe_euc = 0;
+                pos_db = 0;
+                if (maybe_sjis) {
+                    cdb[1] = k0;
+                    k1 = JIStoUCS2(SJIStoJIS(HILO(cdb[0],cdb[1])));
+                    if (k1) {
+#ifdef DEBUG
+                        i = UCStoUTF8S(k1, str0);
+                        str0[i] = '\0';
+                        fprintf(stderr, "Character for guess encoding: 0x%02X%02X", cdb[0], cdb[1]);
+                        fprintf(stderr, " sjis (%s)\n", str0);
+#endif /* DEBUG */
+                        continue;
+                    }
+                }
+                maybe_sjis = 0;
+            }
+            if (is_ascii && lbyte>10000) {
+                /* guess ASCII if we did not find 8bit chars in head 10000 bytes */
+                strcpy(enc,"ASCII");
+                goto post_process;
+            }
+            continue;
+        }
+        is_ascii = 0;
+        if (pos_db==0) {
+            cdb[0] = k0;
+            cdb[1] = 0;
+            pos_db = 1;
+        }
+        else if (pos_db==1 && (maybe_sjis || maybe_euc)) {
+            cdb[1] = k0;
+            k1 = JIStoUCS2(SJIStoJIS(HILO(cdb[0],cdb[1])));
+            k2 = JIStoUCS2(EUCtoJIS(HILO(cdb[0],cdb[1])));
+            if (maybe_sjis) {
+                if (!k1)
+                    maybe_sjis = 0;
+            }
+            if (maybe_euc) {
+                if (!k2)
+                    maybe_euc = 0;
+            }
+            pos_db = 0;
+#ifdef DEBUG
+            if (maybe_sjis || maybe_euc) {
+                fprintf(stderr, "Character for guess encoding: 0x%02X%02X", cdb[0], cdb[1]);
+                if (maybe_sjis) {
+                    i = UCStoUTF8S(k1, str0);
+                    str0[i] = '\0';
+                    fprintf(stderr, " sjis (%s)", str0);
+                }
+                if (maybe_euc) {
+                    i = UCStoUTF8S(k2, str0);
+                    str0[i] = '\0';
+                    fprintf(stderr, " euc (%s)", str0);
+                }
+                fprintf(stderr, "\n");
+            }
+#endif /* DEBUG */
+        }
+        if (pos_utf8==0) {
+            len_utf8 = UTF8length(k0);
+            if (len_utf8<2) {
+                maybe_utf8 = 0;
+                pos_utf8 = 0;
+                continue;
+            }
+            cu8[0] = k0;
+            pos_utf8 = 1;
+        }
+        else if (pos_utf8>0 && maybe_utf8) {
+            if (k0>0xBF) {
+                maybe_utf8 = 0;
+                pos_utf8 = 0;
+                continue;
+            }
+            cu8[pos_utf8] = k0;
+            pos_utf8++;
+            if (pos_utf8==len_utf8) {
+                if ((cu8[0]==0xE0 && cu8[1]<0xA0) ||
+                    (cu8[0]==0xED && cu8[1]>0x9F) ||
+                    (cu8[0]==0xF0 && cu8[1]<0x90)) { /* illegal combination in UTF-8 */
+                    maybe_utf8 = 0;
+                    pos_utf8 = 0;
+                    continue;
+                }
+#ifdef DEBUG
+                for (i=0; i<len_utf8; i++) str0[i] = cu8[i];
+                str0[i] = '\0';
+                fprintf(stderr, "Character for guess encoding: 0x");
+                for (i=0; i<len_utf8; i++) fprintf(stderr, "%02X", cu8[i]);
+                fprintf(stderr, " U+%06lX (%s)\n", UTF8StoUCS(str0), str0);
+#endif /* DEBUG */
+                len_utf8 = 0;
+                pos_utf8 = 0;
+                cu8[0]=cu8[1]=cu8[2]=cu8[3]=0;
+            }
+        }
+    }
+
+    if (is_ascii)
+        strcpy(enc,"ASCII");
+    else if (maybe_sjis+maybe_euc+maybe_utf8>1) {
+        strcpy(enc,"AMBIGUOUS(");
+        if (maybe_sjis)
+            enc = strcat(enc,"s");
+        if (maybe_euc)
+            enc = strcat(enc, maybe_sjis ? ",e" : "e");
+        if (maybe_utf8)
+            enc = strcat(enc, ",u");
+        enc = strcat(enc,")");
+    }
+    else if (maybe_sjis)
+        strcpy(enc,"Shift_JIS");
+    else if (maybe_euc)
+        strcpy(enc,"EUC-JP");
+    else if (maybe_utf8)
+        strcpy(enc,"UTF-8");
+    else
+        strcpy(enc,"BINARY");
+  post_process:
+    rewind (fp);
+    return enc;
+}
+
 /* input line with encoding conversion */
 long input_line2(FILE *fp, unsigned char *buff, unsigned char *buff2,
                  long pos, const long buffsize, int *lastchar)
@@ -811,13 +986,35 @@
     static boolean injis = false;
     const int fd = fileno(fp);
 
+    buffer = buff;
+    first = last = pos;
+
     if (infile_enc[fd] == ENC_UNKNOWN) { /* just after opened */
         ungetbuff[fd].size = 0;
-        if (isUTF8Nstream(fp)) infile_enc[fd] = ENC_UTF8;
-        else                   infile_enc[fd] = get_file_enc();
+        if (isUTF8Nstream(fp)) {
+            infile_enc[fd] = ENC_UTF8;
+#ifdef DEBUG
+            fprintf(stderr, "Detect UTF-8 with BOM #%d\n", fd);
+#endif /* DEBUG */
+        }
+        else if (infile_enc_auto && fd != fileno(stdin)) {
+            char *enc;
+            getc4(fp);
+            getc4(fp);
+            getc4(fp);
+            getc4(fp);
+            rewind(fp);
+            enc = ptenc_guess_enc(fp);
+            if (string_to_enc(enc) > 0) {
+                infile_enc[fd] = string_to_enc(enc);
+                fprintf(stderr, "(guessed encoding #%d: %s = %s)", fd, enc, enc_to_string(infile_enc[fd]));
+            } else {
+                infile_enc[fd] = get_file_enc();
+            }
+            if (enc) free(enc);
+        }
+        else infile_enc[fd] = get_file_enc();
     }
-    buffer = buff;
-    first = last = pos;
 
     while (last < buffsize-30 && (i=getc4(fp)) != EOF && i!='\n' && i!='\r') {
         /* 30 is enough large size for one char */

Modified: trunk/Build/source/texk/ptexenc/unicode.c
===================================================================
--- trunk/Build/source/texk/ptexenc/unicode.c	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/unicode.c	2022-06-12 08:18:08 UTC (rev 63556)
@@ -72,7 +72,8 @@
     }
 }
 
-#if 0 /* not used */
+/* used only for debug */
+#ifdef DEBUG
 int UCStoUTF8S(long ucs, unsigned char *s)
 {
     if (ucs < 0x80) {
@@ -96,7 +97,7 @@
     }
     return 0; /* unsupported */
 }
-#endif /* 0 */
+#endif /* DEBUG */
 
 
 long UCStoUTF8(long ucs)

Modified: trunk/Build/source/texk/ptexenc/version.ac
===================================================================
--- trunk/Build/source/texk/ptexenc/version.ac	2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/version.ac	2022-06-12 08:18:08 UTC (rev 63556)
@@ -10,4 +10,4 @@
 dnl see kpathsea/version.ac.
 dnl
 dnl This file is m4-included from configure.ac.
-m4_define([ptexenc_version], [1.4.1/dev])
+m4_define([ptexenc_version], [1.4.2/dev])



More information about the tex-live-commits mailing list.