texlive[63556] Build/source/texk/ptexenc: ptexenc: support guessing
commits+takuji at tug.org
commits+takuji at tug.org
Sun Jun 12 10:18:09 CEST 2022
Revision: 63556
http://tug.org/svn/texlive?view=revision&revision=63556
Author: takuji
Date: 2022-06-12 10:18:08 +0200 (Sun, 12 Jun 2022)
Log Message:
-----------
ptexenc: support guessing input file encodings
Modified Paths:
--------------
trunk/Build/source/texk/ptexenc/ChangeLog
trunk/Build/source/texk/ptexenc/c-auto.in
trunk/Build/source/texk/ptexenc/configure
trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h
trunk/Build/source/texk/ptexenc/ptexenc/unicode.h
trunk/Build/source/texk/ptexenc/ptexenc.c
trunk/Build/source/texk/ptexenc/unicode.c
trunk/Build/source/texk/ptexenc/version.ac
Modified: trunk/Build/source/texk/ptexenc/ChangeLog
===================================================================
--- trunk/Build/source/texk/ptexenc/ChangeLog 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ChangeLog 2022-06-12 08:18:08 UTC (rev 63556)
@@ -1,3 +1,11 @@
+2022-06-12 TANAKA Takuji <ttk at t-lab.opal.ne.jp>
+
+ * ptexenc.c, unicode.c, ptexenc/ptexenc.h, ptexenc/unicode.h:
+ Add new functions ptenc_guess_enc() and set_guess_file_enc()
+ to support guessing input file encodings.
+ https://github.com/texjporg/tex-jp-build/issues/142
+ * version.ac: Bump to 1.4.2/dev.
+
2022-03-21 Karl Berry <karl at tug.org>
* TL'22 release.
Modified: trunk/Build/source/texk/ptexenc/c-auto.in
===================================================================
--- trunk/Build/source/texk/ptexenc/c-auto.in 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/c-auto.in 2022-06-12 08:18:08 UTC (rev 63556)
@@ -6,7 +6,7 @@
#define PTEXENC_C_AUTO_H
/* ptexenc: the version string. */
-#define PTEXENCVERSION "ptexenc version 1.4.1/dev"
+#define PTEXENCVERSION "ptexenc version 1.4.2/dev"
/* Define to 1 if the `closedir' function returns void instead of int. */
#undef CLOSEDIR_VOID
Modified: trunk/Build/source/texk/ptexenc/configure
===================================================================
--- trunk/Build/source/texk/ptexenc/configure 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/configure 2022-06-12 08:18:08 UTC (rev 63556)
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for ptexenc 1.4.1/dev.
+# Generated by GNU Autoconf 2.71 for ptexenc 1.4.2/dev.
#
# Report bugs to <tex-k at tug.org>.
#
@@ -629,8 +629,8 @@
# Identity of this package.
PACKAGE_NAME='ptexenc'
PACKAGE_TARNAME='ptexenc'
-PACKAGE_VERSION='1.4.1/dev'
-PACKAGE_STRING='ptexenc 1.4.1/dev'
+PACKAGE_VERSION='1.4.2/dev'
+PACKAGE_STRING='ptexenc 1.4.2/dev'
PACKAGE_BUGREPORT='tex-k at tug.org'
PACKAGE_URL=''
@@ -1375,7 +1375,7 @@
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ptexenc 1.4.1/dev to adapt to many kinds of systems.
+\`configure' configures ptexenc 1.4.2/dev to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1446,7 +1446,7 @@
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ptexenc 1.4.1/dev:";;
+ short | recursive ) echo "Configuration of ptexenc 1.4.2/dev:";;
esac
cat <<\_ACEOF
@@ -1566,7 +1566,7 @@
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ptexenc configure 1.4.1/dev
+ptexenc configure 1.4.2/dev
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2090,7 +2090,7 @@
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ptexenc $as_me 1.4.1/dev, which was
+It was created by ptexenc $as_me 1.4.2/dev, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -2850,10 +2850,10 @@
-PTEXENCVERSION=1.4.1/dev
+PTEXENCVERSION=1.4.2/dev
-PTEXENC_LT_VERSINFO=5:1:4
+PTEXENC_LT_VERSINFO=5:2:4
am__api_version='1.16'
@@ -8418,7 +8418,7 @@
# Define the identity of the package.
PACKAGE='ptexenc'
- VERSION='1.4.1/dev'
+ VERSION='1.4.2/dev'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -14528,7 +14528,7 @@
Report bugs to <bug-libtool at gnu.org>."
lt_cl_version="\
-ptexenc config.lt 1.4.1/dev
+ptexenc config.lt 1.4.2/dev
configured by $0, generated by GNU Autoconf 2.71.
Copyright (C) 2011 Free Software Foundation, Inc.
@@ -16255,7 +16255,7 @@
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ptexenc $as_me 1.4.1/dev, which was
+This file was extended by ptexenc $as_me 1.4.2/dev, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -16323,7 +16323,7 @@
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
-ptexenc config.status 1.4.1/dev
+ptexenc config.status 1.4.2/dev
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"
Modified: trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc/ptexenc.h 2022-06-12 08:18:08 UTC (rev 63556)
@@ -5,6 +5,9 @@
#ifndef PTEXENC_PTEXENC_H
#define PTEXENC_PTEXENC_H
+#ifdef WIN32
+#include <ptexenc/c-auto.h>
+#endif
#include <kpathsea/types.h>
#if defined (WIN32) && !defined (__MINGW32__) && !defined (NO_PTENC_DLL)
@@ -22,9 +25,10 @@
#endif
extern PTENCDLL const char *ptexenc_version_string;
+extern PTENCDLL int infile_enc_auto;
#if defined(WIN32)
extern PTENCDLL FILE *Poptr;
-extern PTENCDLL int infile_enc_auto;
+extern PTENCDLL char * ptexenc_nkf(FILE *f);
#endif
#define KANJI_OPTS "{jis|euc|sjis|utf8}"
@@ -32,6 +36,7 @@
/* enable/disable UPTEX */
extern PTENCDLL void enable_UPTEX (boolean enable);
extern PTENCDLL void set_prior_file_enc(void);
+extern PTENCDLL void set_guess_file_enc(boolean enable);
extern PTENCDLL void ptenc_ptex_mode(const boolean enable);
/* get/set Kanji encoding by string */
Modified: trunk/Build/source/texk/ptexenc/ptexenc/unicode.h
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc/unicode.h 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc/unicode.h 2022-06-12 08:18:08 UTC (rev 63556)
@@ -26,8 +26,13 @@
#define BYTE2(x) (((x)>>16) & 0xff)
#define BYTE3(x) (((x)>> 8) & 0xff)
#define BYTE4(x) ( (x) & 0xff)
-
+/* used only for debug */
+#ifdef DEBUG
+extern int UCStoUTF8S(long ucs, unsigned char *s);
+#endif
+
+
/* UTF-8 -> UCS */
#define UTF8BtoUCS(a,b) ((((a)&0x1f) << 6) | ((b)&0x3f))
#define UTF8CtoUCS(a,b,c) ((((a)&0x0f) <<12) | (((b)&0x3f) << 6) | ((c)&0x3f))
Modified: trunk/Build/source/texk/ptexenc/ptexenc.c
===================================================================
--- trunk/Build/source/texk/ptexenc/ptexenc.c 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/ptexenc.c 2022-06-12 08:18:08 UTC (rev 63556)
@@ -41,10 +41,8 @@
const char *ptexenc_version_string = PTEXENCVERSION;
#if defined(WIN32)
FILE *Poptr;
-int infile_enc_auto;
-#else
-static int infile_enc_auto = 1;
#endif
+int infile_enc_auto = 0;
static int file_enc = ENC_UNKNOWN;
static int internal_enc = ENC_UNKNOWN;
@@ -72,6 +70,8 @@
if (strcasecmp(str, "utf8") == 0) return ENC_UTF8;
if (UPTEX_enabled && strcasecmp(str, "uptex") == 0) return ENC_UPTEX;
+ if (strcasecmp(str, "ASCII")== 0) return file_enc;
+ if (strncasecmp(str, "AMBIGUOUS", 9) == 0) return file_enc;
if (strcasecmp(str, "BINARY") == 0) return ENC_JIS;
if (strcasecmp(str, "ISO-2022-JP") == 0) return ENC_JIS;
if (strcasecmp(str, "EUC-JP") == 0) return ENC_EUC;
@@ -146,6 +146,11 @@
return terminal_enc;
}
+void set_guess_file_enc(boolean enable)
+{
+ infile_enc_auto = (int) enable;
+}
+
/* enable ptex mode (use flag 0x100 for Japanese char) */
void ptenc_ptex_mode (const boolean enable)
{
@@ -199,7 +204,6 @@
if (file != ENC_UNKNOWN) {
set_file_enc(file);
#if !defined(WIN32)
- infile_enc_auto = 0;
nkf_disable();
#endif
}
@@ -753,7 +757,7 @@
buffer[last++] = i;
ungetc4(j, fp);
}
-}
+}
static void get_sjis(int i, FILE *fp)
{
@@ -765,7 +769,7 @@
buffer[last++] = i;
ungetc4(j, fp);
}
-}
+}
static boolean is_tail(long *c, FILE *fp)
{
@@ -787,7 +791,7 @@
int i;
int c[MARK_LEN];
int bom_u[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0x7E };
- int bom_l[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0 };
+ int bom_l[MARK_LEN] = { 0xEF, 0xBB, 0xBF, 0x01 };
for (i=0; i<MARK_LEN; i++) {
c[i] = getc4(fp);
@@ -803,6 +807,177 @@
static int infile_enc[NOFILE]; /* ENC_UNKNOWN (=0): not determined
other: determined */
+/* guess file encoding */
+/*
+ asumption:
+ No halfwidth katakana in Shift_JIS
+ No SS2 nor SS3 in EUC-JP
+ JIS X 0208 only and no platform dependent characters in Shift_JIS, EUC-JP
+*/
+char *ptenc_guess_enc(FILE *fp)
+{
+ char *enc;
+ int k0, k1, k2, cdb[2], cu8[4], len_utf8;
+ int is_ascii=1, lbyte=0;
+ int maybe_sjis=1, maybe_euc=1, maybe_utf8=1, pos_db=0, pos_utf8=0;
+#ifdef DEBUG
+ int i;
+ unsigned char str0[5];
+#endif /* DEBUG */
+ enc = xmalloc(sizeof(char)*18);
+
+ while ((k0 = fgetc(fp)) != EOF && maybe_sjis+maybe_euc+maybe_utf8>1) {
+ lbyte++;
+ if (k0==ESC) {
+ k0 = fgetc(fp);
+ if (k0=='$') {
+ k0 = fgetc(fp);
+ if (k0=='@' || k0=='B') {
+ strcpy(enc,"ISO-2022-JP");
+ goto post_process;
+ }
+ }
+ if (k0>0x7F) {
+ strcpy(enc,"BINARY");
+ goto post_process;
+ } else if (k0==EOF) {
+ break;
+ }
+ continue;
+ } else if (k0==0x00) {
+ strcpy(enc,"BINARY");
+ goto post_process;
+ } else if (k0<0x80) {
+ if (pos_utf8>0) {
+ maybe_utf8 = 0;
+ pos_utf8 = 0;
+ }
+ if (pos_db==1) {
+ maybe_euc = 0;
+ pos_db = 0;
+ if (maybe_sjis) {
+ cdb[1] = k0;
+ k1 = JIStoUCS2(SJIStoJIS(HILO(cdb[0],cdb[1])));
+ if (k1) {
+#ifdef DEBUG
+ i = UCStoUTF8S(k1, str0);
+ str0[i] = '\0';
+ fprintf(stderr, "Character for guess encoding: 0x%02X%02X", cdb[0], cdb[1]);
+ fprintf(stderr, " sjis (%s)\n", str0);
+#endif /* DEBUG */
+ continue;
+ }
+ }
+ maybe_sjis = 0;
+ }
+ if (is_ascii && lbyte>10000) {
+ /* guess ASCII if we did not find 8bit chars in head 10000 bytes */
+ strcpy(enc,"ASCII");
+ goto post_process;
+ }
+ continue;
+ }
+ is_ascii = 0;
+ if (pos_db==0) {
+ cdb[0] = k0;
+ cdb[1] = 0;
+ pos_db = 1;
+ }
+ else if (pos_db==1 && (maybe_sjis || maybe_euc)) {
+ cdb[1] = k0;
+ k1 = JIStoUCS2(SJIStoJIS(HILO(cdb[0],cdb[1])));
+ k2 = JIStoUCS2(EUCtoJIS(HILO(cdb[0],cdb[1])));
+ if (maybe_sjis) {
+ if (!k1)
+ maybe_sjis = 0;
+ }
+ if (maybe_euc) {
+ if (!k2)
+ maybe_euc = 0;
+ }
+ pos_db = 0;
+#ifdef DEBUG
+ if (maybe_sjis || maybe_euc) {
+ fprintf(stderr, "Character for guess encoding: 0x%02X%02X", cdb[0], cdb[1]);
+ if (maybe_sjis) {
+ i = UCStoUTF8S(k1, str0);
+ str0[i] = '\0';
+ fprintf(stderr, " sjis (%s)", str0);
+ }
+ if (maybe_euc) {
+ i = UCStoUTF8S(k2, str0);
+ str0[i] = '\0';
+ fprintf(stderr, " euc (%s)", str0);
+ }
+ fprintf(stderr, "\n");
+ }
+#endif /* DEBUG */
+ }
+ if (pos_utf8==0) {
+ len_utf8 = UTF8length(k0);
+ if (len_utf8<2) {
+ maybe_utf8 = 0;
+ pos_utf8 = 0;
+ continue;
+ }
+ cu8[0] = k0;
+ pos_utf8 = 1;
+ }
+ else if (pos_utf8>0 && maybe_utf8) {
+ if (k0>0xBF) {
+ maybe_utf8 = 0;
+ pos_utf8 = 0;
+ continue;
+ }
+ cu8[pos_utf8] = k0;
+ pos_utf8++;
+ if (pos_utf8==len_utf8) {
+ if ((cu8[0]==0xE0 && cu8[1]<0xA0) ||
+ (cu8[0]==0xED && cu8[1]>0x9F) ||
+ (cu8[0]==0xF0 && cu8[1]<0x90)) { /* illegal combination in UTF-8 */
+ maybe_utf8 = 0;
+ pos_utf8 = 0;
+ continue;
+ }
+#ifdef DEBUG
+ for (i=0; i<len_utf8; i++) str0[i] = cu8[i];
+ str0[i] = '\0';
+ fprintf(stderr, "Character for guess encoding: 0x");
+ for (i=0; i<len_utf8; i++) fprintf(stderr, "%02X", cu8[i]);
+ fprintf(stderr, " U+%06lX (%s)\n", UTF8StoUCS(str0), str0);
+#endif /* DEBUG */
+ len_utf8 = 0;
+ pos_utf8 = 0;
+ cu8[0]=cu8[1]=cu8[2]=cu8[3]=0;
+ }
+ }
+ }
+
+ if (is_ascii)
+ strcpy(enc,"ASCII");
+ else if (maybe_sjis+maybe_euc+maybe_utf8>1) {
+ strcpy(enc,"AMBIGUOUS(");
+ if (maybe_sjis)
+ enc = strcat(enc,"s");
+ if (maybe_euc)
+ enc = strcat(enc, maybe_sjis ? ",e" : "e");
+ if (maybe_utf8)
+ enc = strcat(enc, ",u");
+ enc = strcat(enc,")");
+ }
+ else if (maybe_sjis)
+ strcpy(enc,"Shift_JIS");
+ else if (maybe_euc)
+ strcpy(enc,"EUC-JP");
+ else if (maybe_utf8)
+ strcpy(enc,"UTF-8");
+ else
+ strcpy(enc,"BINARY");
+ post_process:
+ rewind (fp);
+ return enc;
+}
+
/* input line with encoding conversion */
long input_line2(FILE *fp, unsigned char *buff, unsigned char *buff2,
long pos, const long buffsize, int *lastchar)
@@ -811,13 +986,35 @@
static boolean injis = false;
const int fd = fileno(fp);
+ buffer = buff;
+ first = last = pos;
+
if (infile_enc[fd] == ENC_UNKNOWN) { /* just after opened */
ungetbuff[fd].size = 0;
- if (isUTF8Nstream(fp)) infile_enc[fd] = ENC_UTF8;
- else infile_enc[fd] = get_file_enc();
+ if (isUTF8Nstream(fp)) {
+ infile_enc[fd] = ENC_UTF8;
+#ifdef DEBUG
+ fprintf(stderr, "Detect UTF-8 with BOM #%d\n", fd);
+#endif /* DEBUG */
+ }
+ else if (infile_enc_auto && fd != fileno(stdin)) {
+ char *enc;
+ getc4(fp);
+ getc4(fp);
+ getc4(fp);
+ getc4(fp);
+ rewind(fp);
+ enc = ptenc_guess_enc(fp);
+ if (string_to_enc(enc) > 0) {
+ infile_enc[fd] = string_to_enc(enc);
+ fprintf(stderr, "(guessed encoding #%d: %s = %s)", fd, enc, enc_to_string(infile_enc[fd]));
+ } else {
+ infile_enc[fd] = get_file_enc();
+ }
+ if (enc) free(enc);
+ }
+ else infile_enc[fd] = get_file_enc();
}
- buffer = buff;
- first = last = pos;
while (last < buffsize-30 && (i=getc4(fp)) != EOF && i!='\n' && i!='\r') {
/* 30 is enough large size for one char */
Modified: trunk/Build/source/texk/ptexenc/unicode.c
===================================================================
--- trunk/Build/source/texk/ptexenc/unicode.c 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/unicode.c 2022-06-12 08:18:08 UTC (rev 63556)
@@ -72,7 +72,8 @@
}
}
-#if 0 /* not used */
+/* used only for debug */
+#ifdef DEBUG
int UCStoUTF8S(long ucs, unsigned char *s)
{
if (ucs < 0x80) {
@@ -96,7 +97,7 @@
}
return 0; /* unsupported */
}
-#endif /* 0 */
+#endif /* DEBUG */
long UCStoUTF8(long ucs)
Modified: trunk/Build/source/texk/ptexenc/version.ac
===================================================================
--- trunk/Build/source/texk/ptexenc/version.ac 2022-06-11 23:52:33 UTC (rev 63555)
+++ trunk/Build/source/texk/ptexenc/version.ac 2022-06-12 08:18:08 UTC (rev 63556)
@@ -10,4 +10,4 @@
dnl see kpathsea/version.ac.
dnl
dnl This file is m4-included from configure.ac.
-m4_define([ptexenc_version], [1.4.1/dev])
+m4_define([ptexenc_version], [1.4.2/dev])
More information about the tex-live-commits
mailing list.