texlive[60856] Build/source/texk: upmendex 0.60: enhance procedure of

commits+takuji at tug.org commits+takuji at tug.org
Sun Oct 24 07:44:35 CEST 2021


Revision: 60856
          http://tug.org/svn/texlive?view=revision&revision=60856
Author:   takuji
Date:     2021-10-24 07:44:35 +0200 (Sun, 24 Oct 2021)
Log Message:
-----------
upmendex 0.60: enhance procedure of classifying characters

Modified Paths:
--------------
    trunk/Build/source/texk/README
    trunk/Build/source/texk/upmendex/ChangeLog
    trunk/Build/source/texk/upmendex/Makefile.am
    trunk/Build/source/texk/upmendex/Makefile.in
    trunk/Build/source/texk/upmendex/configure
    trunk/Build/source/texk/upmendex/configure.ac
    trunk/Build/source/texk/upmendex/convert.c
    trunk/Build/source/texk/upmendex/exkana.h
    trunk/Build/source/texk/upmendex/fwrite.c
    trunk/Build/source/texk/upmendex/kana.h
    trunk/Build/source/texk/upmendex/mendex.h
    trunk/Build/source/texk/upmendex/sort.c

Removed Paths:
-------------
    trunk/Build/source/texk/upmendex/ktable.h

Modified: trunk/Build/source/texk/README
===================================================================
--- trunk/Build/source/texk/README	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/README	2021-10-24 05:44:35 UTC (rev 60856)
@@ -108,7 +108,7 @@
 
 ttfdump - maintained here, by us, since Taiwan upstream apparently gone.
 
-upmendex 0.59 - by Takuji Tanaka
+upmendex 0.60 - by Takuji Tanaka
   http://www.ctan.org/pkg/upmendex
   https://github.com/t-tk/upmendex-package
 

Modified: trunk/Build/source/texk/upmendex/ChangeLog
===================================================================
--- trunk/Build/source/texk/upmendex/ChangeLog	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/ChangeLog	2021-10-24 05:44:35 UTC (rev 60856)
@@ -1,3 +1,14 @@
+2021-10-24  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
+
+	* version 0.60  Beta version.
+	* configure.ac: Bump version.
+	* Makefile.am, convert.c, fwrite.c, sort.c,
+	{,ex}kana.h, ktable.h, mendex.h:
+	Enhance procedure of classifying characters:
+	Newly classifies by character type of
+	General Category in Unicode Script Property.
+	https://github.com/t-tk/upmendex-package/issues/8
+
 2021-09-19  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
 
 	* fread.c:
@@ -170,7 +181,7 @@
 
 2020-05-01  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
 
-	* main.c, version.h, Makefile.{am.in}: Add bug report addresses
+	* main.c, version.h, Makefile.am: Add bug report addresses
 	on command line help.
 	* convert.c: Tune buffer sizes to avoid tool warnings.
 

Modified: trunk/Build/source/texk/upmendex/Makefile.am
===================================================================
--- trunk/Build/source/texk/upmendex/Makefile.am	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/Makefile.am	2021-10-24 05:44:35 UTC (rev 60856)
@@ -19,7 +19,6 @@
 	kana.h \
 	kp.c \
 	kp.h \
-	ktable.h \
 	main.c \
 	mendex.h \
 	version.h \

Modified: trunk/Build/source/texk/upmendex/Makefile.in
===================================================================
--- trunk/Build/source/texk/upmendex/Makefile.in	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/Makefile.in	2021-10-24 05:44:35 UTC (rev 60856)
@@ -593,7 +593,6 @@
 	kana.h \
 	kp.c \
 	kp.h \
-	ktable.h \
 	main.c \
 	mendex.h \
 	version.h \

Modified: trunk/Build/source/texk/upmendex/configure
===================================================================
--- trunk/Build/source/texk/upmendex/configure	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/configure	2021-10-24 05:44:35 UTC (rev 60856)
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for upmendex (TeX Live) 0.59.
+# Generated by GNU Autoconf 2.71 for upmendex (TeX Live) 0.60.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -626,8 +626,8 @@
 # Identity of this package.
 PACKAGE_NAME='upmendex (TeX Live)'
 PACKAGE_TARNAME='upmendex--tex-live-'
-PACKAGE_VERSION='0.59'
-PACKAGE_STRING='upmendex (TeX Live) 0.59'
+PACKAGE_VERSION='0.60'
+PACKAGE_STRING='upmendex (TeX Live) 0.60'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1390,7 +1390,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures upmendex (TeX Live) 0.59 to adapt to many kinds of systems.
+\`configure' configures upmendex (TeX Live) 0.60 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1462,7 +1462,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of upmendex (TeX Live) 0.59:";;
+     short | recursive ) echo "Configuration of upmendex (TeX Live) 0.60:";;
    esac
   cat <<\_ACEOF
 
@@ -1587,7 +1587,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-upmendex (TeX Live) configure 0.59
+upmendex (TeX Live) configure 0.60
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2268,7 +2268,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by upmendex (TeX Live) $as_me 0.59, which was
+It was created by upmendex (TeX Live) $as_me 0.60, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -8806,7 +8806,7 @@
 
 # Define the identity of the package.
  PACKAGE='upmendex--tex-live-'
- VERSION='0.59'
+ VERSION='0.60'
 
 
 # Some tools Automake needs.
@@ -18942,7 +18942,7 @@
 Report bugs to <bug-libtool at gnu.org>."
 
 lt_cl_version="\
-upmendex (TeX Live) config.lt 0.59
+upmendex (TeX Live) config.lt 0.60
 configured by $0, generated by GNU Autoconf 2.71.
 
 Copyright (C) 2011 Free Software Foundation, Inc.
@@ -21114,7 +21114,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by upmendex (TeX Live) $as_me 0.59, which was
+This file was extended by upmendex (TeX Live) $as_me 0.60, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -21182,7 +21182,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-upmendex (TeX Live) config.status 0.59
+upmendex (TeX Live) config.status 0.60
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 

Modified: trunk/Build/source/texk/upmendex/configure.ac
===================================================================
--- trunk/Build/source/texk/upmendex/configure.ac	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/configure.ac	2021-10-24 05:44:35 UTC (rev 60856)
@@ -8,7 +8,7 @@
 dnl   gives unlimited permission to copy and/or distribute it,
 dnl   with or without modifications, as long as this notice is preserved.
 dnl
-AC_INIT([upmendex (TeX Live)],[0.59])
+AC_INIT([upmendex (TeX Live)],[0.60])
 AC_PREREQ([2.71])
 AC_CONFIG_SRCDIR([main.c])
 AC_CONFIG_AUX_DIR([../../build-aux])

Modified: trunk/Build/source/texk/upmendex/convert.c
===================================================================
--- trunk/Build/source/texk/upmendex/convert.c	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/convert.c	2021-10-24 05:44:35 UTC (rev 60856)
@@ -6,7 +6,6 @@
 #include "qsort.h"
 
 #include "exkana.h"
-#include "ktable.h"
 #include "exvar.h"
 
 #include "kp.h"
@@ -25,8 +24,6 @@
 
 static int dicvalread(const char *filename, struct dictionary *dicval, int line);
 
-#define is_alpha_numeric(a)  (((a)>=ALPHATOP && (a)<=ALPHAEND) || (a)==YENSIGN)
-
 UChar * u_xstrdup (const UChar *string)
 {
 	return u_strcpy(xmalloc((u_strlen(string)+1)*sizeof(UChar)), string);
@@ -210,16 +207,25 @@
 				i++;
 			}
 
-			else if (buff1[i]<0x80) {
+			else if (buff1[i]<0x20 && buff1[i]!='\t') { /* ignore control characters */
+				i++;
+			}
+
+			else if (buff1[i]<0x7F) {
 				buff2[j]=buff1[i];
 				i++;
 				j++;
 			}
 
+			else if (buff1[i]<0xA0) { /* ignore control characters */
+				i++;
+			}
+
 			else if (is_latin(buff3)||is_cyrillic(buff3)||is_greek(buff3)
 				 ||is_jpn_kana(buff3)||is_kor_hngl(buff3)||is_zhuyin(buff3)
-				 ||is_numeric(buff3)||is_devanagari(buff3)||is_thai(buff3)
-					||is_comb_diacritical_mark(buff3)) {
+				 ||is_numeric(buff3)==1||is_type_symbol(buff3)==1
+				 ||is_devanagari(buff3)||is_thai(buff3)
+					||is_type_mark_or_punct(buff3)) {
 				buff2[j]=buff3[0];
 				if (wclen==2) buff2[j+1]=buff3[1];
 				i+=wclen;
@@ -226,61 +232,42 @@
 				j+=wclen;
 			}
 
-			else if (buff1[i]>=0x80) {
-				if (is_alpha_numeric(buff3[0])) {
-/*   alpha-numeric,symbols   */
-					for (k=0;k<u_strlen(symboltable);k++) {
-						if (buff3[0]==symboltable[k]) {
-							buff2[j]=k+0x20;
-							if ((buff2[j]>='a')&&(buff2[j]<='z')) buff2[j]-=32;
-							i++;
-							j+=wclen;
-							break;
-						}
+			else {
+				for (k=0;k<dlines;k++) {
+/*   dictionary table   */
+					if (u_strncmp(dictable[k].dic[0],&buff1[i],u_strlen(dictable[k].dic[0]))==0) {
+						u_strncpy(&buff2[j],dictable[k].dic[1],u_strlen(dictable[k].dic[1]));
+						i+=u_strlen(dictable[k].dic[0]);
+						j+=u_strlen(dictable[k].dic[1]);
+						break;
 					}
-					if (k==u_strlen(symboltable)) {
-						i++;
-						buff2[j++]=buff3[0];
-					}
 				}
-
-				else {
-					for (k=0;k<dlines;k++) {
-/*   dictionary table   */
-						if (u_strncmp(dictable[k].dic[0],&buff1[i],u_strlen(dictable[k].dic[0]))==0) {
-							u_strncpy(&buff2[j],dictable[k].dic[1],u_strlen(dictable[k].dic[1]));
+				if ((k==dlines)&&(elines!=0)) {
+/*   environment dictionary table   */
+					for (k=0;k<elines;k++) {
+						if (u_strncmp(envdic[k].dic[0],&buff1[i],u_strlen(envdic[k].dic[0]))==0) {
+							u_strncpy(&buff2[j],envdic[k].dic[1],u_strlen(envdic[k].dic[1]));
 							i+=u_strlen(dictable[k].dic[0]);
-							j+=u_strlen(dictable[k].dic[1]);
+							j+=u_strlen(envdic[k].dic[1]);
 							break;
 						}
 					}
-					if ((k==dlines)&&(elines!=0)) {
-/*   environment dictionary table   */
-						for (k=0;k<elines;k++) {
-							if (u_strncmp(envdic[k].dic[0],&buff1[i],u_strlen(envdic[k].dic[0]))==0) {
-								u_strncpy(&buff2[j],envdic[k].dic[1],u_strlen(envdic[k].dic[1]));
-								i+=u_strlen(dictable[k].dic[0]);
-								j+=u_strlen(envdic[k].dic[1]);
-								break;
-							}
-						}
-					}
-					if (((k==dlines)&&(elines==0))||((k==elines)&&(elines!=0))) {
-						if (is_hanzi(buff3) || force==1) {
+				}
+				if (((k==dlines)&&(elines==0))||((k==elines)&&(elines!=0))) {
+					if (is_hanzi(buff3) || is_numeric(buff3) || is_type_symbol(buff3) || force==1) {
 /*   forced convert   */
-							buff2[j]=buff3[0];
-							if (wclen==2) buff2[j+1]=buff3[1];
-							i+=wclen;
-							j+=wclen;
-						}
-						else {
-							widechar_to_multibyte(errbuff,BUFFERLEN2,&buff1[i]);
-							snprintf(errbuff2,BUFFERLEN3,"\nError: %s is no entry in dictionary file ",errbuff);
-							fputs(errbuff2,efp);
-							if (efp!=stderr) fputs(errbuff2,stderr);
-							return -1;
-						}
+						buff2[j]=buff3[0];
+						if (wclen==2) buff2[j+1]=buff3[1];
+						i+=wclen;
+						j+=wclen;
 					}
+					else {
+						widechar_to_multibyte(errbuff,BUFFERLEN2,&buff1[i]);
+						snprintf(errbuff2,BUFFERLEN3,"\nError: %s is no entry in dictionary file ",errbuff);
+						fputs(errbuff2,efp);
+						if (efp!=stderr) fputs(errbuff2,stderr);
+						return -1;
+					}
 				}
 			}
 		}

Modified: trunk/Build/source/texk/upmendex/exkana.h
===================================================================
--- trunk/Build/source/texk/upmendex/exkana.h	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/exkana.h	2021-10-24 05:44:35 UTC (rev 60856)
@@ -4,7 +4,6 @@
 extern UChar *aiueo;
 extern UChar kanatable[];
 extern UChar extkanatable[];
-extern UChar circkanatable[];
 
 #define SPACE    0x3000    /* 全角スペース */
 #define ALPHATOP 0xff01    /* ! */
@@ -18,8 +17,19 @@
 #define EXKANAEND  0x31FF  /* ㇿ */
 #define CRKANATOP  0x32D0  /* ㋐ */
 #define CRKANAEND  0x32FE  /* ㋾ */
+#define HANKANATOP 0xFF71  /* ア */
+#define HANKANAEND 0xFF9D  /* ン */
+#define HANKANAWO  0xFF66  /* ヲ */
+#define HANKANATU  0xFF6F  /* ッ */
+#define SQKANATOP  0x3300  /* ㌀ */
+#define SQKANAEND  0x3357  /* ㍗ */
+#define CRLATNTOP  0x24B6  /* Ⓐ */
+#define CRLATNEND  0x24E9  /* ⓩ */
 
 #define is_katakana(a)  ((a)>=KATATOP && (a)<=KATAEND)
 #define is_hiragana(a)  ((a)>=HIRATOP && (a)<=HIRAEND)
 #define is_extkana(a)   ((a)>=EXKANATOP && (a)<=EXKANAEND)
 #define is_circkana(a)  ((a)>=CRKANATOP && (a)<=CRKANAEND)
+#define is_hankana(a)   ((a)>=HANKANATOP && (a)<=HANKANAEND || (a)>=HANKANAWO && (a)<=HANKANATU)
+#define is_sqkana(a)    ((a)>=SQKANATOP && (a)<=SQKANAEND)
+#define is_circlatin(a) ((a)>=CRLATNTOP && (a)<=CRLATNEND)

Modified: trunk/Build/source/texk/upmendex/fwrite.c
===================================================================
--- trunk/Build/source/texk/upmendex/fwrite.c	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/fwrite.c	2021-10-24 05:44:35 UTC (rev 60856)
@@ -18,7 +18,7 @@
 static void index_normalize(UChar *istr, UChar *ini, int *chset);
 static int initial_cmp_char(UChar *ini, UChar ch);
 static int init_hanzi_header(void);
-static const UNormalizer2* unormalizer_NFD;
+static const UNormalizer2 *unormalizer_NFD, *unormalizer_NFKD;
 static int turkish_i;
 
 #define M_NONE      0
@@ -189,7 +189,8 @@
 		fprintf(fp,"%s%d%s",setpage_prefix,pagenum,setpage_suffix);
 	}
 	perr=U_ZERO_ERROR;
-	unormalizer_NFD=unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, &perr);
+	unormalizer_NFD =unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, &perr);
+	unormalizer_NFKD=unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, &perr);
 
 	if (strlen(symhead)==0) {
 		if (lethead_flag>0) {
@@ -427,12 +428,12 @@
 			}
 			else {
 				if (chset_prev!=chset) {
-					if ((CH_LATIN<=chset_prev&&chset_prev<=CH_THAI) || symbol_flag==2)
+					if (is_any_script(chset_prev) || symbol_flag==2)
 						fputs(group_skip,fp);
 					if (lethead_flag!=0 && symbol_flag==2 && chset==CH_NUMERIC) {
 						fprintf(fp,"%s%s%s",lethead_prefix,numhead,lethead_suffix);
 					}
-					if (lethead_flag!=0 && (symbol_flag==1 && (CH_LATIN<=chset_prev&&chset_prev<=CH_THAI) ||
+					if (lethead_flag!=0 && (symbol_flag==1 && is_any_script(chset_prev) ||
 								symbol_flag==2 && chset!=CH_NUMERIC) ) {
 						fprintf(fp,"%s%s%s",lethead_prefix,symhead,lethead_suffix);
 					}
@@ -763,6 +764,13 @@
 	*chset=charset(istr);
 	ini[1]=L'\0';
 
+	if (is_circkana(ch) || is_hankana(ch) || is_sqkana(ch) || is_circlatin(ch)) {  /* ㋐㋑㋒.. アイウ.. ㌀㌁㌂.. */
+		src[0]=ch;  src[1]=0x00;
+		perr=U_ZERO_ERROR;
+		unorm2_normalize(unormalizer_NFKD, src, 1, dest, 8, &perr);
+		if (U_SUCCESS(perr))
+			ch=dest[0];
+	}
 	if (is_hiragana(ch)) {
 		ch+=KATATOP-HIRATOP; /* hiragana -> katakana */
 	}
@@ -774,15 +782,15 @@
 		ini[0]=extkanatable[ch-EXKANATOP];
 		return;
 	}
-	if (is_circkana(ch)) {     /* ㋐㋑㋒㋓㋔ .. ㋻㋼㋽㋾ */
-		ini[0]=circkanatable[ch-CRKANATOP];
-		return;
-	}
-	else if (ch==0x309F) { ini[0]=0x3088; return; }  /* HIRAGANA YORI -> よ */
+	if      (ch==0x309F) { ini[0]=0x3088; return; }  /* HIRAGANA YORI -> よ */
 	else if (ch==0x30FF) { ini[0]=0x3053; return; }  /* KATAKANA KOTO -> こ */
 	else if (is_jpn_kana(istr)==2) {
 		c32=U16_GET_SUPPLEMENTARY(istr[0],istr[1]);
 		switch (c32) {
+			case 0x1F200:                  /* 🈀 */
+				ini[0]=0x307B; break;  /* ほ */
+			case 0x1B000:                  /* 𛀀 */
+				ini[0]=0x3048; break;  /* え */
 			case 0x1B150: case 0x1B164:
 				ini[0]=0x3090; break;  /* ゐ */
 			case 0x1B151: case 0x1B165:
@@ -889,6 +897,10 @@
 		ini[0]=ch;
 		return;
 	}
+	if (ch>=0xFF21&&ch<=0xFF3A || ch>=0xFF41&&ch<=0xFF5A) {
+		/* Fullwidth latin letter */
+		ch-=0xFF21-0x0041;
+	}
 	if (ch==0x049||ch==0x069||ch==0x130||ch==0x131||ch==0x0CE||ch==0x0EE) {
 		/* check dotted/dotless İ,I,i,ı and Î,î for Turkish */
 		strX[0] = 0x131;  strX[1] = 0x5A;  strX[2] = 0x00;  /* ıZ */
@@ -1018,6 +1030,8 @@
 			}
 		}
 	}
+	if (ch==0x0AA) ch=L'A';
+	if (ch==0x0BA) ch=L'O';
 	ini[0]=u_toupper(ch);
 	return;
 }

Modified: trunk/Build/source/texk/upmendex/kana.h
===================================================================
--- trunk/Build/source/texk/upmendex/kana.h	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/kana.h	2021-10-24 05:44:35 UTC (rev 60856)
@@ -192,56 +192,6 @@
 	0x308d, /* ろ */
 	0
 };
-UChar circkanatable[]={
-	0x3042, /* あ */
-	0x3044, /* い */
-	0x3046, /* う */
-	0x3048, /* え */
-	0x304a, /* お */
-	0x304b, /* か */
-	0x304d, /* き */
-	0x304f, /* く */
-	0x3051, /* け */
-	0x3053, /* こ */
-	0x3055, /* さ */
-	0x3057, /* し */
-	0x3059, /* す */
-	0x305b, /* せ */
-	0x305d, /* そ */
-	0x305f, /* た */
-	0x3061, /* ち */
-	0x3064, /* つ */
-	0x3066, /* て */
-	0x3068, /* と */
-	0x306a, /* な */
-	0x306b, /* に */
-	0x306c, /* ぬ */
-	0x306d, /* ね */
-	0x306e, /* の */
-	0x306f, /* は */
-	0x3072, /* ひ */
-	0x3075, /* ふ */
-	0x3078, /* へ */
-	0x307b, /* ほ */
-	0x307e, /* ま */
-	0x307f, /* み */
-	0x3080, /* む */
-	0x3081, /* め */
-	0x3082, /* も */
-	0x3084, /* や */
-	0x3086, /* ゆ */
-	0x3088, /* よ */
-	0x3089, /* ら */
-	0x308a, /* り */
-	0x308b, /* る */
-	0x308c, /* れ */
-	0x308d, /* ろ */
-	0x308f, /* わ */
-	0x3090, /* ゐ */
-	0x3091, /* ゑ */
-	0x3092, /* を */
-	0
-};
 
 
 UChar GANADA[]={

Deleted: trunk/Build/source/texk/upmendex/ktable.h
===================================================================
--- trunk/Build/source/texk/upmendex/ktable.h	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/ktable.h	2021-10-24 05:44:35 UTC (rev 60856)
@@ -1,98 +0,0 @@
-UChar symboltable[]={
-	0x3000, /* スペース */
-	0xff01, /* ! */
-	0xff02, /* ” */
-	0xff03, /* # */
-	0xff04, /* $ */
-	0xff05, /* % */
-	0xff06, /* & */
-	0xff07, /* ’ */
-	0xff08, /* ( */
-	0xff09, /* ) */
-	0xff0a, /* * */
-	0xff0b, /* + */
-	0xff0c, /* , */
-	0xff0d, /* − */
-	0xff0e, /* . */
-	0xff0f, /* / */
-	0xff10, /* 0 */
-	0xff11, /* 1 */
-	0xff12, /* 2 */
-	0xff13, /* 3 */
-	0xff14, /* 4 */
-	0xff15, /* 5 */
-	0xff16, /* 6 */
-	0xff17, /* 7 */
-	0xff18, /* 8 */
-	0xff19, /* 9 */
-	0xff1a, /* : */
-	0xff1b, /* ; */
-	0xff1c, /* < */
-	0xff1d, /* = */
-	0xff1e, /* > */
-	0xff1f, /* ? */
-	0xff20, /* @ */
-	0xff21, /* A */
-	0xff22, /* B */
-	0xff23, /* C */
-	0xff24, /* D */
-	0xff25, /* E */
-	0xff26, /* F */
-	0xff27, /* G */
-	0xff28, /* H */
-	0xff29, /* I */
-	0xff2a, /* J */
-	0xff2b, /* K */
-	0xff2c, /* L */
-	0xff2d, /* M */
-	0xff2e, /* N */
-	0xff2f, /* O */
-	0xff30, /* P */
-	0xff31, /* Q */
-	0xff32, /* R */
-	0xff33, /* S */
-	0xff34, /* T */
-	0xff35, /* U */
-	0xff36, /* V */
-	0xff37, /* W */
-	0xff38, /* X */
-	0xff39, /* Y */
-	0xff3a, /* Z */
-	0xff3b, /* [ */
-	0xffe5, /* ¥ */
-	0xff3d, /* ] */
-	0xff3e, /* ^ */
-	0xff3f, /* _ */
-	0xff40, /* ‘ */
-	0xff41, /* a */
-	0xff42, /* b */
-	0xff43, /* c */
-	0xff44, /* d */
-	0xff45, /* e */
-	0xff46, /* f */
-	0xff47, /* g */
-	0xff48, /* h */
-	0xff49, /* i */
-	0xff4a, /* j */
-	0xff4b, /* k */
-	0xff4c, /* l */
-	0xff4d, /* m */
-	0xff4e, /* n */
-	0xff4f, /* o */
-	0xff50, /* p */
-	0xff51, /* q */
-	0xff52, /* r */
-	0xff53, /* s */
-	0xff54, /* t */
-	0xff55, /* u */
-	0xff56, /* v */
-	0xff57, /* w */
-	0xff58, /* x */
-	0xff59, /* y */
-	0xff5a, /* z */
-	0xff5b, /* { */
-	0xff5c, /* | */
-	0xff5d, /* } */
-	0xff5f, /* 〜 */
-	0
-};

Modified: trunk/Build/source/texk/upmendex/mendex.h
===================================================================
--- trunk/Build/source/texk/upmendex/mendex.h	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/mendex.h	2021-10-24 05:44:35 UTC (rev 60856)
@@ -49,7 +49,6 @@
 /* sort.c */
 void wsort(struct index *ind, int num);
 void pagesort(struct index *ind, int num);
-int is_alphanumeric(UChar *c);
 int is_latin(UChar *c);
 int is_numeric(UChar *c);
 int is_jpn_kana(UChar *c);
@@ -60,7 +59,8 @@
 int is_greek(UChar *c);
 int is_devanagari(UChar *c);
 int is_thai(UChar *c);
-int is_comb_diacritical_mark(UChar *c);
+int is_type_mark_or_punct(UChar *c);
+int is_type_symbol(UChar *c);
 int chkcontinue(struct page *p, int num);
 int ss_comp(UChar *s1, UChar *s2);
 
@@ -75,6 +75,7 @@
 #define CH_THAI         8
 #define CH_SYMBOL   0x100
 #define CH_NUMERIC  0x101
+#define  is_any_script(a)  ((CH_LATIN<=(a) && (a)<=CH_THAI))
 
 /* sort.c */
 int charset(UChar *c);

Modified: trunk/Build/source/texk/upmendex/sort.c
===================================================================
--- trunk/Build/source/texk/upmendex/sort.c	2021-10-24 02:54:02 UTC (rev 60855)
+++ trunk/Build/source/texk/upmendex/sort.c	2021-10-24 05:44:35 UTC (rev 60856)
@@ -290,13 +290,15 @@
 
 static int ordering(UChar *c)
 {
-	if (*c<0x80) {
-		if (is_latin(c)) return ltn;
-		else if (is_numeric(c)) return nmbr;
-		else return sym;
+	if      (*c<0x20)                return sym;  /* control */
+	else if (*c<0x7F) {
+		if      (is_latin(c))    return ltn;
+		else if (is_numeric(c))  return nmbr;
+		else                     return sym;
 	}
+	else if (*c<0xA0)                return sym;  /* control */
 	else {
-		if (is_latin(c)) return ltn;
+		if      (is_latin(c))    return ltn;
 		else if (is_jpn_kana(c)) return kana;
 		else if (is_kor_hngl(c)) return hngl;
 		else if (is_hanzi(c))    return hnz;
@@ -305,20 +307,21 @@
 		else if (is_numeric(c))  return nmbr;
 		else if (is_devanagari(c)) return dvng;
 		else if (is_thai(c))     return thai;
-		else return sym;
+		else                     return sym;
 	}
 }
 
 int charset(UChar *c)
 {
-	if (*c==0x00) return CH_UNKNOWN;
-	else if (*c<0x80) {
-		if (is_latin(c)) return CH_LATIN;
-		else if (is_numeric(c)) return CH_NUMERIC;
-		else return CH_SYMBOL;
+	if      (*c<0x20)                return CH_UNKNOWN;  /* control */
+	else if (*c<0x7F) {
+		if      (is_latin(c))    return CH_LATIN;
+		else if (is_numeric(c))  return CH_NUMERIC;
+		else                     return CH_SYMBOL;
 	}
+	else if (*c<0xA0)                return CH_UNKNOWN;  /* control */
 	else {
-		if (is_latin(c)) return CH_LATIN;
+		if      (is_latin(c))    return CH_LATIN;
 		else if (is_jpn_kana(c)) return CH_KANA;
 		else if (is_kor_hngl(c)) return CH_HANGUL;
 		else if (is_hanzi(c))    return CH_HANZI;
@@ -327,7 +330,7 @@
 		else if (is_numeric(c))  return CH_NUMERIC;
 		else if (is_devanagari(c)) return CH_DEVANAGARI;
 		else if (is_thai(c))     return CH_THAI;
-		else return CH_SYMBOL;
+		else                     return CH_SYMBOL;
 	}
 }
 
@@ -347,11 +350,18 @@
 		else l = k-1;
 		chset_l=charset(&str[l]);
 		chset_k=charset(&str[k]);
+		if (chset0==CH_UNKNOWN && is_any_script(chset_l)) {
+			chset0=chset_l;
+		}
+		if (chset0!=CH_UNKNOWN && is_any_script(chset_k)) {
+#if 0
 		if (chset0==CH_UNKNOWN && chset_l!=CH_SYMBOL && chset_l!=CH_NUMERIC) {
 			chset0=chset_l;
 		}
 		if (chset_k!=CH_SYMBOL && chset_k!=CH_NUMERIC) {
+#endif
 			if (chset0!=chset_k) {
+			  fprintf(stderr,"sort.c: ###DBG1000 %d %d %d len:%d %05x %05x\n", chset0, chset_l, chset_k, k, str[l], str[k]);
 				len=k;
 				return len;
 			}
@@ -396,17 +406,11 @@
 	return -1;
 }
 
-int is_alphanumeric(UChar *c)
-{
-	if (((*c>=L'A')&&(*c<=L'Z'))||((*c>=L'a')&&(*c<=L'z'))||((*c>=L'0')&&(*c<=L'9')))
-		return 1;
-	else return 0;
-}
-
 int is_latin(UChar *c)
 {
 	if (((*c>=L'A')&&(*c<=L'Z'))||((*c>=L'a')&&(*c<=L'z'))) return 1;
-	else if ((*c>=0x00C0)&&(*c<=0x00D6)) return 1; /* Latin-1 Supplement */
+	else if ((*c==0x00AA)||(*c==0x00BA)) return 1; /* Latin-1 Supplement */
+	else if ((*c>=0x00C0)&&(*c<=0x00D6)) return 1;
 	else if ((*c>=0x00D8)&&(*c<=0x00F6)) return 1;
 	else if ((*c>=0x00F8)&&(*c<=0x00FF)) return 1;
 	else if ((*c>=0x0100)&&(*c<=0x024F)) return 1; /* Latin Extended-A,B */
@@ -416,15 +420,38 @@
 	else if ((*c>=0xAB30)&&(*c<=0xAB6F)) return 1; /* Latin Extended-E */
 	else if ((*c>=0x1E00)&&(*c<=0x1EFF)) return 1; /* Latin Extended Additional */
 	else if ((*c>=0xFB00)&&(*c<=0xFB06)) return 1; /* Latin ligatures */
+	else if ((*c>=0xFF21)&&(*c<=0xFF3A)) return 1; /* Fullwidth Latin Capital Letter */
+	else if ((*c>=0xFF41)&&(*c<=0xFF5A)) return 1; /* Fullwidth Latin Small Letter */
+		/* Property of followings is "Common, So (other symbol)", but seem to be treated as Latin by ICU collator */
+	else if ((*c>=0x24B6)&&(*c<=0x24CF)) return 1; /* CIRCLED LATIN CAPITAL LETTER */
+	else if ((*c>=0x24D0)&&(*c<=0x24E9)) return 1; /* CIRCLED LATIN SMALL LETTER */
 	else return 0;
 }
 
 int is_numeric(UChar *c)
 {
+	UChar32 c32;
+
 	if ((*c>=L'0')&&(*c<=L'9')) return 1;
-	else if ((*c>=0x0966)&&(*c<=0x096F)) return 1; /* Devanagari Digit */
-	else if ((*c>=0x0E50)&&(*c<=0x0E59)) return 1; /* Thai Digit */
-	else return 0;
+	else if ((*c>=0xFF10)&&(*c<=0xFF19)) return 1; /* Fullwidth Digit */
+		/* followings do not seem to be treated as numbers by ICU collator though charType is U_OTHER_NUMBER */
+	else if ((*c>=0x3192)&&(*c<=0x3195)) return 0; /* IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK */
+	else if ((*c>=0x3220)&&(*c<=0x3229)) return 0; /* PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN */
+	else if ((*c>=0x3280)&&(*c<=0x3289)) return 0; /* CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN */
+	else if ((*c>=0xA830)&&(*c<=0xA835)) return 0; /* NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS */
+
+	if (is_surrogate_pair(c))
+		c32=U16_GET_SUPPLEMENTARY(*c,*(c+1));
+	else c32=*c;
+
+	switch (u_charType(c32)) {
+	case U_DECIMAL_DIGIT_NUMBER:
+		return 1;
+	case U_OTHER_NUMBER:
+		return 2;
+	default:
+		return 0;
+	}
 }
 
 int is_jpn_kana(UChar *c)
@@ -431,13 +458,19 @@
 {
 	UChar32 c32;
 
-	if      ((*c>=0x3040)&&(*c<=0x30FF)) return 1; /* Hiragana, Katakana */
+	if       (*c==0x30A0)                return 0; /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
+	else if  (*c==0x30FB)                return 0; /* KATAKANA MIDDLE DOT */
+	else if ((*c>=0x3040)&&(*c<=0x30FF)) return 1; /* Hiragana, Katakana */
 	else if ((*c>=0x31F0)&&(*c<=0x31FF)) return 1; /* Katakana Phonetic Extensions */
 	else if ((*c>=0x32D0)&&(*c<=0x32FE)) return 1; /* Circled Katakana */
+	else if ((*c>=0xFF66)&&(*c<=0xFF9F)) return 1; /* Halfwidth Katakana */
+	else if ((*c>=0x3300)&&(*c<=0x3357)) return 1; /* Squared Katakana words */
 
 	if (is_surrogate_pair(c)) {
 		c32=U16_GET_SUPPLEMENTARY(*c,*(c+1));
 		if ((c32>=0x1B130) && (c32<=0x1B16F)) return 2; /* Small Kana Extensions */
+		else if ((c32==0x1B000))              return 2; /* KATAKANA LETTER ARCHAIC E */
+		else if ((c32==0x1F200))              return 2; /* SQUARE HIRAGANA HOKA */
 	}
 	return 0;
 		/* ICU 65 does not seem to support
@@ -503,7 +536,8 @@
 
 int is_devanagari(UChar *c)
 {
-	if      ((*c>=0x0966)&&(*c<=0x096F)) return 0; /* Devanagari Digit */
+	if      ((*c>=0x0964)&&(*c<=0x0965)) return 0; /* Generic punctuation for scripts of India */
+	else if ((*c>=0x0966)&&(*c<=0x096F)) return 0; /* Devanagari Digit */
 	else if ((*c>=0x0900)&&(*c<=0x097F)) return 1; /* Devanagari */
 	else if ((*c>=0xA8E0)&&(*c<=0xA8FF)) return 1; /* Devanagari Extended */
 	else return 0;
@@ -517,16 +551,45 @@
 	else return 0;
 }
 
-int is_comb_diacritical_mark(UChar *c)
+int is_type_mark_or_punct(UChar *c)
 {
-	if      ((*c>=0x02B0)&&(*c<=0x02FF)) return 1; /* Spacing Modifier Letters */
-	else if ((*c>=0x0300)&&(*c<=0x036F)) return 1; /* Combining Diacritical Marks */
-	else if ((*c>=0x1DC0)&&(*c<=0x1DFF)) return 1; /* Combining Diacritical Marks Supplement */
-	else if ((*c>=0x1AB0)&&(*c<=0x1AFF)) return 1; /* Combining Diacritical Marks Extended */
-	else if ((*c>=0x3099)&&(*c<=0x309A)) return 1; /* Combining Kana Voiced Sound Marks */
-	else return 0;
+	UChar32 c32;
+
+	if (is_surrogate_pair(c))
+		c32=U16_GET_SUPPLEMENTARY(*c,*(c+1));
+	else c32=*c;
+
+	switch (u_charType(c32)) {
+	case U_MODIFIER_LETTER:
+	case U_DASH_PUNCTUATION: case U_START_PUNCTUATION: case U_END_PUNCTUATION:
+	case U_CONNECTOR_PUNCTUATION: case U_OTHER_PUNCTUATION:
+	case U_INITIAL_PUNCTUATION: case U_FINAL_PUNCTUATION:
+	case U_NON_SPACING_MARK: case U_ENCLOSING_MARK: case U_COMBINING_SPACING_MARK:
+		return 1;
+	default:
+		return 0;
+	}
 }
 
+int is_type_symbol(UChar *c)
+{
+	UChar32 c32;
+
+	if (is_surrogate_pair(c))
+		c32=U16_GET_SUPPLEMENTARY(*c,*(c+1));
+	else c32=*c;
+
+	switch (u_charType(c32)) {
+	case U_MODIFIER_SYMBOL:
+		return 1;
+	case U_MATH_SYMBOL: case U_CURRENCY_SYMBOL:
+	case U_OTHER_SYMBOL:
+		return 2;
+	default:
+		return 0;
+	}
+}
+
 int chkcontinue(struct page *p, int num)
 {
 	int i,j,cc=0,num1,num2,k1,k2;



More information about the tex-live-commits mailing list.