texlive[60239] Build/source/texk/bibtex-x: bibtexu: clean up usage of

commits+takuji at tug.org commits+takuji at tug.org
Sat Aug 14 16:03:26 CEST 2021


Revision: 60239
          http://tug.org/svn/texlive?view=revision&revision=60239
Author:   takuji
Date:     2021-08-14 16:03:26 +0200 (Sat, 14 Aug 2021)
Log Message:
-----------
bibtexu: clean up usage of ICU functions

Modified Paths:
--------------
    trunk/Build/source/texk/bibtex-x/ChangeLog
    trunk/Build/source/texk/bibtex-x/bibtex-2.c
    trunk/Build/source/texk/bibtex-x/bibtex-3.c
    trunk/Build/source/texk/bibtex-x/bibtex.c
    trunk/Build/source/texk/bibtex-x/gblprocs.h
    trunk/Build/source/texk/bibtex-x/gblvars.h

Modified: trunk/Build/source/texk/bibtex-x/ChangeLog
===================================================================
--- trunk/Build/source/texk/bibtex-x/ChangeLog	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/ChangeLog	2021-08-14 14:03:26 UTC (rev 60239)
@@ -1,3 +1,11 @@
+2021-08-14  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
+
+	* bibtex.c, bibtex-[23].c, gbl{proc,vars}.h:
+	Use ucol_strcollUTF8() for collation.
+	Use u_strToUTF8WithSub() in icu_fromUChars() and
+	u_strFromUTF8WithSub() in icu_toUChars().
+	Initialize ICU collator only once.
+
 2021-06-21  Karl Berry  <karl at freefriends.org>
 
 	* tests/bibtexu-yannis.test: disable again. Fails on

Modified: trunk/Build/source/texk/bibtex-x/bibtex-2.c
===================================================================
--- trunk/Build/source/texk/bibtex-x/bibtex-2.c	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/bibtex-2.c	2021-08-14 14:03:26 UTC (rev 60239)
@@ -2930,18 +2930,13 @@
 			ptr2;
 #ifdef UTF_8
 /*
-We use ICU libs to processing UTF-8. First, we have to transform UTF-8 to 
-Unicode/UChar with the fonction icu_UCHars. Then we use the UCollator 
-in the ICU libs to conparer the Unicode. There is an option "location", 
-we use "-o" to indicate the rule of conpare.             23/sep/2009
+We use ICU Collator ucol_strcollUTF8() to compare the Unicode in UTF-8.
+There is an commad line option "-o", "--location" to set the rule of collation.
 */
   Integer_T lenk1, lenk2;
-  UChar uch1[BUF_SIZE+1], uch2[BUF_SIZE+1];
   UBool u_less;
-  UCollator * ucol1;
-  int32_t ucap = BUF_SIZE+1;
-  int32_t uchlen1, uchlen2;
   UErrorCode err1 = U_ZERO_ERROR;
+  const char *ustr1, *ustr2;
 #else
   Boolean_T		less_than;
   Integer_T		char_ptr;
@@ -2956,59 +2951,25 @@
   ptr1 = (arg1 * num_ent_strs) + sort_key_num;
   ptr2 = (arg2 * num_ent_strs) + sort_key_num;
 #ifdef UTF_8
-  lenk1 = strlen((char *)&ENTRY_STRS(ptr1, 0));
-  lenk2 = strlen((char *)&ENTRY_STRS(ptr2, 0));
+  ustr1 = (const char *)&ENTRY_STRS(ptr1, 0);
+  ustr2 = (const char *)&ENTRY_STRS(ptr2, 0);
+  lenk1 = strlen(ustr1);
+  lenk2 = strlen(ustr2);
 
-/*
-icu_toUChars() seems not working here, using u_strFromUTF8 instead. (04/mar/2019)
-*/
-
-/*
-Use u_strFromUTF8WithSub() with a substitution character 0xfffd,
-instead of u_strFromUTF8(). (05/mar/2019)
-If err1 != U_ZERO_ERROR, the original functions are used. (06/mar/2019)
-*/
-
-/*
-  uchlen1 = icu_toUChars(entry_strs, (ptr1 * (ENT_STR_SIZE+1)), lenk1, uch1, ucap);
-  uchlen2 = icu_toUChars(entry_strs, (ptr2 * (ENT_STR_SIZE+1)), lenk2, uch2, ucap);
-*/
-
-  u_strFromUTF8WithSub(uch1, ucap, &uchlen1, (char *)&ENTRY_STRS(ptr1, 0), lenk1, 0xfffd, NULL, &err1);
-  if (!U_SUCCESS(err1)) {
-    printf("Error in u_strFromUTF8WithSub 1.\n");
+  u_less = ucol_strcollUTF8(u_coll, ustr1, lenk1, ustr2, lenk2, &err1)==UCOL_LESS;
+  if (!U_SUCCESS(err1))
+  BEGIN
+	printf("Error in ucol_strcollUTF8.\n");
 #ifdef TRACE
-    if (Flag_trace)
-      TRACE_PR_LN ("Error in u_strFromUTF8WithSub 1");
+	if (Flag_trace)
+		TRACE_PR_LN ("Error in ucol_strcollUTF8");
 #endif                      			/* TRACE */
-    uchlen1 = icu_toUChars(entry_strs, (ptr1 * (ENT_STR_SIZE+1)), lenk1, uch1, ucap);
-    err1 = U_ZERO_ERROR;
-  }
+  END
 
-  u_strFromUTF8WithSub(uch2, ucap, &uchlen2, (char *)&ENTRY_STRS(ptr2, 0), lenk2, 0xfffd, NULL, &err1);
-  if (!U_SUCCESS(err1)) {
-    printf("Error in u_strFromUTF8WithSub 2.\n");
 #ifdef TRACE
-    if (Flag_trace)
-      TRACE_PR_LN ("Error in u_strFromUTF8WithSub 2");
-#endif                      			/* TRACE */
-    uchlen2 = icu_toUChars(entry_strs, (ptr2 * (ENT_STR_SIZE+1)), lenk2, uch2, ucap);
-    err1 = U_ZERO_ERROR;
-  }
-
-  if(Flag_location)
-    ucol1 = ucol_open(Str_location, &err1);
-  else
-    ucol1 = ucol_open(NULL, &err1);
-  if (!U_SUCCESS(err1))
-    printf("Error in opening a ucol in less_than.\n");
-  u_less = !ucol_greaterOrEqual(ucol1, uch1, uchlen1, uch2, uchlen2);
-#ifdef TRACE
   if (Flag_trace)
     TRACE_PR_LN2 ("... first is smaller than second? -- %s (ICU)", (u_less?"T":"F"));
 #endif                      			/* TRACE */
-
-  ucol_close(ucol1);
   return u_less;
 #else
   char_ptr = 0;
@@ -3100,10 +3061,10 @@
 
 #ifdef UTF_8
 /*
-"lower_case_uni" is the fonction for processing the characters, actually the UTF-8.
+"lower_case_uni" is the function for processing the characters, actually the UTF-8.
 We transform UTF-8 to Unicode, then to low case, then back to UTF-8 for output.
 When we transform the character, the length have been changed. So we have do 
-some job for the length. And the output of this fonction we should be careful 
+some job for the length. And the output of this function we should be careful 
 to the length.                                                   23/sep/2009
 */
 BufPointer_T       lower_case_uni (BufType_T buf, BufPointer_T bf_ptr,
@@ -3156,20 +3117,16 @@
 
 
 /*
-This fonction is for transform UTF-8 to Unicode with ICU libs.		 23/sep/2009
+This function is for transform UTF-8 to Unicode with ICU libs. 23/sep/2009
 */
 int32_t icu_toUChars(BufType_T buf, BufPointer_T bf_ptr,BufPointer_T len,UChar * target, int32_t tarcap)
 BEGIN
-	UConverter * ucon1;
 	UErrorCode err1 = U_ZERO_ERROR;
-	ucon1 = ucnv_open(NULL, &err1);
+	int32_t tulen;
+
+	u_strFromUTF8WithSub(target, tarcap, &tulen, (char *)&buf[bf_ptr], len, 0xfffd, NULL, &err1);
 	if (!U_SUCCESS(err1))
 	BEGIN
-		printf("Error in opening a ucnv in icu_toUChars.\n");
-	END
-	ucnv_toUChars(ucon1, target, tarcap, (char *)&buf[bf_ptr], len, &err1);
-	if (!U_SUCCESS(err1))
-	BEGIN
 		printf("Error in icu_toUChars.\n");
 #ifdef TRACE
 		if (Flag_trace)
@@ -3176,22 +3133,17 @@
 			TRACE_PR_LN ("Error in icu_toUChars");
 #endif                      			/* TRACE */
 	END
-	ucnv_close(ucon1);
 	
-	return len;
+	return tulen;
 END
 
 /*
-This fonction is for transform Unicode string to low case. 23/sep/2009
+This function is for transform Unicode string to low case. 23/sep/2009
 */
 int32_t icu_strToLower(UChar * tarlow, int32_t tlcap, UChar * target, int32_t tarlen)
 BEGIN
 	int32_t tllen;
 	UErrorCode err1 = U_ZERO_ERROR;
-	if (!U_SUCCESS(err1))
-	BEGIN
-		printf("Error in icu_strToLower?\n");
-	END
 	if (Flag_language)
 	{
 		tllen=u_strToLower(tarlow,tlcap, target,tarlen,Str_language,&err1);
@@ -3225,21 +3177,16 @@
 
 
 /*
-This fonction is for transform Unicode to UTF-8. 23/sep/2009
+This function is for transform Unicode to UTF-8. 23/sep/2009
 */
 int32_t icu_fromUChars(unsigned char * dest, int32_t destcap, const UChar * src, int32_t srclen)
 BEGIN
-	UConverter * ucon2;
 	UErrorCode err2 = U_ZERO_ERROR;
 	int32_t tblen;
-	ucon2 = ucnv_open(NULL, &err2);
+
+	u_strToUTF8WithSub((char *)dest, destcap, &tblen, src, srclen, 0xfffd, NULL, &err2);
 	if (!U_SUCCESS(err2))
 	BEGIN
-		printf("Error in opening a ucnv in icu_fromUChars.\n");
-	END
-	tblen=ucnv_fromUChars(ucon2, (char *)dest, destcap, src, srclen, &err2);
-	if (!U_SUCCESS(err2))
-	BEGIN
 		printf("Error in icu_fromUChars.\n");
 #ifdef TRACE
 		if (Flag_trace)

Modified: trunk/Build/source/texk/bibtex-x/bibtex-3.c
===================================================================
--- trunk/Build/source/texk/bibtex-x/bibtex-3.c	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/bibtex-3.c	2021-08-14 14:03:26 UTC (rev 60239)
@@ -3134,7 +3134,7 @@
 END
 
 /*
-This fonction is for transform Unicode string to up case. 23/sep/2009
+This function is for transform Unicode string to up case. 23/sep/2009
 */
 
 int32_t icu_strToUpper(UChar * tarup, int32_t tucap, UChar * target, int32_t tarlen)

Modified: trunk/Build/source/texk/bibtex-x/bibtex.c
===================================================================
--- trunk/Build/source/texk/bibtex-x/bibtex.c	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/bibtex.c	2021-08-14 14:03:26 UTC (rev 60239)
@@ -282,9 +282,19 @@
         {
             UVersionInfo icuVersion;
             char icu_version[U_MAX_VERSION_STRING_LENGTH] = "";
+            UErrorCode err1 = U_ZERO_ERROR;
             u_getVersion(icuVersion);
             u_versionToString(icuVersion, icu_version);
             FPRINTF (log_file, "Compiled with:   ICU version %s\n", icu_version);
+
+            if (Flag_location)
+                u_coll = ucol_open(Str_location, &err1);
+            else
+                u_coll = ucol_open(NULL, &err1);
+            if (!U_SUCCESS(err1)) {
+                FPRINTF (log_file, "Error in opening ICU collator.\n");
+                exit(FATAL_EXIT_STATUS);
+            }
         }
 #endif
         FPRINTF (log_file, "\n");
@@ -379,6 +389,9 @@
       END
       get_bst_command_and_process ();
     END
+#ifdef UTF_8
+    ucol_close(u_coll);
+#endif
 Bst_Done_Label:
     a_close (bst_file);
 No_Bst_File_Label:

Modified: trunk/Build/source/texk/bibtex-x/gblprocs.h
===================================================================
--- trunk/Build/source/texk/bibtex-x/gblprocs.h	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/gblprocs.h	2021-08-14 14:03:26 UTC (rev 60239)
@@ -97,7 +97,6 @@
 #include "unicode/uchar.h"
 #include "unicode/ustdio.h"
 #include "unicode/ustring.h"
-#include "unicode/ucnv.h"
 #include "unicode/ucol.h"
 int32_t                 icu_toUChars (BufType_T buf,
                                       BufPointer_T bf_ptr,

Modified: trunk/Build/source/texk/bibtex-x/gblvars.h
===================================================================
--- trunk/Build/source/texk/bibtex-x/gblvars.h	2021-08-13 23:48:51 UTC (rev 60238)
+++ trunk/Build/source/texk/bibtex-x/gblvars.h	2021-08-14 14:03:26 UTC (rev 60239)
@@ -218,7 +218,7 @@
 __EXTERN__ Integer_T                    glob_chr_ptr;
 
 __EXTERN__ Boolean_T                    hash_found;
-__EXTERN__ Integer16_T		        hash_used;
+__EXTERN__ Integer16_T                  hash_used;
 __EXTERN__ Integer8_T                   history;
 
 __EXTERN__ Integer_T                    impl_fn_num;
@@ -376,7 +376,7 @@
 */
 __EXTERN__ unsigned char                c8upcase[LAST_TEXT_CHAR + 1];
 __EXTERN__ unsigned char                c8lowcase[LAST_TEXT_CHAR + 1];
-__EXTERN__ int 			        c8order[LAST_TEXT_CHAR + 1];
+__EXTERN__ int                          c8order[LAST_TEXT_CHAR + 1];
 #endif                          /* SUPPORT_8BIT */
 
 
@@ -427,6 +427,7 @@
 __EXTERN__ char                        *Str_language;
 __EXTERN__ Boolean_T                    Flag_location;
 __EXTERN__ char                        *Str_location;
+__EXTERN__ UCollator *                  u_coll;
 #endif
 __EXTERN__ Boolean_T                    Flag_7bit;
 __EXTERN__ Boolean_T                    Flag_8bit;



More information about the tex-live-commits mailing list.