texlive[60292] Build/source/texk/bibtex-x: bibtexu: fix bug about

commits+takuji at tug.org commits+takuji at tug.org
Sat Aug 21 11:09:31 CEST 2021


Revision: 60292
          http://tug.org/svn/texlive?view=revision&revision=60292
Author:   takuji
Date:     2021-08-21 11:09:30 +0200 (Sat, 21 Aug 2021)
Log Message:
-----------
bibtexu: fix bug about upper/lower cases, collation

Modified Paths:
--------------
    trunk/Build/source/texk/bibtex-x/ChangeLog
    trunk/Build/source/texk/bibtex-x/bibtex-2.c
    trunk/Build/source/texk/bibtex-x/bibtex-4.c
    trunk/Build/source/texk/bibtex-x/gblprocs.h
    trunk/Build/source/texk/bibtex-x/tests/bibtexu-yannis.test
    trunk/Build/source/texk/bibtex-x/tests/ubasic.bbl
    trunk/Build/source/texk/bibtex-x/tests/ubasic.bib
    trunk/Build/source/texk/bibtex-x/tests/yannis.bbl
    trunk/Build/source/texk/bibtex-x/tests/yannis.bib

Modified: trunk/Build/source/texk/bibtex-x/ChangeLog
===================================================================
--- trunk/Build/source/texk/bibtex-x/ChangeLog	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/ChangeLog	2021-08-21 09:09:30 UTC (rev 60292)
@@ -1,3 +1,18 @@
+2021-08-21  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
+
+	* bibtex-4.c:
+	Fix bug of collation (check END_OF_STRING, compare arg1 and arg2).
+	* bibtex-2.c, gblproc.h:
+	Fix bug of making upper/lower cases.
+	It may remain a bug when lengths are different between
+	from-string and to-string, for example:
+	  U+00DF ß LATIN SMALL LETTER SHARP S, 0xC3 0x9F in UTF-8
+	  U+1E9E ẞ LATIN CAPITAL LETTER SHARP S, 0xE1 0xBA 0x9E in UTF-8
+	* tests/ubasic.{bib,bbl},yannis.{bib,bbl}:
+	Update.
+	* tests/bibtexu-yannis.test:
+	Reenable again (hopefully fixed the issue).
+
 2021-08-14  TANAKA Takuji  <ttk at t-lab.opal.ne.jp>
 
 	* bibtex.c, bibtex-[23].c, gbl{proc,vars}.h:
@@ -100,7 +115,7 @@
 
 	Merge bibtex8/ and bibtexu/ into bibtex-x (extended).
 	* bibtex-[234].c, bibtex.[ch], gblprocs.h, gblvars.h, utils.c:
-	Use '#ifndef UTF_8 <bibtex8 code> #ekse <bibtexu code> #endif'.
+	Use '#ifndef UTF_8 <bibtex8 code> #else <bibtexu code> #endif'.
 	* version.h: Set date to '18 mar 2013'.
 	Makefile.am, configure.ac: Adapted.
 

Modified: trunk/Build/source/texk/bibtex-x/bibtex-2.c
===================================================================
--- trunk/Build/source/texk/bibtex-x/bibtex-2.c	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/bibtex-2.c	2021-08-21 09:09:30 UTC (rev 60292)
@@ -2928,17 +2928,18 @@
 BEGIN
   StrEntLoc_T		ptr1,
 			ptr2;
+  Boolean_T		less_than;
 #ifdef UTF_8
 /*
 We use ICU Collator ucol_strcollUTF8() to compare the Unicode in UTF-8.
-There is an command line option "-o", "--location" to set the rule of collation.
+There is a command line option "-o", "--location" to set the rule of collation.
 */
   Integer_T lenk1, lenk2;
-  UBool u_less;
+  UBool u_cmp;
   UErrorCode err1 = U_ZERO_ERROR;
+  char *eos1, *eos2;
   const char *ustr1, *ustr2;
 #else
-  Boolean_T		less_than;
   Integer_T		char_ptr;
   ASCIICode_T		char1,
 			char2;
@@ -2953,10 +2954,12 @@
 #ifdef UTF_8
   ustr1 = (const char *)&ENTRY_STRS(ptr1, 0);
   ustr2 = (const char *)&ENTRY_STRS(ptr2, 0);
-  lenk1 = strlen(ustr1);
-  lenk2 = strlen(ustr2);
+  eos1 = strchr(ustr1, END_OF_STRING);
+  eos2 = strchr(ustr2, END_OF_STRING);
+  lenk1 = eos1 ? eos1-ustr1 : strlen(ustr1);
+  lenk2 = eos2 ? eos2-ustr2 : strlen(ustr2);
 
-  u_less = ucol_strcollUTF8(u_coll, ustr1, lenk1, ustr2, lenk2, &err1)==UCOL_LESS;
+  u_cmp = ucol_strcollUTF8(u_coll, ustr1, lenk1, ustr2, lenk2, &err1);
   if (!U_SUCCESS(err1))
   BEGIN
 	printf("Error in ucol_strcollUTF8.\n");
@@ -2966,11 +2969,22 @@
 #endif                      			/* TRACE */
   END
 
-#ifdef TRACE
-  if (Flag_trace)
-    TRACE_PR_LN2 ("... first is smaller than second? -- %s (ICU)", (u_less?"T":"F"));
-#endif                      			/* TRACE */
-  return u_less;
+  if (u_cmp==UCOL_EQUAL)
+      BEGIN
+        if (arg1 < arg2)
+        BEGIN
+	  COMPARE_RETURN (TRUE);
+	END
+        else if (arg1 > arg2)
+        BEGIN
+	  COMPARE_RETURN (FALSE);
+        END
+        else
+        BEGIN
+          CONFUSION ("Duplicate sort key");
+        END
+      END
+  less_than = u_cmp==UCOL_LESS;
 #else
   char_ptr = 0;
   LOOP
@@ -3013,6 +3027,7 @@
     END
     INCR (char_ptr);
   END
+#endif
 Exit_Label:
 #ifdef TRACE
   if (Flag_trace)
@@ -3019,7 +3034,6 @@
     TRACE_PR_LN2 ("... first is smaller than second? -- %s", (less_than?"T":"F"));
 #endif                      			/* TRACE */
   return (less_than);
-#endif
 END
 /*^^^^^^^^^^^^^^^^^^^^^^^^^^ END OF SECTION 301 ^^^^^^^^^^^^^^^^^^^^^^^^^^^*/
 

Modified: trunk/Build/source/texk/bibtex-x/bibtex-4.c
===================================================================
--- trunk/Build/source/texk/bibtex-x/bibtex-4.c	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/bibtex-4.c	2021-08-21 09:09:30 UTC (rev 60292)
@@ -459,15 +459,10 @@
 When we do lower_case_uni, the length of string have been changed. So we should do some job 
 for the precessing after lower case. Here there may be some potential bug.      23/sep/2009
 */
-		int16_t i=ex_buf_ptr;
 		int16_t llen;
 
-		while ((ex_buf[i] != COLON) && (ex_buf[i] != WHITE_SPACE) && (i< ex_buf_length))
-		BEGIN
-		  i++;		
-		END
-		llen=(i-ex_buf_ptr);
-		ex_buf_ptr=ex_buf_length-1+lower_case_uni (ex_buf, ex_buf_ptr, llen)+ex_buf_ptr;
+		llen=utf8len(ex_buf[ex_buf_ptr]);
+		ex_buf_ptr=ex_buf_ptr+lower_case_uni (ex_buf, ex_buf_ptr, llen)-1;
 #else
 		lower_case (ex_buf, ex_buf_ptr, 1);
 #endif
@@ -487,14 +482,10 @@
 Here the same for processing the length of string after change case. 23/sep/2009
 */
 	      BEGIN
-	        int16_t i=ex_buf_ptr;
-	        int16_t llen;
-	        while ((ex_buf[i] != COLON) && (ex_buf[i] != WHITE_SPACE) && (i< ex_buf_length))
-       	        BEGIN
-		  i++;		
-	        END
-	        llen=(i-ex_buf_ptr+1);
-		ex_buf_ptr=ex_buf_ptr-1+lower_case_uni (ex_buf, ex_buf_ptr, llen);
+		int16_t llen;
+
+		llen=utf8len(ex_buf[ex_buf_ptr]);
+		ex_buf_ptr=ex_buf_ptr+lower_case_uni (ex_buf, ex_buf_ptr, llen)-1;
 	      END
 #else
 	      lower_case (ex_buf, ex_buf_ptr, 1);
@@ -506,14 +497,10 @@
 Here the same for processing the length of string after change case. 23/sep/2009
 */
               BEGIN
-	        int16_t i=ex_buf_ptr;
-	        int16_t ulen;
-	        while ((ex_buf[i] != COLON) && (ex_buf[i] != WHITE_SPACE) && (i< ex_buf_length))
-       	        BEGIN
-		  i++;		
-	        END
-	        ulen=(i-ex_buf_ptr+1);
-	        ex_buf_ptr=ex_buf_ptr-1+upper_case_uni (ex_buf, ex_buf_ptr, ulen);
+		int16_t ulen;
+
+		ulen=utf8len(ex_buf[ex_buf_ptr]);
+		ex_buf_ptr=ex_buf_ptr+upper_case_uni (ex_buf, ex_buf_ptr, ulen)-1;
               END
 #else
 	      upper_case (ex_buf, ex_buf_ptr, 1);

Modified: trunk/Build/source/texk/bibtex-x/gblprocs.h
===================================================================
--- trunk/Build/source/texk/bibtex-x/gblprocs.h	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/gblprocs.h	2021-08-21 09:09:30 UTC (rev 60292)
@@ -115,6 +115,7 @@
                                 int32_t tucap,
                                 UChar * target,
                                 int32_t tarlen);
+#define  utf8len(a)  ((a)<0x80 ? 1 : ((a)<0xc2 ? -2 : ((a)<0xe0 ? 2 : ((a)<0xf0 ? 3 : ((a)<0xf5 ? 4 : -1)))))
 #endif
 void                    a_close (const AlphaFile_T file_pointer);
 Boolean_T               a_open_in (AlphaFile_T *file_pointer,

Modified: trunk/Build/source/texk/bibtex-x/tests/bibtexu-yannis.test
===================================================================
--- trunk/Build/source/texk/bibtex-x/tests/bibtexu-yannis.test	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/tests/bibtexu-yannis.test	2021-08-21 09:09:30 UTC (rev 60292)
@@ -8,7 +8,6 @@
 # https://tug.org/pipermail/tlbuild/2021q2/004918.html
 # 
 # Looking for someone interested in debugging ...
-exit 77
 
 test -d tests || mkdir -p tests
 

Modified: trunk/Build/source/texk/bibtex-x/tests/ubasic.bbl
===================================================================
--- trunk/Build/source/texk/bibtex-x/tests/ubasic.bbl	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/tests/ubasic.bbl	2021-08-21 09:09:30 UTC (rev 60292)
@@ -2,7 +2,7 @@
 
 \bibitem{li2018fuzzing}
 Jun Li, Bodong Zhao, and Chao Zhang.
-\newblock Fuzzing: a survey.
+\newblock Fuzzing: a {Survey} abc def.
 \newblock {\em Cybersecurity}, 1(1):6, 2018.
 
 \end{thebibliography}

Modified: trunk/Build/source/texk/bibtex-x/tests/ubasic.bib
===================================================================
--- trunk/Build/source/texk/bibtex-x/tests/ubasic.bib	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/tests/ubasic.bib	2021-08-21 09:09:30 UTC (rev 60292)
@@ -2,7 +2,7 @@
 % Public domain. Submitted by Dmitry Zheleznyakov, 2021.
 
 @article{li2018fuzzing,
-  title={Fuzzing: a survey},
+  title={Fuzzing: a {Survey} ABC Def},
   author={Li, Jun and Zhao, Bodong and Zhang, Chao},
   journal={Cybersecurity},
   volume={1},

Modified: trunk/Build/source/texk/bibtex-x/tests/yannis.bbl
===================================================================
--- trunk/Build/source/texk/bibtex-x/tests/yannis.bbl	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/tests/yannis.bbl	2021-08-21 09:09:30 UTC (rev 60292)
@@ -7,7 +7,7 @@
 
 \bibitem[Œtingen, 2009]{cle8}
 Œtingen, N. (2009).
-\newblock {\em Un livre}.
+\newblock {\em Un livre, ABC Def {GHI} {Jkl} ÀÁÂ Ñæß {ÀÁÂ} {Ñæß}}.
 \newblock Les Éditions du désastre.
 
 \bibitem[Tarantin, 2009]{cle4}
@@ -22,7 +22,7 @@
 
 \bibitem[Ἕλληνας, 2009]{cle7}
 Ἕλληνας, Μ. (2009).
-\newblock {\em Τὸ καλὸ παράδειγμα}.
+\newblock {\em Τὸ καλὸ παράδειγμα, ΑΒΓ {Δεζ} {ΗΘΙ}}.
 \newblock Ἐκδοτικὸς οἶκος «Δὲν βαριέσαι».
 
 \bibitem[Ζαχαρόπουλος, 2009]{cle5}
@@ -31,6 +31,11 @@
   κάνω, ἀφοῦ σὲ χάνω}.
 \newblock Ἐκδόσεις τῶν χαμένων ἐλπίδων.
 
+\bibitem[Достое́вский, 1880]{cle9}
+Достое́вский, Ф.~М. (1880).
+\newblock {\em Братья Карамазовы, АБВ {Где} {ЖЗИ}}.
+\newblock Русскій Вѣстникъ.
+
 \bibitem[天堂最新的章节表, 2009]{cle3}
 天堂最新的章节表, 格. (2009).
 \newblock {\em 製版・文書処理システム Ω}.

Modified: trunk/Build/source/texk/bibtex-x/tests/yannis.bib
===================================================================
--- trunk/Build/source/texk/bibtex-x/tests/yannis.bib	2021-08-20 23:49:00 UTC (rev 60291)
+++ trunk/Build/source/texk/bibtex-x/tests/yannis.bib	2021-08-21 09:09:30 UTC (rev 60292)
@@ -40,11 +40,17 @@
 @book{cle7,
 	Author = {Ἕλληνας, Μῆτσος},
 	Publisher = {Ἐκδοτικὸς οἶκος «Δὲν βαριέσαι»},
-	Title = {Τὸ καλὸ παράδειγμα},
+	Title = {Τὸ καλὸ παράδειγμα, ΑΒΓ {Δεζ} {ΗΘΙ}},
 	Year = {2009}}
 
 @book{cle8,
 	Author = {Œtingen, Nicolas},
 	Publisher = {Les Éditions du désastre},
-	Title = {Un livre},
+	Title = {Un livre, ABC Def {GHI} {Jkl} ÀÁÂ Ñæß {ÀÁÂ} {Ñæß}},
 	Year = {2009}}
+
+ at book{cle9,
+	Author = {Фёдор Миха́йлович Достое́вский},
+	Publisher = {Русскій Вѣстникъ},
+	Title = {Братья Карамазовы, АБВ {Где} {ЖЗИ}},
+	Year = {1880}}



More information about the tex-live-commits mailing list.