diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..d87f5613ec4234f82f8eaeebc563711f587fdf88 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.xz filter=lfs diff=lfs merge=lfs -text diff --git a/.lfsconfig b/.lfsconfig new file mode 100644 index 0000000000000000000000000000000000000000..3fdd4bd555d97c9354bc9b72f461e9236370f6d9 --- /dev/null +++ b/.lfsconfig @@ -0,0 +1,2 @@ +[lfs] + url = https://artlfs.openeuler.openatom.cn/src-openEuler/coreutils diff --git a/backport-cat-don-t-trust-st_size-on-proc-files.patch b/backport-cat-don-t-trust-st_size-on-proc-files.patch deleted file mode 100644 index 2324d7a2834bebb578de52de3d988be339fd19fb..0000000000000000000000000000000000000000 --- a/backport-cat-don-t-trust-st_size-on-proc-files.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 225cb8d7473eadb481a4884e929bf23589d4bd82 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 6 Apr 2024 15:13:23 -0700 -Subject: [PATCH] =?UTF-8?q?cat:=20don=E2=80=99t=20trust=20st=5Fsize=20on?= - =?UTF-8?q?=20/proc=20files?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -* src/cat.c (main): -Improve test for when copying will exhaust the output device. -Do not rely on st_size, which is unreliable in /proc. -Use lseek instead; this is good enough here. -* tests/cat/cat-self.sh: Test the relaxation of the heuristic -for self-copying. ---- - src/cat.c | 31 +++++++++++++++++++++---------- - tests/cat/cat-self.sh | 20 ++++++++++++++++++++ - 2 files changed, 41 insertions(+), 10 deletions(-) - -diff --git a/src/cat.c b/src/cat.c -index 4ed404363..b33faeb35 100644 ---- a/src/cat.c -+++ b/src/cat.c -@@ -645,9 +645,10 @@ main (int argc, char **argv) - /* Optimal size of i/o operations of output. */ - idx_t outsize = io_blksize (&stat_buf); - -- /* Device and I-node number of the output. */ -+ /* Device, I-node number and lazily-acquired flags of the output. */ - dev_t out_dev = stat_buf.st_dev; - ino_t out_ino = stat_buf.st_ino; -+ int out_flags = -2; - - /* True if the output is a regular file. */ - bool out_isreg = S_ISREG (stat_buf.st_mode) != 0; -@@ -701,17 +702,27 @@ main (int argc, char **argv) - - fdadvise (input_desc, 0, 0, FADVISE_SEQUENTIAL); - -- /* Don't copy a nonempty regular file to itself, as that would -- merely exhaust the output device. It's better to catch this -- error earlier rather than later. */ -+ /* Don't copy a file to itself if that would merely exhaust the -+ output device. It's better to catch this error earlier -+ rather than later. */ - -- if (out_isreg -- && stat_buf.st_dev == out_dev && stat_buf.st_ino == out_ino -- && lseek (input_desc, 0, SEEK_CUR) < stat_buf.st_size) -+ if (stat_buf.st_dev == out_dev && stat_buf.st_ino == out_ino) - { -- error (0, 0, _("%s: input file is output file"), quotef (infile)); -- ok = false; -- goto contin; -+ if (out_flags < -1) -+ out_flags = fcntl (STDOUT_FILENO, F_GETFL); -+ bool exhausting = 0 <= out_flags && out_flags & O_APPEND; -+ if (!exhausting) -+ { -+ off_t in_pos = lseek (input_desc, 0, SEEK_CUR); -+ if (0 <= in_pos) -+ exhausting = in_pos < lseek (STDOUT_FILENO, 0, SEEK_CUR); -+ } -+ if (exhausting) -+ { -+ error (0, 0, _("%s: input file is output file"), quotef (infile)); -+ ok = false; -+ goto contin; -+ } - } - - /* Pointer to the input buffer. */ -diff --git a/tests/cat/cat-self.sh b/tests/cat/cat-self.sh -index e0f6455c0..854825def 100755 ---- a/tests/cat/cat-self.sh -+++ b/tests/cat/cat-self.sh -@@ -30,4 +30,24 @@ echo y >doc.end || framework_failure_ - cat doc doc.end >doc || fail=1 - compare doc doc.end || fail=1 - -+# This terminates even though it copies a file to itself. -+# Coreutils 9.5 and earlier rejected this. -+echo x >fx || framework_failure_ -+echo y >fy || framework_failure_ -+cat fx fy >fxy || fail=1 -+for i in 1 2; do -+ cat fx >fxy$i || fail=1 -+done -+for i in 3 4 5 6; do -+ cat fx >fx$i || fail=1 -+done -+cat - fy fxy1 || fail=1 -+compare fxy fxy1 || fail=1 -+cat fxy2 fy 1<>fxy2 || fail=1 -+compare fxy fxy2 || fail=1 -+returns_ 1 cat fx fx3 1<>fx3 || fail=1 -+returns_ 1 cat - fx4 fx4 || fail=1 -+returns_ 1 cat fx5 >>fx5 || fail=1 -+returns_ 1 cat >fx6 || fail=1 -+ - Exit $fail --- -2.43.0 - diff --git a/backport-chroot-whoami-use-uintmax_t-for-printing-uids.patch b/backport-chroot-whoami-use-uintmax_t-for-printing-uids.patch deleted file mode 100644 index e07aa6d7eadc4f33fefebc7e3f8c2640e2c20054..0000000000000000000000000000000000000000 --- a/backport-chroot-whoami-use-uintmax_t-for-printing-uids.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 8083944484f2cdf6c9b737642567bcdb54db784d Mon Sep 17 00:00:00 2001 -From: Collin Funk -Date: Sun, 6 Oct 2024 22:19:51 -0700 -Subject: [PATCH] chroot,whoami: use uintmax_t for printing uids - -Reference:https://github.com/coreutils/coreutils/commit/8083944484f2cdf6c9b737642567bcdb54db784d -* src/chroot.c (main): Cast the uid to uintmax_t instead of int. -* src/whoami.c (main): Cast the uid to uintmax_t instead of unsigned -long int. ---- - src/chroot.c | 3 ++- - src/whoami.c | 4 ++-- - 2 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/src/chroot.c b/src/chroot.c -index e12454c..dd1ec62 100644 ---- a/src/chroot.c -+++ b/src/chroot.c -@@ -375,7 +375,8 @@ main (int argc, char **argv) - else if (gid_unset (gid)) - { - error (EXIT_CANCELED, errno, -- _("no group specified for unknown uid: %d"), (int) uid); -+ _("no group specified for unknown uid: %ju"), -+ (uintmax_t) uid); - } - } - -diff --git a/src/whoami.c b/src/whoami.c -index f64171d..834ca9f 100644 ---- a/src/whoami.c -+++ b/src/whoami.c -@@ -81,8 +81,8 @@ main (int argc, char **argv) - uid = geteuid (); - pw = uid == NO_UID && errno ? nullptr : getpwuid (uid); - if (!pw) -- error (EXIT_FAILURE, errno, _("cannot find name for user ID %lu"), -- (unsigned long int) uid); -+ error (EXIT_FAILURE, errno, _("cannot find name for user ID %ju"), -+ (uintmax_t) uid); - puts (pw->pw_name); - return EXIT_SUCCESS; - } --- -2.27.0 - diff --git a/backport-coreutils-df-direct.patch b/backport-coreutils-df-direct.patch deleted file mode 100644 index 9e3434aeabec5735c3d3c048ff44a41584046c83..0000000000000000000000000000000000000000 --- a/backport-coreutils-df-direct.patch +++ /dev/null @@ -1,187 +0,0 @@ -From 6e36198f10a2f63b89c89ebb5d5c185b20fb3a63 Mon Sep 17 00:00:00 2001 -From: Kamil Dudka -Date: Mon, 29 Mar 2010 17:20:34 +0000 -Subject: [PATCH] coreutils-df-direct.patch - ---- - doc/coreutils.texi | 7 ++++++ - src/df.c | 34 ++++++++++++++++++++++++++-- - tests/df/direct.sh | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 3 files changed, 94 insertions(+), 2 deletions(-) - create mode 100755 tests/df/direct.sh - -diff --git a/doc/coreutils.texi b/doc/coreutils.texi -index 5b9a597..6810c15 100644 ---- a/doc/coreutils.texi -+++ b/doc/coreutils.texi -@@ -12074,6 +12074,13 @@ some systems (notably Solaris), doing this yields more up to date results, - but in general this option makes @command{df} much slower, especially when - there are many or very busy file systems. - -+@item --direct -+@opindex --direct -+@cindex direct statfs for a file -+Do not resolve mount point and show statistics directly for a file. It can be -+especially useful for NFS mount points if there is a boundary between two -+storage policies behind the mount point. -+ - @item --total - @opindex --total - @cindex grand total of file system size, usage and available space -diff --git a/src/df.c b/src/df.c -index 48025b9..c8efa5b 100644 ---- a/src/df.c -+++ b/src/df.c -@@ -125,6 +125,9 @@ static bool print_type; - /* If true, print a grand total at the end. */ - static bool print_grand_total; - -+/* If true, show statistics for a file instead of mount point. */ -+static bool direct_statfs; -+ - /* Grand total data. */ - static struct fs_usage grand_fsu; - -@@ -252,13 +255,15 @@ enum - NO_SYNC_OPTION = CHAR_MAX + 1, - SYNC_OPTION, - TOTAL_OPTION, -- OUTPUT_OPTION -+ OUTPUT_OPTION, -+ DIRECT_OPTION - }; - - static struct option const long_options[] = - { - {"all", no_argument, nullptr, 'a'}, - {"block-size", required_argument, nullptr, 'B'}, -+ {"direct", no_argument, nullptr, DIRECT_OPTION}, - {"inodes", no_argument, nullptr, 'i'}, - {"human-readable", no_argument, nullptr, 'h'}, - {"si", no_argument, nullptr, 'H'}, -@@ -583,7 +588,10 @@ get_header (void) - for (col = 0; col < ncolumns; col++) - { - char *cell = nullptr; -- char const *header = _(columns[col]->caption); -+ char const *header = (columns[col]->field == TARGET_FIELD -+ && direct_statfs)? -+ _("File") : -+ _(columns[col]->caption); - - if (columns[col]->field == SIZE_FIELD - && (header_mode == DEFAULT_MODE -@@ -1486,6 +1494,17 @@ get_point (char const *point, const struct stat *statp) - static void - get_entry (char const *name, struct stat const *statp) - { -+ if (direct_statfs) -+ { -+ char *resolved = canonicalize_file_name (name); -+ if (resolved) -+ { -+ get_dev (NULL, resolved, name, NULL, NULL, false, false, NULL, false); -+ free (resolved); -+ return; -+ } -+ } -+ - if ((S_ISBLK (statp->st_mode) || S_ISCHR (statp->st_mode)) - && get_device (name)) - return; -@@ -1556,6 +1575,7 @@ or all file systems by default.\n\ - -B, --block-size=SIZE scale sizes by SIZE before printing them; e.g.,\n\ - '-BM' prints sizes in units of 1,048,576 bytes;\n\ - see SIZE format below\n\ -+ --direct show statistics for a file instead of mount point\n\ - -h, --human-readable print sizes in powers of 1024 (e.g., 1023M)\n\ - -H, --si print sizes in powers of 1000 (e.g., 1.1G)\n\ - "), stdout); -@@ -1646,6 +1666,9 @@ main (int argc, char **argv) - xstrtol_fatal (e, oi, c, long_options, optarg); - } - break; -+ case DIRECT_OPTION: -+ direct_statfs = true; -+ break; - case 'i': - if (header_mode == OUTPUT_MODE) - { -@@ -1742,6 +1765,13 @@ main (int argc, char **argv) - } - } - -+ if (direct_statfs && show_local_fs) -+ { -+ error (0, 0, _("options --direct and --local (-l) are mutually " -+ "exclusive")); -+ usage (EXIT_FAILURE); -+ } -+ - if (human_output_opts == -1) - { - if (posix_format) -diff --git a/tests/df/direct.sh b/tests/df/direct.sh -new file mode 100755 -index 0000000..8e4cfb8 ---- /dev/null -+++ b/tests/df/direct.sh -@@ -0,0 +1,55 @@ -+#!/bin/sh -+# Ensure "df --direct" works as documented -+ -+# Copyright (C) 2010 Free Software Foundation, Inc. -+ -+# This program is free software: you can redistribute it and/or modify -+# it under the terms of the GNU General Public License as published by -+# the Free Software Foundation, either version 3 of the License, or -+# (at your option) any later version. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+# GNU General Public License for more details. -+ -+# You should have received a copy of the GNU General Public License -+# along with this program. If not, see . -+ -+. "${srcdir=.}/init.sh"; path_prepend_ ../src -+print_ver_ df -+ -+df || skip_ "df fails" -+ -+DIR=`pwd` || framework_failure -+FILE="$DIR/file" -+touch "$FILE" || framework_failure -+echo "$FILE" > file_exp || framework_failure -+echo "Mounted on" > header_mounted_exp || framework_failure -+echo "File" > header_file_exp || framework_failure -+ -+fail=0 -+ -+df --portability "$FILE" > df_out || fail=1 -+df --portability --direct "$FILE" > df_direct_out || fail=1 -+df --portability --direct --local "$FILE" > /dev/null 2>&1 && fail=1 -+ -+# check df header -+$AWK '{ if (NR==1) print $6 " " $7; }' df_out > header_mounted_out \ -+ || framework_failure -+$AWK '{ if (NR==1) print $6; }' df_direct_out > header_file_out \ -+ || framework_failure -+compare header_mounted_out header_mounted_exp || fail=1 -+compare header_file_out header_file_exp || fail=1 -+ -+# check df output (without --direct) -+$AWK '{ if (NR==2) print $6; }' df_out > file_out \ -+ || framework_failure -+compare file_out file_exp && fail=1 -+ -+# check df output (with --direct) -+$AWK '{ if (NR==2) print $6; }' df_direct_out > file_out \ -+ || framework_failure -+compare file_out file_exp || fail=1 -+ -+Exit $fail --- -2.31.1 - diff --git a/backport-coreutils-i18n.patch b/backport-coreutils-i18n.patch deleted file mode 100644 index aa21a355250ee1bed44ef282ff0edb3d642852c6..0000000000000000000000000000000000000000 --- a/backport-coreutils-i18n.patch +++ /dev/null @@ -1,5187 +0,0 @@ -From 94cf02dfcb1be23dedf8a39af295f28ee2de6013 Mon Sep 17 00:00:00 2001 -From: rpm-build -Date: Wed, 30 Aug 2023 17:19:58 +0200 -Subject: [PATCH] coreutils-i18n.patch - ---- - bootstrap.conf | 2 + - configure.ac | 6 + - lib/linebuffer.h | 8 + - lib/mbchar.c | 23 ++ - lib/mbchar.h | 373 +++++++++++++++++ - lib/mbfile.c | 20 + - lib/mbfile.h | 267 ++++++++++++ - m4/mbchar.m4 | 13 + - m4/mbfile.m4 | 14 + - src/cut.c | 508 +++++++++++++++++++++-- - src/expand-common.c | 114 ++++++ - src/expand-common.h | 12 + - src/expand.c | 90 +++- - src/fold.c | 312 ++++++++++++-- - src/local.mk | 4 +- - src/pr.c | 443 ++++++++++++++++++-- - src/sort.c | 792 +++++++++++++++++++++++++++++++++--- - src/unexpand.c | 102 ++++- - tests/Coreutils.pm | 3 + - tests/expand/mb.sh | 183 +++++++++ - tests/i18n/sort.sh | 29 ++ - tests/local.mk | 4 + - tests/misc/expand.pl | 42 ++ - tests/misc/fold.pl | 50 ++- - tests/misc/sort-mb-tests.sh | 45 ++ - tests/misc/unexpand.pl | 39 ++ - tests/pr/pr-tests.pl | 49 +++ - tests/sort/sort-merge.pl | 42 ++ - tests/sort/sort.pl | 40 +- - tests/unexpand/mb.sh | 172 ++++++++ - 30 files changed, 3605 insertions(+), 196 deletions(-) - create mode 100644 lib/mbchar.c - create mode 100644 lib/mbchar.h - create mode 100644 lib/mbfile.c - create mode 100644 lib/mbfile.h - create mode 100644 m4/mbchar.m4 - create mode 100644 m4/mbfile.m4 - create mode 100644 tests/expand/mb.sh - create mode 100644 tests/i18n/sort.sh - create mode 100644 tests/misc/sort-mb-tests.sh - create mode 100644 tests/unexpand/mb.sh - -diff --git a/bootstrap.conf b/bootstrap.conf -index 126e1e8..b4ccebf 100644 ---- a/bootstrap.conf -+++ b/bootstrap.conf -@@ -163,6 +163,8 @@ gnulib_modules=" - maintainer-makefile - malloc-gnu - manywarnings -+ mbchar -+ mbfile - mbrlen - mbrtoc32 - mbrtowc -diff --git a/configure.ac b/configure.ac -index 9cb6ee1..1131ce3 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -504,6 +504,12 @@ fi - # I'm leaving it here for now. This whole thing needs to be modernized... - gl_WINSIZE_IN_PTEM - -+gl_MBFILE -+dnl Do not use gl_MODULE_INDICATOR([mbfile]) here: we don't want 'struct mbchar' -+dnl to have a different size in lib/ than in tests/. -+AC_DEFINE([GNULIB_MBFILE], [1], -+ [Define to 1 if the gnulib module 'mbfile' is in use.]) -+ - gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H - - if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ -diff --git a/lib/linebuffer.h b/lib/linebuffer.h -index ae0d55d..5bf5350 100644 ---- a/lib/linebuffer.h -+++ b/lib/linebuffer.h -@@ -22,6 +22,11 @@ - # include "idx.h" - # include - -+/* Get mbstate_t. */ -+# if HAVE_WCHAR_H -+# include -+# endif -+ - /* A 'struct linebuffer' holds a line of text. */ - - struct linebuffer -@@ -29,6 +34,9 @@ struct linebuffer - idx_t size; /* Allocated. */ - idx_t length; /* Used. */ - char *buffer; -+# if HAVE_WCHAR_H -+ mbstate_t state; -+# endif - }; - - /* Initialize linebuffer LINEBUFFER for use. */ -diff --git a/lib/mbchar.c b/lib/mbchar.c -new file mode 100644 -index 0000000..d94b7c3 ---- /dev/null -+++ b/lib/mbchar.c -@@ -0,0 +1,23 @@ -+/* Copyright (C) 2001, 2006, 2009-2024 Free Software Foundation, Inc. -+ -+ This file is free software: you can redistribute it and/or modify -+ it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ This file is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public License -+ along with this program. If not, see . */ -+ -+ -+#include -+ -+#define MBCHAR_INLINE _GL_EXTERN_INLINE -+ -+#include -+ -+#include "mbchar.h" -diff --git a/lib/mbchar.h b/lib/mbchar.h -new file mode 100644 -index 0000000..c06ef11 ---- /dev/null -+++ b/lib/mbchar.h -@@ -0,0 +1,373 @@ -+/* Multibyte character data type. -+ Copyright (C) 2001, 2005-2007, 2009-2024 Free Software Foundation, Inc. -+ -+ This file is free software: you can redistribute it and/or modify -+ it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ This file is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public License -+ along with this program. If not, see . */ -+ -+/* Written by Bruno Haible . */ -+ -+/* A multibyte character is a short subsequence of a char* string, -+ representing a single 32-bit wide character. -+ -+ We use multibyte characters instead of 32-bit wide characters because -+ of the following goals: -+ 1) correct multibyte handling, i.e. operate according to the LC_CTYPE -+ locale, -+ 2) ease of maintenance, i.e. the maintainer needs not know all details -+ of the ISO C 99 standard, -+ 3) don't fail grossly if the input is not in the encoding set by the -+ locale, because often different encodings are in use in the same -+ countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...), -+ 4) fast in the case of ASCII characters. -+ -+ Multibyte characters are only accessed through the mb* macros. -+ -+ mb_ptr (mbc) -+ return a pointer to the beginning of the multibyte sequence. -+ -+ mb_len (mbc) -+ returns the number of bytes occupied by the multibyte sequence. -+ Always > 0. -+ -+ mb_iseq (mbc, sc) -+ returns true if mbc is the standard ASCII character sc. -+ -+ mb_isnul (mbc) -+ returns true if mbc is the nul character. -+ -+ mb_cmp (mbc1, mbc2) -+ returns a positive, zero, or negative value depending on whether mbc1 -+ sorts after, same or before mbc2. -+ -+ mb_casecmp (mbc1, mbc2) -+ returns a positive, zero, or negative value depending on whether mbc1 -+ sorts after, same or before mbc2, modulo upper/lowercase conversion. -+ -+ mb_equal (mbc1, mbc2) -+ returns true if mbc1 and mbc2 are equal. -+ -+ mb_caseequal (mbc1, mbc2) -+ returns true if mbc1 and mbc2 are equal modulo upper/lowercase conversion. -+ -+ mb_isalnum (mbc) -+ returns true if mbc is alphanumeric. -+ -+ mb_isalpha (mbc) -+ returns true if mbc is alphabetic. -+ -+ mb_isascii(mbc) -+ returns true if mbc is plain ASCII. -+ -+ mb_isblank (mbc) -+ returns true if mbc is a blank. -+ -+ mb_iscntrl (mbc) -+ returns true if mbc is a control character. -+ -+ mb_isdigit (mbc) -+ returns true if mbc is a decimal digit. -+ -+ mb_isgraph (mbc) -+ returns true if mbc is a graphic character. -+ -+ mb_islower (mbc) -+ returns true if mbc is lowercase. -+ -+ mb_isprint (mbc) -+ returns true if mbc is a printable character. -+ -+ mb_ispunct (mbc) -+ returns true if mbc is a punctuation character. -+ -+ mb_isspace (mbc) -+ returns true if mbc is a space character. -+ -+ mb_isupper (mbc) -+ returns true if mbc is uppercase. -+ -+ mb_isxdigit (mbc) -+ returns true if mbc is a hexadecimal digit. -+ -+ mb_width (mbc) -+ returns the number of columns on the output device occupied by mbc. -+ Always >= 0. -+ -+ mb_putc (mbc, stream) -+ outputs mbc on stream, a byte oriented FILE stream opened for output. -+ -+ mb_setascii (&mbc, sc) -+ assigns the standard ASCII character sc to mbc. -+ (Only available if the 'mbfile' module is in use.) -+ -+ mb_copy (&destmbc, &srcmbc) -+ copies srcmbc to destmbc. -+ -+ Here are the function prototypes of the macros. -+ -+ extern const char * mb_ptr (const mbchar_t mbc); -+ extern size_t mb_len (const mbchar_t mbc); -+ extern bool mb_iseq (const mbchar_t mbc, char sc); -+ extern bool mb_isnul (const mbchar_t mbc); -+ extern int mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2); -+ extern int mb_casecmp (const mbchar_t mbc1, const mbchar_t mbc2); -+ extern bool mb_equal (const mbchar_t mbc1, const mbchar_t mbc2); -+ extern bool mb_caseequal (const mbchar_t mbc1, const mbchar_t mbc2); -+ extern bool mb_isalnum (const mbchar_t mbc); -+ extern bool mb_isalpha (const mbchar_t mbc); -+ extern bool mb_isascii (const mbchar_t mbc); -+ extern bool mb_isblank (const mbchar_t mbc); -+ extern bool mb_iscntrl (const mbchar_t mbc); -+ extern bool mb_isdigit (const mbchar_t mbc); -+ extern bool mb_isgraph (const mbchar_t mbc); -+ extern bool mb_islower (const mbchar_t mbc); -+ extern bool mb_isprint (const mbchar_t mbc); -+ extern bool mb_ispunct (const mbchar_t mbc); -+ extern bool mb_isspace (const mbchar_t mbc); -+ extern bool mb_isupper (const mbchar_t mbc); -+ extern bool mb_isxdigit (const mbchar_t mbc); -+ extern int mb_width (const mbchar_t mbc); -+ extern void mb_putc (const mbchar_t mbc, FILE *stream); -+ extern void mb_setascii (mbchar_t *new, char sc); -+ extern void mb_copy (mbchar_t *new, const mbchar_t *old); -+ */ -+ -+#ifndef _MBCHAR_H -+#define _MBCHAR_H 1 -+ -+/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE. */ -+#if !_GL_CONFIG_H_INCLUDED -+ #error "Please include config.h first." -+#endif -+ -+#include -+#include -+ -+_GL_INLINE_HEADER_BEGIN -+#ifndef MBCHAR_INLINE -+# define MBCHAR_INLINE _GL_INLINE -+#endif -+ -+/* The longest multibyte characters, nowadays, are 4 bytes long. -+ Regardless of the values of MB_CUR_MAX and MB_LEN_MAX. */ -+#define MBCHAR_BUF_SIZE 4 -+ -+struct mbchar -+{ -+ const char *ptr; /* pointer to current character */ -+ size_t bytes; /* number of bytes of current character, > 0 */ -+ bool wc_valid; /* true if wc is a valid 32-bit wide character */ -+ char32_t wc; /* if wc_valid: the current character */ -+#if defined GNULIB_MBFILE -+ char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */ -+#endif -+}; -+ -+/* EOF (not a real character) is represented with bytes = 0 and -+ wc_valid = false. */ -+ -+typedef struct mbchar mbchar_t; -+ -+/* Access the current character. */ -+#define mb_ptr(mbc) ((mbc).ptr) -+#define mb_len(mbc) ((mbc).bytes) -+ -+/* Comparison of characters. */ -+#define mb_iseq(mbc, sc) ((mbc).wc_valid && (mbc).wc == (sc)) -+#define mb_isnul(mbc) ((mbc).wc_valid && (mbc).wc == 0) -+#define mb_cmp(mbc1, mbc2) \ -+ ((mbc1).wc_valid \ -+ ? ((mbc2).wc_valid \ -+ ? _GL_CMP ((mbc1).wc, (mbc2).wc) \ -+ : -1) \ -+ : ((mbc2).wc_valid \ -+ ? 1 \ -+ : (mbc1).bytes == (mbc2).bytes \ -+ ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \ -+ : (mbc1).bytes < (mbc2).bytes \ -+ ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \ -+ : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1))) -+#define mb_casecmp(mbc1, mbc2) \ -+ ((mbc1).wc_valid \ -+ ? ((mbc2).wc_valid \ -+ ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc)) \ -+ : -1) \ -+ : ((mbc2).wc_valid \ -+ ? 1 \ -+ : (mbc1).bytes == (mbc2).bytes \ -+ ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \ -+ : (mbc1).bytes < (mbc2).bytes \ -+ ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \ -+ : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1))) -+#define mb_equal(mbc1, mbc2) \ -+ ((mbc1).wc_valid && (mbc2).wc_valid \ -+ ? (mbc1).wc == (mbc2).wc \ -+ : (mbc1).bytes == (mbc2).bytes \ -+ && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) -+#define mb_caseequal(mbc1, mbc2) \ -+ ((mbc1).wc_valid && (mbc2).wc_valid \ -+ ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc) \ -+ : (mbc1).bytes == (mbc2).bytes \ -+ && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) -+ -+/* , classification. */ -+#define mb_isascii(mbc) \ -+ ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127) -+#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc)) -+#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc)) -+#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc)) -+#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc)) -+#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc)) -+#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc)) -+#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc)) -+#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc)) -+#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc)) -+#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc)) -+#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc)) -+#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc)) -+ -+/* Extra function. */ -+ -+/* Unprintable characters appear as a small box of width 1. */ -+#define MB_UNPRINTABLE_WIDTH 1 -+ -+MBCHAR_INLINE int -+mb_width_aux (char32_t wc) -+{ -+ int w = c32width (wc); -+ /* For unprintable characters, arbitrarily return 0 for control characters -+ and MB_UNPRINTABLE_WIDTH otherwise. */ -+ return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH); -+} -+ -+#define mb_width(mbc) \ -+ ((mbc).wc_valid ? mb_width_aux ((mbc).wc) : MB_UNPRINTABLE_WIDTH) -+ -+/* Output. */ -+#define mb_putc(mbc, stream) fwrite ((mbc).ptr, 1, (mbc).bytes, (stream)) -+ -+#if defined GNULIB_MBFILE -+/* Assignment. */ -+# define mb_setascii(mbc, sc) \ -+ ((mbc)->ptr = (mbc)->buf, (mbc)->bytes = 1, (mbc)->wc_valid = 1, \ -+ (mbc)->wc = (mbc)->buf[0] = (sc)) -+#endif -+ -+/* Copying a character. */ -+MBCHAR_INLINE void -+mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc) -+{ -+#if defined GNULIB_MBFILE -+ if (old_mbc->ptr == &old_mbc->buf[0]) -+ { -+ memcpy (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes); -+ new_mbc->ptr = &new_mbc->buf[0]; -+ } -+ else -+#endif -+ new_mbc->ptr = old_mbc->ptr; -+ new_mbc->bytes = old_mbc->bytes; -+ if ((new_mbc->wc_valid = old_mbc->wc_valid)) -+ new_mbc->wc = old_mbc->wc; -+} -+ -+ -+/* is_basic(c) tests whether the single-byte character c is -+ - in the ISO C "basic character set" or is one of '@', '$', and '`' -+ which ISO C 23 § 5.2.1.1.(1) guarantees to be single-byte and in -+ practice are safe to treat as basic in the execution character set, -+ or -+ - in the POSIX "portable character set", which -+ -+ equally guarantees to be single-byte. -+ This is a convenience function, and is in this file only to share code -+ between mbiter.h, mbuiter.h, and mbfile.h. */ -+#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ -+ && ('$' == 36) && ('%' == 37) && ('&' == 38) && ('\'' == 39) \ -+ && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) \ -+ && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) \ -+ && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) \ -+ && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) \ -+ && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) \ -+ && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) \ -+ && ('@' == 64) && ('A' == 65) && ('B' == 66) && ('C' == 67) \ -+ && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) \ -+ && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) \ -+ && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) \ -+ && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) \ -+ && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) \ -+ && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) \ -+ && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) \ -+ && ('`' == 96) && ('a' == 97) && ('b' == 98) && ('c' == 99) \ -+ && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) \ -+ && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) \ -+ && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) \ -+ && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) \ -+ && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) \ -+ && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) \ -+ && ('|' == 124) && ('}' == 125) && ('~' == 126) -+/* The character set is ISO-646, not EBCDIC. */ -+# define IS_BASIC_ASCII 1 -+ -+/* All locale encodings (see localcharset.h) map the characters 0x00..0x7F -+ to U+0000..U+007F, like ASCII, except for -+ CP864 different mapping of '%' -+ SHIFT_JIS different mappings of 0x5C, 0x7E -+ JOHAB different mapping of 0x5C -+ However, these characters in the range 0x20..0x7E are in the ISO C -+ "basic character set" and in the POSIX "portable character set", which -+ ISO C and POSIX guarantee to be single-byte. Thus, locales with these -+ encodings are not POSIX compliant. And they are most likely not in use -+ any more (as of 2023). */ -+# define is_basic(c) ((unsigned char) (c) < 0x80) -+ -+#else -+ -+MBCHAR_INLINE bool -+is_basic (char c) -+{ -+ switch (c) -+ { -+ case '\0': -+ case '\007': case '\010': -+ case '\t': case '\n': case '\v': case '\f': case '\r': -+ case ' ': case '!': case '"': case '#': case '$': case '%': -+ case '&': case '\'': case '(': case ')': case '*': -+ case '+': case ',': case '-': case '.': case '/': -+ case '0': case '1': case '2': case '3': case '4': -+ case '5': case '6': case '7': case '8': case '9': -+ case ':': case ';': case '<': case '=': case '>': -+ case '?': case '@': -+ case 'A': case 'B': case 'C': case 'D': case 'E': -+ case 'F': case 'G': case 'H': case 'I': case 'J': -+ case 'K': case 'L': case 'M': case 'N': case 'O': -+ case 'P': case 'Q': case 'R': case 'S': case 'T': -+ case 'U': case 'V': case 'W': case 'X': case 'Y': -+ case 'Z': -+ case '[': case '\\': case ']': case '^': case '_': case '`': -+ case 'a': case 'b': case 'c': case 'd': case 'e': -+ case 'f': case 'g': case 'h': case 'i': case 'j': -+ case 'k': case 'l': case 'm': case 'n': case 'o': -+ case 'p': case 'q': case 'r': case 's': case 't': -+ case 'u': case 'v': case 'w': case 'x': case 'y': -+ case 'z': case '{': case '|': case '}': case '~': -+ return 1; -+ default: -+ return 0; -+ } -+} -+ -+#endif -+ -+_GL_INLINE_HEADER_END -+ -+#endif /* _MBCHAR_H */ -diff --git a/lib/mbfile.c b/lib/mbfile.c -new file mode 100644 -index 0000000..8d2957b ---- /dev/null -+++ b/lib/mbfile.c -@@ -0,0 +1,20 @@ -+/* Multibyte character I/O: macros for multi-byte encodings. -+ Copyright (C) 2012-2023 Free Software Foundation, Inc. -+ -+ This file is free software: you can redistribute it and/or modify -+ it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation, either version 3 of the -+ License, or (at your option) any later version. -+ -+ This file is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public License -+ along with this program. If not, see . */ -+ -+#include -+ -+#define MBFILE_INLINE _GL_EXTERN_INLINE -+#include "mbfile.h" -diff --git a/lib/mbfile.h b/lib/mbfile.h -new file mode 100644 -index 0000000..ad61c19 ---- /dev/null -+++ b/lib/mbfile.h -@@ -0,0 +1,267 @@ -+/* Multibyte character I/O: macros for multi-byte encodings. -+ Copyright (C) 2001, 2005, 2009-2023 Free Software Foundation, Inc. -+ -+ This file is free software: you can redistribute it and/or modify -+ it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation, either version 3 of the -+ License, or (at your option) any later version. -+ -+ This file is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public License -+ along with this program. If not, see . */ -+ -+/* Written by Mitsuru Chinen -+ and Bruno Haible . */ -+ -+/* The macros in this file implement multi-byte character input from a -+ stream. -+ -+ mb_file_t -+ is the type for multibyte character input stream, usable for variable -+ declarations. -+ -+ mbf_char_t -+ is the type for multibyte character or EOF, usable for variable -+ declarations. -+ -+ mbf_init (mbf, stream) -+ initializes the MB_FILE for reading from stream. -+ -+ mbf_getc (mbc, mbf) -+ reads the next multibyte character from mbf and stores it in mbc. -+ -+ mb_iseof (mbc) -+ returns true if mbc represents the EOF value. -+ -+ Here are the function prototypes of the macros. -+ -+ extern void mbf_init (mb_file_t mbf, FILE *stream); -+ extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); -+ extern bool mb_iseof (const mbf_char_t mbc); -+ */ -+ -+#ifndef _MBFILE_H -+#define _MBFILE_H 1 -+ -+/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE. */ -+#if !_GL_CONFIG_H_INCLUDED -+ #error "Please include config.h first." -+#endif -+ -+#include -+#include -+#include -+#include -+ -+#include "mbchar.h" -+ -+_GL_INLINE_HEADER_BEGIN -+#ifndef MBFILE_INLINE -+# define MBFILE_INLINE _GL_INLINE -+#endif -+ -+struct mbfile_multi { -+ FILE *fp; -+ bool eof_seen; -+ bool have_pushback; -+ mbstate_t state; -+ unsigned int bufcount; -+ char buf[MBCHAR_BUF_SIZE]; -+ struct mbchar pushback; -+}; -+ -+MBFILE_INLINE void -+mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) -+{ -+ unsigned int new_bufcount; -+ size_t bytes; -+ -+ /* If EOF has already been seen, don't use getc. This matters if -+ mbf->fp is connected to an interactive tty. */ -+ if (mbf->eof_seen) -+ goto eof; -+ -+ /* Return character pushed back, if there is one. */ -+ if (mbf->have_pushback) -+ { -+ mb_copy (mbc, &mbf->pushback); -+ mbf->have_pushback = false; -+ return; -+ } -+ -+ new_bufcount = mbf->bufcount; -+ -+ /* If mbf->state is not in an initial state, some more 32-bit wide character -+ may be hiding in the state. We need to call mbrtoc32 again. */ -+ #if GNULIB_MBRTOC32_REGULAR -+ assert (mbsinit (&mbf->state)); -+ #else -+ if (mbsinit (&mbf->state)) -+ #endif -+ { -+ /* Before using mbrtoc32, we need at least one byte. */ -+ if (new_bufcount == 0) -+ { -+ int c = getc (mbf->fp); -+ if (c == EOF) -+ { -+ mbf->eof_seen = true; -+ goto eof; -+ } -+ mbf->buf[0] = (unsigned char) c; -+ new_bufcount++; -+ } -+ -+ /* Handle most ASCII characters quickly, without calling mbrtoc32(). */ -+ if (new_bufcount == 1 && is_basic (mbf->buf[0])) -+ { -+ /* These characters are part of the POSIX portable character set. -+ For most of them, namely those in the ISO C basic character set, -+ ISO C 99 guarantees that their wide character code is identical to -+ their char code. For the few other ones, this is the case as well, -+ in all locale encodings that are in use. The 32-bit wide character -+ code is the same as well. */ -+ mbc->wc = mbc->buf[0] = mbf->buf[0]; -+ mbc->wc_valid = true; -+ mbc->ptr = &mbc->buf[0]; -+ mbc->bytes = 1; -+ mbf->bufcount = 0; -+ return; -+ } -+ } -+ -+ /* Use mbrtoc32 on an increasing number of bytes. Read only as many bytes -+ from mbf->fp as needed. This is needed to give reasonable interactive -+ behaviour when mbf->fp is connected to an interactive tty. */ -+ for (;;) -+ { -+ /* Feed the bytes one by one into mbrtoc32. */ -+ bytes = mbrtoc32 (&mbc->wc, &mbf->buf[mbf->bufcount], new_bufcount - mbf->bufcount, &mbf->state); -+ -+ if (bytes == (size_t) -1) -+ { -+ /* An invalid multibyte sequence was encountered. */ -+ mbf->bufcount = new_bufcount; -+ /* Return a single byte. */ -+ bytes = 1; -+ mbc->wc_valid = false; -+ /* Allow the next invocation to continue from a sane state. */ -+ mbszero (&mbf->state); -+ break; -+ } -+ else if (bytes == (size_t) -2) -+ { -+ /* An incomplete multibyte character. */ -+ mbf->bufcount = new_bufcount; -+ if (mbf->bufcount == MBCHAR_BUF_SIZE) -+ { -+ /* An overlong incomplete multibyte sequence was encountered. */ -+ /* Return a single byte. */ -+ bytes = 1; -+ mbc->wc_valid = false; -+ break; -+ } -+ else -+ { -+ /* Read one more byte and retry mbrtoc32. */ -+ int c = getc (mbf->fp); -+ if (c == EOF) -+ { -+ /* An incomplete multibyte character at the end. */ -+ mbf->eof_seen = true; -+ bytes = new_bufcount; -+ mbc->wc_valid = false; -+ break; -+ } -+ mbf->buf[new_bufcount] = (unsigned char) c; -+ new_bufcount++; -+ } -+ } -+ else -+ { -+ #if !GNULIB_MBRTOC32_REGULAR -+ if (bytes == (size_t) -3) -+ { -+ /* The previous multibyte sequence produced an additional 32-bit -+ wide character. */ -+ mbf->bufcount = new_bufcount; -+ bytes = 0; -+ } -+ else -+ #endif -+ { -+ bytes = mbf->bufcount + bytes; -+ mbf->bufcount = new_bufcount; -+ if (bytes == 0) -+ { -+ /* A null 32-bit wide character was encountered. */ -+ bytes = 1; -+ assert (mbf->buf[0] == '\0'); -+ assert (mbc->wc == 0); -+ } -+ } -+ mbc->wc_valid = true; -+ break; -+ } -+ } -+ -+ /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ -+ mbc->ptr = &mbc->buf[0]; -+ memcpy (&mbc->buf[0], &mbf->buf[0], bytes); -+ mbc->bytes = bytes; -+ -+ mbf->bufcount -= bytes; -+ if (mbf->bufcount > 0) -+ { -+ /* It's not worth calling memmove() for so few bytes. */ -+ unsigned int count = mbf->bufcount; -+ char *p = &mbf->buf[0]; -+ -+ do -+ { -+ *p = *(p + bytes); -+ p++; -+ } -+ while (--count > 0); -+ } -+ return; -+ -+eof: -+ /* An mbchar_t with bytes == 0 is used to indicate EOF. */ -+ mbc->ptr = NULL; -+ mbc->bytes = 0; -+ mbc->wc_valid = false; -+ return; -+} -+ -+MBFILE_INLINE void -+mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) -+{ -+ mb_copy (&mbf->pushback, mbc); -+ mbf->have_pushback = true; -+} -+ -+typedef struct mbfile_multi mb_file_t; -+ -+typedef mbchar_t mbf_char_t; -+ -+#define mbf_init(mbf, stream) \ -+ ((mbf).fp = (stream), \ -+ (mbf).eof_seen = false, \ -+ (mbf).have_pushback = false, \ -+ mbszero (&(mbf).state), \ -+ (mbf).bufcount = 0) -+ -+#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) -+ -+#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) -+ -+#define mb_iseof(mbc) ((mbc).bytes == 0) -+ -+_GL_INLINE_HEADER_END -+ -+#endif /* _MBFILE_H */ -diff --git a/m4/mbchar.m4 b/m4/mbchar.m4 -new file mode 100644 -index 0000000..471e8c4 ---- /dev/null -+++ b/m4/mbchar.m4 -@@ -0,0 +1,13 @@ -+# mbchar.m4 serial 9 -+dnl Copyright (C) 2005-2007, 2009-2024 Free Software Foundation, Inc. -+dnl This file is free software; the Free Software Foundation -+dnl gives unlimited permission to copy and/or distribute it, -+dnl with or without modifications, as long as this notice is preserved. -+ -+dnl autoconf tests required for use of mbchar.m4 -+dnl From Bruno Haible. -+ -+AC_DEFUN([gl_MBCHAR], -+[ -+ AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS]) -+]) -diff --git a/m4/mbfile.m4 b/m4/mbfile.m4 -new file mode 100644 -index 0000000..83068a9 ---- /dev/null -+++ b/m4/mbfile.m4 -@@ -0,0 +1,14 @@ -+# mbfile.m4 serial 7 -+dnl Copyright (C) 2005, 2008-2023 Free Software Foundation, Inc. -+dnl This file is free software; the Free Software Foundation -+dnl gives unlimited permission to copy and/or distribute it, -+dnl with or without modifications, as long as this notice is preserved. -+ -+dnl autoconf tests required for use of mbfile.h -+dnl From Bruno Haible. -+ -+AC_DEFUN([gl_MBFILE], -+[ -+ AC_REQUIRE([AC_TYPE_MBSTATE_T]) -+ : -+]) -diff --git a/src/cut.c b/src/cut.c -index 061e09c..6d10425 100644 ---- a/src/cut.c -+++ b/src/cut.c -@@ -27,6 +27,11 @@ - #include - #include - #include -+ -+/* Get mbstate_t, mbrtowc(). */ -+#if HAVE_WCHAR_H -+# include -+#endif - #include "system.h" - - #include "assure.h" -@@ -35,6 +40,18 @@ - - #include "set-fields.h" - -+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC -+ installation; work around this configuration error. */ -+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 -+# undef MB_LEN_MAX -+# define MB_LEN_MAX 16 -+#endif -+ -+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ -+#if HAVE_MBRTOWC && defined mbstate_t -+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) -+#endif -+ - /* The official name of this program (e.g., no 'g' prefix). */ - #define PROGRAM_NAME "cut" - -@@ -51,6 +68,52 @@ - } \ - while (0) - -+/* Refill the buffer BUF to get a multibyte character. */ -+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \ -+ do \ -+ { \ -+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \ -+ { \ -+ memmove (BUF, BUFPOS, BUFLEN); \ -+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \ -+ BUFPOS = BUF; \ -+ } \ -+ } \ -+ while (0) -+ -+/* Get wide character on BUFPOS. BUFPOS is not included after that. -+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */ -+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \ -+ do \ -+ { \ -+ mbstate_t state_bak; \ -+ \ -+ if (BUFLEN < 1) \ -+ { \ -+ WC = WEOF; \ -+ break; \ -+ } \ -+ \ -+ /* Get a wide character. */ \ -+ CONVFAIL = false; \ -+ state_bak = STATE; \ -+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \ -+ \ -+ switch (MBLENGTH) \ -+ { \ -+ case (size_t)-1: \ -+ case (size_t)-2: \ -+ CONVFAIL = true; \ -+ STATE = state_bak; \ -+ /* Fall througn. */ \ -+ \ -+ case 0: \ -+ MBLENGTH = 1; \ -+ break; \ -+ } \ -+ } \ -+ while (0) -+ - - /* Pointer inside RP. When checking if a byte or field is selected - by a finite range, we check if it is between CURRENT_RP.LO -@@ -58,6 +121,9 @@ - CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ - static struct field_range_pair *current_rp; - -+/* Length of the delimiter given as argument to -d. */ -+size_t delimlen; -+ - /* This buffer is used to support the semantics of the -s option - (or lack of same) when the specified field list includes (does - not include) the first field. In both of those cases, the entire -@@ -70,6 +136,29 @@ static char *field_1_buffer; - /* The number of bytes allocated for FIELD_1_BUFFER. */ - static size_t field_1_bufsize; - -+enum operating_mode -+ { -+ undefined_mode, -+ -+ /* Output bytes that are at the given positions. */ -+ byte_mode, -+ -+ /* Output characters that are at the given positions. */ -+ character_mode, -+ -+ /* Output the given delimiter-separated fields. */ -+ field_mode -+ }; -+ -+static enum operating_mode operating_mode; -+ -+/* If nonzero, when in byte mode, don't split multibyte characters. */ -+static int byte_mode_character_aware; -+ -+/* If nonzero, the function for single byte locale is work -+ if this program runs on multibyte locale. */ -+static int force_singlebyte_mode; -+ - /* If true, do not output lines containing no delimiter characters. - Otherwise, all such lines are printed. This option is valid only - with field mode. */ -@@ -81,10 +170,16 @@ static bool complement; - - /* The delimiter character for field mode. */ - static unsigned char delim; -+#if HAVE_WCHAR_H -+static wchar_t wcdelim; -+#endif - - /* The delimiter for each line/record. */ - static unsigned char line_delim = '\n'; - -+/* True if the --output-delimiter=STRING option was specified. */ -+static bool output_delimiter_specified; -+ - /* The length of output_delimiter_string. */ - static size_t output_delimiter_length; - -@@ -92,9 +187,6 @@ static size_t output_delimiter_length; - string consisting of the input delimiter. */ - static char *output_delimiter_string; - --/* The output delimiter string contents, if the default. */ --static char output_delimiter_default[1]; -- - /* True if we have ever read standard input. */ - static bool have_read_stdin; - -@@ -148,7 +240,7 @@ Print selected parts of lines from each FILE to standard output.\n\ - -f, --fields=LIST select only these fields; also print any line\n\ - that contains no delimiter character, unless\n\ - the -s option is specified\n\ -- -n (ignored)\n\ -+ -n with -b: don't split multibyte characters\n\ - "), stdout); - fputs (_("\ - --complement complement the set of selected bytes, characters\n\ -@@ -252,7 +344,7 @@ cut_bytes (FILE *stream) - next_item (&byte_idx); - if (print_kth (byte_idx)) - { -- if (output_delimiter_string != output_delimiter_default) -+ if (output_delimiter_specified) - { - if (print_delimiter && is_range_start_index (byte_idx)) - { -@@ -271,6 +363,82 @@ cut_bytes (FILE *stream) - } - } - -+#if HAVE_MBRTOWC -+/* This function is in use for the following case. -+ -+ 1. Read from the stream STREAM, printing to standard output any selected -+ characters. -+ -+ 2. Read from stream STREAM, printing to standard output any selected bytes, -+ without splitting multibyte characters. */ -+ -+static void -+cut_characters_or_cut_bytes_no_split (FILE *stream) -+{ -+ uintmax_t idx; /* number of bytes or characters in the line so far. */ -+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ -+ char *bufpos; /* Next read position of BUF. */ -+ size_t buflen; /* The length of the byte sequence in buf. */ -+ wint_t wc; /* A gotten wide character. */ -+ size_t mblength; /* The byte size of a multibyte character which shows -+ as same character as WC. */ -+ mbstate_t state; /* State of the stream. */ -+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ -+ /* Whether to begin printing delimiters between ranges for the current line. -+ Set after we've begun printing data corresponding to the first range. */ -+ bool print_delimiter = false; -+ -+ idx = 0; -+ buflen = 0; -+ bufpos = buf; -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ current_rp = frp; -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail); -+ (void) convfail; /* ignore unused */ -+ -+ if (wc == WEOF) -+ { -+ if (idx > 0) -+ putchar (line_delim); -+ break; -+ } -+ else if (wc == line_delim) -+ { -+ putchar (line_delim); -+ idx = 0; -+ print_delimiter = false; -+ current_rp = frp; -+ } -+ else -+ { -+ next_item (&idx); -+ if (print_kth (idx)) -+ { -+ if (output_delimiter_specified) -+ { -+ if (print_delimiter && is_range_start_index (idx)) -+ { -+ fwrite (output_delimiter_string, sizeof (char), -+ output_delimiter_length, stdout); -+ } -+ print_delimiter = true; -+ } -+ fwrite (bufpos, mblength, sizeof(char), stdout); -+ } -+ } -+ -+ buflen -= mblength; -+ bufpos += mblength; -+ } -+} -+#endif -+ - /* Read from stream STREAM, printing to standard output any selected fields. */ - - static void -@@ -433,11 +601,218 @@ cut_fields (FILE *stream) - } - } - --/* Process file FILE to standard output, using CUT_STREAM. -+#if HAVE_MBRTOWC -+static void -+cut_fields_mb (FILE *stream) -+{ -+ int c; -+ uintmax_t field_idx; -+ int found_any_selected_field; -+ int buffer_first_field; -+ int empty_input; -+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ -+ char *bufpos; /* Next read position of BUF. */ -+ size_t buflen; /* The length of the byte sequence in buf. */ -+ wint_t wc = 0; /* A gotten wide character. */ -+ size_t mblength; /* The byte size of a multibyte character which shows -+ as same character as WC. */ -+ mbstate_t state; /* State of the stream. */ -+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ -+ -+ current_rp = frp; -+ -+ found_any_selected_field = 0; -+ field_idx = 1; -+ bufpos = buf; -+ buflen = 0; -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ c = getc (stream); -+ empty_input = (c == EOF); -+ if (c != EOF) -+ { -+ ungetc (c, stream); -+ wc = 0; -+ } -+ else -+ wc = WEOF; -+ -+ /* To support the semantics of the -s flag, we may have to buffer -+ all of the first field to determine whether it is `delimited.' -+ But that is unnecessary if all non-delimited lines must be printed -+ and the first field has been selected, or if non-delimited lines -+ must be suppressed and the first field has *not* been selected. -+ That is because a non-delimited line has exactly one field. */ -+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); -+ -+ while (1) -+ { -+ if (field_idx == 1 && buffer_first_field) -+ { -+ int len = 0; -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER -+ (wc, bufpos, buflen, mblength, state, convfail); -+ -+ if (wc == WEOF) -+ break; -+ -+ field_1_buffer = xrealloc (field_1_buffer, len + mblength); -+ memcpy (field_1_buffer + len, bufpos, mblength); -+ len += mblength; -+ buflen -= mblength; -+ bufpos += mblength; -+ -+ if (!convfail && (wc == line_delim || wc == wcdelim)) -+ break; -+ } -+ -+ if (len <= 0 && wc == WEOF) -+ break; -+ -+ /* If the first field extends to the end of line (it is not -+ delimited) and we are printing all non-delimited lines, -+ print this one. */ -+ if (convfail || (!convfail && wc != wcdelim)) -+ { -+ if (suppress_non_delimited) -+ { -+ /* Empty. */ -+ } -+ else -+ { -+ fwrite (field_1_buffer, sizeof (char), len, stdout); -+ /* Make sure the output line is newline terminated. */ -+ if (convfail || (!convfail && wc != line_delim)) -+ putchar (line_delim); -+ } -+ continue; -+ } -+ -+ if (print_kth (1)) -+ { -+ /* Print the field, but not the trailing delimiter. */ -+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout); -+ found_any_selected_field = 1; -+ } -+ next_item (&field_idx); -+ } -+ -+ if (wc != WEOF) -+ { -+ if (print_kth (field_idx)) -+ { -+ if (found_any_selected_field) -+ { -+ fwrite (output_delimiter_string, sizeof (char), -+ output_delimiter_length, stdout); -+ } -+ found_any_selected_field = 1; -+ } -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER -+ (wc, bufpos, buflen, mblength, state, convfail); -+ -+ if (wc == WEOF) -+ break; -+ else if (!convfail && (wc == wcdelim || wc == line_delim)) -+ { -+ buflen -= mblength; -+ bufpos += mblength; -+ break; -+ } -+ -+ if (print_kth (field_idx)) -+ fwrite (bufpos, mblength, sizeof(char), stdout); -+ -+ buflen -= mblength; -+ bufpos += mblength; -+ } -+ } -+ -+ if ((!convfail || wc == line_delim) && buflen < 1) -+ wc = WEOF; -+ -+ if (!convfail && wc == wcdelim) -+ next_item (&field_idx); -+ else if (wc == WEOF || (!convfail && wc == line_delim)) -+ { -+ if (found_any_selected_field -+ || (!empty_input && !(suppress_non_delimited && field_idx == 1))) -+ putchar (line_delim); -+ if (wc == WEOF) -+ break; -+ field_idx = 1; -+ current_rp = frp; -+ found_any_selected_field = 0; -+ } -+ } -+} -+#endif -+ -+static void -+cut_stream (FILE *stream) -+{ -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) -+ { -+ switch (operating_mode) -+ { -+ case byte_mode: -+ if (byte_mode_character_aware) -+ cut_characters_or_cut_bytes_no_split (stream); -+ else -+ cut_bytes (stream); -+ break; -+ -+ case character_mode: -+ cut_characters_or_cut_bytes_no_split (stream); -+ break; -+ -+ case field_mode: -+ if (delimlen == 1) -+ { -+ /* Check if we have utf8 multibyte locale, so we can use this -+ optimization because of uniqueness of characters, which is -+ not true for e.g. SJIS */ -+ char * loc = setlocale(LC_CTYPE, NULL); -+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") || -+ strstr (loc, "UTF8") || strstr (loc, "utf8"))) -+ { -+ cut_fields (stream); -+ break; -+ } -+ } -+ cut_fields_mb (stream); -+ break; -+ -+ default: -+ abort (); -+ } -+ } -+ else -+#endif -+ { -+ if (operating_mode == field_mode) -+ cut_fields (stream); -+ else -+ cut_bytes (stream); -+ } -+} -+ -+/* Process file FILE to standard output. - Return true if successful. */ - - static bool --cut_file (char const *file, void (*cut_stream) (FILE *)) -+cut_file (char const *file) - { - FILE *stream; - -@@ -482,8 +857,8 @@ main (int argc, char **argv) - int optc; - bool ok; - bool delim_specified = false; -- bool byte_mode = false; -- char *spec_list_string = nullptr; -+ char *spec_list_string IF_LINT ( = nullptr); -+ char mbdelim[MB_LEN_MAX + 1]; - - initialize_main (&argc, &argv); - set_program_name (argv[0]); -@@ -493,6 +868,8 @@ main (int argc, char **argv) - - atexit (close_stdout); - -+ operating_mode = undefined_mode; -+ - /* By default, all non-delimited lines are printed. */ - suppress_non_delimited = false; - -@@ -505,35 +882,77 @@ main (int argc, char **argv) - switch (optc) - { - case 'b': -- case 'c': - /* Build the byte list. */ -- byte_mode = true; -- FALLTHROUGH; -+ if (operating_mode != undefined_mode) -+ FATAL_ERROR (_("only one type of list may be specified")); -+ operating_mode = byte_mode; -+ spec_list_string = optarg; -+ break; -+ -+ case 'c': -+ /* Build the character list. */ -+ if (operating_mode != undefined_mode) -+ FATAL_ERROR (_("only one type of list may be specified")); -+ operating_mode = character_mode; -+ spec_list_string = optarg; -+ break; -+ - case 'f': - /* Build the field list. */ -- if (spec_list_string) -- FATAL_ERROR (_("only one list may be specified")); -+ if (operating_mode != undefined_mode) -+ FATAL_ERROR (_("only one type of list may be specified")); -+ operating_mode = field_mode; - spec_list_string = optarg; - break; - - case 'd': - /* New delimiter. */ - /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ -- if (optarg[0] != '\0' && optarg[1] != '\0') -- FATAL_ERROR (_("the delimiter must be a single character")); -- delim = optarg[0]; -- delim_specified = true; -+ { -+#if HAVE_MBRTOWC -+ if(MB_CUR_MAX > 1) -+ { -+ mbstate_t state; -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state); -+ -+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2) -+ ++force_singlebyte_mode; -+ else -+ { -+ delimlen = (delimlen < 1) ? 1 : delimlen; -+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0') -+ FATAL_ERROR (_("the delimiter must be a single character")); -+ memcpy (mbdelim, optarg, delimlen); -+ mbdelim[delimlen] = '\0'; -+ if (delimlen == 1) -+ delim = *optarg; -+ } -+ } -+ -+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) -+#endif -+ { -+ if (optarg[0] != '\0' && optarg[1] != '\0') -+ FATAL_ERROR (_("the delimiter must be a single character")); -+ delim = (unsigned char) optarg[0]; -+ } -+ delim_specified = true; -+ } - break; - - case OUTPUT_DELIMITER_OPTION: -+ output_delimiter_specified = true; - /* Interpret --output-delimiter='' to mean - 'use the NUL byte as the delimiter.' */ - output_delimiter_length = (optarg[0] == '\0' - ? 1 : strlen (optarg)); -- output_delimiter_string = optarg; -+ output_delimiter_string = xstrdup (optarg); - break; - - case 'n': -+ byte_mode_character_aware = 1; - break; - - case 's': -@@ -555,40 +974,57 @@ main (int argc, char **argv) - } - } - -- if (!spec_list_string) -+ if (operating_mode == undefined_mode) - FATAL_ERROR (_("you must specify a list of bytes, characters, or fields")); - -- if (byte_mode) -- { -- if (delim_specified) -- FATAL_ERROR (_("an input delimiter may be specified only\ -+ if (delim_specified && operating_mode != field_mode) -+ FATAL_ERROR (_("an input delimiter may be specified only\ - when operating on fields")); - -- if (suppress_non_delimited) -- FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ -+ if (suppress_non_delimited && operating_mode != field_mode) -+ FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ - \tonly when operating on fields")); -- } - - set_fields (spec_list_string, -- ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0) -- | (complement ? SETFLD_COMPLEMENT : 0))); -+ ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS) -+ | (complement ? SETFLD_COMPLEMENT : 0) ); - - if (!delim_specified) -- delim = '\t'; -+ { -+ delim = '\t'; -+#ifdef HAVE_MBRTOWC -+ wcdelim = L'\t'; -+ mbdelim[0] = '\t'; -+ mbdelim[1] = '\0'; -+ delimlen = 1; -+#endif -+ } - - if (output_delimiter_string == nullptr) - { -- output_delimiter_default[0] = delim; -- output_delimiter_string = output_delimiter_default; -- output_delimiter_length = 1; -+#ifdef HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) -+ { -+ output_delimiter_string = xstrdup(mbdelim); -+ output_delimiter_length = delimlen; -+ } -+ -+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) -+#endif -+ { -+ static char dummy[2]; -+ dummy[0] = delim; -+ dummy[1] = '\0'; -+ output_delimiter_string = dummy; -+ output_delimiter_length = 1; -+ } - } - -- void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields; - if (optind == argc) -- ok = cut_file ("-", cut_stream); -+ ok = cut_file ("-"); - else - for (ok = true; optind < argc; optind++) -- ok &= cut_file (argv[optind], cut_stream); -+ ok &= cut_file (argv[optind]); - - - if (have_read_stdin && fclose (stdin) == EOF) -diff --git a/src/expand-common.c b/src/expand-common.c -index c95998d..d4386fe 100644 ---- a/src/expand-common.c -+++ b/src/expand-common.c -@@ -19,6 +19,7 @@ - #include - #include - #include -+#include - #include "system.h" - #include "fadvise.h" - #include "quote.h" -@@ -123,6 +124,119 @@ set_increment_size (uintmax_t tabval) - return ok; - } - -+extern int -+set_utf_locale (void) -+{ -+ /*try using some predefined locale */ -+ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; -+ -+ const int predef_locales_count=3; -+ for (int i=0;ibufcount=0; -+ if (c == 0xEF) -+ { -+ c=fgetc(fp); -+ } -+ else -+ { -+ if (c != EOF) -+ { -+ ungetc(c,fp); -+ } -+ return false; -+ } -+ -+ if (c == 0xBB) -+ { -+ c=fgetc(fp); -+ } -+ else -+ { -+ if ( c!= EOF ) -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->bufcount=1; -+ ungetc(c,fp); -+ return false; -+ } -+ else -+ { -+ ungetc(0xEF,fp); -+ return false; -+ } -+ } -+ if (c == 0xBF) -+ { -+ mbf->bufcount=0; -+ return true; -+ } -+ else -+ { -+ if (c != EOF) -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->buf[1]=(unsigned char) 0xBB; -+ mbf->bufcount=2; -+ ungetc(c,fp); -+ return false; -+ } -+ else -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->bufcount=1; -+ ungetc(0xBB,fp); -+ return false; -+ } -+ } -+ return false; -+} -+ -+extern void -+print_bom(void) -+{ -+ putc (0xEF, stdout); -+ putc (0xBB, stdout); -+ putc (0xBF, stdout); -+} -+ - /* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - extern void -diff --git a/src/expand-common.h b/src/expand-common.h -index 1a57108..6025652 100644 ---- a/src/expand-common.h -+++ b/src/expand-common.h -@@ -25,6 +25,18 @@ extern size_t max_column_width; - /* The desired exit status. */ - extern int exit_status; - -+extern int -+set_utf_locale (void); -+ -+extern bool -+check_utf_locale(void); -+ -+extern bool -+check_bom(FILE* fp, mb_file_t *mbf); -+ -+extern void -+print_bom(void); -+ - /* Add tab stop TABVAL to the end of 'tab_list'. */ - extern void - add_tab_stop (uintmax_t tabval); -diff --git a/src/expand.c b/src/expand.c -index a6176a9..60b1b8e 100644 ---- a/src/expand.c -+++ b/src/expand.c -@@ -38,6 +38,9 @@ - #include - #include - #include -+ -+#include -+ - #include "system.h" - #include "expand-common.h" - -@@ -96,19 +99,41 @@ expand (void) - { - /* Input stream. */ - FILE *fp = next_file (nullptr); -+ mb_file_t mbf; -+ mbf_char_t c; -+ /* True if the starting locale is utf8. */ -+ bool using_utf_locale; -+ -+ /* True if the first file contains BOM header. */ -+ bool found_bom; -+ using_utf_locale=check_utf_locale(); - - if (!fp) - return; -+ mbf_init (mbf, fp); -+ found_bom=check_bom(fp,&mbf); - -- while (true) -+ if (using_utf_locale == false && found_bom == true) -+ { -+ /*try using some predefined locale */ -+ -+ if (set_utf_locale () != 0) - { -- /* Input character, or EOF. */ -- int c; -+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); -+ } -+ } -+ -+ -+ if (found_bom == true) -+ { -+ print_bom(); -+ } - -+ while (true) -+ { - /* If true, perform translations. */ - bool convert = true; - -- - /* The following variables have valid values only when CONVERT - is true: */ - -@@ -118,17 +143,48 @@ expand (void) - /* Index in TAB_LIST of next tab stop to examine. */ - size_t tab_index = 0; - -- - /* Convert a line of text. */ - - do - { -- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) -- continue; -+ while (true) { -+ mbf_getc (c, mbf); -+ if ((mb_iseof (c)) && (fp = next_file (fp))) -+ { -+ mbf_init (mbf, fp); -+ if (fp!=NULL) -+ { -+ if (check_bom(fp,&mbf)==true) -+ { -+ /*Not the first file - check BOM header*/ -+ if (using_utf_locale==false && found_bom==false) -+ { -+ /*BOM header in subsequent file but not in the first one. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ else -+ { -+ if(using_utf_locale==false && found_bom==true) -+ { -+ /*First file conatined BOM header - locale was switched to UTF -+ *all subsequent files should contain BOM. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ } -+ continue; -+ } -+ else -+ { -+ break; -+ } -+ } -+ - - if (convert) - { -- if (c == '\t') -+ if (mb_iseq (c, '\t')) - { - /* Column the next input tab stop is on. */ - uintmax_t next_tab_column; -@@ -147,32 +203,34 @@ expand (void) - if (putchar (' ') < 0) - write_error (); - -- c = ' '; -+ mb_setascii (&c, ' '); - } -- else if (c == '\b') -+ else if (mb_iseq (c, '\b')) - { - /* Go back one column, and force recalculation of the - next tab stop. */ - column -= !!column; - tab_index -= !!tab_index; - } -- else -+ /* A leading control character could make us trip over. */ -+ else if (!mb_iscntrl (c)) - { -- column++; -+ column += mb_width (c); - if (!column) - error (EXIT_FAILURE, 0, _("input line is too long")); - } - -- convert &= convert_entire_line || !! isblank (c); -+ convert &= convert_entire_line || mb_isblank (c); - } - -- if (c < 0) -+ if (mb_iseof (c)) - return; - -- if (putchar (c) < 0) -+ mb_putc (c, stdout); -+ if (ferror (stdout)) - write_error (); - } -- while (c != '\n'); -+ while (!mb_iseq (c, '\n')); - } - } - -diff --git a/src/fold.c b/src/fold.c -index 941ad11..cf1e747 100644 ---- a/src/fold.c -+++ b/src/fold.c -@@ -23,10 +23,32 @@ - #include - #include - -+/* Get mbstate_t, mbrtowc(), wcwidth(). */ -+#if HAVE_WCHAR_H -+# include -+#endif -+ -+/* Get iswprint(), iswblank(), wcwidth(). */ -+#if HAVE_WCTYPE_H -+# include -+#endif -+ - #include "system.h" - #include "fadvise.h" - #include "xdectoint.h" - -+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC -+ installation; work around this configuration error. */ -+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 -+# undef MB_LEN_MAX -+# define MB_LEN_MAX 16 -+#endif -+ -+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ -+#if HAVE_MBRTOWC && defined mbstate_t -+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) -+#endif -+ - #define TAB_WIDTH 8 - - /* The official name of this program (e.g., no 'g' prefix). */ -@@ -34,20 +56,41 @@ - - #define AUTHORS proper_name ("David MacKenzie") - -+#define FATAL_ERROR(Message) \ -+ do \ -+ { \ -+ error (0, 0, (Message)); \ -+ usage (2); \ -+ } \ -+ while (0) -+ -+enum operating_mode -+{ -+ /* Fold texts by columns that are at the given positions. */ -+ column_mode, -+ -+ /* Fold texts by bytes that are at the given positions. */ -+ byte_mode, -+ -+ /* Fold texts by characters that are at the given positions. */ -+ character_mode, -+}; -+ -+/* The argument shows current mode. (Default: column_mode) */ -+static enum operating_mode operating_mode; -+ - /* If nonzero, try to break on whitespace. */ - static bool break_spaces; - --/* If nonzero, count bytes, not column positions. */ --static bool count_bytes; -- - /* If nonzero, at least one of the files we read was standard input. */ - static bool have_read_stdin; - --static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::"; -+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::"; - - static struct option const longopts[] = - { - {"bytes", no_argument, nullptr, 'b'}, -+ {"characters", no_argument, nullptr, 'c'}, - {"spaces", no_argument, nullptr, 's'}, - {"width", required_argument, nullptr, 'w'}, - {GETOPT_HELP_OPTION_DECL}, -@@ -75,6 +118,7 @@ Wrap input lines in each FILE, writing to standard output.\n\ - - fputs (_("\ - -b, --bytes count bytes rather than columns\n\ -+ -c, --characters count characters rather than columns\n\ - -s, --spaces break at spaces\n\ - -w, --width=WIDTH use WIDTH columns instead of 80\n\ - "), stdout); -@@ -92,7 +136,7 @@ Wrap input lines in each FILE, writing to standard output.\n\ - static size_t - adjust_column (size_t column, char c) - { -- if (!count_bytes) -+ if (operating_mode != byte_mode) - { - if (c == '\b') - { -@@ -115,30 +159,14 @@ adjust_column (size_t column, char c) - to stdout, with maximum line length WIDTH. - Return true if successful. */ - --static bool --fold_file (char const *filename, size_t width) -+static void -+fold_text (FILE *istream, size_t width, int *saved_errno) - { -- FILE *istream; - int c; - size_t column = 0; /* Screen column where next char will go. */ - size_t offset_out = 0; /* Index in 'line_out' for next char. */ - static char *line_out = nullptr; - static size_t allocated_out = 0; -- int saved_errno; -- -- if (STREQ (filename, "-")) -- { -- istream = stdin; -- have_read_stdin = true; -- } -- else -- istream = fopen (filename, "r"); -- -- if (istream == nullptr) -- { -- error (0, errno, "%s", quotef (filename)); -- return false; -- } - - fadvise (istream, FADVISE_SEQUENTIAL); - -@@ -168,6 +196,15 @@ fold_file (char const *filename, size_t width) - bool found_blank = false; - size_t logical_end = offset_out; - -+ /* If LINE_OUT has no wide character, -+ put a new wide character in LINE_OUT -+ if column is bigger than width. */ -+ if (offset_out == 0) -+ { -+ line_out[offset_out++] = c; -+ continue; -+ } -+ - /* Look for the last blank. */ - while (logical_end) - { -@@ -214,13 +251,225 @@ fold_file (char const *filename, size_t width) - line_out[offset_out++] = c; - } - -- saved_errno = errno; -+ *saved_errno = errno; - if (!ferror (istream)) -- saved_errno = 0; -+ *saved_errno = 0; - - if (offset_out) - fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); - -+} -+ -+#if HAVE_MBRTOWC -+static void -+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno) -+{ -+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ -+ size_t buflen = 0; /* The length of the byte sequence in buf. */ -+ char *bufpos = buf; /* Next read position of BUF. */ -+ wint_t wc; /* A gotten wide character. */ -+ size_t mblength; /* The byte size of a multibyte character which shows -+ as same character as WC. */ -+ mbstate_t state, state_bak; /* State of the stream. */ -+ int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */ -+ -+ static char *line_out = NULL; -+ size_t offset_out = 0; /* Index in `line_out' for next char. */ -+ static size_t allocated_out = 0; -+ -+ int increment; -+ size_t column = 0; -+ -+ size_t last_blank_pos; -+ size_t last_blank_column; -+ int is_blank_seen; -+ int last_blank_increment = 0; -+ int is_bs_following_last_blank; -+ size_t bs_following_last_blank_num; -+ int is_cr_after_last_blank; -+ -+#define CLEAR_FLAGS \ -+ do \ -+ { \ -+ last_blank_pos = 0; \ -+ last_blank_column = 0; \ -+ is_blank_seen = 0; \ -+ is_bs_following_last_blank = 0; \ -+ bs_following_last_blank_num = 0; \ -+ is_cr_after_last_blank = 0; \ -+ } \ -+ while (0) -+ -+#define START_NEW_LINE \ -+ do \ -+ { \ -+ putchar ('\n'); \ -+ column = 0; \ -+ offset_out = 0; \ -+ CLEAR_FLAGS; \ -+ } \ -+ while (0) -+ -+ CLEAR_FLAGS; -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ for (;; bufpos += mblength, buflen -= mblength) -+ { -+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream)) -+ { -+ memmove (buf, bufpos, buflen); -+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream); -+ bufpos = buf; -+ } -+ -+ if (buflen < 1) -+ break; -+ -+ /* Get a wide character. */ -+ state_bak = state; -+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state); -+ -+ switch (mblength) -+ { -+ case (size_t)-1: -+ case (size_t)-2: -+ convfail++; -+ state = state_bak; -+ /* Fall through. */ -+ -+ case 0: -+ mblength = 1; -+ break; -+ } -+ -+rescan: -+ if (operating_mode == byte_mode) /* byte mode */ -+ increment = mblength; -+ else if (operating_mode == character_mode) /* character mode */ -+ increment = 1; -+ else /* column mode */ -+ { -+ if (convfail) -+ increment = 1; -+ else -+ { -+ switch (wc) -+ { -+ case L'\n': -+ fwrite (line_out, sizeof(char), offset_out, stdout); -+ START_NEW_LINE; -+ continue; -+ -+ case L'\b': -+ increment = (column > 0) ? -1 : 0; -+ break; -+ -+ case L'\r': -+ increment = -1 * column; -+ break; -+ -+ case L'\t': -+ increment = 8 - column % 8; -+ break; -+ -+ default: -+ increment = wcwidth (wc); -+ increment = (increment < 0) ? 0 : increment; -+ } -+ } -+ } -+ -+ if (column + increment > width && break_spaces && last_blank_pos) -+ { -+ fwrite (line_out, sizeof(char), last_blank_pos, stdout); -+ putchar ('\n'); -+ -+ offset_out = offset_out - last_blank_pos; -+ column = column - last_blank_column + ((is_cr_after_last_blank) -+ ? last_blank_increment : bs_following_last_blank_num); -+ memmove (line_out, line_out + last_blank_pos, offset_out); -+ CLEAR_FLAGS; -+ goto rescan; -+ } -+ -+ if (column + increment > width && column != 0) -+ { -+ fwrite (line_out, sizeof(char), offset_out, stdout); -+ START_NEW_LINE; -+ goto rescan; -+ } -+ -+ if (allocated_out < offset_out + mblength) -+ { -+ line_out = X2REALLOC (line_out, &allocated_out); -+ } -+ -+ memcpy (line_out + offset_out, bufpos, mblength); -+ offset_out += mblength; -+ column += increment; -+ -+ if (is_blank_seen && !convfail && wc == L'\r') -+ is_cr_after_last_blank = 1; -+ -+ if (is_bs_following_last_blank && !convfail && wc == L'\b') -+ ++bs_following_last_blank_num; -+ else -+ is_bs_following_last_blank = 0; -+ -+ if (break_spaces && !convfail && iswblank (wc)) -+ { -+ last_blank_pos = offset_out; -+ last_blank_column = column; -+ is_blank_seen = 1; -+ last_blank_increment = increment; -+ is_bs_following_last_blank = 1; -+ bs_following_last_blank_num = 0; -+ is_cr_after_last_blank = 0; -+ } -+ } -+ -+ *saved_errno = errno; -+ if (!ferror (istream)) -+ *saved_errno = 0; -+ -+ if (offset_out) -+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); -+ -+} -+#endif -+ -+/* Fold file FILENAME, or standard input if FILENAME is "-", -+ to stdout, with maximum line length WIDTH. -+ Return 0 if successful, 1 if an error occurs. */ -+ -+static bool -+fold_file (char const *filename, size_t width) -+{ -+ FILE *istream; -+ int saved_errno; -+ -+ if (STREQ (filename, "-")) -+ { -+ istream = stdin; -+ have_read_stdin = 1; -+ } -+ else -+ istream = fopen (filename, "r"); -+ -+ if (istream == NULL) -+ { -+ error (0, errno, "%s", filename); -+ return 1; -+ } -+ -+ /* Define how ISTREAM is being folded. */ -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ fold_multibyte_text (istream, width, &saved_errno); -+ else -+#endif -+ fold_text (istream, width, &saved_errno); -+ - if (STREQ (filename, "-")) - clearerr (istream); - else if (fclose (istream) != 0 && !saved_errno) -@@ -251,7 +500,8 @@ main (int argc, char **argv) - - atexit (close_stdout); - -- break_spaces = count_bytes = have_read_stdin = false; -+ operating_mode = column_mode; -+ break_spaces = have_read_stdin = false; - - while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1) - { -@@ -260,7 +510,15 @@ main (int argc, char **argv) - switch (optc) - { - case 'b': /* Count bytes rather than columns. */ -- count_bytes = true; -+ if (operating_mode != column_mode) -+ FATAL_ERROR (_("only one way of folding may be specified")); -+ operating_mode = byte_mode; -+ break; -+ -+ case 'c': -+ if (operating_mode != column_mode) -+ FATAL_ERROR (_("only one way of folding may be specified")); -+ operating_mode = character_mode; - break; - - case 's': /* Break at word boundaries. */ -diff --git a/src/local.mk b/src/local.mk -index 96ee941..8fdb8fc 100644 ---- a/src/local.mk -+++ b/src/local.mk -@@ -450,8 +450,8 @@ src_base32_CPPFLAGS = -DBASE_TYPE=32 $(AM_CPPFLAGS) - src_basenc_SOURCES = src/basenc.c - src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS) - --src_expand_SOURCES = src/expand.c src/expand-common.c --src_unexpand_SOURCES = src/unexpand.c src/expand-common.c -+src_expand_SOURCES = src/expand.c src/expand-common.c lib/mbfile.c lib/mbchar.c -+src_unexpand_SOURCES = src/unexpand.c src/expand-common.c lib/mbfile.c lib/mbchar.c - - src_wc_SOURCES = src/wc.c - if USE_AVX2_WC_LINECOUNT -diff --git a/src/pr.c b/src/pr.c -index 09c6fa8..7552b62 100644 ---- a/src/pr.c -+++ b/src/pr.c -@@ -312,6 +312,24 @@ - #include - #include - #include -+ -+/* Get MB_LEN_MAX. */ -+#include -+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC -+ installation; work around this configuration error. */ -+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1 -+# define MB_LEN_MAX 16 -+#endif -+ -+/* Get MB_CUR_MAX. */ -+#include -+ -+/* Solaris 2.5 has a bug: must be included before . */ -+/* Get mbstate_t, mbrtowc(), wcwidth(). */ -+#if HAVE_WCHAR_H -+# include -+#endif -+ - #include "system.h" - #include "fadvise.h" - #include "hard-locale.h" -@@ -324,6 +342,18 @@ - #include "xstrtol-error.h" - #include "xdectoint.h" - -+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ -+#if HAVE_MBRTOWC && defined mbstate_t -+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) -+#endif -+ -+#ifndef HAVE_DECL_WCWIDTH -+"this configure-time declaration test was not run" -+#endif -+#if !HAVE_DECL_WCWIDTH -+extern int wcwidth (); -+#endif -+ - /* The official name of this program (e.g., no 'g' prefix). */ - #define PROGRAM_NAME "pr" - -@@ -416,7 +446,20 @@ struct COLUMN - - typedef struct COLUMN COLUMN; - --static int char_to_clump (char c); -+/* Funtion pointers to switch functions for single byte locale or for -+ multibyte locale. If multibyte functions do not exist in your sysytem, -+ these pointers always point the function for single byte locale. */ -+static void (*print_char) (char c); -+static int (*char_to_clump) (char c); -+ -+/* Functions for single byte locale. */ -+static void print_char_single (char c); -+static int char_to_clump_single (char c); -+ -+/* Functions for multibyte locale. */ -+static void print_char_multi (char c); -+static int char_to_clump_multi (char c); -+ - static bool read_line (COLUMN *p); - static bool print_page (void); - static bool print_stored (COLUMN *p); -@@ -428,6 +471,7 @@ static void add_line_number (COLUMN *p); - static void getoptnum (char const *n_str, int min, int *num, - char const *errfmt); - static void getoptarg (char *arg, char switch_char, char *character, -+ int *character_length, int *character_width, - int *number); - static void print_files (int number_of_files, char **av); - static void init_parameters (int number_of_files); -@@ -441,7 +485,6 @@ static void store_char (char c); - static void pad_down (unsigned int lines); - static void read_rest_of_line (COLUMN *p); - static void skip_read (COLUMN *p, int column_number); --static void print_char (char c); - static void cleanup (void); - static void print_sep_string (void); - static void separator_string (char const *optarg_S); -@@ -453,7 +496,7 @@ static COLUMN *column_vector; - we store the leftmost columns contiguously in buff. - To print a line from buff, get the index of the first character - from line_vector[i], and print up to line_vector[i + 1]. */ --static char *buff; -+static unsigned char *buff; - - /* Index of the position in buff where the next character - will be stored. */ -@@ -557,7 +600,7 @@ static int chars_per_column; - static bool untabify_input = false; - - /* (-e) The input tab character. */ --static char input_tab_char = '\t'; -+static char input_tab_char[MB_LEN_MAX] = "\t"; - - /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ... - where the leftmost column is 1. */ -@@ -567,7 +610,10 @@ static int chars_per_input_tab = 8; - static bool tabify_output = false; - - /* (-i) The output tab character. */ --static char output_tab_char = '\t'; -+static char output_tab_char[MB_LEN_MAX] = "\t"; -+ -+/* (-i) The byte length of output tab character. */ -+static int output_tab_char_length = 1; - - /* (-i) The width of the output tab. */ - static int chars_per_output_tab = 8; -@@ -637,7 +683,13 @@ static int line_number; - static bool numbered_lines = false; - - /* (-n) Character which follows each line number. */ --static char number_separator = '\t'; -+static char number_separator[MB_LEN_MAX] = "\t"; -+ -+/* (-n) The byte length of the character which follows each line number. */ -+static int number_separator_length = 1; -+ -+/* (-n) The character width of the character which follows each line number. */ -+static int number_separator_width = 0; - - /* (-n) line counting starts with 1st line of input file (not with 1st - line of 1st page printed). */ -@@ -690,6 +742,7 @@ static bool use_col_separator = false; - -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */ - static char const *col_sep_string = ""; - static int col_sep_length = 0; -+static int col_sep_width = 0; - static char *column_separator = (char *) " "; - static char *line_separator = (char *) "\t"; - -@@ -852,6 +905,13 @@ separator_string (char const *optarg_S) - integer_overflow (); - col_sep_length = len; - col_sep_string = optarg_S; -+ -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ col_sep_width = mbswidth (col_sep_string, 0); -+ else -+#endif -+ col_sep_width = col_sep_length; - } - - int -@@ -876,6 +936,21 @@ main (int argc, char **argv) - - atexit (close_stdout); - -+/* Define which functions are used, the ones for single byte locale or the ones -+ for multibyte locale. */ -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ print_char = print_char_multi; -+ char_to_clump = char_to_clump_multi; -+ } -+ else -+#endif -+ { -+ print_char = print_char_single; -+ char_to_clump = char_to_clump_single; -+ } -+ - n_files = 0; - file_names = (argc > 1 - ? xnmalloc (argc - 1, sizeof (char *)) -@@ -952,8 +1027,12 @@ main (int argc, char **argv) - break; - case 'e': - if (optarg) -- getoptarg (optarg, 'e', &input_tab_char, -- &chars_per_input_tab); -+ { -+ int dummy_length, dummy_width; -+ -+ getoptarg (optarg, 'e', input_tab_char, &dummy_length, -+ &dummy_width, &chars_per_input_tab); -+ } - /* Could check tab width > 0. */ - untabify_input = true; - break; -@@ -966,8 +1045,12 @@ main (int argc, char **argv) - break; - case 'i': - if (optarg) -- getoptarg (optarg, 'i', &output_tab_char, -- &chars_per_output_tab); -+ { -+ int dummy_width; -+ -+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length, -+ &dummy_width, &chars_per_output_tab); -+ } - /* Could check tab width > 0. */ - tabify_output = true; - break; -@@ -985,8 +1068,8 @@ main (int argc, char **argv) - case 'n': - numbered_lines = true; - if (optarg) -- getoptarg (optarg, 'n', &number_separator, -- &chars_per_number); -+ getoptarg (optarg, 'n', number_separator, &number_separator_length, -+ &number_separator_width, &chars_per_number); - break; - case 'N': - skip_count = false; -@@ -1011,6 +1094,7 @@ main (int argc, char **argv) - /* Reset an additional input of -s, -S dominates -s */ - col_sep_string = ""; - col_sep_length = 0; -+ col_sep_width = 0; - use_col_separator = true; - if (optarg) - separator_string (optarg); -@@ -1165,7 +1249,8 @@ getoptnum (char const *n_str, int min, int *num, char const *err) - a number. */ - - static void --getoptarg (char *arg, char switch_char, char *character, int *number) -+getoptarg (char *arg, char switch_char, char *character, int *character_length, -+ int *character_width, int *number) - { - if (!*arg) - { -@@ -1174,7 +1259,41 @@ getoptarg (char *arg, char switch_char, char *character, int *number) - } - - if (!ISDIGIT (*arg)) -- *character = *arg++; -+ { -+#ifdef HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) /* for multibyte locale. */ -+ { -+ wchar_t wc; -+ size_t mblength; -+ int width; -+ mbstate_t state = {'\0'}; -+ -+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state); -+ -+ if (mblength == (size_t)-1 || mblength == (size_t)-2) -+ { -+ *character_length = 1; -+ *character_width = 1; -+ } -+ else -+ { -+ *character_length = (mblength < 1) ? 1 : mblength; -+ width = wcwidth (wc); -+ *character_width = (width < 0) ? 0 : width; -+ } -+ -+ strncpy (character, arg, *character_length); -+ arg += *character_length; -+ } -+ else /* for single byte locale. */ -+#endif -+ { -+ *character = *arg++; -+ *character_length = 1; -+ *character_width = 1; -+ } -+ } -+ - if (*arg) - { - long int tmp_long; -@@ -1203,6 +1322,11 @@ static void - init_parameters (int number_of_files) - { - int chars_used_by_number = 0; -+ int mb_len = 1; -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ mb_len = MB_LEN_MAX; -+#endif - - lines_per_body = lines_per_page - lines_per_header - lines_per_footer; - if (lines_per_body <= 0) -@@ -1240,7 +1364,7 @@ init_parameters (int number_of_files) - else - col_sep_string = column_separator; - -- col_sep_length = 1; -+ col_sep_length = col_sep_width = 1; - use_col_separator = true; - } - /* It's rather pointless to define a TAB separator with column -@@ -1272,11 +1396,11 @@ init_parameters (int number_of_files) - + TAB_WIDTH (chars_per_input_tab, chars_per_number); */ - - /* Estimate chars_per_text without any margin and keep it constant. */ -- if (number_separator == '\t') -+ if (number_separator[0] == '\t') - number_width = (chars_per_number - + TAB_WIDTH (chars_per_default_tab, chars_per_number)); - else -- number_width = chars_per_number + 1; -+ number_width = chars_per_number + number_separator_width; - - /* The number is part of the column width unless we are - printing files in parallel. */ -@@ -1285,7 +1409,7 @@ init_parameters (int number_of_files) - } - - int sep_chars, useful_chars; -- if (ckd_mul (&sep_chars, columns - 1, col_sep_length)) -+ if (ckd_mul (&sep_chars, columns - 1, col_sep_width)) - sep_chars = INT_MAX; - if (ckd_sub (&useful_chars, chars_per_line - chars_used_by_number, - sep_chars)) -@@ -1308,7 +1432,7 @@ init_parameters (int number_of_files) - We've to use 8 as the lower limit, if we use chars_per_default_tab = 8 - to expand a tab which is not an input_tab-char. */ - free (clump_buff); -- clump_buff = xmalloc (MAX (8, chars_per_input_tab)); -+ clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab)); - } - - /* Open the necessary files, -@@ -1414,7 +1538,7 @@ init_funcs (void) - - /* Enlarge p->start_position of first column to use the same form of - padding_not_printed with all columns. */ -- h = h + col_sep_length; -+ h = h + col_sep_width; - - /* This loop takes care of all but the rightmost column. */ - -@@ -1448,7 +1572,7 @@ init_funcs (void) - } - else - { -- h = h_next + col_sep_length; -+ h = h_next + col_sep_width; - h_next = h + chars_per_column; - } - } -@@ -1745,9 +1869,9 @@ static void - align_column (COLUMN *p) - { - padding_not_printed = p->start_position; -- if (col_sep_length < padding_not_printed) -+ if (col_sep_width < padding_not_printed) - { -- pad_across_to (padding_not_printed - col_sep_length); -+ pad_across_to (padding_not_printed - col_sep_width); - padding_not_printed = ANYWHERE; - } - -@@ -2021,13 +2145,13 @@ store_char (char c) - /* May be too generous. */ - buff = X2REALLOC (buff, &buff_allocated); - } -- buff[buff_current++] = c; -+ buff[buff_current++] = (unsigned char) c; - } - - static void - add_line_number (COLUMN *p) - { -- int i; -+ int i, j; - char *s; - int num_width; - -@@ -2044,22 +2168,24 @@ add_line_number (COLUMN *p) - /* Tabification is assumed for multiple columns, also for n-separators, - but 'default n-separator = TAB' hasn't been given priority over - equal column_width also specified by POSIX. */ -- if (number_separator == '\t') -+ if (number_separator[0] == '\t') - { - i = number_width - chars_per_number; - while (i-- > 0) - (p->char_func) (' '); - } - else -- (p->char_func) (number_separator); -+ for (j = 0; j < number_separator_length; j++) -+ (p->char_func) (number_separator[j]); - } - else - /* To comply with POSIX, we avoid any expansion of default TAB - separator with a single column output. No column_width requirement - has to be considered. */ - { -- (p->char_func) (number_separator); -- if (number_separator == '\t') -+ for (j = 0; j < number_separator_length; j++) -+ (p->char_func) (number_separator[j]); -+ if (number_separator[0] == '\t') - output_position = POS_AFTER_TAB (chars_per_output_tab, - output_position); - } -@@ -2218,7 +2344,7 @@ print_white_space (void) - while (goal - h_old > 1 - && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal) - { -- putchar (output_tab_char); -+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout); - h_old = h_new; - } - while (++h_old <= goal) -@@ -2238,6 +2364,7 @@ print_sep_string (void) - { - char const *s = col_sep_string; - int l = col_sep_length; -+ int not_space_flag; - - if (separators_not_printed <= 0) - { -@@ -2249,6 +2376,7 @@ print_sep_string (void) - { - for (; separators_not_printed > 0; --separators_not_printed) - { -+ not_space_flag = 0; - while (l-- > 0) - { - /* 3 types of sep_strings: spaces only, spaces and chars, -@@ -2262,12 +2390,15 @@ print_sep_string (void) - } - else - { -+ not_space_flag = 1; - if (spaces_not_printed > 0) - print_white_space (); - putchar (*s++); -- ++output_position; - } - } -+ if (not_space_flag) -+ output_position += col_sep_width; -+ - /* sep_string ends with some spaces */ - if (spaces_not_printed > 0) - print_white_space (); -@@ -2295,7 +2426,7 @@ print_clump (COLUMN *p, int n, char *clump) - required number of tabs and spaces. */ - - static void --print_char (char c) -+print_char_single (char c) - { - if (tabify_output) - { -@@ -2319,6 +2450,74 @@ print_char (char c) - putchar (c); - } - -+#ifdef HAVE_MBRTOWC -+static void -+print_char_multi (char c) -+{ -+ static size_t mbc_pos = 0; -+ static char mbc[MB_LEN_MAX] = {'\0'}; -+ static mbstate_t state = {'\0'}; -+ mbstate_t state_bak; -+ wchar_t wc; -+ size_t mblength; -+ int width; -+ -+ if (tabify_output) -+ { -+ state_bak = state; -+ mbc[mbc_pos++] = c; -+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state); -+ -+ while (mbc_pos > 0) -+ { -+ switch (mblength) -+ { -+ case (size_t)-2: -+ state = state_bak; -+ return; -+ -+ case (size_t)-1: -+ state = state_bak; -+ ++output_position; -+ putchar (mbc[0]); -+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1); -+ --mbc_pos; -+ break; -+ -+ case 0: -+ mblength = 1; -+ -+ default: -+ if (wc == L' ') -+ { -+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); -+ --mbc_pos; -+ ++spaces_not_printed; -+ return; -+ } -+ else if (spaces_not_printed > 0) -+ print_white_space (); -+ -+ /* Nonprintables are assumed to have width 0, except L'\b'. */ -+ if ((width = wcwidth (wc)) < 1) -+ { -+ if (wc == L'\b') -+ --output_position; -+ } -+ else -+ output_position += width; -+ -+ fwrite (mbc, sizeof(char), mblength, stdout); -+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); -+ mbc_pos -= mblength; -+ } -+ } -+ return; -+ } -+ putchar (c); -+} -+#endif -+ - /* Skip to page PAGE before printing. - PAGE may be larger than total number of pages. */ - -@@ -2495,9 +2694,9 @@ read_line (COLUMN *p) - align_empty_cols = false; - } - -- if (col_sep_length < padding_not_printed) -+ if (col_sep_width < padding_not_printed) - { -- pad_across_to (padding_not_printed - col_sep_length); -+ pad_across_to (padding_not_printed - col_sep_width); - padding_not_printed = ANYWHERE; - } - -@@ -2566,7 +2765,7 @@ print_stored (COLUMN *p) - COLUMN *q; - - int line = p->current_line++; -- char *first = &buff[line_vector[line]]; -+ unsigned char *first = &buff[line_vector[line]]; - /* FIXME - UMR: Uninitialized memory read: - * This is occurring while in: -@@ -2578,7 +2777,7 @@ print_stored (COLUMN *p) - xmalloc [xmalloc.c:94] - init_store_cols [pr.c:1648] - */ -- char *last = &buff[line_vector[line + 1]]; -+ unsigned char *last = &buff[line_vector[line + 1]]; - - pad_vertically = true; - -@@ -2598,9 +2797,9 @@ print_stored (COLUMN *p) - } - } - -- if (col_sep_length < padding_not_printed) -+ if (col_sep_width < padding_not_printed) - { -- pad_across_to (padding_not_printed - col_sep_length); -+ pad_across_to (padding_not_printed - col_sep_width); - padding_not_printed = ANYWHERE; - } - -@@ -2613,8 +2812,8 @@ print_stored (COLUMN *p) - if (spaces_not_printed == 0) - { - output_position = p->start_position + end_vector[line]; -- if (p->start_position - col_sep_length == chars_per_margin) -- output_position -= col_sep_length; -+ if (p->start_position - col_sep_width == chars_per_margin) -+ output_position -= col_sep_width; - } - - return true; -@@ -2633,7 +2832,7 @@ print_stored (COLUMN *p) - number of characters is 1.) */ - - static int --char_to_clump (char c) -+char_to_clump_single (char c) - { - unsigned char uc = c; - char *s = clump_buff; -@@ -2643,10 +2842,10 @@ char_to_clump (char c) - int chars; - int chars_per_c = 8; - -- if (c == input_tab_char) -+ if (c == input_tab_char[0]) - chars_per_c = chars_per_input_tab; - -- if (c == input_tab_char || c == '\t') -+ if (c == input_tab_char[0] || c == '\t') - { - width = TAB_WIDTH (chars_per_c, input_position); - -@@ -2727,6 +2926,164 @@ char_to_clump (char c) - return chars; - } - -+#ifdef HAVE_MBRTOWC -+static int -+char_to_clump_multi (char c) -+{ -+ static size_t mbc_pos = 0; -+ static char mbc[MB_LEN_MAX] = {'\0'}; -+ static mbstate_t state = {'\0'}; -+ mbstate_t state_bak; -+ wchar_t wc; -+ size_t mblength; -+ int wc_width; -+ register char *s = clump_buff; -+ register int i, j; -+ char esc_buff[4]; -+ int width; -+ int chars; -+ int chars_per_c = 8; -+ -+ state_bak = state; -+ mbc[mbc_pos++] = c; -+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state); -+ -+ width = 0; -+ chars = 0; -+ while (mbc_pos > 0) -+ { -+ switch (mblength) -+ { -+ case (size_t)-2: -+ state = state_bak; -+ return 0; -+ -+ case (size_t)-1: -+ state = state_bak; -+ mblength = 1; -+ -+ if (use_esc_sequence || use_cntrl_prefix) -+ { -+ width = +4; -+ chars = +4; -+ *s++ = '\\'; -+ sprintf (esc_buff, "%03o", (unsigned char) mbc[0]); -+ for (i = 0; i <= 2; ++i) -+ *s++ = (int) esc_buff[i]; -+ } -+ else -+ { -+ width += 1; -+ chars += 1; -+ *s++ = mbc[0]; -+ } -+ break; -+ -+ case 0: -+ mblength = 1; -+ /* Fall through */ -+ -+ default: -+ if (memcmp (mbc, input_tab_char, mblength) == 0) -+ chars_per_c = chars_per_input_tab; -+ -+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t') -+ { -+ int width_inc; -+ -+ width_inc = TAB_WIDTH (chars_per_c, input_position); -+ width += width_inc; -+ -+ if (untabify_input) -+ { -+ for (i = width_inc; i; --i) -+ *s++ = ' '; -+ chars += width_inc; -+ } -+ else -+ { -+ for (i = 0; i < mblength; i++) -+ *s++ = mbc[i]; -+ chars += mblength; -+ } -+ } -+ else if ((wc_width = wcwidth (wc)) < 1) -+ { -+ if (use_esc_sequence) -+ { -+ for (i = 0; i < mblength; i++) -+ { -+ width += 4; -+ chars += 4; -+ *s++ = '\\'; -+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]); -+ for (j = 0; j <= 2; ++j) -+ *s++ = (int) esc_buff[j]; -+ } -+ } -+ else if (use_cntrl_prefix) -+ { -+ if (wc < 0200) -+ { -+ width += 2; -+ chars += 2; -+ *s++ = '^'; -+ *s++ = wc ^ 0100; -+ } -+ else -+ { -+ for (i = 0; i < mblength; i++) -+ { -+ width += 4; -+ chars += 4; -+ *s++ = '\\'; -+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]); -+ for (j = 0; j <= 2; ++j) -+ *s++ = (int) esc_buff[j]; -+ } -+ } -+ } -+ else if (wc == L'\b') -+ { -+ width += -1; -+ chars += 1; -+ *s++ = c; -+ } -+ else -+ { -+ width += 0; -+ chars += mblength; -+ for (i = 0; i < mblength; i++) -+ *s++ = mbc[i]; -+ } -+ } -+ else -+ { -+ width += wc_width; -+ chars += mblength; -+ for (i = 0; i < mblength; i++) -+ *s++ = mbc[i]; -+ } -+ } -+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); -+ mbc_pos -= mblength; -+ } -+ -+ /* Too many backspaces must put us in position 0 -- never negative. */ -+ if (width < 0 && input_position == 0) -+ { -+ chars = 0; -+ input_position = 0; -+ } -+ else if (width < 0 && input_position <= -width) -+ input_position = 0; -+ else -+ input_position += width; -+ -+ return chars; -+} -+#endif -+ - /* We've just printed some files and need to clean up things before - looking for more options and printing the next batch of files. - -diff --git a/src/sort.c b/src/sort.c -index 2d8324c..46331b8 100644 ---- a/src/sort.c -+++ b/src/sort.c -@@ -29,6 +29,14 @@ - #include - #include - #include -+#if HAVE_WCHAR_H -+# include -+#endif -+/* Get isw* functions. */ -+#if HAVE_WCTYPE_H -+# include -+#endif -+ - #include "system.h" - #include "argmatch.h" - #include "assure.h" -@@ -157,14 +165,39 @@ static int thousands_sep; - /* We currently ignore multi-byte grouping chars. */ - static bool thousands_sep_ignored; - -+/* True if -f is specified. */ -+static bool folding; -+ - /* Nonzero if the corresponding locales are hard. */ - static bool hard_LC_COLLATE; --#if HAVE_NL_LANGINFO -+#if HAVE_LANGINFO_CODESET - static bool hard_LC_TIME; - #endif - - #define NONZERO(x) ((x) != 0) - -+/* get a multibyte character's byte length. */ -+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \ -+ do \ -+ { \ -+ wchar_t wc; \ -+ mbstate_t state_bak; \ -+ \ -+ state_bak = STATE; \ -+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \ -+ \ -+ switch (MBLENGTH) \ -+ { \ -+ case (size_t)-1: \ -+ case (size_t)-2: \ -+ STATE = state_bak; \ -+ /* Fall through. */ \ -+ case 0: \ -+ MBLENGTH = 1; \ -+ } \ -+ } \ -+ while (0) -+ - /* The kind of blanks for '-b' to skip in various options. */ - enum blanktype { bl_start, bl_end, bl_both }; - -@@ -341,13 +374,11 @@ static bool stable; - /* An int value outside char range. */ - enum { NON_CHAR = CHAR_MAX + 1 }; - --/* If TAB has this value, blanks separate fields. */ --enum { TAB_DEFAULT = CHAR_MAX + 1 }; -- --/* Tab character separating fields. If TAB_DEFAULT, then fields are -+/* Tab character separating fields. If tab_length is 0, then fields are - separated by the empty string between a non-blank character and a blank - character. */ --static int tab = TAB_DEFAULT; -+static char tab[MB_LEN_MAX + 1]; -+static size_t tab_length = 0; - - /* Flag to remove consecutive duplicate lines from the output. - Only the last of a sequence of equal lines will be output. */ -@@ -804,6 +835,46 @@ reap_all (void) - reap (-1); - } - -+/* Function pointers. */ -+static void -+(*inittables) (void); -+static char * -+(*begfield) (const struct line*, const struct keyfield *); -+static char * -+(*limfield) (const struct line*, const struct keyfield *); -+static void -+(*skipblanks) (char **ptr, char *lim); -+static int -+(*getmonth) (char const *, size_t, char **); -+static int -+(*keycompare) (const struct line *, const struct line *); -+static int -+(*numcompare) (const char *, const char *); -+ -+/* Test for white space multibyte character. -+ Set LENGTH the byte length of investigated multibyte character. */ -+#if HAVE_MBRTOWC -+static int -+ismbblank (const char *str, size_t len, size_t *length) -+{ -+ size_t mblength; -+ wchar_t wc; -+ mbstate_t state; -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ mblength = mbrtowc (&wc, str, len, &state); -+ -+ if (mblength == (size_t)-1 || mblength == (size_t)-2) -+ { -+ *length = 1; -+ return 0; -+ } -+ -+ *length = (mblength < 1) ? 1 : mblength; -+ return iswblank (wc) || wc == '\n'; -+} -+#endif -+ - /* Clean up any remaining temporary files. */ - - static void -@@ -1271,7 +1342,7 @@ zaptemp (char const *name) - free (node); - } - --#if HAVE_NL_LANGINFO -+#if HAVE_LANGINFO_CODESET - - static int - struct_month_cmp (void const *m1, void const *m2) -@@ -1286,7 +1357,7 @@ struct_month_cmp (void const *m1, void const *m2) - /* Initialize the character class tables. */ - - static void --inittables (void) -+inittables_uni (void) - { - size_t i; - -@@ -1298,7 +1369,7 @@ inittables (void) - fold_toupper[i] = toupper (i); - } - --#if HAVE_NL_LANGINFO -+#if HAVE_LANGINFO_CODESET - /* If we're not in the "C" locale, read different names for months. */ - if (hard_LC_TIME) - { -@@ -1380,6 +1451,84 @@ specify_nmerge (int oi, char c, char const *s) - xstrtol_fatal (e, oi, c, long_options, s); - } - -+#if HAVE_MBRTOWC -+static void -+inittables_mb (void) -+{ -+ int i, j, k, l; -+ char *name, *s, *lc_time, *lc_ctype; -+ size_t s_len, mblength; -+ char mbc[MB_LEN_MAX]; -+ wchar_t wc, pwc; -+ mbstate_t state_mb, state_wc; -+ -+ lc_time = setlocale (LC_TIME, ""); -+ if (lc_time) -+ lc_time = xstrdup (lc_time); -+ -+ lc_ctype = setlocale (LC_CTYPE, ""); -+ if (lc_ctype) -+ lc_ctype = xstrdup (lc_ctype); -+ -+ if (lc_time && lc_ctype) -+ /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert -+ * the names of months to upper case */ -+ setlocale (LC_CTYPE, lc_time); -+ -+ for (i = 0; i < MONTHS_PER_YEAR; i++) -+ { -+ s = (char *) nl_langinfo (ABMON_1 + i); -+ s_len = strlen (s); -+ monthtab[i].name = name = (char *) xmalloc (s_len + 1); -+ monthtab[i].val = i + 1; -+ -+ memset (&state_mb, '\0', sizeof (mbstate_t)); -+ memset (&state_wc, '\0', sizeof (mbstate_t)); -+ -+ for (j = 0; j < s_len;) -+ { -+ if (!ismbblank (s + j, s_len - j, &mblength)) -+ break; -+ j += mblength; -+ } -+ -+ for (k = 0; j < s_len;) -+ { -+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb); -+ assert (mblength != (size_t)-1 && mblength != (size_t)-2); -+ if (mblength == 0) -+ break; -+ -+ pwc = towupper (wc); -+ if (pwc == wc) -+ { -+ memcpy (mbc, s + j, mblength); -+ j += mblength; -+ } -+ else -+ { -+ j += mblength; -+ mblength = wcrtomb (mbc, pwc, &state_wc); -+ assert (mblength != (size_t)0 && mblength != (size_t)-1); -+ } -+ -+ for (l = 0; l < mblength; l++) -+ name[k++] = mbc[l]; -+ } -+ name[k] = '\0'; -+ } -+ qsort ((void *) monthtab, MONTHS_PER_YEAR, -+ sizeof (struct month), struct_month_cmp); -+ -+ if (lc_time && lc_ctype) -+ /* restore the original locales */ -+ setlocale (LC_CTYPE, lc_ctype); -+ -+ free (lc_ctype); -+ free (lc_time); -+} -+#endif -+ - /* Specify the amount of main memory to use when sorting. */ - static void - specify_sort_size (int oi, char c, char const *s) -@@ -1611,7 +1760,7 @@ buffer_linelim (struct buffer const *buf) - by KEY in LINE. */ - - static char * --begfield (struct line const *line, struct keyfield const *key) -+begfield_uni (const struct line *line, const struct keyfield *key) - { - char *ptr = line->text, *lim = ptr + line->length - 1; - size_t sword = key->sword; -@@ -1620,10 +1769,10 @@ begfield (struct line const *line, struct keyfield const *key) - /* The leading field separator itself is included in a field when -t - is absent. */ - -- if (tab != TAB_DEFAULT) -+ if (tab_length) - while (ptr < lim && sword--) - { -- while (ptr < lim && *ptr != tab) -+ while (ptr < lim && *ptr != tab[0]) - ++ptr; - if (ptr < lim) - ++ptr; -@@ -1649,12 +1798,71 @@ begfield (struct line const *line, struct keyfield const *key) - return ptr; - } - -+#if HAVE_MBRTOWC -+static char * -+begfield_mb (const struct line *line, const struct keyfield *key) -+{ -+ int i; -+ char *ptr = line->text, *lim = ptr + line->length - 1; -+ size_t sword = key->sword; -+ size_t schar = key->schar; -+ size_t mblength; -+ mbstate_t state; -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ if (tab_length) -+ while (ptr < lim && sword--) -+ { -+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ if (ptr < lim) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ } -+ else -+ while (ptr < lim && sword--) -+ { -+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ if (ptr < lim) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ } -+ -+ if (key->skipsblanks) -+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ -+ for (i = 0; i < schar; i++) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ -+ if (ptr + mblength > lim) -+ break; -+ else -+ ptr += mblength; -+ } -+ -+ return ptr; -+} -+#endif -+ - /* Return the limit of (a pointer to the first character after) the field - in LINE specified by KEY. */ - - ATTRIBUTE_PURE - static char * --limfield (struct line const *line, struct keyfield const *key) -+limfield_uni (struct line const *line, struct keyfield const *key) - { - char *ptr = line->text, *lim = ptr + line->length - 1; - size_t eword = key->eword, echar = key->echar; -@@ -1669,10 +1877,10 @@ limfield (struct line const *line, struct keyfield const *key) - 'beginning' is the first character following the delimiting TAB. - Otherwise, leave PTR pointing at the first 'blank' character after - the preceding field. */ -- if (tab != TAB_DEFAULT) -+ if (tab_length) - while (ptr < lim && eword--) - { -- while (ptr < lim && *ptr != tab) -+ while (ptr < lim && *ptr != tab[0]) - ++ptr; - if (ptr < lim && (eword || echar)) - ++ptr; -@@ -1718,10 +1926,10 @@ limfield (struct line const *line, struct keyfield const *key) - */ - - /* Make LIM point to the end of (one byte past) the current field. */ -- if (tab != TAB_DEFAULT) -+ if (tab_length) - { - char *newlim; -- newlim = memchr (ptr, tab, lim - ptr); -+ newlim = memchr (ptr, tab[0], lim - ptr); - if (newlim) - lim = newlim; - } -@@ -1752,6 +1960,130 @@ limfield (struct line const *line, struct keyfield const *key) - return ptr; - } - -+#if HAVE_MBRTOWC -+static char * _GL_ATTRIBUTE_PURE -+limfield_mb (const struct line *line, const struct keyfield *key) -+{ -+ char *ptr = line->text, *lim = ptr + line->length - 1; -+ size_t eword = key->eword, echar = key->echar; -+ int i; -+ size_t mblength; -+ mbstate_t state; -+ -+ if (echar == 0) -+ eword++; /* skip all of end field. */ -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ if (tab_length) -+ while (ptr < lim && eword--) -+ { -+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ if (ptr < lim && (eword | echar)) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ } -+ else -+ while (ptr < lim && eword--) -+ { -+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ if (ptr < lim) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ } -+ -+ -+# ifdef POSIX_UNSPECIFIED -+ /* Make LIM point to the end of (one byte past) the current field. */ -+ if (tab_length) -+ { -+ char *newlim, *p; -+ -+ newlim = NULL; -+ for (p = ptr; p < lim;) -+ { -+ if (memcmp (p, tab, tab_length) == 0) -+ { -+ newlim = p; -+ break; -+ } -+ -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ p += mblength; -+ } -+ } -+ else -+ { -+ char *newlim; -+ newlim = ptr; -+ -+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength)) -+ newlim += mblength; -+ if (ptr < lim) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ ptr += mblength; -+ } -+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength)) -+ newlim += mblength; -+ lim = newlim; -+ } -+# endif -+ -+ if (echar != 0) -+ { -+ /* If we're skipping leading blanks, don't start counting characters -+ * until after skipping past any leading blanks. */ -+ if (key->skipeblanks) -+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) -+ ptr += mblength; -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */ -+ for (i = 0; i < echar; i++) -+ { -+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); -+ -+ if (ptr + mblength > lim) -+ break; -+ else -+ ptr += mblength; -+ } -+ } -+ -+ return ptr; -+} -+#endif -+ -+static void -+skipblanks_uni (char **ptr, char *lim) -+{ -+ while (*ptr < lim && blanks[to_uchar (**ptr)]) -+ ++(*ptr); -+} -+ -+#if HAVE_MBRTOWC -+static void -+skipblanks_mb (char **ptr, char *lim) -+{ -+ size_t mblength; -+ while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength)) -+ (*ptr) += mblength; -+} -+#endif -+ - /* Fill BUF reading from FP, moving buf->left bytes from the end - of buf->buf to the beginning first. If EOF is reached and the - file wasn't terminated by a newline, supply one. Set up BUF's line -@@ -1838,8 +2170,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file) - else - { - if (key->skipsblanks) -- while (blanks[to_uchar (*line_start)]) -- line_start++; -+ { -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ size_t mblength; -+ while (line_start < line->keylim && -+ ismbblank (line_start, -+ line->keylim - line_start, -+ &mblength)) -+ line_start += mblength; -+ } -+ else -+#endif -+ while (blanks[to_uchar (*line_start)]) -+ line_start++; -+ } - line->keybeg = line_start; - } - } -@@ -1977,12 +2323,10 @@ find_unit_order (char const *number) - - ATTRIBUTE_PURE - static int --human_numcompare (char const *a, char const *b) -+human_numcompare (char *a, char *b) - { -- while (blanks[to_uchar (*a)]) -- a++; -- while (blanks[to_uchar (*b)]) -- b++; -+ skipblanks(&a, a + strlen(a)); -+ skipblanks(&b, b + strlen(b)); - - int diff = find_unit_order (a) - find_unit_order (b); - return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); -@@ -1994,7 +2338,7 @@ human_numcompare (char const *a, char const *b) - - ATTRIBUTE_PURE - static int --numcompare (char const *a, char const *b) -+numcompare_uni (const char *a, const char *b) - { - while (blanks[to_uchar (*a)]) - a++; -@@ -2004,6 +2348,25 @@ numcompare (char const *a, char const *b) - return strnumcmp (a, b, decimal_point, thousands_sep); - } - -+#if HAVE_MBRTOWC -+static int -+numcompare_mb (const char *a, const char *b) -+{ -+ size_t mblength, len; -+ len = strlen (a); /* okay for UTF-8 */ -+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength)) -+ { -+ a += mblength; -+ len -= mblength; -+ } -+ len = strlen (b); /* okay for UTF-8 */ -+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength)) -+ b += mblength; -+ -+ return strnumcmp (a, b, decimal_point, thousands_sep); -+} -+#endif /* HAV_EMBRTOWC */ -+ - static int - nan_compare (long double a, long double b) - { -@@ -2045,7 +2408,7 @@ general_numcompare (char const *sa, char const *sb) - Return 0 if the name in S is not recognized. */ - - static int --getmonth (char const *month, char **ea) -+getmonth_uni (char const *month, size_t len, char **ea) - { - size_t lo = 0; - size_t hi = MONTHS_PER_YEAR; -@@ -2372,15 +2735,14 @@ debug_key (struct line const *line, struct keyfield const *key) - char saved = *lim; - *lim = '\0'; - -- while (blanks[to_uchar (*beg)]) -- beg++; -+ skipblanks (&beg, lim); - - char *tighter_lim = beg; - - if (lim < beg) - tighter_lim = lim; - else if (key->month) -- getmonth (beg, &tighter_lim); -+ getmonth (beg, lim-beg, &tighter_lim); - else if (key->general_numeric) - ignore_value (strtold (beg, &tighter_lim)); - else if (key->numeric || key->human_numeric) -@@ -2526,7 +2888,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) - /* Warn about significant leading blanks. */ - bool implicit_skip = key_numeric (key) || key->month; - bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */ -- if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset -+ if (!zero_width && !gkey_only && !tab_length && !line_offset - && ((!key->skipsblanks && !implicit_skip) - || (!key->skipsblanks && key->schar) - || (!key->skipeblanks && key->echar))) -@@ -2574,9 +2936,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) - bool number_locale_warned = false; - if (basic_numeric_field_span) - { -- if (tab == TAB_DEFAULT -- ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))) -- : tab == thousands_sep) -+ if (tab_length -+ ? tab[0] == thousands_sep -+ : thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))) - { - error (0, 0, - _("field separator %s is treated as a " -@@ -2587,9 +2949,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) - } - if (basic_numeric_field_span || general_numeric_field_span) - { -- if (tab == TAB_DEFAULT -- ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))) -- : tab == decimal_point) -+ if (tab_length -+ ? tab[0] == decimal_point -+ : thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))) - { - error (0, 0, - _("field separator %s is treated as a " -@@ -2597,19 +2959,19 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) - quote (((char []) {decimal_point, 0}))); - number_locale_warned = true; - } -- else if (tab == '-') -+ else if (tab_length && tab[0] == '-') - { - error (0, 0, - _("field separator %s is treated as a " - "minus sign in numbers"), -- quote (((char []) {tab, 0}))); -+ quote (((char []) {tab[0], 0}))); - } -- else if (general_numeric_field_span && tab == '+') -+ else if (general_numeric_field_span && tab_length && tab[0] == '+') - { - error (0, 0, - _("field separator %s is treated as a " - "plus sign in numbers"), -- quote (((char []) {tab, 0}))); -+ quote (((char []) {tab[0], 0}))); - } - } - -@@ -2620,7 +2982,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) - { - error (0, 0, - _("%snumbers use %s as a decimal point in this locale"), -- tab == decimal_point ? "" : _("note "), -+ (tab_length && tab[0] == decimal_point) ? "" : _("note "), - quote (((char []) {decimal_point, 0}))); - - } -@@ -2662,11 +3024,87 @@ diff_reversed (int diff, bool reversed) - return reversed ? (diff < 0) - (diff > 0) : diff; - } - -+#if HAVE_MBRTOWC -+static int -+getmonth_mb (const char *s, size_t len, char **ea) -+{ -+ char *month; -+ register size_t i; -+ register int lo = 0, hi = MONTHS_PER_YEAR, result; -+ char *tmp; -+ size_t wclength, mblength; -+ const char *pp; -+ const wchar_t *wpp; -+ wchar_t *month_wcs; -+ mbstate_t state; -+ -+ while (len > 0 && ismbblank (s, len, &mblength)) -+ { -+ s += mblength; -+ len -= mblength; -+ } -+ -+ if (len == 0) -+ return 0; -+ -+ if (SIZE_MAX - len < 1) -+ xalloc_die (); -+ -+ month = (char *) xnmalloc (len + 1, MB_CUR_MAX); -+ -+ pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX); -+ memcpy (tmp, s, len); -+ tmp[len] = '\0'; -+ wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t)); -+ memset (&state, '\0', sizeof (mbstate_t)); -+ -+ wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state); -+ if (wclength == (size_t)-1 || pp != NULL) -+ error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s)); -+ -+ for (i = 0; i < wclength; i++) -+ { -+ month_wcs[i] = towupper(month_wcs[i]); -+ if (iswblank (month_wcs[i])) -+ { -+ month_wcs[i] = L'\0'; -+ break; -+ } -+ } -+ -+ mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state); -+ assert (mblength != (-1) && wpp == NULL); -+ -+ do -+ { -+ int ix = (lo + hi) / 2; -+ -+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0) -+ hi = ix; -+ else -+ lo = ix; -+ } -+ while (hi - lo > 1); -+ -+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name)) -+ ? monthtab[lo].val : 0); -+ -+ if (ea && result) -+ *ea = (char*) s + strlen (monthtab[lo].name); -+ -+ free (month); -+ free (tmp); -+ free (month_wcs); -+ -+ return result; -+} -+#endif -+ - /* Compare two lines A and B trying every key in sequence until there - are no more keys or a difference is found. */ - - static int --keycompare (struct line const *a, struct line const *b) -+keycompare_uni (const struct line *a, const struct line *b) - { - struct keyfield *key = keylist; - -@@ -2747,7 +3185,7 @@ keycompare (struct line const *a, struct line const *b) - else if (key->human_numeric) - diff = human_numcompare (ta, tb); - else if (key->month) -- diff = getmonth (ta, nullptr) - getmonth (tb, nullptr); -+ diff = getmonth (ta, tlena, nullptr) - getmonth (tb, tlenb, nullptr); - else if (key->random) - diff = compare_random (ta, tlena, tb, tlenb); - else if (key->version) -@@ -2857,6 +3295,211 @@ keycompare (struct line const *a, struct line const *b) - return diff_reversed (diff, key->reverse); - } - -+#if HAVE_MBRTOWC -+static int -+keycompare_mb (const struct line *a, const struct line *b) -+{ -+ struct keyfield *key = keylist; -+ -+ /* For the first iteration only, the key positions have been -+ precomputed for us. */ -+ char *texta = a->keybeg; -+ char *textb = b->keybeg; -+ char *lima = a->keylim; -+ char *limb = b->keylim; -+ -+ size_t mblength_a, mblength_b; -+ wchar_t wc_a, wc_b; -+ mbstate_t state_a, state_b; -+ -+ int diff = 0; -+ -+ memset (&state_a, '\0', sizeof(mbstate_t)); -+ memset (&state_b, '\0', sizeof(mbstate_t)); -+ /* Ignore keys with start after end. */ -+ if (a->keybeg - a->keylim > 0) -+ return 0; -+ -+ -+ /* Ignore and/or translate chars before comparing. */ -+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \ -+ do \ -+ { \ -+ wchar_t uwc; \ -+ char mbc[MB_LEN_MAX]; \ -+ mbstate_t state_wc; \ -+ \ -+ for (NEW_LEN = i = 0; i < LEN;) \ -+ { \ -+ mbstate_t state_bak; \ -+ \ -+ state_bak = STATE; \ -+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \ -+ \ -+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \ -+ || MBLENGTH == 0) \ -+ { \ -+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \ -+ STATE = state_bak; \ -+ if (!ignore) \ -+ COPY[NEW_LEN++] = TEXT[i]; \ -+ i++; \ -+ continue; \ -+ } \ -+ \ -+ if (ignore) \ -+ { \ -+ if ((ignore == nonprinting && !iswprint (WC)) \ -+ || (ignore == nondictionary \ -+ && !iswalnum (WC) && !iswblank (WC))) \ -+ { \ -+ i += MBLENGTH; \ -+ continue; \ -+ } \ -+ } \ -+ \ -+ if (translate) \ -+ { \ -+ \ -+ uwc = towupper(WC); \ -+ if (WC == uwc) \ -+ { \ -+ memcpy (mbc, TEXT + i, MBLENGTH); \ -+ i += MBLENGTH; \ -+ } \ -+ else \ -+ { \ -+ i += MBLENGTH; \ -+ WC = uwc; \ -+ memset (&state_wc, '\0', sizeof (mbstate_t)); \ -+ \ -+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \ -+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \ -+ } \ -+ \ -+ for (j = 0; j < MBLENGTH; j++) \ -+ COPY[NEW_LEN++] = mbc[j]; \ -+ } \ -+ else \ -+ for (j = 0; j < MBLENGTH; j++) \ -+ COPY[NEW_LEN++] = TEXT[i++]; \ -+ } \ -+ COPY[NEW_LEN] = '\0'; \ -+ } \ -+ while (0) -+ -+ /* Actually compare the fields. */ -+ -+ for (;;) -+ { -+ /* Find the lengths. */ -+ size_t lena = lima <= texta ? 0 : lima - texta; -+ size_t lenb = limb <= textb ? 0 : limb - textb; -+ -+ char enda IF_LINT (= 0); -+ char endb IF_LINT (= 0); -+ -+ char const *translate = key->translate; -+ bool const *ignore = key->ignore; -+ -+ if (ignore || translate) -+ { -+ if (SIZE_MAX - lenb - 2 < lena) -+ xalloc_die (); -+ char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX); -+ char *copy_b = copy_a + lena * MB_CUR_MAX + 1; -+ size_t new_len_a, new_len_b; -+ size_t i, j; -+ -+ IGNORE_CHARS (new_len_a, lena, texta, copy_a, -+ wc_a, mblength_a, state_a); -+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b, -+ wc_b, mblength_b, state_b); -+ texta = copy_a; textb = copy_b; -+ lena = new_len_a; lenb = new_len_b; -+ } -+ else -+ { -+ /* Use the keys in-place, temporarily null-terminated. */ -+ enda = texta[lena]; texta[lena] = '\0'; -+ endb = textb[lenb]; textb[lenb] = '\0'; -+ } -+ -+ if (key->random) -+ diff = compare_random (texta, lena, textb, lenb); -+ else if (key->numeric | key->general_numeric | key->human_numeric) -+ { -+ char savea = *lima, saveb = *limb; -+ -+ *lima = *limb = '\0'; -+ diff = (key->numeric ? numcompare (texta, textb) -+ : key->general_numeric ? general_numcompare (texta, textb) -+ : human_numcompare (texta, textb)); -+ *lima = savea, *limb = saveb; -+ } -+ else if (key->version) -+ diff = filevercmp (texta, textb); -+ else if (key->month) -+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL); -+ else if (lena == 0) -+ diff = - NONZERO (lenb); -+ else if (lenb == 0) -+ diff = 1; -+ else if (hard_LC_COLLATE && !folding) -+ { -+ diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1); -+ } -+ else -+ { -+ diff = memcmp (texta, textb, MIN (lena, lenb)); -+ if (diff == 0) -+ diff = lena < lenb ? -1 : lena != lenb; -+ } -+ -+ if (ignore || translate) -+ free (texta); -+ else -+ { -+ texta[lena] = enda; -+ textb[lenb] = endb; -+ } -+ -+ if (diff) -+ goto not_equal; -+ -+ key = key->next; -+ if (! key) -+ break; -+ -+ /* Find the beginning and limit of the next field. */ -+ if (key->eword != -1) -+ lima = limfield (a, key), limb = limfield (b, key); -+ else -+ lima = a->text + a->length - 1, limb = b->text + b->length - 1; -+ -+ if (key->sword != -1) -+ texta = begfield (a, key), textb = begfield (b, key); -+ else -+ { -+ texta = a->text, textb = b->text; -+ if (key->skipsblanks) -+ { -+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a)) -+ texta += mblength_a; -+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b)) -+ textb += mblength_b; -+ } -+ } -+ } -+ -+not_equal: -+ if (key && key->reverse) -+ return -diff; -+ else -+ return diff; -+} -+#endif -+ - /* Compare two lines A and B, returning negative, zero, or positive - depending on whether A compares less than, equal to, or greater than B. */ - -@@ -2884,7 +3527,7 @@ compare (struct line const *a, struct line const *b) - diff = - NONZERO (blen); - else if (blen == 0) - diff = 1; -- else if (hard_LC_COLLATE) -+ else if (hard_LC_COLLATE && !folding) - { - /* xmemcoll0 is a performance enhancement as - it will not unconditionally write '\0' after the -@@ -4272,6 +4915,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) - break; - case 'f': - key->translate = fold_toupper; -+ folding = true; - break; - case 'g': - key->general_numeric = true; -@@ -4351,7 +4995,7 @@ main (int argc, char **argv) - initialize_exit_failure (SORT_FAILURE); - - hard_LC_COLLATE = hard_locale (LC_COLLATE); --#if HAVE_NL_LANGINFO -+#if HAVE_LANGINFO_CODESET - hard_LC_TIME = hard_locale (LC_TIME); - #endif - -@@ -4374,6 +5018,29 @@ main (int argc, char **argv) - thousands_sep = NON_CHAR; - } - -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ inittables = inittables_mb; -+ begfield = begfield_mb; -+ limfield = limfield_mb; -+ skipblanks = skipblanks_mb; -+ getmonth = getmonth_mb; -+ keycompare = keycompare_mb; -+ numcompare = numcompare_mb; -+ } -+ else -+#endif -+ { -+ inittables = inittables_uni; -+ begfield = begfield_uni; -+ limfield = limfield_uni; -+ skipblanks = skipblanks_uni; -+ getmonth = getmonth_uni; -+ keycompare = keycompare_uni; -+ numcompare = numcompare_uni; -+ } -+ - have_read_stdin = false; - inittables (); - -@@ -4644,13 +5311,34 @@ main (int argc, char **argv) - - case 't': - { -- char newtab = optarg[0]; -- if (! newtab) -+ char newtab[MB_LEN_MAX + 1]; -+ size_t newtab_length = 1; -+ strncpy (newtab, optarg, MB_LEN_MAX); -+ if (! newtab[0]) - error (SORT_FAILURE, 0, _("empty tab")); -- if (optarg[1]) -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ wchar_t wc; -+ mbstate_t state; -+ -+ memset (&state, '\0', sizeof (mbstate_t)); -+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab, -+ MB_LEN_MAX), -+ &state); -+ switch (newtab_length) -+ { -+ case (size_t) -1: -+ case (size_t) -2: -+ case 0: -+ newtab_length = 1; -+ } -+ } -+#endif -+ if (newtab_length == 1 && optarg[1]) - { - if (STREQ (optarg, "\\0")) -- newtab = '\0'; -+ newtab[0] = '\0'; - else - { - /* Provoke with 'sort -txx'. Complain about -@@ -4661,9 +5349,11 @@ main (int argc, char **argv) - quote (optarg)); - } - } -- if (tab != TAB_DEFAULT && tab != newtab) -+ if (tab_length && (tab_length != newtab_length -+ || memcmp (tab, newtab, tab_length) != 0)) - error (SORT_FAILURE, 0, _("incompatible tabs")); -- tab = newtab; -+ memcpy (tab, newtab, newtab_length); -+ tab_length = newtab_length; - } - break; - -diff --git a/src/unexpand.c b/src/unexpand.c -index aca67dd..f79c808 100644 ---- a/src/unexpand.c -+++ b/src/unexpand.c -@@ -39,6 +39,9 @@ - #include - #include - #include -+ -+#include -+ - #include "system.h" - #include "expand-common.h" - -@@ -105,24 +108,47 @@ unexpand (void) - { - /* Input stream. */ - FILE *fp = next_file (nullptr); -+ mb_file_t mbf; - - /* The array of pending blanks. In non-POSIX locales, blanks can - include characters other than spaces, so the blanks must be - stored, not merely counted. */ -- char *pending_blank; -+ mbf_char_t *pending_blank; -+ /* True if the starting locale is utf8. */ -+ bool using_utf_locale; -+ -+ /* True if the first file contains BOM header. */ -+ bool found_bom; -+ using_utf_locale=check_utf_locale(); - - if (!fp) - return; -+ mbf_init (mbf, fp); -+ found_bom=check_bom(fp,&mbf); -+ -+ if (using_utf_locale == false && found_bom == true) -+ { -+ /*try using some predefined locale */ - -+ if (set_utf_locale () != 0) -+ { -+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); -+ } -+ } - /* The worst case is a non-blank character, then one blank, then a - tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so - allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ -- pending_blank = xmalloc (max_column_width); -+ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); -+ -+ if (found_bom == true) -+ { -+ print_bom(); -+ } - - while (true) - { - /* Input character, or EOF. */ -- int c; -+ mbf_char_t c; - - /* If true, perform translations. */ - bool convert = true; -@@ -156,12 +182,44 @@ unexpand (void) - - do - { -- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) -- continue; -+ while (true) { -+ mbf_getc (c, mbf); -+ if ((mb_iseof (c)) && (fp = next_file (fp))) -+ { -+ mbf_init (mbf, fp); -+ if (fp!=NULL) -+ { -+ if (check_bom(fp,&mbf)==true) -+ { -+ /*Not the first file - check BOM header*/ -+ if (using_utf_locale==false && found_bom==false) -+ { -+ /*BOM header in subsequent file but not in the first one. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ else -+ { -+ if(using_utf_locale==false && found_bom==true) -+ { -+ /*First file conatined BOM header - locale was switched to UTF -+ *all subsequent files should contain BOM. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ } -+ continue; -+ } -+ else -+ { -+ break; -+ } -+ } -+ - - if (convert) - { -- bool blank = !! isblank (c); -+ bool blank = mb_isblank (c); - - if (blank) - { -@@ -178,16 +236,16 @@ unexpand (void) - if (next_tab_column < column) - error (EXIT_FAILURE, 0, _("input line is too long")); - -- if (c == '\t') -+ if (mb_iseq (c, '\t')) - { - column = next_tab_column; - - if (pending) -- pending_blank[0] = '\t'; -+ mb_setascii (&pending_blank[0], '\t'); - } - else - { -- column++; -+ column += mb_width (c); - - if (! (prev_blank && column == next_tab_column)) - { -@@ -195,13 +253,14 @@ unexpand (void) - will be replaced by tabs. */ - if (column == next_tab_column) - one_blank_before_tab_stop = true; -- pending_blank[pending++] = c; -+ mb_copy (&pending_blank[pending++], &c); - prev_blank = true; - continue; - } - - /* Replace the pending blanks by a tab or two. */ -- pending_blank[0] = c = '\t'; -+ mb_setascii (&c, '\t'); -+ mb_setascii (&pending_blank[0], '\t'); - } - - /* Discard pending blanks, unless it was a single -@@ -209,7 +268,7 @@ unexpand (void) - pending = one_blank_before_tab_stop; - } - } -- else if (c == '\b') -+ else if (mb_iseq (c, '\b')) - { - /* Go back one column, and force recalculation of the - next tab stop. */ -@@ -219,16 +278,20 @@ unexpand (void) - } - else - { -- column++; -- if (!column) -+ const uintmax_t orig_column = column; -+ column += mb_width (c); -+ if (column < orig_column) - error (EXIT_FAILURE, 0, _("input line is too long")); - } - - if (pending) - { - if (pending > 1 && one_blank_before_tab_stop) -- pending_blank[0] = '\t'; -- if (fwrite (pending_blank, 1, pending, stdout) != pending) -+ mb_setascii (&pending_blank[0], '\t'); -+ -+ for (int n = 0; n < pending; ++n) -+ mb_putc (pending_blank[n], stdout); -+ if (ferror (stdout)) - write_error (); - pending = 0; - one_blank_before_tab_stop = false; -@@ -238,16 +301,17 @@ unexpand (void) - convert &= convert_entire_line || blank; - } - -- if (c < 0) -+ if (mb_iseof (c)) - { - free (pending_blank); - return; - } - -- if (putchar (c) < 0) -+ mb_putc (c, stdout); -+ if (ferror (stdout)) - write_error (); - } -- while (c != '\n'); -+ while (!mb_iseq (c, '\n')); - } - } - -diff --git a/tests/Coreutils.pm b/tests/Coreutils.pm -index 18e7bea..24a141b 100644 ---- a/tests/Coreutils.pm -+++ b/tests/Coreutils.pm -@@ -269,6 +269,9 @@ sub run_tests ($$$$$) - # Yes, this is an arbitrary limit. If it causes trouble, - # consider removing it. - my $max = 30; -+ # The downstream i18n multi-byte tests have a "-mb" suffix. -+ # Therefore add 3 to the maximum test name length. -+ $max += 3; - if ($max < length $test_name) - { - warn "$program_name: $test_name: test name is too long (> $max)\n"; -diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh -new file mode 100644 -index 0000000..dd6007c ---- /dev/null -+++ b/tests/expand/mb.sh -@@ -0,0 +1,183 @@ -+#!/bin/sh -+ -+# Copyright (C) 2012-2015 Free Software Foundation, Inc. -+ -+# This program is free software: you can redistribute it and/or modify -+# it under the terms of the GNU General Public License as published by -+# the Free Software Foundation, either version 3 of the License, or -+# (at your option) any later version. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+# GNU General Public License for more details. -+ -+# You should have received a copy of the GNU General Public License -+# along with this program. If not, see . -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ expand -+ -+export LC_ALL=en_US.UTF-8 -+ -+#input containing multibyte characters -+cat <<\EOF > in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+cat <<\EOF > exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#multiple files as an input -+cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand ./in ./in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test characters with display widths != 1 -+env printf '12345678 -+e\t|ascii(1) -+\u00E9\t|composed(1) -+e\u0301\t|decomposed(1) -+\u3000\t|ideo-space(2) -+\uFF0D\t|full-hypen(2) -+' > in || framework_failure_ -+ -+env printf '12345678 -+e |ascii(1) -+\u00E9 |composed(1) -+e\u0301 |decomposed(1) -+\u3000 |ideo-space(2) -+\uFF0D |full-hypen(2) -+' > exp || framework_failure_ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#shouldn't fail with "input line too long" -+#when a line starts with a control character -+env printf '\n' > in || framework_failure_ -+ -+expand < in > out || fail=1 -+compare in out > /dev/null 2>&1 || fail=1 -+ -+#non-Unicode characters interspersed between Unicode ones -+env printf '12345678 -+\t\xFF| -+\xFF\t| -+\t\xFFä| -+ä\xFF\t| -+\tä\xFF| -+\xFF\tä| -+äbcdef\xFF\t| -+' > in || framework_failure_ -+ -+env printf '12345678 -+ \xFF| -+\xFF | -+ \xFFä| -+ä\xFF | -+ ä\xFF| -+\xFF ä| -+äbcdef\xFF | -+' > exp || framework_failure_ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+ -+#BOM header test 1 -+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ -+ -+ -+printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+exit $fail -diff --git a/tests/i18n/sort.sh b/tests/i18n/sort.sh -new file mode 100644 -index 0000000..26c95de ---- /dev/null -+++ b/tests/i18n/sort.sh -@@ -0,0 +1,29 @@ -+#!/bin/sh -+# Verify sort's multi-byte support. -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ sort -+ -+export LC_ALL=en_US.UTF-8 -+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \ -+ || skip_ "No UTF-8 locale available" -+ -+# Enable heap consistency checkng on older systems -+export MALLOC_CHECK_=2 -+ -+ -+# check buffer overflow issue due to -+# expanding multi-byte representation due to case conversion -+# https://bugzilla.suse.com/show_bug.cgi?id=928749 -+cat < exp -+. -+ɑ -+EOF -+cat < out || fail=1 -+. -+ɑ -+EOF -+compare exp out || { fail=1; cat out; } -+ -+ -+Exit $fail -diff --git a/tests/local.mk b/tests/local.mk -index fdbf369..a6ce49c 100644 ---- a/tests/local.mk -+++ b/tests/local.mk -@@ -387,6 +387,8 @@ all_tests = \ - tests/sort/sort-discrim.sh \ - tests/sort/sort-files0-from.pl \ - tests/sort/sort-float.sh \ -+ tests/misc/sort-mb-tests.sh \ -+ tests/i18n/sort.sh \ - tests/sort/sort-h-thousands-sep.sh \ - tests/sort/sort-merge.pl \ - tests/sort/sort-merge-fdlimit.sh \ -@@ -590,6 +592,7 @@ all_tests = \ - tests/du/threshold.sh \ - tests/du/trailing-slash.sh \ - tests/du/two-args.sh \ -+ tests/expand/mb.sh \ - tests/id/gnu-zero-uids.sh \ - tests/id/no-context.sh \ - tests/id/context.sh \ -@@ -746,6 +749,7 @@ all_tests = \ - tests/touch/read-only.sh \ - tests/touch/relative.sh \ - tests/touch/trailing-slash.sh \ -+ tests/unexpand/mb.sh \ - $(all_root_tests) - - # See tests/factor/create-test.sh. -diff --git a/tests/misc/expand.pl b/tests/misc/expand.pl -index 11f3fc4..d609a2c 100755 ---- a/tests/misc/expand.pl -+++ b/tests/misc/expand.pl -@@ -27,6 +27,15 @@ my $prog = 'expand'; - # Turn off localization of executable's output. - @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; - -+#comment out next line to disable multibyte tests -+my $mb_locale = $ENV{LOCALE_FR_UTF8}; -+! defined $mb_locale || $mb_locale eq 'none' -+ and $mb_locale = 'C'; -+ -+my $prog = 'expand'; -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - my @Tests = - ( - ['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}], -@@ -168,6 +177,8 @@ my @Tests = - - - # Test errors -+ # FIXME: The following tests contain ‘quoting’ specific to LC_MESSAGES -+ # So we force LC_MESSAGES=C to make them pass. - ['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1}, - {ERR => "$prog: tab size contains invalid character(s): 'a'\n"}], - ['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1}, -@@ -184,6 +195,37 @@ my @Tests = - {ERR => "$prog: '/' specifier not at start of number: '/'\n"}], - ); - -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether expand is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}]; -+ } -+ push @Tests, @new; -+ } -+ -+ -+@Tests = triple_test \@Tests; -+ - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; - -diff --git a/tests/misc/fold.pl b/tests/misc/fold.pl -index 00b4362..7d51bea 100755 ---- a/tests/misc/fold.pl -+++ b/tests/misc/fold.pl -@@ -20,9 +20,18 @@ use strict; - - (my $program_name = $0) =~ s|.*/||; - -+my $prog = 'fold'; -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - # Turn off localization of executable's output. - @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; - -+# uncommented to enable multibyte paths -+my $mb_locale = $ENV{LOCALE_FR_UTF8}; -+! defined $mb_locale || $mb_locale eq 'none' -+ and $mb_locale = 'C'; -+ - my @Tests = - ( - ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}], -@@ -31,9 +40,48 @@ my @Tests = - ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}], - ); - -+# Add _POSIX2_VERSION=199209 to the environment of each test -+# that uses an old-style option like +1. -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether fold is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; -+ } -+ push @Tests, @new; -+ } -+ -+@Tests = triple_test \@Tests; -+ -+# Remember that triple_test creates from each test with exactly one "IN" -+# file two more tests (.p and .r suffix on name) corresponding to reading -+# input from a file and from a pipe. The pipe-reading test would fail -+# due to a race condition about 1 in 20 times. -+# Remove the IN_PIPE version of the "output-is-input" test above. -+# The others aren't susceptible because they have three inputs each. -+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests; -+ - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; - --my $prog = 'fold'; - my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); - exit $fail; -diff --git a/tests/misc/sort-mb-tests.sh b/tests/misc/sort-mb-tests.sh -new file mode 100644 -index 0000000..11836ba ---- /dev/null -+++ b/tests/misc/sort-mb-tests.sh -@@ -0,0 +1,45 @@ -+#!/bin/sh -+# Verify sort's multi-byte support. -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ sort -+ -+export LC_ALL=en_US.UTF-8 -+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \ -+ || skip_ "No UTF-8 locale available" -+ -+ -+cat < exp -+Banana@5 -+Apple@10 -+Citrus@20 -+Cherry@30 -+EOF -+ -+cat < out || fail=1 -+Apple@10 -+Banana@5 -+Citrus@20 -+Cherry@30 -+EOF -+ -+compare exp out || { fail=1; cat out; } -+ -+ -+cat < exp -+Citrus@AA20@@5 -+Cherry@AA30@@10 -+Apple@AA10@@20 -+Banana@AA5@@30 -+EOF -+ -+cat < out || fail=1 -+Apple@AA10@@20 -+Banana@AA5@@30 -+Citrus@AA20@@5 -+Cherry@AA30@@10 -+EOF -+ -+compare exp out || { fail=1; cat out; } -+ -+Exit $fail -diff --git a/tests/misc/unexpand.pl b/tests/misc/unexpand.pl -index 76bcbd4..59eb819 100755 ---- a/tests/misc/unexpand.pl -+++ b/tests/misc/unexpand.pl -@@ -27,6 +27,14 @@ my $limits = getlimits (); - - my $prog = 'unexpand'; - -+# comment out next line to disable multibyte tests -+my $mb_locale = $ENV{LOCALE_FR_UTF8}; -+! defined $mb_locale || $mb_locale eq 'none' -+ and $mb_locale = 'C'; -+ -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - my @Tests = - ( - ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}], -@@ -128,6 +136,37 @@ my @Tests = - ['ts2', '-t5,8', {IN=>"x\t \t y\n"}, {OUT=>"x\t\t y\n"}], - ); - -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether unexpand is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ next if ($test_name =~ 'b-1'); -+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; -+ } -+ push @Tests, @new; -+ } -+ -+@Tests = triple_test \@Tests; -+ - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; - -diff --git a/tests/pr/pr-tests.pl b/tests/pr/pr-tests.pl -index 6b34e0b..34b4aeb 100755 ---- a/tests/pr/pr-tests.pl -+++ b/tests/pr/pr-tests.pl -@@ -24,6 +24,15 @@ use strict; - my $prog = 'pr'; - my $normalize_strerror = "s/': .*/'/"; - -+my $mb_locale; -+#Uncomment the following line to enable multibyte tests -+$mb_locale = $ENV{LOCALE_FR_UTF8}; -+! defined $mb_locale || $mb_locale eq 'none' -+ and $mb_locale = 'C'; -+ -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - my @tv = ( - - # -b option is no longer an official option. But it's still working to -@@ -515,8 +524,48 @@ push @Tests, - {IN=>"x\tx\tx\tx\tx\nx\tx\tx\tx\tx\n"}, - {OUT=>"x\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"} ]; - -+# Add _POSIX2_VERSION=199209 to the environment of each test -+# that uses an old-style option like +1. -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether pr is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ #temporarily skip some failing tests -+ next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1"); -+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; -+ } -+ push @Tests, @new; -+ } -+ - @Tests = triple_test \@Tests; - -+# Remember that triple_test creates from each test with exactly one "IN" -+# file two more tests (.p and .r suffix on name) corresponding to reading -+# input from a file and from a pipe. The pipe-reading test would fail -+# due to a race condition about 1 in 20 times. -+# Remove the IN_PIPE version of the "output-is-input" test above. -+# The others aren't susceptible because they have three inputs each. -+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests; -+ - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; - -diff --git a/tests/sort/sort-merge.pl b/tests/sort/sort-merge.pl -index 89eed0c..b855d73 100755 ---- a/tests/sort/sort-merge.pl -+++ b/tests/sort/sort-merge.pl -@@ -26,6 +26,15 @@ my $prog = 'sort'; - # Turn off localization of executable's output. - @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; - -+my $mb_locale; -+# uncommented according to upstream commit enabling multibyte paths -+$mb_locale = $ENV{LOCALE_FR_UTF8}; -+! defined $mb_locale || $mb_locale eq 'none' -+ and $mb_locale = 'C'; -+ -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - # three empty files and one that says 'foo' - my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}}); - -@@ -77,6 +86,39 @@ my @Tests = - {OUT=>$big_input}], - ); - -+# Add _POSIX2_VERSION=199209 to the environment of each test -+# that uses an old-style option like +1. -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether sort is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ next if ($test_name =~ "nmerge-."); -+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; -+ } -+ push @Tests, @new; -+ } -+ -+@Tests = triple_test \@Tests; -+ - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; - -diff --git a/tests/sort/sort.pl b/tests/sort/sort.pl -index d49f65f..ebba925 100755 ---- a/tests/sort/sort.pl -+++ b/tests/sort/sort.pl -@@ -24,10 +24,15 @@ my $prog = 'sort'; - # Turn off localization of executable's output. - @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; - --my $mb_locale = $ENV{LOCALE_FR_UTF8}; -+my $mb_locale; -+#Comment out next line to disable multibyte tests -+$mb_locale = $ENV{LOCALE_FR_UTF8}; - ! defined $mb_locale || $mb_locale eq 'none' - and $mb_locale = 'C'; - -+my $try = "Try \`$prog --help' for more information.\n"; -+my $inval = "$prog: invalid byte, character or field list\n$try"; -+ - # Since each test is run with a file name and with redirected stdin, - # the name in the diagnostic is either the file name or "-". - # Normalize each diagnostic to use '-'. -@@ -423,6 +428,38 @@ foreach my $t (@Tests) - } - } - -+if ($mb_locale ne 'C') -+ { -+ # Duplicate each test vector, appending "-mb" to the test name and -+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we -+ # provide coverage for the distro-added multi-byte code paths. -+ my @new; -+ foreach my $t (@Tests) -+ { -+ my @new_t = @$t; -+ my $test_name = shift @new_t; -+ -+ # Depending on whether sort is multi-byte-patched, -+ # it emits different diagnostics: -+ # non-MB: invalid byte or field list -+ # MB: invalid byte, character or field list -+ # Adjust the expected error output accordingly. -+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval} -+ (@new_t)) -+ { -+ my $sub = {ERR_SUBST => 's/, character//'}; -+ push @new_t, $sub; -+ push @$t, $sub; -+ } -+ #disable several failing tests until investigation, disable all tests with envvars set -+ next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t)); -+ next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a"); -+ next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules. -+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; -+ } -+ push @Tests, @new; -+ } -+ - @Tests = triple_test \@Tests; - - # Remember that triple_test creates from each test with exactly one "IN" -@@ -432,6 +469,7 @@ foreach my $t (@Tests) - # Remove the IN_PIPE version of the "output-is-input" test above. - # The others aren't susceptible because they have three inputs each. - @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests; -+@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests; - - my $save_temps = $ENV{DEBUG}; - my $verbose = $ENV{VERBOSE}; -diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh -new file mode 100644 -index 0000000..8a82d74 ---- /dev/null -+++ b/tests/unexpand/mb.sh -@@ -0,0 +1,172 @@ -+#!/bin/sh -+ -+# Copyright (C) 2012-2015 Free Software Foundation, Inc. -+ -+# This program is free software: you can redistribute it and/or modify -+# it under the terms of the GNU General Public License as published by -+# the Free Software Foundation, either version 3 of the License, or -+# (at your option) any later version. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+# GNU General Public License for more details. -+ -+# You should have received a copy of the GNU General Public License -+# along with this program. If not, see . -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ unexpand -+ -+export LC_ALL=en_US.UTF-8 -+ -+#input containing multibyte characters -+cat > in <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+cat > exp <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+#multiple files as an input -+cat >> exp <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+unexpand -a ./in ./in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test characters with a display width larger than 1 -+ -+env printf '12345678 -+e |ascii(1) -+\u00E9 |composed(1) -+e\u0301 |decomposed(1) -+\u3000 |ideo-space(2) -+\uFF0D |full-hypen(2) -+' > in || framework_failure_ -+ -+env printf '12345678 -+e\t|ascii(1) -+\u00E9\t|composed(1) -+e\u0301\t|decomposed(1) -+\u3000\t|ideo-space(2) -+\uFF0D\t|full-hypen(2) -+' > exp || framework_failure_ -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test input where a blank of width > 1 is not being substituted -+in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" -+exp='   ö ü ß' -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#non-Unicode characters interspersed between Unicode ones -+env printf '12345678 -+ \xFF| -+\xFF | -+ \xFFä| -+ä\xFF | -+ ä\xFF| -+\xFF ä| -+äbcdef\xFF | -+' > in || framework_failure_ -+ -+env printf '12345678 -+\t\xFF| -+\xFF\t| -+\t\xFFä| -+ä\xFF\t| -+\tä\xFF| -+\xFF\tä| -+äbcdef\xFF\t| -+' > exp || framework_failure_ -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#BOM header test 1 -+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 --- -2.44.0 - diff --git a/backport-dd-don-t-trust-st_size-on-proc-files.patch b/backport-dd-don-t-trust-st_size-on-proc-files.patch deleted file mode 100644 index d28a5de700b37a767910d8463992f44144e7b301..0000000000000000000000000000000000000000 --- a/backport-dd-don-t-trust-st_size-on-proc-files.patch +++ /dev/null @@ -1,30 +0,0 @@ -From ac6b8d8224de140f5a6f2ca66e6ce279604a37e6 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 6 Apr 2024 15:15:04 -0700 -Subject: [PATCH] =?UTF-8?q?dd:=20don=E2=80=99t=20trust=20st=5Fsize=20on=20?= - =?UTF-8?q?/proc/files?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -* src/dd.c (skip): Don’t trust st_size == 0 ---- - src/dd.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/dd.c b/src/dd.c -index b50b841..a582111 100644 ---- a/src/dd.c -+++ b/src/dd.c -@@ -1813,7 +1813,7 @@ skip (int fdesc, char const *file, intmax_t records, idx_t blocksize, - struct stat st; - if (ifstat (STDIN_FILENO, &st) != 0) - error (EXIT_FAILURE, errno, _("cannot fstat %s"), quoteaf (file)); -- if (usable_st_size (&st) && 0 <= input_offset -+ if (usable_st_size (&st) && 0 < st.st_size && 0 <= input_offset - && st.st_size - input_offset < offset) - { - /* When skipping past EOF, return the number of _full_ blocks --- -2.27.0 - diff --git a/backport-head-fix-overflows-in-elide_tail_bytes_pipe.patch b/backport-head-fix-overflows-in-elide_tail_bytes_pipe.patch deleted file mode 100644 index 0a4d9a718b71c36686a442ac4d0ec9c47247ca90..0000000000000000000000000000000000000000 --- a/backport-head-fix-overflows-in-elide_tail_bytes_pipe.patch +++ /dev/null @@ -1,252 +0,0 @@ -From 8fe800a06e50be3c905ab1694a2d1bfd6e70be42 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 10 Aug 2024 22:19:17 -0700 -Subject: [PATCH] head: fix overflows in elide_tail_bytes_pipe -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Not clear that the overflows could be exploited, -but they made the code confusing. -* src/head.c (elide_tail_bytes_pipe): Don’t convert uintmax_t -to size_t first thing; wait until it’s known the value will fit, -and then use idx_t rather than size_t to prefer signed types. -Prefer idx_t in nearby code, too. -Rename locals n_elide_0 to n_elide (for consistency elsewhere) -and n_elide to in_elide. -Remove bogus (SIZE_MAX < n_elide + READ_BUFSIZE) test; -in the typical case where n_elide’s type was the same as -that of SIZE_MAX, the test never succeeded, and in the -less-common case where n_elide was wider than size_t, -the addition could silently overflow, causing the test -to fail when it should succeed. The test is not needed anyway now. -Add static asserts to document code assumptions. -Redo the ! (n_elide <= HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD) case -so that it works with enormous values of n_elide even on -32-bit platforms; for example, n_bufs is now uintmax_t not size_t. -Simplify by using xpalloc instead of by-hand code. -Remove bogus ‘if (rem)’ test, as rem is always nonzero. ---- - src/head.c | 129 ++++++++++++++++++++++++----------------------------- - 1 file changed, 58 insertions(+), 71 deletions(-) - -diff --git a/src/head.c b/src/head.c -index a9155c24c..9715b7b73 100644 ---- a/src/head.c -+++ b/src/head.c -@@ -237,17 +237,16 @@ elseek (int fd, off_t offset, int whence, char const *filename) - } - - /* For an input file with name FILENAME and descriptor FD, -- output all but the last N_ELIDE_0 bytes. -+ output all but the last N_ELIDE bytes. - If CURRENT_POS is nonnegative, assume that the input file is - positioned at CURRENT_POS and that it should be repositioned to - just before the elided bytes before returning. - Return true upon success. - Give a diagnostic and return false upon error. */ - static bool --elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, -+elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide, - off_t current_pos) - { -- size_t n_elide = n_elide_0; - uintmax_t desired_pos = current_pos; - bool ok = true; - -@@ -265,16 +264,9 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - #endif - - #if HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD < 2 * READ_BUFSIZE -- "HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD must be at least 2 * READ_BUFSIZE" -+# error "HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD must be at least 2 * READ_BUFSIZE" - #endif - -- if (SIZE_MAX < n_elide_0 + READ_BUFSIZE) -- { -- char umax_buf[INT_BUFSIZE_BOUND (n_elide_0)]; -- error (EXIT_FAILURE, 0, _("%s: number of bytes is too large"), -- umaxtostr (n_elide_0, umax_buf)); -- } -- - /* Two cases to consider... - 1) n_elide is small enough that we can afford to double-buffer: - allocate 2 * (READ_BUFSIZE + n_elide) bytes -@@ -286,11 +278,14 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - CAUTION: do not fail (out of memory) when asked to elide - a ridiculous amount, but when given only a small input. */ - -+ static_assert (READ_BUFSIZE <= IDX_MAX); -+ static_assert (HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD <= IDX_MAX - READ_BUFSIZE); - if (n_elide <= HEAD_TAIL_PIPE_BYTECOUNT_THRESHOLD) - { -+ idx_t in_elide = n_elide; - bool first = true; - bool eof = false; -- size_t n_to_read = READ_BUFSIZE + n_elide; -+ size_t n_to_read = READ_BUFSIZE + in_elide; - bool i; - char *b[2]; - b[0] = xnmalloc (2, n_to_read); -@@ -310,7 +305,7 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - } - - /* reached EOF */ -- if (n_read <= n_elide) -+ if (n_read <= in_elide) - { - if (first) - { -@@ -320,7 +315,7 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - } - else - { -- delta = n_elide - n_read; -+ delta = in_elide - n_read; - } - } - eof = true; -@@ -330,15 +325,15 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - the previous round. */ - if (! first) - { -- desired_pos += n_elide - delta; -- xwrite_stdout (b[!i] + READ_BUFSIZE, n_elide - delta); -+ desired_pos += in_elide - delta; -+ xwrite_stdout (b[!i] + READ_BUFSIZE, in_elide - delta); - } - first = false; - -- if (n_elide < n_read) -+ if (in_elide < n_read) - { -- desired_pos += n_read - n_elide; -- xwrite_stdout (b[i], n_read - n_elide); -+ desired_pos += n_read - in_elide; -+ xwrite_stdout (b[i], n_read - in_elide); - } - } - -@@ -350,31 +345,24 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - bytes. Then, for each new buffer we read, also write an old one. */ - - bool eof = false; -- size_t n_read; -- bool buffered_enough; -- size_t i, i_next; -+ idx_t n_read; - char **b = nullptr; -- /* Round n_elide up to a multiple of READ_BUFSIZE. */ -- size_t rem = READ_BUFSIZE - (n_elide % READ_BUFSIZE); -- size_t n_elide_round = n_elide + rem; -- size_t n_bufs = n_elide_round / READ_BUFSIZE + 1; -- size_t n_alloc = 0; -- size_t n_array_alloc = 0; -- -- buffered_enough = false; -+ -+ idx_t remainder = n_elide % READ_BUFSIZE; -+ /* The number of buffers needed to hold n_elide bytes plus one -+ extra buffer. They are allocated lazily, so don't report -+ overflow now simply because the number does not fit into idx_t. */ -+ uintmax_t n_bufs = n_elide / READ_BUFSIZE + (remainder != 0) + 1; -+ idx_t n_alloc = 0; -+ idx_t n_array_alloc = 0; -+ -+ bool buffered_enough = false; -+ idx_t i, i_next; - for (i = 0, i_next = 1; !eof; i = i_next, i_next = (i_next + 1) % n_bufs) - { - if (n_array_alloc == i) -- { -- /* reallocate between 16 and n_bufs entries. */ -- if (n_array_alloc == 0) -- n_array_alloc = MIN (n_bufs, 16); -- else if (n_array_alloc <= n_bufs / 2) -- n_array_alloc *= 2; -- else -- n_array_alloc = n_bufs; -- b = xnrealloc (b, n_array_alloc, sizeof *b); -- } -+ b = xpalloc (b, &n_array_alloc, 1, MIN (n_bufs, PTRDIFF_MAX), -+ sizeof *b); - - if (! buffered_enough) - { -@@ -403,43 +391,42 @@ elide_tail_bytes_pipe (char const *filename, int fd, uintmax_t n_elide_0, - } - } - -- /* Output any remainder: rem bytes from b[i] + n_read. */ -- if (rem) -+ /* Output the remainder: rem bytes from b[i] + n_read. */ -+ idx_t rem = READ_BUFSIZE - remainder; -+ if (buffered_enough) - { -- if (buffered_enough) -+ idx_t n_bytes_left_in_b_i = READ_BUFSIZE - n_read; -+ desired_pos += rem; -+ if (rem < n_bytes_left_in_b_i) - { -- size_t n_bytes_left_in_b_i = READ_BUFSIZE - n_read; -- desired_pos += rem; -- if (rem < n_bytes_left_in_b_i) -- { -- xwrite_stdout (b[i] + n_read, rem); -- } -- else -- { -- xwrite_stdout (b[i] + n_read, n_bytes_left_in_b_i); -- xwrite_stdout (b[i_next], rem - n_bytes_left_in_b_i); -- } -+ xwrite_stdout (b[i] + n_read, rem); - } -- else if (i + 1 == n_bufs) -+ else - { -- /* This happens when n_elide < file_size < n_elide_round. -- -- |READ_BUF.| -- | | rem | -- |---------!---------!---------!---------| -- |---- n_elide ---------| -- | | x | -- | |y | -- |---- file size -----------| -- | |n_read| -- |---- n_elide_round ----------| -- */ -- size_t y = READ_BUFSIZE - rem; -- size_t x = n_read - y; -- desired_pos += x; -- xwrite_stdout (b[i_next], x); -+ xwrite_stdout (b[i] + n_read, n_bytes_left_in_b_i); -+ xwrite_stdout (b[i_next], rem - n_bytes_left_in_b_i); - } - } -+ else if (i + 1 == n_bufs) -+ { -+ /* This happens when -+ n_elide < file_size < (n_bufs - 1) * READ_BUFSIZE. -+ -+ |READ_BUF.| -+ | | rem | -+ |---------!---------!---------!---------| -+ |---- n_elide----------| -+ | | x | -+ | |y | -+ |---- file size -----------| -+ | |n_read| -+ |(n_bufs - 1) * READ_BUFSIZE--| -+ */ -+ idx_t y = READ_BUFSIZE - rem; -+ idx_t x = n_read - y; -+ desired_pos += x; -+ xwrite_stdout (b[i_next], x); -+ } - - free_mem: - for (i = 0; i < n_alloc; i++) --- -2.33.0 - diff --git a/backport-head-off_t-not-uintmax_t-for-file-offset.patch b/backport-head-off_t-not-uintmax_t-for-file-offset.patch deleted file mode 100644 index 421a129d559c3e57bbf64118013f10b67db7f7da..0000000000000000000000000000000000000000 --- a/backport-head-off_t-not-uintmax_t-for-file-offset.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 0f9e2719e0dd2366f0381daa832f9415f3162af2 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 10 Aug 2024 18:55:09 -0700 -Subject: [PATCH] head: off_t not uintmax_t for file offset - -* src/head.c (elide_tail_lines_pipe): -Use off_t, not uintmax_t, for a local var that is -a file offset. ---- - src/head.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/head.c b/src/head.c -index 2795ae486..a9155c24c 100644 ---- a/src/head.c -+++ b/src/head.c -@@ -504,7 +504,7 @@ elide_tail_lines_pipe (char const *filename, int fd, uintmax_t n_elide, - size_t nlines; - struct linebuffer *next; - }; -- uintmax_t desired_pos = current_pos; -+ off_t desired_pos = current_pos; - typedef struct linebuffer LBUFFER; - LBUFFER *first, *last, *tmp; - size_t total_lines = 0; /* Total number of newlines in all buffers. */ --- -2.43.0 - diff --git a/backport-putenv-Don-t-crash-upon-out-of-memory.patch b/backport-putenv-Don-t-crash-upon-out-of-memory.patch deleted file mode 100644 index 13d132ea41b1d1a1e7f3606aafcdf87ad48f9c14..0000000000000000000000000000000000000000 --- a/backport-putenv-Don-t-crash-upon-out-of-memory.patch +++ /dev/null @@ -1,30 +0,0 @@ -From adb76c754290c328a88438af89e491ece7e6a9c5 Mon Sep 17 00:00:00 2001 -From: Bruno Haible -Date: Thu, 6 Jun 2024 02:24:44 +0200 -Subject: [PATCH] putenv: Don't crash upon out-of-memory. - -* lib/putenv.c (_unsetenv): Handle malloc failure. - -Reference:https://github.com/coreutils/gnulib/commit/adb76c754290c328a88438af89e491ece7e6a9c5 -Conflict:delete ChangeLog - ---- - lib/putenv.c | 2 ++ - 1 files changed, 2 insertions(+) - -diff --git a/lib/putenv.c b/lib/putenv.c -index 525d12ae..1d70717e 100644 ---- a/lib/putenv.c -+++ b/lib/putenv.c -@@ -92,6 +92,8 @@ _unsetenv (const char *name) - { - int putenv_result; - char *name_ = malloc (len + 2); -+ if (name_ == NULL) -+ return -1; - memcpy (name_, name, len); - name_[len] = '='; - name_[len + 1] = 0; --- -2.43.0 - diff --git a/backport-shuf-avoid-integer-overflow-on-huge-inputs.patch b/backport-shuf-avoid-integer-overflow-on-huge-inputs.patch deleted file mode 100644 index 7d02db5a9697788261c43def876cac5817a4e880..0000000000000000000000000000000000000000 --- a/backport-shuf-avoid-integer-overflow-on-huge-inputs.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 1ea7255f8b0661cdfabbd13f8f443f81665a07e0 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 3 Aug 2024 22:59:12 -0700 -Subject: [PATCH] shuf: avoid integer overflow on huge inputs - -* gl/lib/randperm.c: Include . -(randperm_bound): Return SIZE_MAX if the multiplication overflows. -Do not overflow when converting bit count to byte count. - -Reference:https://github.com/coreutils/coreutils/commit/1ea7255f8b0661cdfabbd13f8f443f81665a07e0 -Conflict:change gl/lib/randperm.c to lib/randperm.c; Adaptation to floor_lg() - ---- - gl/lib/randperm.c | 11 +++++++---- - 1 file changed, 7 insertions(+), 4 deletions(-) - -diff --git a/lib/randperm.c b/lib/randperm.c -index 50328cd9a..14a304524 100644 ---- a/lib/randperm.c -+++ b/lib/randperm.c -@@ -23,6 +23,7 @@ - - #include - #include -+#include - #include - - #include "attribute.h" - -@@ -39,13 +40,15 @@ randperm_bound (size_t h, size_t n) - { - /* Upper bound on number of bits needed to generate the first number - of the permutation. */ -- uintmax_t lg_n = floor_lg (n) + 1; -+ unsigned int lg_n = floor_lg (n) + 1; - -- /* Upper bound on number of bits needed to generated the first H elements. */ -- uintmax_t ar = lg_n * h; -+ /* Upper bound on number of bits needed to generate the first H elements. */ -+ uintmax_t ar; -+ if (ckd_mul (&ar, lg_n, h)) -+ return SIZE_MAX; - - /* Convert the bit count to a byte count. */ -- size_t bound = (ar + CHAR_BIT - 1) / CHAR_BIT; -+ size_t bound = ar / CHAR_BIT + (ar % CHAR_BIT != 0); - - return bound; - } --- -2.43.0 - diff --git a/backport-shuf-fix-randomness-bug.patch b/backport-shuf-fix-randomness-bug.patch deleted file mode 100644 index 565666ba7b0852fa89b692605be6d6b2e978e22c..0000000000000000000000000000000000000000 --- a/backport-shuf-fix-randomness-bug.patch +++ /dev/null @@ -1,40 +0,0 @@ -From bfbb3ec7f798b179d7fa7b42673e068b18048899 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 3 Aug 2024 22:31:20 -0700 -Subject: [PATCH] shuf: fix randomness bug - -Problem reported by Daniel Carpenter . -* gl/lib/randread.c (randread_new): Fill the ISAAC buffer -instead of storing at most BYTES_BOUND bytes into it. ---- - gl/lib/randread.c | 12 +++++++++++- - 1 files changed, 12 insertions(+), 1 deletion(-) - -diff --git a/lib/randread.c b/lib/randread.c -index cbee224bb..43c0cf09f 100644 ---- a/lib/randread.c -+++ b/lib/randread.c -@@ -189,9 +189,19 @@ randread_new (char const *name, size_t bytes_bound) - setvbuf (source, s->buf.c, _IOFBF, MIN (sizeof s->buf.c, bytes_bound)); - else - { -+ /* Fill the ISAAC buffer. Although it is tempting to read at -+ most BYTES_BOUND bytes, this is incorrect for two reasons. -+ First, BYTES_BOUND is just an estimate. -+ Second, even if the estimate is correct -+ ISAAC64 poorly randomizes when BYTES_BOUND is small -+ and just the first few bytes of s->buf.isaac.state.m -+ are random while the other bytes are all zero. See: -+ Aumasson J-P. On the pseudo-random generator ISAAC. -+ Cryptology ePrint Archive. 2006;438. -+ . */ - s->buf.isaac.buffered = 0; - if (! get_nonce (s->buf.isaac.state.m, -- MIN (sizeof s->buf.isaac.state.m, bytes_bound))) -+ sizeof s->buf.isaac.state.m)) - { - int e = errno; - randread_free_body (s); --- -2.43.0 - diff --git a/backport-sort-don-t-trust-st_size-on-proc-files.patch b/backport-sort-don-t-trust-st_size-on-proc-files.patch deleted file mode 100644 index 210774e28df5f49becc7006e141419193fe56020..0000000000000000000000000000000000000000 --- a/backport-sort-don-t-trust-st_size-on-proc-files.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 8ff3903281e03d36dd1aa2a202a56f38af726d91 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 6 Apr 2024 15:17:14 -0700 -Subject: [PATCH] =?UTF-8?q?sort:=20don=E2=80=99t=20trust=20st=5Fsize=20on?= - =?UTF-8?q?=20/proc=20files?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Problem and fix reported by Takashi Kusumi in: -https://bugs.gnu.org/70231 -* src/sort.c (sort_buffer_size): Don’t trust st_size == 0. ---- - src/sort.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/sort.c b/src/sort.c -index e779845..b983ca2 100644 ---- a/src/sort.c -+++ b/src/sort.c -@@ -1538,7 +1538,7 @@ sort_buffer_size (FILE *const *fps, size_t nfps, - != 0) - sort_die (_("stat failed"), files[i]); - -- if (S_ISREG (st.st_mode)) -+ if (usable_st_size (&st) && 0 < st.st_size) - file_size = st.st_size; - else - { --- -2.27.0 - diff --git a/backport-split-don-t-trust-st_size-on-proc-files.patch b/backport-split-don-t-trust-st_size-on-proc-files.patch deleted file mode 100644 index dc051b97b1e274308af9967e486221d3e74b79a9..0000000000000000000000000000000000000000 --- a/backport-split-don-t-trust-st_size-on-proc-files.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 11163675818ab877f20d3740a7c3e59d565b8e9c Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Sat, 6 Apr 2024 15:18:04 -0700 -Subject: [PATCH] =?UTF-8?q?split:=20don=E2=80=99t=20trust=20st=5Fsize=20on?= - =?UTF-8?q?=20/proc=20files?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -* src/split.c (create): Don’t trust st_size == 0. ---- - src/split.c | 6 ++---- - 1 file changed, 2 insertions(+), 4 deletions(-) - -diff --git a/src/split.c b/src/split.c -index 037960a59..f82a7f74b 100644 ---- a/src/split.c -+++ b/src/split.c -@@ -489,10 +489,8 @@ create (char const *name) - if (psame_inode (&in_stat_buf, &out_stat_buf)) - error (EXIT_FAILURE, 0, _("%s would overwrite input; aborting"), - quoteaf (name)); -- bool regularish -- = S_ISREG (out_stat_buf.st_mode) || S_TYPEISSHM (&out_stat_buf); -- if (! (regularish && out_stat_buf.st_size == 0) -- && ftruncate (fd, 0) < 0 && regularish) -+ if (ftruncate (fd, 0) < 0 -+ && (S_ISREG (out_stat_buf.st_mode) || S_TYPEISSHM (&out_stat_buf))) - error (EXIT_FAILURE, errno, _("%s: error truncating"), quotef (name)); - - return fd; --- -2.43.0 - diff --git a/backport-tail-avoid-infloop-with-c-on-dev-zero.patch b/backport-tail-avoid-infloop-with-c-on-dev-zero.patch deleted file mode 100644 index 6e8f62be1681019af614381cc45ed043cf0b0fe9..0000000000000000000000000000000000000000 --- a/backport-tail-avoid-infloop-with-c-on-dev-zero.patch +++ /dev/null @@ -1,84 +0,0 @@ -From fb543b6b82c1f3a20ff88f44cc3ed367bfe811b6 Mon Sep 17 00:00:00 2001 -From: Paul Eggert -Date: Fri, 19 Apr 2024 21:44:32 -0700 -Subject: [PATCH] tail: avoid infloop with -c on /dev/zero - -Problem reported by Ionut Nicula in: -https://bugs.gnu.org/70477 -* src/tail.c (tail_bytes): Do not loop forever on commands -like 'tail -c 4096 /dev/zero'. -* tests/tail/tail-c.sh: Test this fix. ---- - src/tail.c | 24 +++++++++++++++++++----- - tests/tail/tail-c.sh | 10 ++++++++++ - 2 files changed, 29 insertions(+), 5 deletions(-) - -diff --git a/src/tail.c b/src/tail.c -index 52c0810..a3b46ca 100644 ---- a/src/tail.c -+++ b/src/tail.c -@@ -760,7 +760,8 @@ free_lbuffers: - return ok; - } - --/* Print the last N_BYTES characters from the end of pipe FD. -+/* Print the last N_BYTES characters from the end of FD. -+ Work even if the input is a pipe. - This is a stripped down version of pipe_lines. - Return true if successful. */ - -@@ -1875,15 +1876,28 @@ tail_bytes (char const *pretty_filename, int fd, uintmax_t n_bytes, - { - off_t end_pos = -1; - off_t current_pos = -1; -+ bool copy_from_current_pos = false; - - if (! presume_input_pipe && n_bytes <= OFF_T_MAX) - { - if (usable_st_size (&stats)) -- end_pos = stats.st_size; -- else if ((current_pos = lseek (fd, -n_bytes, SEEK_END)) != -1) -- end_pos = current_pos + n_bytes; -+ { -+ /* Use st_size only if it's so large that this is -+ probably not a /proc or similar file, where st_size -+ is notional. */ -+ end_pos = stats.st_size; -+ off_t smallish_size = STP_BLKSIZE (&stats); -+ copy_from_current_pos = smallish_size < end_pos; -+ } -+ else -+ { -+ current_pos = lseek (fd, -n_bytes, SEEK_END); -+ copy_from_current_pos = current_pos != -1; -+ if (copy_from_current_pos) -+ end_pos = current_pos + n_bytes; -+ } - } -- if (end_pos <= (off_t) STP_BLKSIZE (&stats)) -+ if (! copy_from_current_pos) - return pipe_bytes (pretty_filename, fd, n_bytes, read_pos); - if (current_pos == -1) - current_pos = xlseek (fd, 0, SEEK_CUR, pretty_filename); -diff --git a/tests/tail/tail-c.sh b/tests/tail/tail-c.sh -index f518e5b..a9f2bc2 100755 ---- a/tests/tail/tail-c.sh -+++ b/tests/tail/tail-c.sh -@@ -35,4 +35,14 @@ printf '123456' | tail -c3 > out || fail=1 - printf '456' > exp || framework_failure_ - compare exp out || fail=1 - -+# Any part of /dev/zero should be valid for tail -c. -+head -c 4096 /dev/zero >exp || fail=1 -+tail -c 4096 /dev/zero >out || fail=1 -+compare exp out || fail=1 -+ -+# Any part of /dev/urandom, if it exists, should be valid for tail -c. -+if test -r /dev/urandom; then -+ timeout --verbose 1 tail -c 4096 /dev/urandom >/dev/null || fail=1 -+fi -+ - Exit $fail --- -2.43.0 - diff --git a/coreutils-9.5-gcc14-gnulib-lto.patch b/coreutils-9.5-gcc14-gnulib-lto.patch deleted file mode 100644 index cc4d60564e2bfd54252fa55d7ea4083d252f7691..0000000000000000000000000000000000000000 --- a/coreutils-9.5-gcc14-gnulib-lto.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 4602765093f04e597f87d78cf29d21eea03b6fa4 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?P=C3=A1draig=20Brady?= -Date: Wed, 28 Aug 2024 12:10:43 +0100 -Subject: avoid GCC -Wmaybe-uninitialized false positives with LTO - -Avoids false warnings with GCC 14.2.1 with -flto - -* lib/canonicalize.c: Initialize END_IDX. -* lib/getndelim2.c: Initialize C. ---- - ChangeLog | 8 ++++++++ - lib/canonicalize.c | 9 ++++++++- - lib/getndelim2.c | 8 +++++--- - 3 files changed, 21 insertions(+), 4 deletions(-) - ---- a/lib/canonicalize.c -+++ b/lib/canonicalize.c -@@ -34,6 +34,13 @@ - #include "hash-triple.h" - #include "xalloc.h" - -+/* Suppress bogus GCC -Wmaybe-uninitialized warnings. */ -+#if defined GCC_LINT || defined lint -+# define IF_LINT(Code) Code -+#else -+# define IF_LINT(Code) /* empty */ -+#endif -+ - #ifndef DOUBLE_SLASH_IS_DISTINCT_ROOT - # define DOUBLE_SLASH_IS_DISTINCT_ROOT false - #endif -@@ -367,7 +374,7 @@ canonicalize_filename_mode_stk (const char *name, canonicalize_mode_t can_mode, - buf[n] = '\0'; - - char *extra_buf = bufs->extra.data; -- idx_t end_idx; -+ idx_t end_idx IF_LINT (= 0); - if (end_in_extra_buffer) - end_idx = end - extra_buf; - size_t len = strlen (end); -diff --git a/lib/getndelim2.c b/lib/getndelim2.c -index 89989ae..db61e2a 100644 ---- a/lib/getndelim2.c -+++ b/lib/getndelim2.c -@@ -47,8 +47,10 @@ - #include "memchr2.h" - - /* Avoid false GCC warning "'c' may be used uninitialized". */ --#if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4 --# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -+#if defined GCC_LINT || defined lint -+# define IF_LINT(Code) Code -+#else -+# define IF_LINT(Code) /* empty */ - #endif - - /* The maximum value that getndelim2 can return without suffering from -@@ -102,7 +104,7 @@ getndelim2 { - /* Here always ptr + size == read_pos + nbytes_avail. - Also nbytes_avail > 0 || size < nmax. */ - -- int c; -+ int c IF_LINT (= EOF); - const char *buffer; - size_t buffer_len; diff --git a/coreutils-9.5.tar.xz b/coreutils-9.5.tar.xz deleted file mode 100644 index 3a6c9dba57057be00225ffa49c7840d166699801..0000000000000000000000000000000000000000 Binary files a/coreutils-9.5.tar.xz and /dev/null differ diff --git a/coreutils-9.5-sw.patch b/coreutils-9.6-sw.patch similarity index 84% rename from coreutils-9.5-sw.patch rename to coreutils-9.6-sw.patch index a43c1cda5cc2eb1aa1323c817414c32298f5cae3..9db13dfec66dd19979b7c80d80dee3252e09bc7b 100755 --- a/coreutils-9.5-sw.patch +++ b/coreutils-9.6-sw.patch @@ -1,6 +1,6 @@ -diff -ur coreutils-9.5.orig/build-aux/config.guess coreutils-9.5/build-aux/config.guess ---- coreutils-9.5.orig/build-aux/config.guess 2024-09-02 09:08:54.651545374 +0800 -+++ coreutils-9.5/build-aux/config.guess 2024-09-02 09:10:13.785486378 +0800 +diff -ur coreutils-9.6.orig/build-aux/config.guess coreutils-9.6/build-aux/config.guess +--- coreutils-9.6.orig/build-aux/config.guess 2024-09-02 09:08:54.651545374 +0800 ++++ coreutils-9.6/build-aux/config.guess 2024-09-02 09:10:13.785486378 +0800 @@ -1008,6 +1008,14 @@ UNAME_MACHINE=aarch64_be GUESS=$UNAME_MACHINE-unknown-linux-$LIBC @@ -16,20 +16,20 @@ diff -ur coreutils-9.5.orig/build-aux/config.guess coreutils-9.5/build-aux/confi alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in EV5) UNAME_MACHINE=alphaev5 ;; -diff -ur coreutils-9.5.orig/build-aux/config.sub coreutils-9.5/build-aux/config.sub ---- coreutils-9.5.orig/build-aux/config.sub 2024-09-02 09:08:54.652545386 +0800 -+++ coreutils-9.5/build-aux/config.sub 2024-09-02 09:10:31.238693918 +0800 -@@ -1183,6 +1183,7 @@ - | a29k \ - | aarch64 | aarch64_be | aarch64c | arm64ec \ +diff -ur coreutils-9.6.orig/build-aux/config.sub coreutils-9.6/build-aux/config.sub +--- coreutils-9.6.orig/build-aux/config.sub 2024-09-02 09:08:54.652545386 +0800 ++++ coreutils-9.6/build-aux/config.sub 2024-09-02 09:10:31.238693918 +0800 +@@ -1254,6 +1254,7 @@ + | aarch64_be \ + | aarch64c \ | abacus \ + | sw_64 \ - | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \ - | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \ - | alphapca5[67] | alpha64pca5[67] \ -diff -ur coreutils-9.5.orig/configure coreutils-9.5/configure ---- coreutils-9.5.orig/configure 2024-09-02 09:08:53.994537562 +0800 -+++ coreutils-9.5/configure 2024-09-02 09:12:45.750293437 +0800 + | alpha \ + | alpha64 \ + | alpha64ev56 \ +diff -ur coreutils-9.6.orig/configure coreutils-9.6/configure +--- coreutils-9.6.orig/configure 2024-09-02 09:08:53.994537562 +0800 ++++ coreutils-9.6/configure 2024-09-02 09:12:45.750293437 +0800 @@ -8769,6 +8769,12 @@ # (according to the test results of Bruno Haible's ieeefp/fenv_default.m4 # and the GCC 4.1.2 manual). @@ -61,9 +61,9 @@ diff -ur coreutils-9.5.orig/configure coreutils-9.5/configure gl_cv_host_cpu_c_abi_32bit=no ;; *) gl_cv_host_cpu_c_abi_32bit=unknown ;; -diff -ur coreutils-9.5.orig/lib/uname.c coreutils-9.5/lib/uname.c ---- coreutils-9.5.orig/lib/uname.c 2024-09-02 09:08:54.765546730 +0800 -+++ coreutils-9.5/lib/uname.c 2024-09-02 09:13:34.891877792 +0800 +diff -ur coreutils-9.6.orig/lib/uname.c coreutils-9.6/lib/uname.c +--- coreutils-9.6.orig/lib/uname.c 2024-09-02 09:08:54.765546730 +0800 ++++ coreutils-9.6/lib/uname.c 2024-09-02 09:13:34.891877792 +0800 @@ -228,6 +228,10 @@ case PROCESSOR_ARCHITECTURE_MIPS: strcpy (buf->machine, "mips"); @@ -75,9 +75,9 @@ diff -ur coreutils-9.5.orig/lib/uname.c coreutils-9.5/lib/uname.c case PROCESSOR_ARCHITECTURE_ALPHA: case PROCESSOR_ARCHITECTURE_ALPHA64: strcpy (buf->machine, "alpha"); -diff -ur coreutils-9.5.orig/m4/fpieee.m4 coreutils-9.5/m4/fpieee.m4 ---- coreutils-9.5.orig/m4/fpieee.m4 2024-09-02 09:08:54.055538287 +0800 -+++ coreutils-9.5/m4/fpieee.m4 2024-09-02 09:14:22.824447773 +0800 +diff -ur coreutils-9.6.orig/m4/fpieee.m4 coreutils-9.6/m4/fpieee.m4 +--- coreutils-9.6.orig/m4/fpieee.m4 2024-09-02 09:08:54.055538287 +0800 ++++ coreutils-9.6/m4/fpieee.m4 2024-09-02 09:14:22.824447773 +0800 @@ -30,6 +30,12 @@ # (according to the test results of Bruno Haible's ieeefp/fenv_default.m4 # and the GCC 4.1.2 manual). @@ -91,9 +91,9 @@ diff -ur coreutils-9.5.orig/m4/fpieee.m4 coreutils-9.5/m4/fpieee.m4 alpha*) # On Alpha systems, a compiler option provides the behaviour. # See the ieee(3) manual page, also available at -diff -ur coreutils-9.5.orig/m4/host-cpu-c-abi.m4 coreutils-9.5/m4/host-cpu-c-abi.m4 ---- coreutils-9.5.orig/m4/host-cpu-c-abi.m4 2024-09-02 09:08:54.075538525 +0800 -+++ coreutils-9.5/m4/host-cpu-c-abi.m4 2024-09-02 09:16:15.858791895 +0800 +diff -ur coreutils-9.6.orig/m4/host-cpu-c-abi.m4 coreutils-9.6/m4/host-cpu-c-abi.m4 +--- coreutils-9.6.orig/m4/host-cpu-c-abi.m4 2024-09-02 09:08:54.075538525 +0800 ++++ coreutils-9.6/m4/host-cpu-c-abi.m4 2024-09-02 09:16:15.858791895 +0800 @@ -91,6 +91,12 @@ ;; @@ -135,9 +135,9 @@ diff -ur coreutils-9.5.orig/m4/host-cpu-c-abi.m4 coreutils-9.5/m4/host-cpu-c-abi gl_cv_host_cpu_c_abi_32bit=no ;; *) gl_cv_host_cpu_c_abi_32bit=unknown ;; -diff -ur coreutils-9.5.orig/src/longlong.h coreutils-9.5/src/longlong.h ---- coreutils-9.5.orig/src/longlong.h 2024-09-02 09:08:53.933536836 +0800 -+++ coreutils-9.5/src/longlong.h 2024-09-02 09:17:29.550668195 +0800 +diff -ur coreutils-9.6.orig/src/longlong.h coreutils-9.6/src/longlong.h +--- coreutils-9.6.orig/src/longlong.h 2024-09-02 09:08:53.933536836 +0800 ++++ coreutils-9.6/src/longlong.h 2024-09-02 09:17:29.550668195 +0800 @@ -170,6 +170,92 @@ don't need to be under !NO_ASM */ #if ! defined (NO_ASM) diff --git a/coreutils-9.6.tar.xz b/coreutils-9.6.tar.xz new file mode 100644 index 0000000000000000000000000000000000000000..0d191eaa9bf9ed0ae9b1bf723eea51da4b4beb52 --- /dev/null +++ b/coreutils-9.6.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a0124327b398fd9eb1a6abde583389821422c744ffa10734b24f557610d3283 +size 6134764 diff --git a/coreutils.spec b/coreutils.spec index e792c2d369177d0379053146baebfbc9054d0f64..da9b7b52a66801e40284abad3fabb8f049ef15ea 100644 --- a/coreutils.spec +++ b/coreutils.spec @@ -1,7 +1,7 @@ Name: coreutils -Version: 9.5 -Release: 7 -License: GPLv3+ +Version: 9.6 +Release: 1 +License: GPL-3.0-or-later Summary: A set of basic GNU tools commonly used in shell scripts Url: https://www.gnu.org/software/coreutils/ Source0: https://ftp.gnu.org/gnu/%{name}/%{name}-%{version}.tar.xz @@ -17,23 +17,9 @@ Patch2: bugfix-remove-usr-local-lib-from-m4.patch Patch3: bugfix-dummy_help2man.patch Patch4: skip-the-tests-that-require-selinux-if-selinux-is-di.patch Patch5: backport-config-color-alias-for-ls.patch -Patch6: backport-coreutils-df-direct.patch -Patch7: backport-coreutils-i18n.patch Patch8: test-skip-overlay-filesystem-because-of-no-inotify_add_watch.patch -Patch9: coreutils-9.5-gcc14-gnulib-lto.patch -patch10: backport-sort-don-t-trust-st_size-on-proc-files.patch -patch11: backport-cat-don-t-trust-st_size-on-proc-files.patch -patch12: backport-dd-don-t-trust-st_size-on-proc-files.patch -patch13: backport-split-don-t-trust-st_size-on-proc-files.patch -patch14: backport-putenv-Don-t-crash-upon-out-of-memory.patch -patch15: backport-head-off_t-not-uintmax_t-for-file-offset.patch -patch16: backport-shuf-avoid-integer-overflow-on-huge-inputs.patch -patch17: backport-shuf-fix-randomness-bug.patch -patch18: backport-chroot-whoami-use-uintmax_t-for-printing-uids.patch -patch19: backport-tail-avoid-infloop-with-c-on-dev-zero.patch -patch20: backport-head-fix-overflows-in-elide_tail_bytes_pipe.patch - -Patch9001: coreutils-9.5-sw.patch + +Patch9001: coreutils-9.6-sw.patch Conflicts: filesystem < 3 @@ -52,8 +38,6 @@ BuildRequires: gettext-devel, gmp-devel, libacl-devel, libattr-devel BuildRequires: libcap-devel, libselinux-devel, libselinux-utils, openssl-devel tcl Requires: ncurses, gmp -Requires(preun): /sbin/install-info -Requires(post): /sbin/install-info Provides: coreutils-full = %{version}-%{release} Provides: fileutils = %{version}-%{release} @@ -151,7 +135,6 @@ popd %{_libexecdir}/coreutils/*.so %doc ABOUT-NLS NEWS README THANKS TODO %license COPYING -%exclude %{_infodir}/dir %config(noreplace) %{_sysconfdir}/profile.d/* %config(noreplace) %{_sysconfdir}/DIR_COLORS* @@ -161,6 +144,9 @@ popd %{_mandir}/man*/* %changelog +* Sat Jan 18 2025 Funda Wang - 9.6-1 +- update to 9.6 + * Thu Nov 28 2024 huyubiao - 9.5-7 - sync patches from community - add backport-head-fix-overflows-in-elide_tail_bytes_pipe.patch