[libcxx] Fix using std::wcout/wcin on Windows with streams configured in wide mode On Windows, the underlying file descriptors for stdout/stdin/stderr can be reconfigured to wide mode. In the default (narrow) mode, the charset usually isn't utf8 (as libcxx assumes), but normally a locale specific codepage (where each codepage only can represent a small subset of unicode characters). By configuring the stdout file descriptor to wide mode, the user can output wchar_t based strings without convesion to the narrow charset. Within libcxx, don't try to use codecvt to convert this to a narrow character encoding, but output these strings as such with fputwc. In wide mode, such strings could be output directly with fwrite too, but if the file descriptor hasn't been configured in wide mode, that breaks the output (which currently works reasonably). By always outputting one character at a time with fputwc, it works regardless of mode of the stdout file descriptor. For the narrow output stream, std::cout, outputting (via fwrite) does fail when the file descriptor is set to wide mode. This matches how it behaves with both MS STL and GNU libstdc++ too, so this is probably acceptable. This fixes https://github.com/llvm/llvm-project/issues/46646, and the downstream bugs https://github.com/mstorsjo/llvm-mingw/issues/145 and https://github.com/mstorsjo/llvm-mingw/issues/222. Differential Revision: https://reviews.llvm.org/D146398 GitOrigin-RevId: fcbbd9649ac165aaf7fc7d60b8fef3b23755179a

commit: 7f2ea8aa46cc32735c42cb9ae439af7042f12979 [log] [tgz]
author: Martin Storsjö <martin@martin.st> Wed Mar 15 12:11:28 2023 +0200
committer: Copybara-Service <copybara-worker@google.com> Sat Jun 03 13:16:23 2023 -0700
tree: 449bc01707e412b0532ea8625022a92d97a12d78
parent: 167b9ab17d2b2b6ebb4c7623bdde8873ba6650d4 [diff]
diff --git a/docs/UsingLibcxx.rst b/docs/UsingLibcxx.rst
index 3acb07b..703c324 100644
--- a/docs/UsingLibcxx.rst
+++ b/docs/UsingLibcxx.rst

@@ -552,3 +552,26 @@
 
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
+
+Platform specific behavior
+==========================
+
+Windows
+-------
+
+The ``stdout``, ``stderr``, and ``stdin`` file streams can be placed in
+Unicode mode by a suitable call to ``_setmode()``. When in this mode,
+the sequence of bytes read from, or written to, these streams is interpreted
+as a sequence of little-endian ``wchar_t`` elements. Thus, use of
+``std::cout``, ``std::cerr``, or ``std::cin`` with streams in Unicode mode
+will not behave as they usually do since bytes read or written won't be
+interpreted as individual ``char`` elements. However, ``std::wcout``,
+``std::wcerr``, and ``std::wcin`` will behave as expected.
+
+Wide character stream such as ``std::wcin`` or ``std::wcout`` imbued with a
+locale behave differently than they otherwise do. By default, wide character
+streams don't convert wide characters but input/output them as is. If a
+specific locale is imbued, the IO with the underlying stream happens with
+regular ``char`` elements, which are converted to/from wide characters
+according to the locale. Note that this doesn't behave as expected if the
+stream has been set in Unicode mode.

diff --git a/src/std_stream.h b/src/std_stream.h
index 9c15d7d..0febf42 100644
--- a/src/std_stream.h
+++ b/src/std_stream.h

@@ -60,6 +60,12 @@
     bool __last_consumed_is_next_;
     bool __always_noconv_;
 
+#if defined(_LIBCPP_WIN32API)
+    static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
+#else
+    static constexpr bool __is_win32api_wide_char = false;
+#endif
+
     __stdinbuf(const __stdinbuf&);
     __stdinbuf& operator=(const __stdinbuf&);
 
@@ -74,6 +80,12 @@
       __last_consumed_is_next_(false)
 {
     imbue(this->getloc());
+    // On Windows, in wchar_t mode, ignore the codecvt from the locale by
+    // default and assume noconv; this passes wchar_t through unmodified from
+    // getwc. If the user sets a custom locale with imbue(), that gets honored,
+    // the IO is done with getc() and converted with the provided codecvt.
+    if constexpr (__is_win32api_wide_char)
+        __always_noconv_ = true;
 }
 
 template <class _CharT>
@@ -101,6 +113,36 @@
     return __getchar(true);
 }
 
+static bool __do_getc(FILE *__fp, char *__pbuf) {
+    int __c = getc(__fp);
+    if (__c == EOF)
+        return false;
+    *__pbuf = static_cast<char>(__c);
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_getc(FILE *__fp, wchar_t *__pbuf) {
+    wint_t __c = getwc(__fp);
+    if (__c == WEOF)
+        return false;
+    *__pbuf = static_cast<wchar_t>(__c);
+    return true;
+}
+#endif
+
+static bool __do_ungetc(int __c, FILE *__fp, char __dummy) {
+    if (ungetc(__c, __fp) == EOF)
+        return false;
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_ungetc(std::wint_t __c, FILE *__fp, wchar_t __dummy) {
+    if (ungetwc(__c, __fp) == WEOF)
+        return false;
+    return true;
+}
+#endif
+
 template <class _CharT>
 typename __stdinbuf<_CharT>::int_type
 __stdinbuf<_CharT>::__getchar(bool __consume)
@@ -115,6 +157,20 @@
         }
         return __result;
     }
+    if (__always_noconv_) {
+        char_type __1buf;
+        if (!__do_getc(__file_, &__1buf))
+            return traits_type::eof();
+        if (!__consume)
+        {
+            if (!__do_ungetc(traits_type::to_int_type(__1buf), __file_, __1buf))
+                return traits_type::eof();
+        }
+        else
+            __last_consumed_ = traits_type::to_int_type(__1buf);
+        return traits_type::to_int_type(__1buf);
+    }
+
     char __extbuf[__limit];
     int __nread = _VSTD::max(1, __encoding_);
     for (int __i = 0; __i < __nread; ++__i)
@@ -125,42 +181,37 @@
         __extbuf[__i] = static_cast<char>(__c);
     }
     char_type __1buf;
-    if (__always_noconv_)
-        __1buf = static_cast<char_type>(__extbuf[0]);
-    else
+    const char* __enxt;
+    char_type* __inxt;
+    codecvt_base::result __r;
+    do
     {
-        const char* __enxt;
-        char_type* __inxt;
-        codecvt_base::result __r;
-        do
+        state_type __sv_st = *__st_;
+        __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
+                               &__1buf, &__1buf + 1, __inxt);
+        switch (__r)
         {
-            state_type __sv_st = *__st_;
-            __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
-                                   &__1buf, &__1buf + 1, __inxt);
-            switch (__r)
-            {
-            case _VSTD::codecvt_base::ok:
-                break;
-            case codecvt_base::partial:
-                *__st_ = __sv_st;
-                if (__nread == sizeof(__extbuf))
-                    return traits_type::eof();
-                {
-                    int __c = getc(__file_);
-                    if (__c == EOF)
-                        return traits_type::eof();
-                    __extbuf[__nread] = static_cast<char>(__c);
-                }
-                ++__nread;
-                break;
-            case codecvt_base::error:
+        case _VSTD::codecvt_base::ok:
+            break;
+        case codecvt_base::partial:
+            *__st_ = __sv_st;
+            if (__nread == sizeof(__extbuf))
                 return traits_type::eof();
-            case _VSTD::codecvt_base::noconv:
-                __1buf = static_cast<char_type>(__extbuf[0]);
-                break;
+            {
+                int __c = getc(__file_);
+                if (__c == EOF)
+                    return traits_type::eof();
+                __extbuf[__nread] = static_cast<char>(__c);
             }
-        } while (__r == _VSTD::codecvt_base::partial);
-    }
+            ++__nread;
+            break;
+        case codecvt_base::error:
+            return traits_type::eof();
+        case _VSTD::codecvt_base::noconv:
+            __1buf = static_cast<char_type>(__extbuf[0]);
+            break;
+        }
+    } while (__r == _VSTD::codecvt_base::partial);
     if (!__consume)
     {
         for (int __i = __nread; __i > 0;)
@@ -188,8 +239,11 @@
         }
         return __c;
     }
-    if (__last_consumed_is_next_)
-    {
+    if (__always_noconv_ && __last_consumed_is_next_) {
+        if (!__do_ungetc(__last_consumed_, __file_,
+                         traits_type::to_char_type(__last_consumed_)))
+            return traits_type::eof();
+    } else if (__last_consumed_is_next_) {
         char __extbuf[__limit];
         char* __enxt;
         const char_type __ci = traits_type::to_char_type(__last_consumed_);
@@ -244,6 +298,12 @@
     state_type* __st_;
     bool __always_noconv_;
 
+#if defined(_LIBCPP_WIN32API)
+    static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
+#else
+    static constexpr bool __is_win32api_wide_char = false;
+#endif
+
     __stdoutbuf(const __stdoutbuf&);
     __stdoutbuf& operator=(const __stdoutbuf&);
 };
@@ -255,8 +315,31 @@
       __st_(__st),
       __always_noconv_(__cv_->always_noconv())
 {
+    // On Windows, in wchar_t mode, ignore the codecvt from the locale by
+    // default and assume noconv; this passes wchar_t through unmodified to
+    // fputwc, which handles it correctly depending on the actual mode of the
+    // output stream. If the user sets a custom locale with imbue(), that
+    // gets honored.
+    if constexpr (__is_win32api_wide_char)
+        __always_noconv_ = true;
 }
 
+static bool __do_fputc(char __c, FILE* __fp) {
+    if (fwrite(&__c, sizeof(__c), 1, __fp) != 1)
+        return false;
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_fputc(wchar_t __c, FILE* __fp) {
+    // fputwc works regardless of wide/narrow mode of stdout, while
+    // fwrite of wchar_t only works if the stream actually has been set
+    // into wide mode.
+    if (fputwc(__c, __fp) == WEOF)
+        return false;
+    return true;
+}
+#endif
+
 template <class _CharT>
 typename __stdoutbuf<_CharT>::int_type
 __stdoutbuf<_CharT>::overflow(int_type __c)
@@ -268,7 +351,7 @@
         __1buf = traits_type::to_char_type(__c);
         if (__always_noconv_)
         {
-            if (fwrite(&__1buf, sizeof(char_type), 1, __file_) != 1)
+            if (!__do_fputc(__1buf, __file_))
                 return traits_type::eof();
         }
         else
@@ -313,7 +396,10 @@
 streamsize
 __stdoutbuf<_CharT>::xsputn(const char_type* __s, streamsize __n)
 {
-    if (__always_noconv_)
+    // For wchar_t on Windows, don't call fwrite(), but write characters one
+    // at a time with fputwc(); that works both when stdout is in the default
+    // mode and when it is set to Unicode mode.
+    if (__always_noconv_ && !__is_win32api_wide_char)
         return fwrite(__s, sizeof(char_type), __n, __file_);
     streamsize __i = 0;
     for (; __i < __n; ++__i, ++__s)

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh b/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh
new file mode 100644
index 0000000..7edd63b
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh

@@ -0,0 +1,5 @@
+# Check that the stderr of the executed program matches a reference file.
+program=${1}
+expected_file=${2}
+${program} 2>stderr.log >stdout.log
+cmp stderr.log "${expected_file}"

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh b/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh
new file mode 100644
index 0000000..996cae5
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh

@@ -0,0 +1,5 @@
+# Check that the stdout of the executed program matches a reference file.
+program=${1}
+expected_file=${2}
+${program} 2>stderr.log >stdout.log
+cmp stdout.log "${expected_file}"

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh b/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh
new file mode 100644
index 0000000..70a2a6f
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh

@@ -0,0 +1,4 @@
+# Pass a reference file as stdin to a test executable.
+program=${1}
+input=${2}
+cat ${input} | ${program}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/test.dat b/test/std/input.output/iostream.objects/wide.stream.objects/test.dat
new file mode 100644
index 0000000..2b0e567
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/test.dat
Binary files differ

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-imbue.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-imbue.sh.cpp
new file mode 100644
index 0000000..bc57f82
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-imbue.sh.cpp

@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcerr;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../check-stderr.sh
+// RUN: %{build}
+// RUN: %{exec} bash check-stderr.sh "%t.exe" "zzzz"
+
+#include <iostream>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
+                const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = 'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcerr.imbue(loc);
+    std::wcerr << L"1234";
+    return 0;
+}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp
new file mode 100644
index 0000000..78bc6de
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp

@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcerr;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: check-stderr.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash check-stderr.sh "%t.exe" "test.dat"
+
+// Check that wcerr works, preserving the unicode characters, after switching
+// stderr to wide mode.
+
+#include <iostream>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stderr), _O_WTEXT);
+    std::wcerr << L"1234\u20ac\u00e5\u00e4\u00f6";
+    return 0;
+}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp
new file mode 100644
index 0000000..1e310a6
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp

@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcin;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../send-stdin.sh
+// RUN: %{build}
+// RUN: %{exec} bash send-stdin.sh "%t.exe" "1234"
+
+#include <iostream>
+#include <cassert>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_in(std::mbstate_t&, const char *from, const char *from_end,
+                const char *&from_next, wchar_t *to, wchar_t *to_end, wchar_t *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = L'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcin.imbue(loc);
+    std::wstring str;
+    std::wcin >> str;
+    assert(str == L"zzzz");
+    return 0;
+}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp
new file mode 100644
index 0000000..b5a6c25
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp

@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcin;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: send-stdin.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash send-stdin.sh "%t.exe" "test.dat"
+
+// Check that wcin works, preserving the unicode characters, after switching
+// stdin to wide mode.
+
+#include <iostream>
+#include <cassert>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stdin), _O_WTEXT);
+    std::wstring str;
+    std::wcin >> str;
+    assert(str == L"1234\u20ac\u00e5\u00e4\u00f6");
+    return 0;
+}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp
new file mode 100644
index 0000000..a0968ba
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp

@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcout;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../check-stdout.sh
+// RUN: %{build}
+// RUN: %{exec} bash check-stdout.sh "%t.exe" "zzzz"
+
+#include <iostream>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
+                const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = 'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcout.imbue(loc);
+    std::wcout << L"1234";
+    return 0;
+}

diff --git a/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp b/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp
new file mode 100644
index 0000000..26c19dd
--- /dev/null
+++ b/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp

@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcout;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: check-stdout.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash check-stdout.sh "%t.exe" "test.dat"
+
+// Check that wcout works, preserving the unicode characters, after switching
+// stdout to wide mode.
+
+#include <iostream>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stdout), _O_WTEXT);
+    std::wcout << L"1234\u20ac\u00e5\u00e4\u00f6";
+    return 0;
+}
commit	7f2ea8aa46cc32735c42cb9ae439af7042f12979	[log] [tgz]
author	Martin Storsjö <martin@martin.st>	Wed Mar 15 12:11:28 2023 +0200
committer	Copybara-Service <copybara-worker@google.com>	Sat Jun 03 13:16:23 2023 -0700
tree	449bc01707e412b0532ea8625022a92d97a12d78
parent	167b9ab17d2b2b6ebb4c7623bdde8873ba6650d4 [diff]