blob: e45358ddc68c4809f69398bf0915af061368cc4e [file] [log] [blame]
//===-- Unittests for StringConverter class -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "hdr/errno_macros.h"
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/error_or.h"
#include "src/__support/macros/properties/os.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/string_converter.h"
#include "test/UnitTest/Test.h"
// TODO: add support for 16-bit widechars to StringConverter to remove this
// macro
#ifdef LIBC_TARGET_OS_IS_WINDOWS
TEST(LlvmLibcStringConverterTest, Windows) {
// pass on windows for now
}
#else
TEST(LlvmLibcStringConverterTest, UTF8To32) {
// first 4 bytes are clown emoji (🤡)
// next 3 bytes are sigma symbol (∑)
// next 2 bytes are y with diaeresis (ÿ)
// last byte is the letter A
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
auto res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xff);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 9);
res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x41);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 10);
res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
res = sc.pop<char32_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
}
TEST(LlvmLibcStringConverterTest, UTF32To8) {
// clown emoji, sigma symbol, y with diaeresis, letter A
const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
static_cast<wchar_t>(0x2211),
static_cast<wchar_t>(0xff), static_cast<wchar_t>(0x41),
static_cast<wchar_t>(0x0)};
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
auto res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
// end of clown emoji, sigma symbol begins
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x88);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x91);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
// end of sigma symbol, y with diaeresis begins
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
// end of y with diaeresis, letter A begins
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x41);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
// null byte
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
res = sc.pop<char8_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
}
TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
const wchar_t src[] = {
static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0x2211),
static_cast<wchar_t>(0x0)}; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
auto res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
// can only read 1 character from source string, so error on next pop
res = sc.pop<char8_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
}
TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
// first 4 bytes are clown emoji, then next 3 are sigma symbol
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 5);
auto res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.pop<char32_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), -1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
}
TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
const wchar_t src[] = {
static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
auto res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
}
TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
// first 4 bytes are clown emoji (🤡)
// next 3 form an invalid character
const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
auto res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.pop<char32_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
}
TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) {
// if an invalid character exists in the source string but we don't have space
// to write it, we should return a "stop converting" error rather than an
// invalid character error
// first 4 bytes are clown emoji (🤡)
// next 3 form an invalid character
const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30";
LIBC_NAMESPACE::internal::mbstate ps1;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
reinterpret_cast<const char8_t *>(src1), &ps1, 1);
auto res1 = sc1.pop<char32_t>();
ASSERT_TRUE(res1.has_value());
ASSERT_EQ(static_cast<int>(res1.value()), 0x1f921);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
res1 = sc1.pop<char32_t>();
ASSERT_FALSE(res1.has_value());
// no space to write error NOT invalid character error (EILSEQ)
ASSERT_EQ(static_cast<int>(res1.error()), -1);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
const wchar_t src2[] = {
static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
LIBC_NAMESPACE::internal::mbstate ps2;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
reinterpret_cast<const char32_t *>(src2), &ps2, 4);
auto res2 = sc2.pop<char8_t>();
ASSERT_TRUE(res2.has_value());
ASSERT_EQ(static_cast<int>(res2.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
res2 = sc2.pop<char8_t>();
ASSERT_TRUE(res2.has_value());
ASSERT_EQ(static_cast<int>(res2.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
res2 = sc2.pop<char8_t>();
ASSERT_TRUE(res2.has_value());
ASSERT_EQ(static_cast<int>(res2.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
res2 = sc2.pop<char8_t>();
ASSERT_TRUE(res2.has_value());
ASSERT_EQ(static_cast<int>(res2.value()), 0xA1);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
res2 = sc2.pop<char8_t>();
ASSERT_FALSE(res2.has_value());
// no space to write error NOT invalid character error (EILSEQ)
ASSERT_EQ(static_cast<int>(res2.error()), -1);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
}
TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
/*
We do NOT test partially popping a character and expecting the next
StringConverter to continue where we left off. This is not expected to work
and considered invalid.
*/
const wchar_t src[] = {
static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xff),
static_cast<wchar_t>(0x0)}; // clown emoji, y with diaeresis (ÿ)
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
auto res = sc1.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
res = sc1.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
res = sc1.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
res = sc1.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
// sc2 should pick up where sc1 left off and continue the conversion
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), &state,
SIZE_MAX, 1);
res = sc2.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
res = sc2.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
}
TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 2);
auto res = sc1.pop<char32_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), -1);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
// sc2 should pick up where sc1 left off and continue the conversion
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), &state,
SIZE_MAX, 3);
res = sc2.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 2);
res = sc2.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
}
TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state, 1);
auto res = sc.pop<char32_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.pop<char32_t>(); // no space to pop this into
ASSERT_FALSE(res.has_value());
}
TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
static_cast<wchar_t>(0x1f921)}; // 2 clown emojis
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state, 5);
auto res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.pop<char8_t>();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
}
#endif