| //===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// |
| /// \file |
| /// This file provides utility classes to convert between different character |
| /// encodings. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Support/TextEncoding.h" |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/StringExtras.h" |
| #include "llvm/Support/ConvertEBCDIC.h" |
| #include <system_error> |
| |
| #if HAVE_ICU |
| #include <unicode/ucnv.h> |
| #elif HAVE_ICONV |
| #include <iconv.h> |
| #endif |
| |
| using namespace llvm; |
| |
| // Normalize the charset name with the charset alias matching algorithm proposed |
| // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. |
| static void normalizeCharSetName(StringRef CSName, |
| SmallVectorImpl<char> &Normalized) { |
| bool PrevDigit = false; |
| for (auto Ch : CSName) { |
| if (isAlnum(Ch)) { |
| Ch = toLower(Ch); |
| if (Ch != '0' || PrevDigit) { |
| PrevDigit = isDigit(Ch); |
| Normalized.push_back(Ch); |
| } |
| } |
| } |
| } |
| |
| // Maps the encoding name to enum constant if possible. |
| static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { |
| SmallString<16> Normalized; |
| normalizeCharSetName(Name, Normalized); |
| if (Normalized.equals("utf8")) |
| return TextEncoding::UTF8; |
| if (Normalized.equals("ibm1047")) |
| return TextEncoding::IBM1047; |
| return std::nullopt; |
| } |
| |
| LLVM_ATTRIBUTE_UNUSED static void |
| HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, |
| SmallVectorImpl<char> &Result) { |
| // No space left in output buffer. Double the size of the underlying |
| // memory in the SmallVectorImpl, adjust pointer and length and continue |
| // the conversion. |
| Capacity = |
| (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size(); |
| Result.resize(0); |
| Result.resize_for_overwrite(Capacity); |
| Output = static_cast<char *>(Result.data()); |
| OutputLength = Capacity; |
| } |
| |
| namespace { |
| enum ConversionType { |
| UTF8ToIBM1047, |
| IBM1047ToUTF8, |
| }; |
| |
| // Support conversion between EBCDIC 1047 and UTF-8. This class uses |
| // built-in translation tables that allow for translation between the |
| // aforementioned encodings. The use of tables for conversion is only |
| // possible because EBCDIC 1047 is a single-byte, stateless encoding; other |
| // encodings are not supported. |
| class TextEncodingConverterTable final |
| : public details::TextEncodingConverterImplBase { |
| const ConversionType ConvType; |
| |
| public: |
| TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {} |
| |
| std::error_code convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) override; |
| |
| void reset() override {} |
| }; |
| |
| std::error_code |
| TextEncodingConverterTable::convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) { |
| switch (ConvType) { |
| case IBM1047ToUTF8: |
| ConverterEBCDIC::convertToUTF8(Source, Result); |
| return std::error_code(); |
| case UTF8ToIBM1047: |
| return ConverterEBCDIC::convertToEBCDIC(Source, Result); |
| } |
| llvm_unreachable("Invalid ConvType!"); |
| return std::error_code(); |
| } |
| |
| #if HAVE_ICU |
| struct UConverterDeleter { |
| void operator()(UConverter *Converter) const { |
| if (Converter) |
| ucnv_close(Converter); |
| } |
| }; |
| using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>; |
| |
| class TextEncodingConverterICU final |
| : public details::TextEncodingConverterImplBase { |
| UConverterUniquePtr FromConvDesc; |
| UConverterUniquePtr ToConvDesc; |
| |
| public: |
| TextEncodingConverterICU(UConverterUniquePtr FromConverter, |
| UConverterUniquePtr ToConverter) |
| : FromConvDesc(std::move(FromConverter)), |
| ToConvDesc(std::move(ToConverter)) {} |
| |
| std::error_code convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) override; |
| |
| void reset() override; |
| }; |
| |
| // TODO: The current implementation discards the partial result and restarts the |
| // conversion from the beginning if there is a conversion error due to |
| // insufficient buffer size. In the future, it would better to save the partial |
| // result and resume the conversion for the remaining string. |
| // TODO: Improve translation of ICU errors to error_code |
| std::error_code |
| TextEncodingConverterICU::convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) { |
| // Setup the input in case it has no backing data. |
| size_t InputLength = Source.size(); |
| const char *In = InputLength ? const_cast<char *>(Source.data()) : ""; |
| |
| // Setup the output. We directly write into the SmallVector. |
| size_t Capacity = Result.capacity(); |
| size_t OutputLength = Capacity; |
| Result.resize_for_overwrite(Capacity); |
| char *Output; |
| UErrorCode EC = U_ZERO_ERROR; |
| |
| ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, |
| &EC); |
| ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, |
| NULL, &EC); |
| assert(U_SUCCESS(EC)); |
| |
| do { |
| EC = U_ZERO_ERROR; |
| const char *Input = In; |
| |
| Output = InputLength ? static_cast<char *>(Result.data()) : nullptr; |
| ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, |
| In + InputLength, /*pivotStart=*/NULL, |
| /*pivotSource=*/NULL, /*pivotTarget=*/NULL, |
| /*pivotLimit=*/NULL, /*reset=*/true, |
| /*flush=*/true, &EC); |
| if (U_FAILURE(EC)) { |
| if (EC == U_BUFFER_OVERFLOW_ERROR) { |
| if (Capacity < Result.max_size()) { |
| HandleOverflow(Capacity, Output, OutputLength, Result); |
| continue; |
| } else |
| return std::error_code(E2BIG, std::generic_category()); |
| } |
| // Some other error occured. |
| Result.resize(Output - Result.data()); |
| return std::error_code(EILSEQ, std::generic_category()); |
| } |
| break; |
| } while (true); |
| |
| Result.resize(Output - Result.data()); |
| return std::error_code(); |
| } |
| |
| void TextEncodingConverterICU::reset() { |
| ucnv_reset(&*FromConvDesc); |
| ucnv_reset(&*ToConvDesc); |
| } |
| |
| #elif HAVE_ICONV |
| class TextEncodingConverterIconv final |
| : public details::TextEncodingConverterImplBase { |
| class UniqueIconvT { |
| iconv_t ConvDesc; |
| |
| public: |
| operator iconv_t() const { return ConvDesc; } |
| UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} |
| ~UniqueIconvT() { |
| if (ConvDesc != (iconv_t)-1) { |
| iconv_close(ConvDesc); |
| ConvDesc = (iconv_t)-1; |
| } |
| } |
| UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { |
| Other.ConvDesc = (iconv_t)-1; |
| } |
| UniqueIconvT &operator=(UniqueIconvT &&Other) { |
| if (&Other != this) { |
| ConvDesc = Other.ConvDesc; |
| Other.ConvDesc = (iconv_t)-1; |
| } |
| return *this; |
| } |
| }; |
| UniqueIconvT ConvDesc; |
| |
| public: |
| TextEncodingConverterIconv(UniqueIconvT ConvDesc) |
| : ConvDesc(std::move(ConvDesc)) {} |
| |
| std::error_code convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) override; |
| |
| void reset() override; |
| }; |
| |
| // TODO: The current implementation discards the partial result and restarts the |
| // conversion from the beginning if there is a conversion error due to |
| // insufficient buffer size. In the future, it would better to save the partial |
| // result and resume the conversion for the remaining string. |
| std::error_code |
| TextEncodingConverterIconv::convertString(StringRef Source, |
| SmallVectorImpl<char> &Result) { |
| // Setup the output. We directly write into the SmallVector. |
| size_t Capacity = Result.capacity(); |
| char *Output = static_cast<char *>(Result.data()); |
| size_t OutputLength = Capacity; |
| Result.resize_for_overwrite(Capacity); |
| |
| size_t Ret; |
| // Handle errors returned from iconv(). |
| auto HandleError = [&Capacity, &Output, &OutputLength, &Result, |
| this](size_t Ret) { |
| if (Ret == static_cast<size_t>(-1)) { |
| // An error occured. Check if we can gracefully handle it. |
| if (errno == E2BIG && Capacity < Result.max_size()) { |
| HandleOverflow(Capacity, Output, OutputLength, Result); |
| // Reset converter |
| reset(); |
| return std::error_code(); |
| } else { |
| // Some other error occured. |
| Result.resize(Output - Result.data()); |
| return std::error_code(errno, std::generic_category()); |
| } |
| } else { |
| // A positive return value indicates that some characters were converted |
| // in a nonreversible way, that is, replaced with a SUB symbol. Returning |
| // an error in this case makes sure that both conversion routines behave |
| // in the same way. |
| return std::make_error_code(std::errc::illegal_byte_sequence); |
| } |
| }; |
| |
| do { |
| // Setup the input. Use nullptr to reset iconv state if input length is |
| // zero. |
| size_t InputLength = Source.size(); |
| char *Input = InputLength ? const_cast<char *>(Source.data()) : ""; |
| Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); |
| if (Ret != 0) { |
| if (auto EC = HandleError(Ret)) |
| return EC; |
| continue; |
| } |
| // Flush the converter |
| Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); |
| if (Ret != 0) { |
| if (auto EC = HandleError(Ret)) |
| return EC; |
| continue; |
| } |
| break; |
| } while (true); |
| |
| // Re-adjust size to actual size. |
| Result.resize(Output - Result.data()); |
| return std::error_code(); |
| } |
| |
| inline void TextEncodingConverterIconv::reset() { |
| iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); |
| } |
| |
| #endif // HAVE_ICONV |
| } // namespace |
| |
| ErrorOr<TextEncodingConverter> |
| TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) { |
| |
| // Text encodings should be distinct. |
| if (CPFrom == CPTo) |
| return std::make_error_code(std::errc::invalid_argument); |
| |
| ConversionType Conversion; |
| if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047) |
| Conversion = UTF8ToIBM1047; |
| else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8) |
| Conversion = IBM1047ToUTF8; |
| else |
| return std::make_error_code(std::errc::invalid_argument); |
| |
| return TextEncodingConverter( |
| std::make_unique<TextEncodingConverterTable>(Conversion)); |
| } |
| |
| ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, |
| StringRef To) { |
| std::optional<TextEncoding> FromEncoding = getKnownEncoding(From); |
| std::optional<TextEncoding> ToEncoding = getKnownEncoding(To); |
| if (FromEncoding && ToEncoding) { |
| ErrorOr<TextEncodingConverter> Converter = |
| create(*FromEncoding, *ToEncoding); |
| if (Converter) |
| return Converter; |
| } |
| #if HAVE_ICU |
| UErrorCode EC = U_ZERO_ERROR; |
| UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC)); |
| if (U_FAILURE(EC)) |
| return std::make_error_code(std::errc::invalid_argument); |
| |
| UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC)); |
| if (U_FAILURE(EC)) |
| return std::make_error_code(std::errc::invalid_argument); |
| |
| auto Converter = std::make_unique<TextEncodingConverterICU>( |
| std::move(FromConvDesc), std::move(ToConvDesc)); |
| return TextEncodingConverter(std::move(Converter)); |
| #elif HAVE_ICONV |
| iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str()); |
| if (ConvDesc == (iconv_t)-1) |
| return std::make_error_code(std::errc::invalid_argument); |
| return TextEncodingConverter( |
| std::make_unique<TextEncodingConverterIconv>(ConvDesc)); |
| #else |
| return std::make_error_code(std::errc::invalid_argument); |
| #endif |
| } |