blob: f48ea625908e99d850b3b809be7a7a179e99e6ad [file] [log] [blame]
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// WARNING, this entire header is generated by
// utils/generate_indic_conjunct_break_table.py
// DO NOT MODIFY!
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
// See Terms of Use <https://www.unicode.org/copyright.html>
// for definitions of Unicode Inc.'s Data Files and Software.
//
// NOTICE TO USER: Carefully read the following legal agreement.
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
// TERMS AND CONDITIONS OF THIS AGREEMENT.
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
// THE DATA FILES OR SOFTWARE.
//
// COPYRIGHT AND PERMISSION NOTICE
//
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of the Unicode data files and any associated documentation
// (the "Data Files") or Unicode software and any associated documentation
// (the "Software") to deal in the Data Files or Software
// without restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, and/or sell copies of
// the Data Files or Software, and to permit persons to whom the Data Files
// or Software are furnished to do so, provided that either
// (a) this copyright and permission notice appear with all copies
// of the Data Files or Software, or
// (b) this copyright and permission notice appear in associated
// Documentation.
//
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale,
// use or other dealings in these Data Files or Software without prior
// written authorization of the copyright holder.
#ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
#define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
#include <__algorithm/ranges_upper_bound.h>
#include <__config>
#include <__cstddef/ptrdiff_t.h>
#include <__iterator/access.h>
#include <cstdint>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER >= 20
namespace __indic_conjunct_break {
enum class __property : uint8_t {
// Values generated from the data files.
__Consonant,
__Extend,
__Linker,
// The code unit has none of above properties.
__none
};
/// The entries of the indic conjunct break property table.
///
/// The data is generated from
/// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
///
/// The data has 3 values
/// - bits [0, 1] The property. One of the values generated from the datafiles
/// of \ref __property
/// - bits [2, 10] The size of the range.
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
/// the range is lower bound + size.
///
/// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
/// in the Unicode tables are larger. They are stored in multiple consecutive
/// ranges in the data table. An alternative would be to store the sizes in a
/// separate 16-bit value. The original MSVC STL code had such an approach, but
/// this approach uses less space for the data and is about 4% faster in the
/// following benchmark.
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
// clang-format off
_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[403] = {
0x001801bd,
0x00241819,
0x002c88b1,
0x002df801,
0x002e0805,
0x002e2005,
0x002e3801,
0x00308029,
0x00325851,
0x00338001,
0x0036b019,
0x0036f815,
0x00373805,
0x0037500d,
0x00388801,
0x00398069,
0x003d3029,
0x003f5821,
0x003fe801,
0x0040b00d,
0x0040d821,
0x00412809,
0x00414811,
0x0042c809,
0x0044b821,
0x0046505d,
0x0047187d,
0x0048a890,
0x0049d001,
0x0049e001,
0x004a081d,
0x004a6802,
0x004a8819,
0x004ac01c,
0x004b1005,
0x004bc01c,
0x004c0801,
0x004ca84c,
0x004d5018,
0x004d9000,
0x004db00c,
0x004de001,
0x004df001,
0x004e080d,
0x004e6802,
0x004eb801,
0x004ee004,
0x004ef800,
0x004f1005,
0x004f8004,
0x004ff001,
0x00500805,
0x0051e001,
0x00520805,
0x00523805,
0x00525809,
0x00528801,
0x00538005,
0x0053a801,
0x00540805,
0x0054a84c,
0x00555018,
0x00559004,
0x0055a810,
0x0055e001,
0x00560811,
0x00563805,
0x00566802,
0x00571005,
0x0057c800,
0x0057d015,
0x00580801,
0x0058a84c,
0x00595018,
0x00599004,
0x0059a810,
0x0059e001,
0x0059f005,
0x005a080d,
0x005a6802,
0x005aa809,
0x005ae004,
0x005af800,
0x005b1005,
0x005b8800,
0x005c1001,
0x005df001,
0x005e0001,
0x005e6801,
0x005eb801,
0x00600001,
0x00602001,
0x0060a84c,
0x0061503c,
0x0061e001,
0x0061f009,
0x00623009,
0x00625009,
0x00626802,
0x0062a805,
0x0062c008,
0x00631005,
0x00640801,
0x0065e001,
0x0065f805,
0x00661001,
0x00663009,
0x0066500d,
0x0066a805,
0x00671005,
0x00680005,
0x0068a894,
0x0069d805,
0x0069f001,
0x006a080d,
0x006a6802,
0x006ab801,
0x006b1005,
0x006c0801,
0x006e5001,
0x006e7801,
0x006e9009,
0x006eb001,
0x006ef801,
0x00718801,
0x0071a019,
0x0072381d,
0x00758801,
0x0075a021,
0x00764019,
0x0078c005,
0x0079a801,
0x0079b801,
0x0079c801,
0x007b8835,
0x007c0011,
0x007c3005,
0x007c6829,
0x007cc88d,
0x007e3001,
0x0081680d,
0x00819015,
0x0081c805,
0x0081e805,
0x0082c005,
0x0082f009,
0x0083880d,
0x00841001,
0x00842805,
0x00846801,
0x0084e801,
0x009ae809,
0x00b8900d,
0x00b99009,
0x00ba9005,
0x00bb9005,
0x00bda005,
0x00bdb819,
0x00be3001,
0x00be4829,
0x00bee801,
0x00c05809,
0x00c07801,
0x00c42805,
0x00c54801,
0x00c90009,
0x00c93805,
0x00c99001,
0x00c9c809,
0x00d0b805,
0x00d0d801,
0x00d2b001,
0x00d2c019,
0x00d30001,
0x00d31001,
0x00d3281d,
0x00d39825,
0x00d3f801,
0x00d58079,
0x00d8000d,
0x00d9a025,
0x00da1009,
0x00db5821,
0x00dc0005,
0x00dd100d,
0x00dd4015,
0x00df3001,
0x00df4005,
0x00df6801,
0x00df7811,
0x00e1601d,
0x00e1b005,
0x00e68009,
0x00e6a031,
0x00e71019,
0x00e76801,
0x00e7a001,
0x00e7c005,
0x00ee00fd,
0x01006801,
0x01068081,
0x01677809,
0x016bf801,
0x016f007d,
0x01815015,
0x0184c805,
0x0533780d,
0x0533a025,
0x0534f005,
0x05378005,
0x05401001,
0x05403001,
0x05405801,
0x05412805,
0x05416001,
0x05462005,
0x05470045,
0x0547f801,
0x0549301d,
0x054a3829,
0x054a9801,
0x054c0009,
0x054d9801,
0x054db00d,
0x054de005,
0x054e0001,
0x054f2801,
0x05514815,
0x05518805,
0x0551a805,
0x05521801,
0x05526001,
0x0553e001,
0x05558001,
0x05559009,
0x0555b805,
0x0555f005,
0x05560801,
0x05576005,
0x0557b001,
0x055f2801,
0x055f4001,
0x055f6801,
0x07d8f001,
0x07f0003d,
0x07f1003d,
0x07fcf005,
0x080fe801,
0x08170001,
0x081bb011,
0x08500809,
0x08502805,
0x0850600d,
0x0851c009,
0x0851f801,
0x08572805,
0x0869200d,
0x086b4811,
0x08755805,
0x0877e00d,
0x087a3029,
0x087c100d,
0x08800801,
0x0881c039,
0x08838001,
0x08839805,
0x0883f809,
0x0885980d,
0x0885c805,
0x08861001,
0x08880009,
0x08893811,
0x0889681d,
0x088b9801,
0x088c0005,
0x088db021,
0x088e0001,
0x088e480d,
0x088e7801,
0x08917809,
0x0891a00d,
0x0891f001,
0x08920801,
0x0896f801,
0x0897181d,
0x08980005,
0x0899d805,
0x0899f001,
0x089a0001,
0x089a6801,
0x089ab801,
0x089b3019,
0x089b8011,
0x089dc001,
0x089dd815,
0x089e1001,
0x089e2801,
0x089e3809,
0x089e7009,
0x089e9001,
0x089f0805,
0x08a1c01d,
0x08a21009,
0x08a23001,
0x08a2f001,
0x08a58001,
0x08a59815,
0x08a5d001,
0x08a5e801,
0x08a5f805,
0x08a61005,
0x08ad7801,
0x08ad900d,
0x08ade005,
0x08adf805,
0x08aee005,
0x08b1981d,
0x08b1e801,
0x08b1f805,
0x08b55801,
0x08b56801,
0x08b5801d,
0x08b8e801,
0x08b8f801,
0x08b9100d,
0x08b93811,
0x08c17821,
0x08c1c805,
0x08c98001,
0x08c9d80d,
0x08ca1801,
0x08cea00d,
0x08ced005,
0x08cf0001,
0x08d00825,
0x08d19815,
0x08d1d80d,
0x08d23801,
0x08d28815,
0x08d2c809,
0x08d45031,
0x08d4c005,
0x08e18019,
0x08e1c015,
0x08e1f801,
0x08e49055,
0x08e55019,
0x08e59005,
0x08e5a805,
0x08e98815,
0x08e9d001,
0x08e9e005,
0x08e9f819,
0x08ea3801,
0x08ec8005,
0x08eca801,
0x08ecb801,
0x08f79805,
0x08f80005,
0x08f9b011,
0x08fa0009,
0x08fad001,
0x09a20001,
0x09a23839,
0x0b08f02d,
0x0b096809,
0x0b578011,
0x0b598019,
0x0b7a7801,
0x0b7c780d,
0x0b7f2001,
0x0b7f8005,
0x0de4e805,
0x0e7800b5,
0x0e798059,
0x0e8b2811,
0x0e8b6815,
0x0e8bd81d,
0x0e8c2819,
0x0e8d500d,
0x0e921009,
0x0ed000d9,
0x0ed1d8c5,
0x0ed3a801,
0x0ed42001,
0x0ed4d811,
0x0ed50839,
0x0f000019,
0x0f004041,
0x0f00d819,
0x0f011805,
0x0f013011,
0x0f047801,
0x0f098019,
0x0f157001,
0x0f17600d,
0x0f27600d,
0x0f2f7005,
0x0f468019,
0x0f4a2019,
0x0f9fd811,
0x7001017d,
0x700803bd};
// clang-format on
/// Returns the indic conjuct break property of a code point.
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
// The algorithm searches for the upper bound of the range and, when found,
// steps back one entry. This algorithm is used since the code point can be
// anywhere in the range. After a lower bound is found the next step is to
// compare whether the code unit is indeed in the range.
//
// Since the entry contains a code unit, size, and property the code point
// being sought needs to be adjusted. Just shifting the code point to the
// proper position doesn't work; suppose an entry has property 0, size 1,
// and lower bound 3. This results in the entry 0x1810.
// When searching for code point 3 it will search for 0x1800, find 0x1810
// and moves to the previous entry. Thus the lower bound value will never
// be found.
// The simple solution is to set the bits belonging to the property and
// size. Then the upper bound for code point 3 will return the entry after
// 0x1810. After moving to the previous entry the algorithm arrives at the
// correct entry.
ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
if (__i == 0)
return __property::__none;
--__i;
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111);
if (__code_point <= __upper_bound)
return static_cast<__property>(__entries[__i] & 0b11);
return __property::__none;
}
} // namespace __indic_conjunct_break
#endif // _LIBCPP_STD_VER >= 20
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H