|  | #!/usr/bin/env python | 
|  | # ===----------------------------------------------------------------------===## | 
|  | # | 
|  | # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | # See https://llvm.org/LICENSE.txt for license information. | 
|  | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | # | 
|  | # ===----------------------------------------------------------------------===## | 
|  |  | 
|  | # The code is based on | 
|  | # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py | 
|  | # | 
|  | # Copyright (c) Microsoft Corporation. | 
|  | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  |  | 
|  | from io import StringIO | 
|  | from pathlib import Path | 
|  | from dataclasses import dataclass | 
|  | from typing import Optional | 
|  | import re | 
|  | import sys | 
|  |  | 
|  |  | 
|  | @dataclass | 
|  | class PropertyRange: | 
|  | lower: int = -1 | 
|  | upper: int = -1 | 
|  | prop: str = None | 
|  |  | 
|  |  | 
|  | @dataclass | 
|  | class Entry: | 
|  | lower: int = -1 | 
|  | offset: int = -1 | 
|  |  | 
|  |  | 
|  | LINE_REGEX = re.compile( | 
|  | r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)" | 
|  | ) | 
|  |  | 
|  |  | 
|  | def filterProperty(element: PropertyRange) -> Optional[PropertyRange]: | 
|  | ### Matches property predicate? | 
|  | if element.prop in ["W", "F"]: | 
|  | return element | 
|  |  | 
|  | ### Matches hardcode ranges predicate? | 
|  |  | 
|  | # Yijing Hexagram Symbols | 
|  | if element.lower >= 0x4DC0 and element.upper <= 0x4DFF: | 
|  | return element | 
|  |  | 
|  | # Miscellaneous Symbols and Pictographs | 
|  | if element.lower >= 0x1F300 and element.upper <= 0x1F5FF: | 
|  | return element | 
|  |  | 
|  | # Supplemental Symbols and Pictographs | 
|  | if element.lower >= 0x1F900 and element.upper <= 0x1F9FF: | 
|  | return element | 
|  |  | 
|  | return None | 
|  |  | 
|  |  | 
|  | def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: | 
|  | result = PropertyRange() | 
|  | if m := LINE_REGEX.match(inputLine): | 
|  | lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") | 
|  | result.lower = int(lower_str, base=16) | 
|  | result.upper = result.lower | 
|  | if upper_str is not None: | 
|  | result.upper = int(upper_str, base=16) | 
|  | return result | 
|  |  | 
|  | else: | 
|  | return None | 
|  |  | 
|  |  | 
|  | def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: | 
|  | """ | 
|  | Merges overlapping and consecutive ranges to one range. | 
|  |  | 
|  | Since the input properties are filtered the exact property isn't | 
|  | interesting anymore. The properties in the output are merged to aid | 
|  | debugging. | 
|  | Merging the ranges results in fewer ranges in the output table, | 
|  | reducing binary and improving lookup performance. | 
|  | """ | 
|  | result = list() | 
|  | for x in input: | 
|  | if ( | 
|  | len(result) | 
|  | and x.lower > result[-1].lower | 
|  | and x.lower <= result[-1].upper + 1 | 
|  | ): | 
|  | result[-1].upper = max(result[-1].upper, x.upper) | 
|  | result[-1].prop += f" {x.prop}" | 
|  | continue | 
|  | result.append(x) | 
|  | return result | 
|  |  | 
|  |  | 
|  | DATA_ARRAY_TEMPLATE = r""" | 
|  | /// The entries of the characters with an estimated width of 2. | 
|  | /// | 
|  | /// Contains the entries for [format.string.std]/12 | 
|  | ///  -  Any code point with the East_Asian_Width="W" or East_Asian_Width="F" | 
|  | ///     Derived Extracted Property as described by UAX #44 | 
|  | /// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols) | 
|  | /// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs) | 
|  | /// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs) | 
|  | /// | 
|  | /// The data is generated from | 
|  | /// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt | 
|  | /// - The "overrides" in [format.string.std]/12 | 
|  | /// | 
|  | /// The format of EastAsianWidth.txt is two fields separated by a semicolon. | 
|  | /// Field 0: Unicode code point value or range of code point values | 
|  | /// Field 1: East_Asian_Width property, consisting of one of the following values: | 
|  | ///         "A", "F", "H", "N", "Na", "W" | 
|  | ///  - All code points, assigned or unassigned, that are not listed | 
|  | ///      explicitly are given the value "N". | 
|  | ///  - The unassigned code points in the following blocks default to "W": | 
|  | ///         CJK Unified Ideographs Extension A: U+3400..U+4DBF | 
|  | ///         CJK Unified Ideographs:             U+4E00..U+9FFF | 
|  | ///         CJK Compatibility Ideographs:       U+F900..U+FAFF | 
|  | ///  - All undesignated code points in Planes 2 and 3, whether inside or | 
|  | ///      outside of allocated blocks, default to "W": | 
|  | ///         Plane 2:                            U+20000..U+2FFFD | 
|  | ///         Plane 3:                            U+30000..U+3FFFD | 
|  | /// | 
|  | /// The table is similar to the table | 
|  | ///  __extended_grapheme_custer_property_boundary::__entries | 
|  | /// which explains the details of these classes. The only difference is this | 
|  | /// table lacks a property, thus having more bits available for the size. | 
|  | /// | 
|  | /// The maximum code point that has an estimated width of 2 is U+3FFFD. This | 
|  | /// value can be encoded in 18 bits. Thus the upper 3 bits of the code point | 
|  | /// are always 0. These 3 bits are used to enlarge the offset range. This | 
|  | /// optimization reduces the table in Unicode 15 from 184 to 104 entries, | 
|  | /// saving 320 bytes. | 
|  | /// | 
|  | /// The data has 2 values: | 
|  | /// - bits [0, 13] The size of the range, allowing 16384 elements. | 
|  | /// - bits [14, 31] The lower bound code point of the range. The upper bound of | 
|  | ///   the range is lower bound + size. | 
|  | _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ | 
|  | {entries}}}; | 
|  |  | 
|  | /// The upper bound entry of EastAsianWidth.txt. | 
|  | /// | 
|  | /// Values greater than this value may have more than 18 significant bits. | 
|  | /// They always have a width of 1. This property makes it possible to store | 
|  | /// the table in its compact form. | 
|  | inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x}; | 
|  |  | 
|  | /// Returns the estimated width of a Unicode code point. | 
|  | /// | 
|  | /// \\pre The code point is a valid Unicode code point. | 
|  | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{ | 
|  | // Since __table_upper_bound contains the unshifted range do the | 
|  | // comparison without shifting. | 
|  | if (__code_point > __table_upper_bound) [[unlikely]] | 
|  | return 1; | 
|  |  | 
|  | // When the code-point is less than the first element in the table | 
|  | // the lookup is quite expensive. Since quite some scripts are in | 
|  | // that range, it makes sense to validate that first. | 
|  | // The std_format_spec_string_unicode benchmark gives a measurable | 
|  | // improvement. | 
|  | if (__code_point < (__entries[0] >> 14)) | 
|  | return 1; | 
|  |  | 
|  | ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries; | 
|  | if (__i == 0) | 
|  | return 1; | 
|  |  | 
|  | --__i; | 
|  | uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu); | 
|  | return 1 + (__code_point <= __upper_bound); | 
|  | }} | 
|  | """ | 
|  |  | 
|  | TABLES_HPP_TEMPLATE = """ | 
|  | // -*- C++ -*- | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | // WARNING, this entire header is generated by | 
|  | // utils/generate_width_estimation_table.py | 
|  | // DO NOT MODIFY! | 
|  |  | 
|  | // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE | 
|  | // | 
|  | // See Terms of Use <https://www.unicode.org/copyright.html> | 
|  | // for definitions of Unicode Inc.'s Data Files and Software. | 
|  | // | 
|  | // NOTICE TO USER: Carefully read the following legal agreement. | 
|  | // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S | 
|  | // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), | 
|  | // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | 
|  | // TERMS AND CONDITIONS OF THIS AGREEMENT. | 
|  | // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE | 
|  | // THE DATA FILES OR SOFTWARE. | 
|  | // | 
|  | // COPYRIGHT AND PERMISSION NOTICE | 
|  | // | 
|  | // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. | 
|  | // Distributed under the Terms of Use in https://www.unicode.org/copyright.html. | 
|  | // | 
|  | // Permission is hereby granted, free of charge, to any person obtaining | 
|  | // a copy of the Unicode data files and any associated documentation | 
|  | // (the "Data Files") or Unicode software and any associated documentation | 
|  | // (the "Software") to deal in the Data Files or Software | 
|  | // without restriction, including without limitation the rights to use, | 
|  | // copy, modify, merge, publish, distribute, and/or sell copies of | 
|  | // the Data Files or Software, and to permit persons to whom the Data Files | 
|  | // or Software are furnished to do so, provided that either | 
|  | // (a) this copyright and permission notice appear with all copies | 
|  | // of the Data Files or Software, or | 
|  | // (b) this copyright and permission notice appear in associated | 
|  | // Documentation. | 
|  | // | 
|  | // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF | 
|  | // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE | 
|  | // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | 
|  | // NONINFRINGEMENT OF THIRD PARTY RIGHTS. | 
|  | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS | 
|  | // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL | 
|  | // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, | 
|  | // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER | 
|  | // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | 
|  | // PERFORMANCE OF THE DATA FILES OR SOFTWARE. | 
|  | // | 
|  | // Except as contained in this notice, the name of a copyright holder | 
|  | // shall not be used in advertising or otherwise to promote the sale, | 
|  | // use or other dealings in these Data Files or Software without prior | 
|  | // written authorization of the copyright holder. | 
|  |  | 
|  | #ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H | 
|  | #define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H | 
|  |  | 
|  | #include <__algorithm/ranges_upper_bound.h> | 
|  | #include <__config> | 
|  | #include <__cstddef/ptrdiff_t.h> | 
|  | #include <cstdint> | 
|  |  | 
|  | #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) | 
|  | #  pragma GCC system_header | 
|  | #endif | 
|  |  | 
|  | _LIBCPP_BEGIN_NAMESPACE_STD | 
|  |  | 
|  | #if _LIBCPP_STD_VER >= 20 | 
|  |  | 
|  | namespace __width_estimation_table {{ | 
|  | {content} | 
|  | }} // namespace __width_estimation_table | 
|  |  | 
|  | #endif // _LIBCPP_STD_VER >= 20 | 
|  |  | 
|  | _LIBCPP_END_NAMESPACE_STD | 
|  |  | 
|  | #endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H""" | 
|  |  | 
|  |  | 
|  | def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]: | 
|  | # The maximum value that can be encoded in the available bits in the | 
|  | # __entries table. | 
|  | upper_bound = 0x3FFFF | 
|  | # The maximum offset in an __entries entry. Larger offsets will be | 
|  | # splitted and stored in multiple entries. | 
|  | chunk = 16384 | 
|  | result = list[Entry]() | 
|  | high = -1 | 
|  | for range in sorted(ranges, key=lambda x: x.lower): | 
|  | # Validate overlapping ranges | 
|  | assert range.lower > high | 
|  | high = range.upper | 
|  | assert high <= upper_bound | 
|  |  | 
|  | while True: | 
|  | e = Entry(range.lower, range.upper - range.lower) | 
|  | if e.offset < chunk: | 
|  | result.append(e) | 
|  | break | 
|  | e.offset = chunk - 1 | 
|  | result.append(e) | 
|  | range.lower += chunk | 
|  | return result | 
|  |  | 
|  |  | 
|  | cpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */" | 
|  |  | 
|  |  | 
|  | def generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str: | 
|  | result = StringIO() | 
|  | table = property_ranges_to_table(ranges) | 
|  | result.write( | 
|  | DATA_ARRAY_TEMPLATE.format( | 
|  | size=len(table), | 
|  | entries=", //\n".join( | 
|  | [ | 
|  | cpp_entrytemplate.format( | 
|  | x.lower << 14 | x.offset, | 
|  | x.lower, | 
|  | x.lower + x.offset, | 
|  | x.offset + 1, | 
|  | ) | 
|  | for x in table | 
|  | ] | 
|  | ), | 
|  | upper_bound=upper_bound, | 
|  | ) | 
|  | ) | 
|  |  | 
|  | return result.getvalue() | 
|  |  | 
|  |  | 
|  | def generate_data_tables() -> str: | 
|  | """ | 
|  | Generate Unicode data for [format.string.std]/12 | 
|  | """ | 
|  | east_asian_width_path = ( | 
|  | Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt" | 
|  | ) | 
|  |  | 
|  | properties = list() | 
|  | with east_asian_width_path.open(encoding="utf-8") as f: | 
|  | properties.extend( | 
|  | list( | 
|  | filter( | 
|  | filterProperty, | 
|  | [x for line in f if (x := parsePropertyLine(line))], | 
|  | ) | 
|  | ) | 
|  | ) | 
|  | # The range U+4DC0 - U+4DFF is neutral and should not be in the table | 
|  | # The range U+1F300 - U+1F5FF is partly in the range, for example | 
|  | #   1F300..1F320;W   # So    [33] CYCLONE..SHOOTING STAR | 
|  | #   1F321..1F32C;N   # So    [12] THERMOMETER..WIND BLOWING FACE | 
|  | #   1F32D..1F335;W   # So     [9] HOT DOG..CACTUS | 
|  | # The first and last ranges are present, but the second isn't | 
|  |  | 
|  | # Validate the hardcode ranges are present | 
|  |  | 
|  | # Yijing Hexagram Symbols | 
|  | for i in range(0x4DC0, 0x4DFF + 1): | 
|  | assert [x for x in properties if i >= x.lower and i <= x.upper] | 
|  |  | 
|  | # Miscellaneous Symbols and Pictographs | 
|  | for i in range(0x1F300, 0x1F5FF + 1): | 
|  | assert [x for x in properties if i >= x.lower and i <= x.upper] | 
|  |  | 
|  | # Miscellaneous Symbols and Pictographs | 
|  | for i in range(0x1F900, 0x1F9FF + 1): | 
|  | assert [x for x in properties if i >= x.lower and i <= x.upper] | 
|  |  | 
|  | data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower)) | 
|  |  | 
|  | return "\n".join([generate_cpp_data(data, data[-1].upper)]) | 
|  |  | 
|  |  | 
|  | if __name__ == "__main__": | 
|  | if len(sys.argv) == 2: | 
|  | sys.stdout = open(sys.argv[1], "w") | 
|  | print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables())) |