Mark de Wever | 59e66c5 | 2024-04-09 19:20:06 +0200 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # ===----------------------------------------------------------------------===## |
| 3 | # |
| 4 | # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 5 | # See https://llvm.org/LICENSE.txt for license information. |
| 6 | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 | # |
| 8 | # ===----------------------------------------------------------------------===## |
| 9 | |
| 10 | # The code is based on |
| 11 | # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py |
| 12 | # |
| 13 | # Copyright (c) Microsoft Corporation. |
| 14 | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 15 | |
| 16 | from io import StringIO |
| 17 | from pathlib import Path |
| 18 | from dataclasses import dataclass |
| 19 | from typing import Optional |
| 20 | import re |
| 21 | import sys |
| 22 | |
| 23 | |
| 24 | @dataclass |
| 25 | class PropertyRange: |
| 26 | lower: int = -1 |
| 27 | upper: int = -1 |
| 28 | prop: str = None |
| 29 | |
| 30 | |
| 31 | @dataclass |
| 32 | class Entry: |
| 33 | lower: int = -1 |
| 34 | offset: int = -1 |
| 35 | prop: int = -1 |
| 36 | |
| 37 | |
| 38 | LINE_REGEX = re.compile( |
| 39 | r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*InCB;\s*(?P<prop>\w+)" |
| 40 | ) |
| 41 | |
| 42 | def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: |
| 43 | result = PropertyRange() |
| 44 | if m := LINE_REGEX.match(inputLine): |
| 45 | lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") |
| 46 | result.lower = int(lower_str, base=16) |
| 47 | result.upper = result.lower |
| 48 | if upper_str is not None: |
| 49 | result.upper = int(upper_str, base=16) |
| 50 | return result |
| 51 | |
| 52 | else: |
| 53 | return None |
| 54 | |
| 55 | |
| 56 | |
| 57 | def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: |
| 58 | """ |
| 59 | Merges consecutive ranges with the same property to one range. |
| 60 | |
| 61 | Merging the ranges results in fewer ranges in the output table, |
| 62 | reducing binary and improving lookup performance. |
| 63 | """ |
| 64 | result = list() |
| 65 | for x in input: |
| 66 | if ( |
| 67 | len(result) |
| 68 | and result[-1].prop == x.prop |
| 69 | and result[-1].upper + 1 == x.lower |
| 70 | ): |
| 71 | result[-1].upper = x.upper |
| 72 | continue |
| 73 | result.append(x) |
| 74 | return result |
| 75 | |
| 76 | |
| 77 | PROP_VALUE_ENUMERATOR_TEMPLATE = " __{}" |
| 78 | PROP_VALUE_ENUM_TEMPLATE = """ |
| 79 | enum class __property : uint8_t {{ |
| 80 | // Values generated from the data files. |
| 81 | {enumerators}, |
| 82 | |
| 83 | // The code unit has none of above properties. |
| 84 | __none |
| 85 | }}; |
| 86 | """ |
| 87 | |
| 88 | DATA_ARRAY_TEMPLATE = """ |
| 89 | /// The entries of the indic conjunct break property table. |
| 90 | /// |
| 91 | /// The data is generated from |
| 92 | /// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
| 93 | /// |
| 94 | /// The data has 3 values |
| 95 | /// - bits [0, 1] The property. One of the values generated from the datafiles |
| 96 | /// of \\ref __property |
| 97 | /// - bits [2, 10] The size of the range. |
| 98 | /// - bits [11, 31] The lower bound code point of the range. The upper bound of |
| 99 | /// the range is lower bound + size. |
| 100 | /// |
| 101 | /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges |
| 102 | /// in the Unicode tables are larger. They are stored in multiple consecutive |
| 103 | /// ranges in the data table. An alternative would be to store the sizes in a |
| 104 | /// separate 16-bit value. The original MSVC STL code had such an approach, but |
| 105 | /// this approach uses less space for the data and is about 4% faster in the |
| 106 | /// following benchmark. |
| 107 | /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp |
| 108 | // clang-format off |
| 109 | _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ |
| 110 | {entries}}}; |
| 111 | // clang-format on |
| 112 | |
| 113 | /// Returns the indic conjuct break property of a code point. |
| 114 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{ |
| 115 | // The algorithm searches for the upper bound of the range and, when found, |
| 116 | // steps back one entry. This algorithm is used since the code point can be |
| 117 | // anywhere in the range. After a lower bound is found the next step is to |
| 118 | // compare whether the code unit is indeed in the range. |
| 119 | // |
| 120 | // Since the entry contains a code unit, size, and property the code point |
| 121 | // being sought needs to be adjusted. Just shifting the code point to the |
| 122 | // proper position doesn't work; suppose an entry has property 0, size 1, |
| 123 | // and lower bound 3. This results in the entry 0x1810. |
| 124 | // When searching for code point 3 it will search for 0x1800, find 0x1810 |
| 125 | // and moves to the previous entry. Thus the lower bound value will never |
| 126 | // be found. |
| 127 | // The simple solution is to set the bits belonging to the property and |
| 128 | // size. Then the upper bound for code point 3 will return the entry after |
| 129 | // 0x1810. After moving to the previous entry the algorithm arrives at the |
| 130 | // correct entry. |
| 131 | ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; |
| 132 | if (__i == 0) |
| 133 | return __property::__none; |
| 134 | |
| 135 | --__i; |
| 136 | uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111); |
| 137 | if (__code_point <= __upper_bound) |
| 138 | return static_cast<__property>(__entries[__i] & 0b11); |
| 139 | |
| 140 | return __property::__none; |
| 141 | }} |
| 142 | """ |
| 143 | |
| 144 | MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """ |
| 145 | // -*- C++ -*- |
| 146 | //===----------------------------------------------------------------------===// |
| 147 | // |
| 148 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 149 | // See https://llvm.org/LICENSE.txt for license information. |
| 150 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 151 | // |
| 152 | //===----------------------------------------------------------------------===// |
| 153 | |
| 154 | // WARNING, this entire header is generated by |
| 155 | // utils/generate_indic_conjunct_break_table.py |
| 156 | // DO NOT MODIFY! |
| 157 | |
| 158 | // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE |
| 159 | // |
| 160 | // See Terms of Use <https://www.unicode.org/copyright.html> |
| 161 | // for definitions of Unicode Inc.'s Data Files and Software. |
| 162 | // |
| 163 | // NOTICE TO USER: Carefully read the following legal agreement. |
| 164 | // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S |
| 165 | // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), |
| 166 | // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE |
| 167 | // TERMS AND CONDITIONS OF THIS AGREEMENT. |
| 168 | // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE |
| 169 | // THE DATA FILES OR SOFTWARE. |
| 170 | // |
| 171 | // COPYRIGHT AND PERMISSION NOTICE |
| 172 | // |
| 173 | // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. |
| 174 | // Distributed under the Terms of Use in https://www.unicode.org/copyright.html. |
| 175 | // |
| 176 | // Permission is hereby granted, free of charge, to any person obtaining |
| 177 | // a copy of the Unicode data files and any associated documentation |
| 178 | // (the "Data Files") or Unicode software and any associated documentation |
| 179 | // (the "Software") to deal in the Data Files or Software |
| 180 | // without restriction, including without limitation the rights to use, |
| 181 | // copy, modify, merge, publish, distribute, and/or sell copies of |
| 182 | // the Data Files or Software, and to permit persons to whom the Data Files |
| 183 | // or Software are furnished to do so, provided that either |
| 184 | // (a) this copyright and permission notice appear with all copies |
| 185 | // of the Data Files or Software, or |
| 186 | // (b) this copyright and permission notice appear in associated |
| 187 | // Documentation. |
| 188 | // |
| 189 | // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF |
| 190 | // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
| 191 | // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 192 | // NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
| 193 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS |
| 194 | // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
| 195 | // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
| 196 | // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
| 197 | // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| 198 | // PERFORMANCE OF THE DATA FILES OR SOFTWARE. |
| 199 | // |
| 200 | // Except as contained in this notice, the name of a copyright holder |
| 201 | // shall not be used in advertising or otherwise to promote the sale, |
| 202 | // use or other dealings in these Data Files or Software without prior |
| 203 | // written authorization of the copyright holder. |
| 204 | |
| 205 | #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H |
| 206 | #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H |
| 207 | |
| 208 | #include <__algorithm/ranges_upper_bound.h> |
| 209 | #include <__config> |
Nikolas Klauser | e99c490 | 2024-10-31 02:20:10 +0100 | [diff] [blame] | 210 | #include <__cstddef/ptrdiff_t.h> |
Mark de Wever | 59e66c5 | 2024-04-09 19:20:06 +0200 | [diff] [blame] | 211 | #include <__iterator/access.h> |
Mark de Wever | 59e66c5 | 2024-04-09 19:20:06 +0200 | [diff] [blame] | 212 | #include <cstdint> |
| 213 | |
| 214 | #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) |
| 215 | # pragma GCC system_header |
| 216 | #endif |
| 217 | |
| 218 | _LIBCPP_BEGIN_NAMESPACE_STD |
| 219 | |
| 220 | #if _LIBCPP_STD_VER >= 20 |
| 221 | |
| 222 | namespace __indic_conjunct_break {{ |
| 223 | {content} |
| 224 | }} // namespace __indic_conjunct_break |
| 225 | |
Louis Dionne | 348e741 | 2024-08-30 12:07:07 -0400 | [diff] [blame] | 226 | #endif // _LIBCPP_STD_VER >= 20 |
Mark de Wever | 59e66c5 | 2024-04-09 19:20:06 +0200 | [diff] [blame] | 227 | |
| 228 | _LIBCPP_END_NAMESPACE_STD |
| 229 | |
| 230 | #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H""" |
| 231 | |
| 232 | |
| 233 | def property_ranges_to_table( |
| 234 | ranges: list[PropertyRange], props: list[str] |
| 235 | ) -> list[Entry]: |
| 236 | assert len(props) < 4 |
| 237 | result = list[Entry]() |
| 238 | high = -1 |
| 239 | for range in sorted(ranges, key=lambda x: x.lower): |
| 240 | # Validate overlapping ranges |
| 241 | assert range.lower > high |
| 242 | high = range.upper |
| 243 | |
| 244 | while True: |
| 245 | e = Entry(range.lower, range.upper - range.lower, props.index(range.prop)) |
| 246 | if e.offset <= 511: |
| 247 | result.append(e) |
| 248 | break |
| 249 | e.offset = 511 |
| 250 | result.append(e) |
| 251 | range.lower += 512 |
| 252 | return result |
| 253 | |
| 254 | |
| 255 | cpp_entrytemplate = " 0x{:08x}" |
| 256 | |
| 257 | |
| 258 | def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str: |
| 259 | result = StringIO() |
| 260 | prop_values = sorted(set(x.prop for x in ranges)) |
| 261 | table = property_ranges_to_table(ranges, prop_values) |
| 262 | enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values] |
| 263 | result.write( |
| 264 | PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values)) |
| 265 | ) |
| 266 | result.write( |
| 267 | DATA_ARRAY_TEMPLATE.format( |
| 268 | prop_name=prop_name, |
| 269 | size=len(table), |
| 270 | entries=",\n".join( |
| 271 | [ |
| 272 | cpp_entrytemplate.format(x.lower << 11 | x.offset << 2 | x.prop) |
| 273 | for x in table |
| 274 | ] |
| 275 | ), |
| 276 | ) |
| 277 | ) |
| 278 | |
| 279 | return result.getvalue() |
| 280 | |
| 281 | |
| 282 | def generate_data_tables() -> str: |
| 283 | """ |
| 284 | Generate Unicode data for inclusion into <format> from |
| 285 | - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
| 286 | |
| 287 | These files are expected to be stored in the same directory as this script. |
| 288 | """ |
| 289 | root = Path(__file__).absolute().parent / "data" / "unicode" |
| 290 | derived_core_path = root / "DerivedCoreProperties.txt" |
| 291 | |
| 292 | indic_conjunct_break = list() |
| 293 | with derived_core_path.open(encoding="utf-8") as f: |
| 294 | indic_conjunct_break_ranges = compactPropertyRanges( |
| 295 | [x for line in f if (x := parsePropertyLine(line))] |
| 296 | ) |
| 297 | |
| 298 | indic_conjunct_break_data = generate_cpp_data("Grapheme_Break", indic_conjunct_break_ranges) |
| 299 | return "\n".join([indic_conjunct_break_data]) |
| 300 | |
| 301 | |
| 302 | if __name__ == "__main__": |
| 303 | if len(sys.argv) == 2: |
| 304 | sys.stdout = open(sys.argv[1], "w") |
| 305 | print( |
| 306 | MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format( |
| 307 | content=generate_data_tables() |
| 308 | ) |
| 309 | ) |