| #!/usr/bin/env python |
| # ===- gen_std.py - ------------------------------------------*- python -*--===# |
| # |
| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| # |
| # ===------------------------------------------------------------------------===# |
| |
| """gen_std.py is a tool to generate a lookup table (from qualified names to |
| include headers) for C/C++ Standard Library symbols by parsing archived HTML |
| files from cppreference. |
| |
| The generated files are located in clang/include/Tooling/Inclusions. |
| |
| Caveats and FIXMEs: |
| - only symbols directly in "std" namespace are added, we should also add std's |
| subnamespace symbols (e.g. chrono). |
| - symbols with multiple variants or defined in multiple headers aren't added, |
| e.g. std::move, std::swap |
| |
| Usage: |
| 1. Install BeautifulSoup dependency, see instruction: |
| https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup |
| 2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at |
| https://en.cppreference.com/w/Cppreference:Archives |
| 3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should |
| get a "cppreference/reference" directory. |
| 4. Run the command: |
| // Generate C++ symbols |
| python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc |
| // Generate C symbols |
| python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc |
| """ |
| |
| |
| import cppreference_parser |
| import argparse |
| import datetime |
| import os |
| import sys |
| import re |
| |
| |
| CODE_PREFIX = """\ |
| //===-- gen_std.py generated file -------------------------------*- C++ -*-===// |
| // |
| // Used to build a lookup table (qualified names => include headers) for %s |
| // Standard Library symbols. |
| // |
| // This file was generated automatically by |
| // clang/tools/include-mapping/gen_std.py, DO NOT EDIT! |
| // |
| // Generated from cppreference offline HTML book (modified on %s). |
| //===----------------------------------------------------------------------===// |
| """ |
| |
| |
| def ParseArg(): |
| parser = argparse.ArgumentParser(description="Generate StdGen file") |
| parser.add_argument( |
| "-cppreference", |
| metavar="PATH", |
| default="", |
| help="path to the cppreference offline HTML directory", |
| required=True, |
| ) |
| parser.add_argument( |
| "-symbols", |
| default="cpp", |
| help="Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.", |
| required=True, |
| ) |
| return parser.parse_args() |
| |
| |
| def AdditionalHeadersForIOSymbols(symbol): |
| # IO-related symbols declared in the <iosfwd> header, per C++ |
| # [iosfwd.syn 31.3.1]: |
| iosfwd_symbols = [ |
| "basic_ios", |
| "basic_streambuf", |
| "basic_istream", |
| "basic_ostream", |
| "basic_iostream", |
| "basic_stringbuf", |
| "basic_istringstream", |
| "basic_ostringstream", |
| "basic_stringstream", |
| "basic_spanbuf", |
| "basic_ispanstream", |
| "basic_ospanstream", |
| "basic_spanstream", |
| "basic_filebuf", |
| "basic_ifstream", |
| "basic_ofstream", |
| "basic_fstream", |
| "basic_syncbuf", |
| "basic_osyncstream", |
| "istreambuf_iterator", |
| "ostreambuf_iterator", |
| "ios", |
| "wios", |
| "streambuf", |
| "istream", |
| "ostream", |
| "iostream", |
| "stringbuf", |
| "istringstream", |
| "ostringstream", |
| "stringstream", |
| "spanbuf", |
| "ispanstream", |
| "ospanstream", |
| "spanstream", |
| "filebuf", |
| "ifstream", |
| "ofstream", |
| "fstream", |
| "syncbuf", |
| "osyncstream", |
| "wstreambuf", |
| "wistream", |
| "wostream", |
| "wiostream", |
| "wstringbuf", |
| "wistringstream", |
| "wostringstream", |
| "wstringstream", |
| "wspanbuf", |
| "wispanstream", |
| "wospanstream", |
| "wspanstream", |
| "wfilebuf", |
| "wifstream", |
| "wofstream", |
| "wfstream", |
| "wsyncbuf", |
| "wosyncstream", |
| "fpos", |
| "streampos", |
| "wstreampos", |
| "u8streampos", |
| "u16streampos", |
| "u32streampos", |
| ] |
| assert len(symbol.headers) == 1 |
| sym_header = symbol.headers[0] |
| headers = [] |
| # <iostream> is preferred than <iosfwd> |
| |
| # <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>. |
| # per C++ [iostream.syn 31.4.1] |
| if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]: |
| headers.append("<iostream>") |
| |
| if symbol.name in iosfwd_symbols: |
| headers.append("<iosfwd>") |
| |
| return headers |
| |
| |
| def GetCCompatibilitySymbols(symbol): |
| # C++ form of the C standard headers. |
| c_compat_headers = { |
| "<cassert>", |
| "<cctype>", |
| "<cerrno>", |
| "<cfenv>", |
| "<cfloat>", |
| "<cinttypes>", |
| "<climits>", |
| "<clocale>", |
| "<cmath>", |
| "<csetjmp>", |
| "<csignal>", |
| "<cstdarg>", |
| "<cstddef>", |
| "<cstdint>", |
| "<cstdio>", |
| "<cstdlib>", |
| "<cstring>", |
| "<ctime>", |
| "<cuchar>", |
| "<cwchar>", |
| "<cwctype>", |
| } |
| # C++ [support.c.headers.other] 17.14.7 |
| # ..., behaves as if each name placed in the standard library namespace by |
| # the corresponding <cname> header is placed within the global namespace |
| # scope, except for the functions described in [sf.cmath], the |
| # std::lerp function overloads ([c.math.lerp]), the declaration of |
| # std::byte ([cstddef.syn]), and the functions and function templates |
| # described in [support.types.byteops]. |
| exception_symbols = { |
| "(assoc_)?laguerre[f|l]?", |
| "(assoc_|sph_)?legendre[f|l]?", |
| "beta[f|l]?", |
| "(comp_)?ellint_[1-3][f|l]?", |
| "(cyl_|sph_)?bessel_[i-k][f|l]?", |
| "(cyl_|sph_)?neumann[f|l]?", |
| "expint[f|l]?", |
| "hermite[f|l]?", |
| "riemann_zeta[f|l]?", |
| "lerp", |
| "byte", |
| } |
| assert len(symbol.headers) == 1 |
| header = symbol.headers[0] |
| if header not in c_compat_headers: |
| return [] |
| if any(re.fullmatch(x, symbol.name) for x in exception_symbols): |
| return [] |
| |
| # Introduce two more entries, both in the global namespace, one using the |
| # C++-compat header and another using the C header. |
| results = [] |
| if symbol.namespace != None: |
| # avoid printing duplicated entries, for C macros! |
| results.append(cppreference_parser.Symbol(symbol.name, None, [header])) |
| c_header = "<" + header[2:-1] + ".h>" # <cstdio> => <stdio.h> |
| results.append(cppreference_parser.Symbol(symbol.name, None, [c_header])) |
| return results |
| |
| |
| def main(): |
| args = ParseArg() |
| if args.symbols == "cpp": |
| page_root = os.path.join(args.cppreference, "en", "cpp") |
| symbol_index_root = os.path.join(page_root, "symbol_index") |
| parse_pages = [ |
| (page_root, "symbol_index.html", "std::"), |
| # std sub-namespace symbols have separated pages. |
| # We don't index std literal operators (e.g. |
| # std::literals::chrono_literals::operator""d), these symbols can't be |
| # accessed by std::<symbol_name>. |
| # |
| # std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc |
| (symbol_index_root, "chrono.html", "std::chrono::"), |
| (symbol_index_root, "execution.html", "std::execution::"), |
| (symbol_index_root, "numbers.html", "std::numbers::"), |
| (symbol_index_root, "filesystem.html", "std::filesystem::"), |
| (symbol_index_root, "pmr.html", "std::pmr::"), |
| (symbol_index_root, "ranges.html", "std::ranges::"), |
| (symbol_index_root, "regex_constants.html", "std::regex_constants::"), |
| (symbol_index_root, "this_thread.html", "std::this_thread::"), |
| # Zombie symbols that were available from the Standard Library, but are |
| # removed in the following standards. |
| (symbol_index_root, "zombie_names.html", "std::"), |
| (symbol_index_root, "macro.html", None), |
| ] |
| elif args.symbols == "c": |
| page_root = os.path.join(args.cppreference, "en", "c") |
| symbol_index_root = page_root |
| parse_pages = [(page_root, "index.html", None)] |
| |
| if not os.path.exists(symbol_index_root): |
| exit("Path %s doesn't exist!" % symbol_index_root) |
| |
| symbols = cppreference_parser.GetSymbols(parse_pages) |
| |
| # We don't have version information from the unzipped offline HTML files. |
| # so we use the modified time of the symbol_index.html as the version. |
| index_page_path = os.path.join(page_root, "index.html") |
| cppreference_modified_date = datetime.datetime.fromtimestamp( |
| os.stat(index_page_path).st_mtime |
| ).strftime("%Y-%m-%d") |
| print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date)) |
| for symbol in symbols: |
| if len(symbol.headers) == 1: |
| augmented_symbols = [symbol] |
| augmented_symbols.extend(GetCCompatibilitySymbols(symbol)) |
| for s in augmented_symbols: |
| s.headers.extend(AdditionalHeadersForIOSymbols(s)) |
| for header in s.headers: |
| # SYMBOL(unqualified_name, namespace, header) |
| print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header)) |
| elif len(symbol.headers) == 0: |
| sys.stderr.write("No header found for symbol %s\n" % symbol.name) |
| else: |
| # FIXME: support symbols with multiple headers (e.g. std::move). |
| sys.stderr.write( |
| "Ambiguous header for symbol %s: %s\n" |
| % (symbol.name, ", ".join(symbol.headers)) |
| ) |
| |
| |
| if __name__ == "__main__": |
| main() |