blob: 944fd17263700160a457038d08dbc2e7453dc110 [file] [log] [blame]
#!/usr/bin/env python
#===- gen_std.py - ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C/C++ Standard Library symbols by parsing archived HTML
files from cppreference.
The generated files are located in clang/include/Tooling/Inclusions.
Caveats and FIXMEs:
- only symbols directly in "std" namespace are added, we should also add std's
subnamespace symbols (e.g. chrono).
- symbols with multiple variants or defined in multiple headers aren't added,
e.g. std::move, std::swap
Usage:
1. Install BeautifulSoup dependency, see instruction:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at
https://en.cppreference.com/w/Cppreference:Archives
3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
get a "cppreference/reference" directory.
4. Run the command:
// Generate C++ symbols
python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
// Generate C symbols
python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
"""
import cppreference_parser
import argparse
import datetime
import os
import sys
import re
CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// This file was generated automatically by
// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""
def ParseArg():
parser = argparse.ArgumentParser(description='Generate StdGen file')
parser.add_argument('-cppreference', metavar='PATH',
default='',
help='path to the cppreference offline HTML directory',
required=True
)
parser.add_argument('-symbols',
default='cpp',
help='Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.',
required=True)
return parser.parse_args()
def AdditionalHeadersForIOSymbols(symbol):
# IO-related symbols declared in the <iosfwd> header, per C++
# [iosfwd.syn 31.3.1]:
iosfwd_symbols = [
'basic_ios',
'basic_streambuf',
'basic_istream',
'basic_ostream',
'basic_iostream',
'basic_stringbuf',
'basic_istringstream',
'basic_ostringstream',
'basic_stringstream',
'basic_spanbuf',
'basic_ispanstream',
'basic_ospanstream',
'basic_spanstream',
'basic_filebuf',
'basic_ifstream',
'basic_ofstream',
'basic_fstream',
'basic_syncbuf',
'basic_osyncstream',
'istreambuf_iterator',
'ostreambuf_iterator',
'ios',
'wios',
'streambuf',
'istream',
'ostream',
'iostream',
'stringbuf',
'istringstream',
'ostringstream',
'stringstream',
'spanbuf',
'ispanstream',
'ospanstream',
'spanstream',
'filebuf',
'ifstream',
'ofstream',
'fstream',
'syncbuf',
'osyncstream',
'wstreambuf',
'wistream',
'wostream',
'wiostream',
'wstringbuf',
'wistringstream',
'wostringstream',
'wstringstream',
'wspanbuf',
'wispanstream',
'wospanstream',
'wspanstream',
'wfilebuf',
'wifstream',
'wofstream',
'wfstream',
'wsyncbuf',
'wosyncstream',
'fpos',
'streampos',
'wstreampos',
'u8streampos',
'u16streampos',
'u32streampos',
]
assert(len(symbol.headers) == 1)
sym_header = symbol.headers[0]
headers = []
# <iostream> is preferred than <iosfwd>
# <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>.
# per C++ [iostream.syn 31.4.1]
if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]:
headers.append("<iostream>")
if symbol.name in iosfwd_symbols:
headers.append("<iosfwd>")
return headers
def GetCCompatibilitySymbols(symbol):
# C++ form of the C standard headers.
c_compat_headers = {
"<cassert>",
"<cctype>",
"<cerrno>",
"<cfenv>",
"<cfloat>",
"<cinttypes>",
"<climits>",
"<clocale>",
"<cmath>",
"<csetjmp>",
"<csignal>",
"<cstdarg>",
"<cstddef>",
"<cstdint>",
"<cstdio>",
"<cstdlib>",
"<cstring>",
"<ctime>",
"<cuchar>",
"<cwchar>",
"<cwctype>",
}
# C++ [support.c.headers.other] 17.14.7
# ..., behaves as if each name placed in the standard library namespace by
# the corresponding <cname> header is placed within the global namespace
# scope, except for the functions described in [sf.cmath], the
# std​::​lerp function overloads ([c.math.lerp]), the declaration of
# std​::​byte ([cstddef.syn]), and the functions and function templates
# described in [support.types.byteops].
exception_symbols = {
"(assoc_)?laguerre[f|l]?",
"(assoc_|sph_)?legendre[f|l]?",
"beta[f|l]?",
"(comp_)?ellint_[1-3][f|l]?",
"(cyl_|sph_)?bessel_[i-k][f|l]?",
"(cyl_|sph_)?neumann[f|l]?",
"expint[f|l]?",
"hermite[f|l]?",
"riemann_zeta[f|l]?",
"lerp",
"byte",
}
assert(len(symbol.headers) == 1)
header = symbol.headers[0]
if header not in c_compat_headers:
return []
if any(re.fullmatch(x, symbol.name) for x in exception_symbols):
return []
# Introduce two more entries, both in the global namespace, one using the
# C++-compat header and another using the C header.
results = []
if symbol.namespace != None:
# avoid printing duplicated entries, for C macros!
results.append(cppreference_parser.Symbol(symbol.name, None, [header]))
c_header = "<" + header[2:-1] + ".h>" # <cstdio> => <stdio.h>
results.append(cppreference_parser.Symbol(symbol.name, None, [c_header]))
return results
def main():
args = ParseArg()
if args.symbols == 'cpp':
page_root = os.path.join(args.cppreference, "en", "cpp")
symbol_index_root = os.path.join(page_root, "symbol_index")
parse_pages = [
(page_root, "symbol_index.html", "std::"),
# std sub-namespace symbols have separated pages.
# We don't index std literal operators (e.g.
# std::literals::chrono_literals::operator""d), these symbols can't be
# accessed by std::<symbol_name>.
#
# std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc
(symbol_index_root, "chrono.html", "std::chrono::"),
(symbol_index_root, "execution.html", "std::execution::"),
(symbol_index_root, "numbers.html", "std::numbers::"),
(symbol_index_root, "filesystem.html", "std::filesystem::"),
(symbol_index_root, "pmr.html", "std::pmr::"),
(symbol_index_root, "ranges.html", "std::ranges::"),
(symbol_index_root, "regex_constants.html", "std::regex_constants::"),
(symbol_index_root, "this_thread.html", "std::this_thread::"),
# Zombie symbols that were available from the Standard Library, but are
# removed in the following standards.
(symbol_index_root, "zombie_names.html", "std::"),
(symbol_index_root, "macro.html", None),
]
elif args.symbols == 'c':
page_root = os.path.join(args.cppreference, "en", "c")
symbol_index_root = page_root
parse_pages = [(page_root, "index.html", None)]
if not os.path.exists(symbol_index_root):
exit("Path %s doesn't exist!" % symbol_index_root)
symbols = cppreference_parser.GetSymbols(parse_pages)
# We don't have version information from the unzipped offline HTML files.
# so we use the modified time of the symbol_index.html as the version.
index_page_path = os.path.join(page_root, "index.html")
cppreference_modified_date = datetime.datetime.fromtimestamp(
os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
for symbol in symbols:
if len(symbol.headers) == 1:
augmented_symbols = [symbol]
augmented_symbols.extend(GetCCompatibilitySymbols(symbol))
for s in augmented_symbols:
s.headers.extend(AdditionalHeadersForIOSymbols(s))
for header in s.headers:
# SYMBOL(unqualified_name, namespace, header)
print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace,
header))
elif len(symbol.headers) == 0:
sys.stderr.write("No header found for symbol %s\n" % symbol.name)
else:
# FIXME: support symbols with multiple headers (e.g. std::move).
sys.stderr.write("Ambiguous header for symbol %s: %s\n" % (
symbol.name, ', '.join(symbol.headers)))
if __name__ == '__main__':
main()