| #!/usr/bin/env python |
| #===- cppreference_parser.py - ------------------------------*- python -*--===# |
| # |
| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| # |
| #===------------------------------------------------------------------------===# |
| |
| from bs4 import BeautifulSoup, NavigableString |
| |
| import collections |
| import multiprocessing |
| import os |
| import re |
| import signal |
| import sys |
| |
| |
| class Symbol: |
| |
| def __init__(self, name, namespace, headers): |
| # unqualifed symbol name, e.g. "move" |
| self.name = name |
| # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) |
| # None for C symbols. |
| self.namespace = namespace |
| # a list of corresponding headers |
| self.headers = headers |
| |
| |
| def _HasClass(tag, *classes): |
| for c in tag.get('class', []): |
| if c in classes: |
| return True |
| return False |
| |
| |
| def _ParseSymbolPage(symbol_page_html, symbol_name): |
| """Parse symbol page and retrieve the include header defined in this page. |
| The symbol page provides header for the symbol, specifically in |
| "Defined in header <header>" section. An example: |
| |
| <tr class="t-dsc-header"> |
| <td colspan="2"> <div>Defined in header <code><ratio></code> </div> |
| </td></tr> |
| |
| Returns a list of headers. |
| """ |
| headers = set() |
| all_headers = set() |
| |
| soup = BeautifulSoup(symbol_page_html, "html.parser") |
| # Rows in table are like: |
| # Defined in header <foo> .t-dsc-header |
| # Defined in header <bar> .t-dsc-header |
| # decl1 .t-dcl |
| # Defined in header <baz> .t-dsc-header |
| # decl2 .t-dcl |
| for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): |
| current_headers = [] |
| was_decl = False |
| for row in table.select('tr'): |
| if _HasClass(row, 't-dcl', 't-dsc'): |
| was_decl = True |
| # Symbols are in the first cell. |
| found_symbols = row.find('td').stripped_strings |
| if not symbol_name in found_symbols: |
| continue |
| headers.update(current_headers) |
| elif _HasClass(row, 't-dsc-header'): |
| # If we saw a decl since the last header, this is a new block of headers |
| # for a new block of decls. |
| if was_decl: |
| current_headers = [] |
| was_decl = False |
| # There are also .t-dsc-header for "defined in namespace". |
| if not "Defined in header " in row.text: |
| continue |
| # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. |
| for header_code in row.find_all("code"): |
| current_headers.append(header_code.text) |
| all_headers.add(header_code.text) |
| # If the symbol was never named, consider all named headers. |
| return headers or all_headers |
| |
| |
| def _ParseIndexPage(index_page_html): |
| """Parse index page. |
| The index page lists all std symbols and hrefs to their detailed pages |
| (which contain the defined header). An example: |
| |
| <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> |
| <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> |
| |
| Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). |
| """ |
| symbols = [] |
| soup = BeautifulSoup(index_page_html, "html.parser") |
| for symbol_href in soup.select("a[title]"): |
| # Ignore annotated symbols like "acos<>() (std::complex)". |
| # These tend to be overloads, and we the primary is more useful. |
| # This accidentally accepts begin/end despite the (iterator) caption: the |
| # (since C++11) note is first. They are good symbols, so the bug is unfixed. |
| caption = symbol_href.next_sibling |
| variant = isinstance(caption, NavigableString) and "(" in caption |
| symbol_tt = symbol_href.find("tt") |
| if symbol_tt: |
| symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() |
| symbol_href["href"], variant)) |
| return symbols |
| |
| |
| def _ReadSymbolPage(path, name): |
| with open(path) as f: |
| return _ParseSymbolPage(f.read(), name) |
| |
| |
| def _GetSymbols(pool, root_dir, index_page_name, namespace): |
| """Get all symbols listed in the index page. All symbols should be in the |
| given namespace. |
| |
| Returns a list of Symbols. |
| """ |
| |
| # Workflow steps: |
| # 1. Parse index page which lists all symbols to get symbol |
| # name (unqualified name) and its href link to the symbol page which |
| # contains the defined header. |
| # 2. Parse the symbol page to get the defined header. |
| index_page_path = os.path.join(root_dir, index_page_name) |
| with open(index_page_path, "r") as f: |
| # Read each symbol page in parallel. |
| results = [] # (symbol_name, promise of [header...]) |
| for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): |
| # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. |
| # FIXME: use these as a fallback rather than ignoring entirely. |
| if variant: |
| continue |
| path = os.path.join(root_dir, symbol_page_path) |
| results.append((symbol_name, |
| pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) |
| |
| # Build map from symbol name to a set of headers. |
| symbol_headers = collections.defaultdict(set) |
| for symbol_name, lazy_headers in results: |
| symbol_headers[symbol_name].update(lazy_headers.get()) |
| |
| symbols = [] |
| for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): |
| symbols.append(Symbol(name, namespace, list(headers))) |
| return symbols |
| |
| |
| def GetSymbols(parse_pages): |
| """Get all symbols by parsing the given pages. |
| |
| Args: |
| parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) |
| """ |
| symbols = [] |
| # Run many workers to process individual symbol pages under the symbol index. |
| # Don't allow workers to capture Ctrl-C. |
| pool = multiprocessing.Pool( |
| initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) |
| try: |
| for root_dir, page_name, namespace in parse_pages: |
| symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace)) |
| finally: |
| pool.terminate() |
| pool.join() |
| return symbols |