clang-tools-extra/clangd/include-mapping/cppreference_parser.py - llvm-project - Git at Google

 #!/usr/bin/env python
 #===- cppreference_parser.py -  ------------------------------*- python -*--===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 #===------------------------------------------------------------------------===#

 from bs4 import BeautifulSoup, NavigableString

 import collections
 import multiprocessing
 import os
 import re
 import signal
 import sys


 class Symbol:

   def __init__(self, name, namespace, headers):
     # unqualifed symbol name, e.g. "move"
     self.name = name
     # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
     # None for C symbols.
     self.namespace = namespace
     # a list of corresponding headers
     self.headers = headers


 def _HasClass(tag, *classes):
   for c in tag.get('class', []):
     if c in classes:
       return True
   return False


 def _ParseSymbolPage(symbol_page_html, symbol_name):
   """Parse symbol page and retrieve the include header defined in this page.
   The symbol page provides header for the symbol, specifically in
   "Defined in header <header>" section. An example:

   <tr class="t-dsc-header">
     <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
   </td></tr>

   Returns a list of headers.
   """
   headers = set()
   all_headers = set()

   soup = BeautifulSoup(symbol_page_html, "html.parser")
   # Rows in table are like:
   #   Defined in header <foo>      .t-dsc-header
   #   Defined in header <bar>      .t-dsc-header
   #   decl1                        .t-dcl
   #   Defined in header <baz>      .t-dsc-header
   #   decl2                        .t-dcl
   for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
     current_headers = []
     was_decl = False
     for row in table.select('tr'):
       if _HasClass(row, 't-dcl', 't-dsc'):
         was_decl = True
         # Symbols are in the first cell.
         found_symbols = row.find('td').stripped_strings
         if not symbol_name in found_symbols:
           continue
         headers.update(current_headers)
       elif _HasClass(row, 't-dsc-header'):
         # If we saw a decl since the last header, this is a new block of headers
         # for a new block of decls.
         if was_decl:
           current_headers = []
         was_decl = False
         # There are also .t-dsc-header for "defined in namespace".
         if not "Defined in header " in row.text:
           continue
         # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
         for header_code in row.find_all("code"):
           current_headers.append(header_code.text)
           all_headers.add(header_code.text)
   # If the symbol was never named, consider all named headers.
   return headers or all_headers


 def _ParseIndexPage(index_page_html):
   """Parse index page.
   The index page lists all std symbols and hrefs to their detailed pages
   (which contain the defined header). An example:

   <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
   <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>

   Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
   """
   symbols = []
   soup = BeautifulSoup(index_page_html, "html.parser")
   for symbol_href in soup.select("a[title]"):
     # Ignore annotated symbols like "acos<>() (std::complex)".
     # These tend to be overloads, and we the primary is more useful.
     # This accidentally accepts begin/end despite the (iterator) caption: the
     # (since C++11) note is first. They are good symbols, so the bug is unfixed.
     caption = symbol_href.next_sibling
     variant = isinstance(caption, NavigableString) and "(" in caption
     symbol_tt = symbol_href.find("tt")
     if symbol_tt:
       symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
                       symbol_href["href"], variant))
   return symbols


 def _ReadSymbolPage(path, name):
   with open(path) as f:
     return _ParseSymbolPage(f.read(), name)


 def _GetSymbols(pool, root_dir, index_page_name, namespace):
   """Get all symbols listed in the index page. All symbols should be in the
   given namespace.

   Returns a list of Symbols.
   """

   # Workflow steps:
   #   1. Parse index page which lists all symbols to get symbol
   #      name (unqualified name) and its href link to the symbol page which
   #      contains the defined header.
   #   2. Parse the symbol page to get the defined header.
   index_page_path = os.path.join(root_dir, index_page_name)
   with open(index_page_path, "r") as f:
     # Read each symbol page in parallel.
     results = [] # (symbol_name, promise of [header...])
     for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
       # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
       # FIXME: use these as a fallback rather than ignoring entirely.
       if variant:
         continue
       path = os.path.join(root_dir, symbol_page_path)
       results.append((symbol_name,
                       pool.apply_async(_ReadSymbolPage, (path, symbol_name))))

     # Build map from symbol name to a set of headers.
     symbol_headers = collections.defaultdict(set)
     for symbol_name, lazy_headers in results:
       symbol_headers[symbol_name].update(lazy_headers.get())

   symbols = []
   for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
     symbols.append(Symbol(name, namespace, list(headers)))
   return symbols


 def GetSymbols(parse_pages):
   """Get all symbols by parsing the given pages.

   Args:
     parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
   """
   symbols = []
   # Run many workers to process individual symbol pages under the symbol index.
   # Don't allow workers to capture Ctrl-C.
   pool = multiprocessing.Pool(
       initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
   try:
     for root_dir, page_name, namespace in parse_pages:
       symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
   finally:
     pool.terminate()
     pool.join()
   return symbols
	#!/usr/bin/env python
	#===- cppreference_parser.py - ------------------------------- python ---===#
	#
	# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	#
	#===------------------------------------------------------------------------===#

	from bs4 import BeautifulSoup, NavigableString

	import collections
	import multiprocessing
	import os
	import re
	import signal
	import sys


	class Symbol:

	def __init__(self, name, namespace, headers):
	# unqualifed symbol name, e.g. "move"
	self.name = name
	# namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
	# None for C symbols.
	self.namespace = namespace
	# a list of corresponding headers
	self.headers = headers


	def _HasClass(tag, *classes):
	for c in tag.get('class', []):
	if c in classes:
	return True
	return False


	def _ParseSymbolPage(symbol_page_html, symbol_name):
	"""Parse symbol page and retrieve the include header defined in this page.
	The symbol page provides header for the symbol, specifically in
	"Defined in header <header>" section. An example:

	<tr class="t-dsc-header">
	<td colspan="2"> <div>Defined in header <code><ratio></code> </div>
	</td></tr>

	Returns a list of headers.
	"""
	headers = set()
	all_headers = set()

	soup = BeautifulSoup(symbol_page_html, "html.parser")
	# Rows in table are like:
	# Defined in header <foo> .t-dsc-header
	# Defined in header <bar> .t-dsc-header
	# decl1 .t-dcl
	# Defined in header <baz> .t-dsc-header
	# decl2 .t-dcl
	for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
	current_headers = []
	was_decl = False
	for row in table.select('tr'):
	if _HasClass(row, 't-dcl', 't-dsc'):
	was_decl = True
	# Symbols are in the first cell.
	found_symbols = row.find('td').stripped_strings
	if not symbol_name in found_symbols:
	continue
	headers.update(current_headers)
	elif _HasClass(row, 't-dsc-header'):
	# If we saw a decl since the last header, this is a new block of headers
	# for a new block of decls.
	if was_decl:
	current_headers = []
	was_decl = False
	# There are also .t-dsc-header for "defined in namespace".
	if not "Defined in header " in row.text:
	continue
	# The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
	for header_code in row.find_all("code"):
	current_headers.append(header_code.text)
	all_headers.add(header_code.text)
	# If the symbol was never named, consider all named headers.
	return headers or all_headers


	def _ParseIndexPage(index_page_html):
	"""Parse index page.
	The index page lists all std symbols and hrefs to their detailed pages
	(which contain the defined header). An example:

	<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
	<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>

	Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
	"""
	symbols = []
	soup = BeautifulSoup(index_page_html, "html.parser")
	for symbol_href in soup.select("a[title]"):
	# Ignore annotated symbols like "acos<>() (std::complex)".
	# These tend to be overloads, and we the primary is more useful.
	# This accidentally accepts begin/end despite the (iterator) caption: the
	# (since C++11) note is first. They are good symbols, so the bug is unfixed.
	caption = symbol_href.next_sibling
	variant = isinstance(caption, NavigableString) and "(" in caption
	symbol_tt = symbol_href.find("tt")
	if symbol_tt:
	symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
	symbol_href["href"], variant))
	return symbols


	def _ReadSymbolPage(path, name):
	with open(path) as f:
	return _ParseSymbolPage(f.read(), name)


	def _GetSymbols(pool, root_dir, index_page_name, namespace):
	"""Get all symbols listed in the index page. All symbols should be in the
	given namespace.

	Returns a list of Symbols.
	"""

	# Workflow steps:
	# 1. Parse index page which lists all symbols to get symbol
	# name (unqualified name) and its href link to the symbol page which
	# contains the defined header.
	# 2. Parse the symbol page to get the defined header.
	index_page_path = os.path.join(root_dir, index_page_name)
	with open(index_page_path, "r") as f:
	# Read each symbol page in parallel.
	results = [] # (symbol_name, promise of [header...])
	for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
	# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
	# FIXME: use these as a fallback rather than ignoring entirely.
	if variant:
	continue
	path = os.path.join(root_dir, symbol_page_path)
	results.append((symbol_name,
	pool.apply_async(_ReadSymbolPage, (path, symbol_name))))

	# Build map from symbol name to a set of headers.
	symbol_headers = collections.defaultdict(set)
	for symbol_name, lazy_headers in results:
	symbol_headers[symbol_name].update(lazy_headers.get())

	symbols = []
	for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
	symbols.append(Symbol(name, namespace, list(headers)))
	return symbols


	def GetSymbols(parse_pages):
	"""Get all symbols by parsing the given pages.

	Args:
	parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
	"""
	symbols = []
	# Run many workers to process individual symbol pages under the symbol index.
	# Don't allow workers to capture Ctrl-C.
	pool = multiprocessing.Pool(
	initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
	try:
	for root_dir, page_name, namespace in parse_pages:
	symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
	finally:
	pool.terminate()
	pool.join()
	return symbols