utils/unicode-case-fold.py - llvm-project/llvm - Git at Google

 #!/usr/bin/env python
 """
 Unicode case folding database conversion utility

 Parses the database and generates a C++ function which implements the case
 folding algorithm. The database entries are of the form:

   <code>; <status>; <mapping>; # <name>

 <status> can be one of four characters:
   C - Common mappings
   S - mappings for Simple case folding
   F - mappings for Full case folding
   T - special case for Turkish I characters

 Right now this generates a function which implements simple case folding (C+S
 entries).
 """

 from __future__ import print_function

 import sys
 import re

 try:
     from urllib.request import urlopen
 except ImportError:
     from urllib2 import urlopen


 # This variable will body of the mappings function
 body = ""

 # Reads file line-by-line, extracts Common and Simple case fold mappings and
 # returns a (from_char, to_char, from_name) tuple.
 def mappings(f):
     previous_from = -1
     expr = re.compile(r"^(.*); [CS]; (.*); # (.*)")
     for line in f:
         m = expr.match(line)
         if not m:
             continue
         from_char = int(m.group(1), 16)
         to_char = int(m.group(2), 16)
         from_name = m.group(3)

         if from_char <= previous_from:
             raise Exception("Duplicate or unsorted characters in input")
         yield from_char, to_char, from_name
         previous_from = from_char


 # Computes the shift (to_char - from_char) in a mapping.
 def shift(mapping):
     return mapping[1] - mapping[0]


 # Computes the stride (from_char2 - from_char1) of two mappings.
 def stride2(mapping1, mapping2):
     return mapping2[0] - mapping1[0]


 # Computes the stride of a list of mappings. The list should have at least two
 # mappings. All mappings in the list are assumed to have the same stride.
 def stride(block):
     return stride2(block[0], block[1])


 # b is a list of mappings. All the mappings are assumed to have the same
 # shift and the stride between adjecant mappings (if any) is constant.
 def dump_block(b):
     global body

     if len(b) == 1:
         # Special case for handling blocks of length 1. We don't even need to
         # emit the "if (C < X) return C" check below as all characters in this
         # range will be caught by the "C < X" check emitted by the first
         # non-trivial block.
         body += "  // {2}\n  if (C == {0:#06x})\n    return {1:#06x};\n".format(*b[0])
         return

     first = b[0][0]
     last = first + stride(b) * (len(b) - 1)
     modulo = first % stride(b)

     # All characters before this block map to themselves.
     body += "  if (C < {0:#06x})\n    return C;\n".format(first)
     body += "  // {0} characters\n".format(len(b))

     # Generic pattern: check upper bound (lower bound is checked by the "if"
     # above) and modulo of C, return C+shift.
     pattern = "  if (C <= {0:#06x} && C % {1} == {2})\n    return C + {3};\n"

     if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
         # Special case:
         # We can elide the modulo-check because the expression "C|1" will map
         # the intervening characters to themselves.
         pattern = "  if (C <= {0:#06x})\n    return C | 1;\n"
     elif stride(b) == 1:
         # Another special case: X % 1 is always zero, so don't emit the
         # modulo-check.
         pattern = "  if (C <= {0:#06x})\n    return C + {3};\n"

     body += pattern.format(last, stride(b), modulo, shift(b[0]))


 current_block = []
 f = urlopen(sys.argv[1])
 for m in mappings(f):
     if len(current_block) == 0:
         current_block.append(m)
         continue

     if shift(current_block[0]) != shift(m):
         # Incompatible shift, start a new block.
         dump_block(current_block)
         current_block = [m]
         continue

     if len(current_block) == 1 or stride(current_block) == stride2(
         current_block[-1], m
     ):
         current_block.append(m)
         continue

     # Incompatible stride, start a new block.
     dump_block(current_block)
     current_block = [m]
 f.close()

 dump_block(current_block)

 print(
     "//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//"
 )
 print("//")
 print("// This file was generated by utils/unicode-case-fold.py from the Unicode")
 print("// case folding database at")
 print("//   ", sys.argv[1])
 print("//")
 print("// To regenerate this file, run:")
 print("//   utils/unicode-case-fold.py \\")
 print('//     "{}" \\'.format(sys.argv[1]))
 print("//     > lib/Support/UnicodeCaseFold.cpp")
 print("//")
 print(
     "//===----------------------------------------------------------------------===//"
 )
 print("")
 print('#include "llvm/Support/Unicode.h"')
 print("")
 print("int llvm::sys::unicode::foldCharSimple(int C) {")
 print(body)
 print("  return C;")
 print("}")
	#!/usr/bin/env python
	"""
	Unicode case folding database conversion utility

	Parses the database and generates a C++ function which implements the case
	folding algorithm. The database entries are of the form:

	<code>; <status>; <mapping>; # <name>

	<status> can be one of four characters:
	C - Common mappings
	S - mappings for Simple case folding
	F - mappings for Full case folding
	T - special case for Turkish I characters

	Right now this generates a function which implements simple case folding (C+S
	entries).
	"""

	from __future__ import print_function

	import sys
	import re

	try:
	from urllib.request import urlopen
	except ImportError:
	from urllib2 import urlopen


	# This variable will body of the mappings function
	body = ""

	# Reads file line-by-line, extracts Common and Simple case fold mappings and
	# returns a (from_char, to_char, from_name) tuple.
	def mappings(f):
	previous_from = -1
	expr = re.compile(r"^(.); [CS]; (.); # (.*)")
	for line in f:
	m = expr.match(line)
	if not m:
	continue
	from_char = int(m.group(1), 16)
	to_char = int(m.group(2), 16)
	from_name = m.group(3)

	if from_char <= previous_from:
	raise Exception("Duplicate or unsorted characters in input")
	yield from_char, to_char, from_name
	previous_from = from_char


	# Computes the shift (to_char - from_char) in a mapping.
	def shift(mapping):
	return mapping[1] - mapping[0]


	# Computes the stride (from_char2 - from_char1) of two mappings.
	def stride2(mapping1, mapping2):
	return mapping2[0] - mapping1[0]


	# Computes the stride of a list of mappings. The list should have at least two
	# mappings. All mappings in the list are assumed to have the same stride.
	def stride(block):
	return stride2(block[0], block[1])


	# b is a list of mappings. All the mappings are assumed to have the same
	# shift and the stride between adjecant mappings (if any) is constant.
	def dump_block(b):
	global body

	if len(b) == 1:
	# Special case for handling blocks of length 1. We don't even need to
	# emit the "if (C < X) return C" check below as all characters in this
	# range will be caught by the "C < X" check emitted by the first
	# non-trivial block.
	body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
	return

	first = b[0][0]
	last = first + stride(b) * (len(b) - 1)
	modulo = first % stride(b)

	# All characters before this block map to themselves.
	body += " if (C < {0:#06x})\n return C;\n".format(first)
	body += " // {0} characters\n".format(len(b))

	# Generic pattern: check upper bound (lower bound is checked by the "if"
	# above) and modulo of C, return C+shift.
	pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"

	if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
	# Special case:
	# We can elide the modulo-check because the expression "C\|1" will map
	# the intervening characters to themselves.
	pattern = " if (C <= {0:#06x})\n return C \| 1;\n"
	elif stride(b) == 1:
	# Another special case: X % 1 is always zero, so don't emit the
	# modulo-check.
	pattern = " if (C <= {0:#06x})\n return C + {3};\n"

	body += pattern.format(last, stride(b), modulo, shift(b[0]))


	current_block = []
	f = urlopen(sys.argv[1])
	for m in mappings(f):
	if len(current_block) == 0:
	current_block.append(m)
	continue

	if shift(current_block[0]) != shift(m):
	# Incompatible shift, start a new block.
	dump_block(current_block)
	current_block = [m]
	continue

	if len(current_block) == 1 or stride(current_block) == stride2(
	current_block[-1], m
	):
	current_block.append(m)
	continue

	# Incompatible stride, start a new block.
	dump_block(current_block)
	current_block = [m]
	f.close()

	dump_block(current_block)

	print(
	"//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//"
	)
	print("//")
	print("// This file was generated by utils/unicode-case-fold.py from the Unicode")
	print("// case folding database at")
	print("// ", sys.argv[1])
	print("//")
	print("// To regenerate this file, run:")
	print("// utils/unicode-case-fold.py \\")
	print('// "{}" \\'.format(sys.argv[1]))
	print("// > lib/Support/UnicodeCaseFold.cpp")
	print("//")
	print(
	"//===----------------------------------------------------------------------===//"
	)
	print("")
	print('#include "llvm/Support/Unicode.h"')
	print("")
	print("int llvm::sys::unicode::foldCharSimple(int C) {")
	print(body)
	print(" return C;")
	print("}")