libc/src/stdio/scanf_core/int_converter.cpp - llvm-project - Git at Google

 //===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "src/stdio/scanf_core/int_converter.h"

 #include "src/__support/CPP/limits.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/scanf_core/converter_utils.h"
 #include "src/stdio/scanf_core/core_structs.h"
 #include "src/stdio/scanf_core/reader.h"

 #include <stddef.h>

 namespace LIBC_NAMESPACE_DECL {
 namespace scanf_core {

 // This code is very similar to the code in __support/str_to_integer.h but is
 // not quite the same. Here is the list of differences and why they exist:
 //  1) This takes a reader and a format section instead of a char* and the base.
 //      This should be fairly self explanatory. While the char* could be adapted
 //      to a reader and the base could be calculated ahead of time, the
 //      semantics are slightly different, specifically a char* can be indexed
 //      freely (I can read str[2] and then str[0]) whereas a File (which the
 //      reader may contain) cannot.
 //  2) Because this uses a Reader, this function can only unget once.
 //      This is relevant because scanf specifies it reads the "longest sequence
 //      of input characters which does not exceed any specified field width and
 //      which is, or is a prefix of, a matching input sequence." Whereas the
 //      strtol function accepts "the longest initial subsequence of the input
 //      string (...) that is of the expected form." This is demonstrated by the
 //      differences in how they deal with the string "0xZZZ" when parsing as
 //      hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
 //      since it reads the first 'Z', sees that it's not a valid hex digit, and
 //      reverses one character. The strtol function on the other hand only
 //      accepts the "0" since that's the longest valid hexadecimal sequence. It
 //      sees the 'Z' after the "0x" and determines that this is not the prefix
 //      to a valid hex string.
 //  3) This conversion may have a maximum width.
 //      If a maximum width is specified, this conversion is only allowed to
 //      accept a certain number of characters. Strtol doesn't have any such
 //      limitation.
 int convert_int(Reader *reader, const FormatSection &to_conv) {
   // %d "Matches an optionally signed decimal integer [...] with the value 10
   // for the base argument. The corresponding argument shall be a pointer to
   // signed integer."

   // %i "Matches an optionally signed integer [...] with the value 0 for the
   // base argument. The corresponding argument shall be a pointer to signed
   // integer."

   // %u "Matches an optionally signed decimal integer [...] with the value 10
   // for the base argument. The corresponding argument shall be a pointer to
   // unsigned integer"

   // %o "Matches an optionally signed octal integer [...] with the value 8 for
   // the base argument. The corresponding argument shall be a pointer to
   // unsigned integer"

   // %x/X "Matches an optionally signed hexadecimal integer [...] with the value
   // 16 for the base argument. The corresponding argument shall be a pointer to
   // unsigned integer"

   size_t max_width = cpp::numeric_limits<size_t>::max();
   if (to_conv.max_width > 0) {
     max_width = to_conv.max_width;
   }

   uintmax_t result = 0;
   bool is_number = false;
   bool is_signed = false;
   int base = 0;
   if (to_conv.conv_name == 'i') {
     base = 0;
     is_signed = true;
   } else if (to_conv.conv_name == 'o') {
     base = 8;
   } else if (internal::tolower(to_conv.conv_name) == 'x' ||
              to_conv.conv_name == 'p') {
     base = 16;
   } else if (to_conv.conv_name == 'd') {
     base = 10;
     is_signed = true;
   } else { // conv_name must be 'u'
     base = 10;
   }

   char cur_char = reader->getc();

   char result_sign = '+';
   if (cur_char == '+' || cur_char == '-') {
     result_sign = cur_char;
     if (max_width > 1) {
       --max_width;
       cur_char = reader->getc();
     } else {
       // If the max width has been hit already, then the return value must be 0
       // since no actual digits of the number have been parsed yet.
       write_int_with_length(0, to_conv);
       return MATCHING_FAILURE;
     }
   }
   const bool is_negative = result_sign == '-';

   // Base of 0 means automatically determine the base. Base of 16 may have a
   // prefix of "0x"
   if (base == 0 || base == 16) {
     // If the first character is 0, then it could be octal or hex.
     if (cur_char == '0') {
       is_number = true;

       // Read the next character to check.
       if (max_width > 1) {
         --max_width;
         cur_char = reader->getc();
       } else {
         write_int_with_length(0, to_conv);
         return READ_OK;
       }

       if (internal::tolower(cur_char) == 'x') {
         // This is a valid hex prefix.

         is_number = false;
         // A valid hex prefix is not necessarily a valid number. For the
         // conversion to be valid it needs to use all of the characters it
         // consumes. From the standard:
         // 7.23.6.2 paragraph 9: "An input item is defined as the longest
         // sequence of input characters which does not exceed any specified
         // field width and which is, or is a prefix of, a matching input
         // sequence."
         // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence,
         // the execution of the directive fails: this condition is a matching
         // failure"
         base = 16;
         if (max_width > 1) {
           --max_width;
           cur_char = reader->getc();
         } else {
           return MATCHING_FAILURE;
         }

       } else {
         if (base == 0) {
           base = 8;
         }
       }
     } else if (base == 0) {
       if (internal::isdigit(cur_char)) {
         // If the first character is a different number, then it's 10.
         base = 10;
       } else {
         // If the first character isn't a valid digit, then there are no valid
         // digits at all. The number is 0.
         reader->ungetc(cur_char);
         write_int_with_length(0, to_conv);
         return MATCHING_FAILURE;
       }
     }
   }

   constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
   constexpr uintmax_t SIGNED_MAX =
       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
   constexpr uintmax_t NEGATIVE_SIGNED_MAX =
       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;

   const uintmax_t MAX =
       (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
                  : UNSIGNED_MAX);

   const uintmax_t max_div_by_base = MAX / base;

   if (internal::isalnum(cur_char) &&
       internal::b36_char_to_int(cur_char) < base) {
     is_number = true;
   }

   bool has_overflow = false;
   size_t i = 0;
   for (; i < max_width && internal::isalnum(cur_char) &&
          internal::b36_char_to_int(cur_char) < base;
        ++i, cur_char = reader->getc()) {

     uintmax_t cur_digit = internal::b36_char_to_int(cur_char);

     if (result == MAX) {
       has_overflow = true;
       continue;
     } else if (result > max_div_by_base) {
       result = MAX;
       has_overflow = true;
     } else {
       result = result * base;
     }

     if (result > MAX - cur_digit) {
       result = MAX;
       has_overflow = true;
     } else {
       result = result + cur_digit;
     }
   }

   // We always read one more character than will be used, so we have to put the
   // last one back.
   reader->ungetc(cur_char);

   if (!is_number)
     return MATCHING_FAILURE;

   if (has_overflow) {
     write_int_with_length(MAX, to_conv);
   } else {
     if (is_negative)
       result = -result;

     write_int_with_length(result, to_conv);
   }

   return READ_OK;
 }

 } // namespace scanf_core
 } // namespace LIBC_NAMESPACE_DECL
	//===-- Int type specifier converters for scanf ------------------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "src/stdio/scanf_core/int_converter.h"

	#include "src/__support/CPP/limits.h"
	#include "src/__support/ctype_utils.h"
	#include "src/__support/macros/config.h"
	#include "src/stdio/scanf_core/converter_utils.h"
	#include "src/stdio/scanf_core/core_structs.h"
	#include "src/stdio/scanf_core/reader.h"

	#include <stddef.h>

	namespace LIBC_NAMESPACE_DECL {
	namespace scanf_core {

	// This code is very similar to the code in __support/str_to_integer.h but is
	// not quite the same. Here is the list of differences and why they exist:
	// 1) This takes a reader and a format section instead of a char* and the base.
	// This should be fairly self explanatory. While the char* could be adapted
	// to a reader and the base could be calculated ahead of time, the
	// semantics are slightly different, specifically a char* can be indexed
	// freely (I can read str[2] and then str[0]) whereas a File (which the
	// reader may contain) cannot.
	// 2) Because this uses a Reader, this function can only unget once.
	// This is relevant because scanf specifies it reads the "longest sequence
	// of input characters which does not exceed any specified field width and
	// which is, or is a prefix of, a matching input sequence." Whereas the
	// strtol function accepts "the longest initial subsequence of the input
	// string (...) that is of the expected form." This is demonstrated by the
	// differences in how they deal with the string "0xZZZ" when parsing as
	// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
	// since it reads the first 'Z', sees that it's not a valid hex digit, and
	// reverses one character. The strtol function on the other hand only
	// accepts the "0" since that's the longest valid hexadecimal sequence. It
	// sees the 'Z' after the "0x" and determines that this is not the prefix
	// to a valid hex string.
	// 3) This conversion may have a maximum width.
	// If a maximum width is specified, this conversion is only allowed to
	// accept a certain number of characters. Strtol doesn't have any such
	// limitation.
	int convert_int(Reader *reader, const FormatSection &to_conv) {
	// %d "Matches an optionally signed decimal integer [...] with the value 10
	// for the base argument. The corresponding argument shall be a pointer to
	// signed integer."

	// %i "Matches an optionally signed integer [...] with the value 0 for the
	// base argument. The corresponding argument shall be a pointer to signed
	// integer."

	// %u "Matches an optionally signed decimal integer [...] with the value 10
	// for the base argument. The corresponding argument shall be a pointer to
	// unsigned integer"

	// %o "Matches an optionally signed octal integer [...] with the value 8 for
	// the base argument. The corresponding argument shall be a pointer to
	// unsigned integer"

	// %x/X "Matches an optionally signed hexadecimal integer [...] with the value
	// 16 for the base argument. The corresponding argument shall be a pointer to
	// unsigned integer"

	size_t max_width = cpp::numeric_limits<size_t>::max();
	if (to_conv.max_width > 0) {
	max_width = to_conv.max_width;
	}

	uintmax_t result = 0;
	bool is_number = false;
	bool is_signed = false;
	int base = 0;
	if (to_conv.conv_name == 'i') {
	base = 0;
	is_signed = true;
	} else if (to_conv.conv_name == 'o') {
	base = 8;
	} else if (internal::tolower(to_conv.conv_name) == 'x' \|\|
	to_conv.conv_name == 'p') {
	base = 16;
	} else if (to_conv.conv_name == 'd') {
	base = 10;
	is_signed = true;
	} else { // conv_name must be 'u'
	base = 10;
	}

	char cur_char = reader->getc();

	char result_sign = '+';
	if (cur_char == '+' \|\| cur_char == '-') {
	result_sign = cur_char;
	if (max_width > 1) {
	--max_width;
	cur_char = reader->getc();
	} else {
	// If the max width has been hit already, then the return value must be 0
	// since no actual digits of the number have been parsed yet.
	write_int_with_length(0, to_conv);
	return MATCHING_FAILURE;
	}
	}
	const bool is_negative = result_sign == '-';

	// Base of 0 means automatically determine the base. Base of 16 may have a
	// prefix of "0x"
	if (base == 0 \|\| base == 16) {
	// If the first character is 0, then it could be octal or hex.
	if (cur_char == '0') {
	is_number = true;

	// Read the next character to check.
	if (max_width > 1) {
	--max_width;
	cur_char = reader->getc();
	} else {
	write_int_with_length(0, to_conv);
	return READ_OK;
	}

	if (internal::tolower(cur_char) == 'x') {
	// This is a valid hex prefix.

	is_number = false;
	// A valid hex prefix is not necessarily a valid number. For the
	// conversion to be valid it needs to use all of the characters it
	// consumes. From the standard:
	// 7.23.6.2 paragraph 9: "An input item is defined as the longest
	// sequence of input characters which does not exceed any specified
	// field width and which is, or is a prefix of, a matching input
	// sequence."
	// 7.23.6.2 paragraph 10: "If the input item is not a matching sequence,
	// the execution of the directive fails: this condition is a matching
	// failure"
	base = 16;
	if (max_width > 1) {
	--max_width;
	cur_char = reader->getc();
	} else {
	return MATCHING_FAILURE;
	}

	} else {
	if (base == 0) {
	base = 8;
	}
	}
	} else if (base == 0) {
	if (internal::isdigit(cur_char)) {
	// If the first character is a different number, then it's 10.
	base = 10;
	} else {
	// If the first character isn't a valid digit, then there are no valid
	// digits at all. The number is 0.
	reader->ungetc(cur_char);
	write_int_with_length(0, to_conv);
	return MATCHING_FAILURE;
	}
	}
	}

	constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
	constexpr uintmax_t SIGNED_MAX =
	static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
	constexpr uintmax_t NEGATIVE_SIGNED_MAX =
	static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;

	const uintmax_t MAX =
	(is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
	: UNSIGNED_MAX);

	const uintmax_t max_div_by_base = MAX / base;

	if (internal::isalnum(cur_char) &&
	internal::b36_char_to_int(cur_char) < base) {
	is_number = true;
	}

	bool has_overflow = false;
	size_t i = 0;
	for (; i < max_width && internal::isalnum(cur_char) &&
	internal::b36_char_to_int(cur_char) < base;
	++i, cur_char = reader->getc()) {

	uintmax_t cur_digit = internal::b36_char_to_int(cur_char);

	if (result == MAX) {
	has_overflow = true;
	continue;
	} else if (result > max_div_by_base) {
	result = MAX;
	has_overflow = true;
	} else {
	result = result * base;
	}

	if (result > MAX - cur_digit) {
	result = MAX;
	has_overflow = true;
	} else {
	result = result + cur_digit;
	}
	}

	// We always read one more character than will be used, so we have to put the
	// last one back.
	reader->ungetc(cur_char);

	if (!is_number)
	return MATCHING_FAILURE;

	if (has_overflow) {
	write_int_with_length(MAX, to_conv);
	} else {
	if (is_negative)
	result = -result;

	write_int_with_length(result, to_conv);
	}

	return READ_OK;
	}

	} // namespace scanf_core
	} // namespace LIBC_NAMESPACE_DECL