| //===-- Int type specifier converters for scanf -----------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "src/stdio/scanf_core/int_converter.h" |
| |
| #include "src/__support/CPP/limits.h" |
| #include "src/__support/ctype_utils.h" |
| #include "src/__support/macros/config.h" |
| #include "src/stdio/scanf_core/converter_utils.h" |
| #include "src/stdio/scanf_core/core_structs.h" |
| #include "src/stdio/scanf_core/reader.h" |
| |
| #include <stddef.h> |
| |
| namespace LIBC_NAMESPACE_DECL { |
| namespace scanf_core { |
| |
| // This code is very similar to the code in __support/str_to_integer.h but is |
| // not quite the same. Here is the list of differences and why they exist: |
| // 1) This takes a reader and a format section instead of a char* and the base. |
| // This should be fairly self explanatory. While the char* could be adapted |
| // to a reader and the base could be calculated ahead of time, the |
| // semantics are slightly different, specifically a char* can be indexed |
| // freely (I can read str[2] and then str[0]) whereas a File (which the |
| // reader may contain) cannot. |
| // 2) Because this uses a Reader, this function can only unget once. |
| // This is relevant because scanf specifies it reads the "longest sequence |
| // of input characters which does not exceed any specified field width and |
| // which is, or is a prefix of, a matching input sequence." Whereas the |
| // strtol function accepts "the longest initial subsequence of the input |
| // string (...) that is of the expected form." This is demonstrated by the |
| // differences in how they deal with the string "0xZZZ" when parsing as |
| // hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, |
| // since it reads the first 'Z', sees that it's not a valid hex digit, and |
| // reverses one character. The strtol function on the other hand only |
| // accepts the "0" since that's the longest valid hexadecimal sequence. It |
| // sees the 'Z' after the "0x" and determines that this is not the prefix |
| // to a valid hex string. |
| // 3) This conversion may have a maximum width. |
| // If a maximum width is specified, this conversion is only allowed to |
| // accept a certain number of characters. Strtol doesn't have any such |
| // limitation. |
| int convert_int(Reader *reader, const FormatSection &to_conv) { |
| // %d "Matches an optionally signed decimal integer [...] with the value 10 |
| // for the base argument. The corresponding argument shall be a pointer to |
| // signed integer." |
| |
| // %i "Matches an optionally signed integer [...] with the value 0 for the |
| // base argument. The corresponding argument shall be a pointer to signed |
| // integer." |
| |
| // %u "Matches an optionally signed decimal integer [...] with the value 10 |
| // for the base argument. The corresponding argument shall be a pointer to |
| // unsigned integer" |
| |
| // %o "Matches an optionally signed octal integer [...] with the value 8 for |
| // the base argument. The corresponding argument shall be a pointer to |
| // unsigned integer" |
| |
| // %x/X "Matches an optionally signed hexadecimal integer [...] with the value |
| // 16 for the base argument. The corresponding argument shall be a pointer to |
| // unsigned integer" |
| |
| size_t max_width = cpp::numeric_limits<size_t>::max(); |
| if (to_conv.max_width > 0) { |
| max_width = to_conv.max_width; |
| } |
| |
| uintmax_t result = 0; |
| bool is_number = false; |
| bool is_signed = false; |
| int base = 0; |
| if (to_conv.conv_name == 'i') { |
| base = 0; |
| is_signed = true; |
| } else if (to_conv.conv_name == 'o') { |
| base = 8; |
| } else if (internal::tolower(to_conv.conv_name) == 'x' || |
| to_conv.conv_name == 'p') { |
| base = 16; |
| } else if (to_conv.conv_name == 'd') { |
| base = 10; |
| is_signed = true; |
| } else { // conv_name must be 'u' |
| base = 10; |
| } |
| |
| char cur_char = reader->getc(); |
| |
| char result_sign = '+'; |
| if (cur_char == '+' || cur_char == '-') { |
| result_sign = cur_char; |
| if (max_width > 1) { |
| --max_width; |
| cur_char = reader->getc(); |
| } else { |
| // If the max width has been hit already, then the return value must be 0 |
| // since no actual digits of the number have been parsed yet. |
| write_int_with_length(0, to_conv); |
| return MATCHING_FAILURE; |
| } |
| } |
| const bool is_negative = result_sign == '-'; |
| |
| // Base of 0 means automatically determine the base. Base of 16 may have a |
| // prefix of "0x" |
| if (base == 0 || base == 16) { |
| // If the first character is 0, then it could be octal or hex. |
| if (cur_char == '0') { |
| is_number = true; |
| |
| // Read the next character to check. |
| if (max_width > 1) { |
| --max_width; |
| cur_char = reader->getc(); |
| } else { |
| write_int_with_length(0, to_conv); |
| return READ_OK; |
| } |
| |
| if (internal::tolower(cur_char) == 'x') { |
| // This is a valid hex prefix. |
| |
| is_number = false; |
| // A valid hex prefix is not necessarily a valid number. For the |
| // conversion to be valid it needs to use all of the characters it |
| // consumes. From the standard: |
| // 7.23.6.2 paragraph 9: "An input item is defined as the longest |
| // sequence of input characters which does not exceed any specified |
| // field width and which is, or is a prefix of, a matching input |
| // sequence." |
| // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, |
| // the execution of the directive fails: this condition is a matching |
| // failure" |
| base = 16; |
| if (max_width > 1) { |
| --max_width; |
| cur_char = reader->getc(); |
| } else { |
| return MATCHING_FAILURE; |
| } |
| |
| } else { |
| if (base == 0) { |
| base = 8; |
| } |
| } |
| } else if (base == 0) { |
| if (internal::isdigit(cur_char)) { |
| // If the first character is a different number, then it's 10. |
| base = 10; |
| } else { |
| // If the first character isn't a valid digit, then there are no valid |
| // digits at all. The number is 0. |
| reader->ungetc(cur_char); |
| write_int_with_length(0, to_conv); |
| return MATCHING_FAILURE; |
| } |
| } |
| } |
| |
| constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max(); |
| constexpr uintmax_t SIGNED_MAX = |
| static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()); |
| constexpr uintmax_t NEGATIVE_SIGNED_MAX = |
| static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1; |
| |
| const uintmax_t MAX = |
| (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) |
| : UNSIGNED_MAX); |
| |
| const uintmax_t max_div_by_base = MAX / base; |
| |
| if (internal::isalnum(cur_char) && |
| internal::b36_char_to_int(cur_char) < base) { |
| is_number = true; |
| } |
| |
| bool has_overflow = false; |
| size_t i = 0; |
| for (; i < max_width && internal::isalnum(cur_char) && |
| internal::b36_char_to_int(cur_char) < base; |
| ++i, cur_char = reader->getc()) { |
| |
| uintmax_t cur_digit = internal::b36_char_to_int(cur_char); |
| |
| if (result == MAX) { |
| has_overflow = true; |
| continue; |
| } else if (result > max_div_by_base) { |
| result = MAX; |
| has_overflow = true; |
| } else { |
| result = result * base; |
| } |
| |
| if (result > MAX - cur_digit) { |
| result = MAX; |
| has_overflow = true; |
| } else { |
| result = result + cur_digit; |
| } |
| } |
| |
| // We always read one more character than will be used, so we have to put the |
| // last one back. |
| reader->ungetc(cur_char); |
| |
| if (!is_number) |
| return MATCHING_FAILURE; |
| |
| if (has_overflow) { |
| write_int_with_length(MAX, to_conv); |
| } else { |
| if (is_negative) |
| result = -result; |
| |
| write_int_with_length(result, to_conv); |
| } |
| |
| return READ_OK; |
| } |
| |
| } // namespace scanf_core |
| } // namespace LIBC_NAMESPACE_DECL |