| //===-- Format string parser for scanf -------------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
| #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
| |
| #include "src/__support/arg_list.h" |
| #include "src/__support/ctype_utils.h" |
| #include "src/__support/str_to_integer.h" |
| #include "src/stdio/scanf_core/core_structs.h" |
| #include "src/stdio/scanf_core/scanf_config.h" |
| |
| #include <stddef.h> |
| |
| namespace LIBC_NAMESPACE { |
| namespace scanf_core { |
| |
| #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) |
| #else |
| #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() |
| #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| |
| template <typename ArgProvider> class Parser { |
| const char *__restrict str; |
| |
| size_t cur_pos = 0; |
| ArgProvider args_cur; |
| |
| #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| // args_start stores the start of the va_args, which is used when a previous |
| // argument is needed. In that case, we have to read the arguments from the |
| // beginning since they don't support reading backwards. |
| ArgProvider args_start; |
| size_t args_index = 1; |
| #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| |
| public: |
| #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
| : str(new_str), args_cur(args), args_start(args) {} |
| #else |
| LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
| : str(new_str), args_cur(args) {} |
| #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| |
| // get_next_section will parse the format string until it has a fully |
| // specified format section. This can either be a raw format section with no |
| // conversion, or a format section with a conversion that has all of its |
| // variables stored in the format section. |
| LIBC_INLINE FormatSection get_next_section() { |
| FormatSection section; |
| size_t starting_pos = cur_pos; |
| if (str[cur_pos] == '%') { |
| // format section |
| section.has_conv = true; |
| |
| ++cur_pos; |
| [[maybe_unused]] size_t conv_index = 0; |
| |
| #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| conv_index = parse_index(&cur_pos); |
| #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| |
| if (str[cur_pos] == '*') { |
| ++cur_pos; |
| section.flags = FormatFlags::NO_WRITE; |
| } |
| |
| // handle width |
| section.max_width = -1; |
| if (internal::isdigit(str[cur_pos])) { |
| auto result = internal::strtointeger<int>(str + cur_pos, 10); |
| section.max_width = result.value; |
| cur_pos = cur_pos + result.parsed_len; |
| } |
| |
| // TODO(michaelrj): add posix allocate flag support. |
| // if (str[cur_pos] == 'm') { |
| // ++cur_pos; |
| // section.flags = FormatFlags::ALLOCATE; |
| // } |
| |
| LengthModifier lm = parse_length_modifier(&cur_pos); |
| section.length_modifier = lm; |
| |
| section.conv_name = str[cur_pos]; |
| |
| // If NO_WRITE is not set, then read the next arg as the output pointer. |
| if ((section.flags & FormatFlags::NO_WRITE) == 0) { |
| // Since all outputs are pointers, there's no need to distinguish when |
| // reading from va_args. They're all the same size and stored the same. |
| section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); |
| } |
| |
| // If the end of the format section is on the '\0'. This means we need to |
| // not advance the cur_pos and we should not count this has having a |
| // conversion. |
| if (str[cur_pos] != '\0') { |
| ++cur_pos; |
| } else { |
| section.has_conv = false; |
| } |
| |
| // If the format is a bracketed one, then we need to parse out the insides |
| // of the brackets. |
| if (section.conv_name == '[') { |
| constexpr char CLOSING_BRACKET = ']'; |
| constexpr char INVERT_FLAG = '^'; |
| constexpr char RANGE_OPERATOR = '-'; |
| |
| cpp::bitset<256> scan_set; |
| bool invert = false; |
| |
| // The circumflex in the first position represents the inversion flag, |
| // but it's easier to apply that at the end so we just store it for now. |
| if (str[cur_pos] == INVERT_FLAG) { |
| invert = true; |
| ++cur_pos; |
| } |
| |
| // This is used to determine if a hyphen is being used as a literal or |
| // as a range operator. |
| size_t set_start_pos = cur_pos; |
| |
| // Normally the right bracket closes the set, but if it's the first |
| // character (possibly after the inversion flag) then it's instead |
| // included as a character in the set and the second right bracket |
| // closes the set. |
| if (str[cur_pos] == CLOSING_BRACKET) { |
| scan_set.set(CLOSING_BRACKET); |
| ++cur_pos; |
| } |
| |
| while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { |
| // If a hyphen is being used as a range operator, since it's neither |
| // at the beginning nor end of the set. |
| if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && |
| str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { |
| // Technically there is no requirement to correct the ordering of |
| // the range, but since the range operator is entirely |
| // implementation defined it seems like a good convenience. |
| char a = str[cur_pos - 1]; |
| char b = str[cur_pos + 1]; |
| char start = (a < b ? a : b); |
| char end = (a < b ? b : a); |
| scan_set.set_range(start, end); |
| cur_pos += 2; |
| } else { |
| scan_set.set(str[cur_pos]); |
| ++cur_pos; |
| } |
| } |
| if (invert) |
| scan_set.flip(); |
| |
| if (str[cur_pos] == CLOSING_BRACKET) { |
| ++cur_pos; |
| section.scan_set = scan_set; |
| } else { |
| // if the end of the string was encountered, this is not a valid set. |
| section.has_conv = false; |
| } |
| } |
| } else { |
| // raw section |
| section.has_conv = false; |
| while (str[cur_pos] != '%' && str[cur_pos] != '\0') |
| ++cur_pos; |
| } |
| section.raw_string = {str + starting_pos, cur_pos - starting_pos}; |
| return section; |
| } |
| |
| private: |
| // parse_length_modifier parses the length modifier inside a format string. It |
| // assumes that str[*local_pos] is inside a format specifier. It returns a |
| // LengthModifier with the length modifier it found. It will advance local_pos |
| // after the format specifier if one is found. |
| LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { |
| switch (str[*local_pos]) { |
| case ('l'): |
| if (str[*local_pos + 1] == 'l') { |
| *local_pos += 2; |
| return LengthModifier::ll; |
| } else { |
| ++*local_pos; |
| return LengthModifier::l; |
| } |
| case ('h'): |
| if (str[*local_pos + 1] == 'h') { |
| *local_pos += 2; |
| return LengthModifier::hh; |
| } else { |
| ++*local_pos; |
| return LengthModifier::h; |
| } |
| case ('L'): |
| ++*local_pos; |
| return LengthModifier::L; |
| case ('j'): |
| ++*local_pos; |
| return LengthModifier::j; |
| case ('z'): |
| ++*local_pos; |
| return LengthModifier::z; |
| case ('t'): |
| ++*local_pos; |
| return LengthModifier::t; |
| default: |
| return LengthModifier::NONE; |
| } |
| } |
| |
| // get_next_arg_value gets the next value from the arg list as type T. |
| template <class T> LIBC_INLINE T get_next_arg_value() { |
| return args_cur.template next_var<T>(); |
| } |
| |
| //---------------------------------------------------- |
| // INDEX MODE ONLY FUNCTIONS AFTER HERE: |
| //---------------------------------------------------- |
| |
| #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| |
| // parse_index parses the index of a value inside a format string. It |
| // assumes that str[*local_pos] points to character after a '%' or '*', and |
| // returns 0 if there is no closing $, or if it finds no number. If it finds a |
| // number, it will move local_pos past the end of the $, else it will not move |
| // local_pos. |
| LIBC_INLINE size_t parse_index(size_t *local_pos) { |
| if (internal::isdigit(str[*local_pos])) { |
| auto result = internal::strtointeger<int>(str + *local_pos, 10); |
| size_t index = result.value; |
| if (str[*local_pos + result.parsed_len] != '$') |
| return 0; |
| *local_pos = 1 + result.parsed_len + *local_pos; |
| return index; |
| } |
| return 0; |
| } |
| |
| // get_arg_value gets the value from the arg list at index (starting at 1). |
| // This may require parsing the format string. An index of 0 is interpreted as |
| // the next value. |
| template <class T> LIBC_INLINE T get_arg_value(size_t index) { |
| if (!(index == 0 || index == args_index)) |
| args_to_index(index); |
| |
| ++args_index; |
| return get_next_arg_value<T>(); |
| } |
| |
| // the ArgList can only return the next item in the list. This function is |
| // used in index mode when the item that needs to be read is not the next one. |
| // It moves cur_args to the index requested so the appropriate value may |
| // be read. This may involve parsing the format string, and is in the worst |
| // case an O(n^2) operation. |
| LIBC_INLINE void args_to_index(size_t index) { |
| if (args_index > index) { |
| args_index = 1; |
| args_cur = args_start; |
| } |
| |
| while (args_index < index) { |
| // Since all arguments must be pointers, we can just read all of them as |
| // void * and not worry about type issues. |
| args_cur.template next_var<void *>(); |
| ++args_index; |
| } |
| } |
| |
| #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| }; |
| |
| } // namespace scanf_core |
| } // namespace LIBC_NAMESPACE |
| |
| #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |