llvm/lib/Support/GlobPattern.cpp - llvm-project - Git at Google

 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements a glob pattern matcher.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/Support/GlobPattern.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"

 using namespace llvm;

 // Expands character ranges and returns a bitmap.
 // For example, "a-cf-hz" is expanded to "abcfghz".
 static Expected<BitVector> expand(StringRef S, StringRef Original) {
   BitVector BV(256, false);

   // Expand X-Y.
   for (;;) {
     if (S.size() < 3)
       break;

     uint8_t Start = S[0];
     uint8_t End = S[2];

     // If it doesn't start with something like X-Y,
     // consume the first character and proceed.
     if (S[1] != '-') {
       BV[Start] = true;
       S = S.substr(1);
       continue;
     }

     // It must be in the form of X-Y.
     // Validate it and then interpret the range.
     if (Start > End)
       return make_error<StringError>("invalid glob pattern: " + Original,
                                      errc::invalid_argument);

     for (int C = Start; C <= End; ++C)
       BV[(uint8_t)C] = true;
     S = S.substr(3);
   }

   for (char C : S)
     BV[(uint8_t)C] = true;
   return BV;
 }

 // Identify brace expansions in S and return the list of patterns they expand
 // into.
 static Expected<SmallVector<std::string, 1>>
 parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
   SmallVector<std::string> SubPatterns = {S.str()};
   if (!MaxSubPatterns || !S.contains('{'))
     return std::move(SubPatterns);

   struct BraceExpansion {
     size_t Start;
     size_t Length;
     SmallVector<StringRef, 2> Terms;
   };
   SmallVector<BraceExpansion, 0> BraceExpansions;

   BraceExpansion *CurrentBE = nullptr;
   size_t TermBegin;
   for (size_t I = 0, E = S.size(); I != E; ++I) {
     if (S[I] == '[') {
       I = S.find(']', I + 2);
       if (I == std::string::npos)
         return make_error<StringError>("invalid glob pattern, unmatched '['",
                                        errc::invalid_argument);
     } else if (S[I] == '{') {
       if (CurrentBE)
         return make_error<StringError>(
             "nested brace expansions are not supported",
             errc::invalid_argument);
       CurrentBE = &BraceExpansions.emplace_back();
       CurrentBE->Start = I;
       TermBegin = I + 1;
     } else if (S[I] == ',') {
       if (!CurrentBE)
         continue;
       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
       TermBegin = I + 1;
     } else if (S[I] == '}') {
       if (!CurrentBE)
         continue;
       if (CurrentBE->Terms.empty())
         return make_error<StringError>(
             "empty or singleton brace expansions are not supported",
             errc::invalid_argument);
       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
       CurrentBE->Length = I - CurrentBE->Start + 1;
       CurrentBE = nullptr;
     } else if (S[I] == '\\') {
       if (++I == E)
         return make_error<StringError>("invalid glob pattern, stray '\\'",
                                        errc::invalid_argument);
     }
   }
   if (CurrentBE)
     return make_error<StringError>("incomplete brace expansion",
                                    errc::invalid_argument);

   size_t NumSubPatterns = 1;
   for (auto &BE : BraceExpansions) {
     if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
       NumSubPatterns = std::numeric_limits<size_t>::max();
       break;
     }
     NumSubPatterns *= BE.Terms.size();
   }
   if (NumSubPatterns > *MaxSubPatterns)
     return make_error<StringError>("too many brace expansions",
                                    errc::invalid_argument);
   // Replace brace expansions in reverse order so that we don't invalidate
   // earlier start indices
   for (auto &BE : reverse(BraceExpansions)) {
     SmallVector<std::string> OrigSubPatterns;
     std::swap(SubPatterns, OrigSubPatterns);
     for (StringRef Term : BE.Terms)
       for (StringRef Orig : OrigSubPatterns)
         SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
   }
   return std::move(SubPatterns);
 }

 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;

   // Store the prefix that does not contain any metacharacter.
   size_t PrefixSize = S.find_first_of("?*[{\\");
   Pat.Prefix = S.substr(0, PrefixSize);
   if (PrefixSize == std::string::npos)
     return Pat;
   S = S.substr(PrefixSize);

   SmallVector<std::string, 1> SubPats;
   if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
     return std::move(Err);
   for (StringRef SubPat : SubPats) {
     auto SubGlobOrErr = SubGlobPattern::create(SubPat);
     if (!SubGlobOrErr)
       return SubGlobOrErr.takeError();
     Pat.SubGlobs.push_back(*SubGlobOrErr);
   }

   return Pat;
 }

 Expected<GlobPattern::SubGlobPattern>
 GlobPattern::SubGlobPattern::create(StringRef S) {
   SubGlobPattern Pat;

   // Parse brackets.
   Pat.Pat.assign(S.begin(), S.end());
   for (size_t I = 0, E = S.size(); I != E; ++I) {
     if (S[I] == '[') {
       // ']' is allowed as the first character of a character class. '[]' is
       // invalid. So, just skip the first character.
       ++I;
       size_t J = S.find(']', I + 1);
       if (J == StringRef::npos)
         return make_error<StringError>("invalid glob pattern, unmatched '['",
                                        errc::invalid_argument);
       StringRef Chars = S.substr(I, J - I);
       bool Invert = S[I] == '^' || S[I] == '!';
       Expected<BitVector> BV =
           Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
       if (!BV)
         return BV.takeError();
       if (Invert)
         BV->flip();
       Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
       I = J;
     } else if (S[I] == '\\') {
       if (++I == E)
         return make_error<StringError>("invalid glob pattern, stray '\\'",
                                        errc::invalid_argument);
     }
   }
   return Pat;
 }

 bool GlobPattern::match(StringRef S) const {
   if (!S.consume_front(Prefix))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
   for (auto &Glob : SubGlobs)
     if (Glob.match(S))
       return true;
   return false;
 }

 // Factor the pattern into segments split by '*'. The segment is matched
 // sequentianlly by finding the first occurrence past the end of the previous
 // match.
 bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
   const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
              *SavedS = S;
   const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
   size_t B = 0, SavedB = 0;
   while (S != End) {
     if (P == PEnd)
       ;
     else if (*P == '*') {
       // The non-* substring on the left of '*' matches the tail of S. Save the
       // positions to be used by backtracking if we see a mismatch later.
       SegmentBegin = ++P;
       SavedS = S;
       SavedB = B;
       continue;
     } else if (*P == '[') {
       if (Brackets[B].Bytes[uint8_t(*S)]) {
         P = Pat.data() + Brackets[B++].NextOffset;
         ++S;
         continue;
       }
     } else if (*P == '\\') {
       if (*++P == *S) {
         ++P;
         ++S;
         continue;
       }
     } else if (*P == *S || *P == '?') {
       ++P;
       ++S;
       continue;
     }
     if (!SegmentBegin)
       return false;
     // We have seen a '*'. Backtrack to the saved positions. Shift the S
     // position to probe the next starting position in the segment.
     P = SegmentBegin;
     S = ++SavedS;
     B = SavedB;
   }
   // All bytes in Str have been matched. Return true if the rest part of Pat is
   // empty or contains only '*'.
   return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
 }
	//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a glob pattern matcher.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Support/GlobPattern.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Support/Errc.h"

	using namespace llvm;

	// Expands character ranges and returns a bitmap.
	// For example, "a-cf-hz" is expanded to "abcfghz".
	static Expected<BitVector> expand(StringRef S, StringRef Original) {
	BitVector BV(256, false);

	// Expand X-Y.
	for (;;) {
	if (S.size() < 3)
	break;

	uint8_t Start = S[0];
	uint8_t End = S[2];

	// If it doesn't start with something like X-Y,
	// consume the first character and proceed.
	if (S[1] != '-') {
	BV[Start] = true;
	S = S.substr(1);
	continue;
	}

	// It must be in the form of X-Y.
	// Validate it and then interpret the range.
	if (Start > End)
	return make_error<StringError>("invalid glob pattern: " + Original,
	errc::invalid_argument);

	for (int C = Start; C <= End; ++C)
	BV[(uint8_t)C] = true;
	S = S.substr(3);
	}

	for (char C : S)
	BV[(uint8_t)C] = true;
	return BV;
	}

	// Identify brace expansions in S and return the list of patterns they expand
	// into.
	static Expected<SmallVector<std::string, 1>>
	parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
	SmallVector<std::string> SubPatterns = {S.str()};
	if (!MaxSubPatterns \|\| !S.contains('{'))
	return std::move(SubPatterns);

	struct BraceExpansion {
	size_t Start;
	size_t Length;
	SmallVector<StringRef, 2> Terms;
	};
	SmallVector<BraceExpansion, 0> BraceExpansions;

	BraceExpansion *CurrentBE = nullptr;
	size_t TermBegin;
	for (size_t I = 0, E = S.size(); I != E; ++I) {
	if (S[I] == '[') {
	I = S.find(']', I + 2);
	if (I == std::string::npos)
	return make_error<StringError>("invalid glob pattern, unmatched '['",
	errc::invalid_argument);
	} else if (S[I] == '{') {
	if (CurrentBE)
	return make_error<StringError>(
	"nested brace expansions are not supported",
	errc::invalid_argument);
	CurrentBE = &BraceExpansions.emplace_back();
	CurrentBE->Start = I;
	TermBegin = I + 1;
	} else if (S[I] == ',') {
	if (!CurrentBE)
	continue;
	CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
	TermBegin = I + 1;
	} else if (S[I] == '}') {
	if (!CurrentBE)
	continue;
	if (CurrentBE->Terms.empty())
	return make_error<StringError>(
	"empty or singleton brace expansions are not supported",
	errc::invalid_argument);
	CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
	CurrentBE->Length = I - CurrentBE->Start + 1;
	CurrentBE = nullptr;
	} else if (S[I] == '\\') {
	if (++I == E)
	return make_error<StringError>("invalid glob pattern, stray '\\'",
	errc::invalid_argument);
	}
	}
	if (CurrentBE)
	return make_error<StringError>("incomplete brace expansion",
	errc::invalid_argument);

	size_t NumSubPatterns = 1;
	for (auto &BE : BraceExpansions) {
	if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
	NumSubPatterns = std::numeric_limits<size_t>::max();
	break;
	}
	NumSubPatterns *= BE.Terms.size();
	}
	if (NumSubPatterns > *MaxSubPatterns)
	return make_error<StringError>("too many brace expansions",
	errc::invalid_argument);
	// Replace brace expansions in reverse order so that we don't invalidate
	// earlier start indices
	for (auto &BE : reverse(BraceExpansions)) {
	SmallVector<std::string> OrigSubPatterns;
	std::swap(SubPatterns, OrigSubPatterns);
	for (StringRef Term : BE.Terms)
	for (StringRef Orig : OrigSubPatterns)
	SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
	}
	return std::move(SubPatterns);
	}

	Expected<GlobPattern>
	GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
	GlobPattern Pat;

	// Store the prefix that does not contain any metacharacter.
	size_t PrefixSize = S.find_first_of("?*[{\\");
	Pat.Prefix = S.substr(0, PrefixSize);
	if (PrefixSize == std::string::npos)
	return Pat;
	S = S.substr(PrefixSize);

	SmallVector<std::string, 1> SubPats;
	if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
	return std::move(Err);
	for (StringRef SubPat : SubPats) {
	auto SubGlobOrErr = SubGlobPattern::create(SubPat);
	if (!SubGlobOrErr)
	return SubGlobOrErr.takeError();
	Pat.SubGlobs.push_back(*SubGlobOrErr);
	}

	return Pat;
	}

	Expected<GlobPattern::SubGlobPattern>
	GlobPattern::SubGlobPattern::create(StringRef S) {
	SubGlobPattern Pat;

	// Parse brackets.
	Pat.Pat.assign(S.begin(), S.end());
	for (size_t I = 0, E = S.size(); I != E; ++I) {
	if (S[I] == '[') {
	// ']' is allowed as the first character of a character class. '[]' is
	// invalid. So, just skip the first character.
	++I;
	size_t J = S.find(']', I + 1);
	if (J == StringRef::npos)
	return make_error<StringError>("invalid glob pattern, unmatched '['",
	errc::invalid_argument);
	StringRef Chars = S.substr(I, J - I);
	bool Invert = S[I] == '^' \|\| S[I] == '!';
	Expected<BitVector> BV =
	Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
	if (!BV)
	return BV.takeError();
	if (Invert)
	BV->flip();
	Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
	I = J;
	} else if (S[I] == '\\') {
	if (++I == E)
	return make_error<StringError>("invalid glob pattern, stray '\\'",
	errc::invalid_argument);
	}
	}
	return Pat;
	}

	bool GlobPattern::match(StringRef S) const {
	if (!S.consume_front(Prefix))
	return false;
	if (SubGlobs.empty() && S.empty())
	return true;
	for (auto &Glob : SubGlobs)
	if (Glob.match(S))
	return true;
	return false;
	}

	// Factor the pattern into segments split by '*'. The segment is matched
	// sequentianlly by finding the first occurrence past the end of the previous
	// match.
	bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
	const char P = Pat.data(), SegmentBegin = nullptr, *S = Str.data(),
	*SavedS = S;
	const char const PEnd = P + Pat.size(), const End = S + Str.size();
	size_t B = 0, SavedB = 0;
	while (S != End) {
	if (P == PEnd)
	;
	else if (P == '') {
	// The non-* substring on the left of '*' matches the tail of S. Save the
	// positions to be used by backtracking if we see a mismatch later.
	SegmentBegin = ++P;
	SavedS = S;
	SavedB = B;
	continue;
	} else if (*P == '[') {
	if (Brackets[B].Bytes[uint8_t(*S)]) {
	P = Pat.data() + Brackets[B++].NextOffset;
	++S;
	continue;
	}
	} else if (*P == '\\') {
	if (++P == S) {
	++P;
	++S;
	continue;
	}
	} else if (P == S \|\| *P == '?') {
	++P;
	++S;
	continue;
	}
	if (!SegmentBegin)
	return false;
	// We have seen a '*'. Backtrack to the saved positions. Shift the S
	// position to probe the next starting position in the segment.
	P = SegmentBegin;
	S = ++SavedS;
	B = SavedB;
	}
	// All bytes in Str have been matched. Return true if the rest part of Pat is
	// empty or contains only '*'.
	return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
	}