llvm / llvm-project / 0a302f66673720a0d3fd3f0ce32ec9cfda428cf1 / . / llvm / include / llvm / Support / SuffixTree.h

//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// | |

// | |

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |

// See https://llvm.org/LICENSE.txt for license information. | |

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |

// | |

//===----------------------------------------------------------------------===// | |

// | |

// This file defines the Suffix Tree class and Suffix Tree Node struct. | |

// | |

//===----------------------------------------------------------------------===// | |

#ifndef LLVM_SUPPORT_SUFFIXTREE_H | |

#define LLVM_SUPPORT_SUFFIXTREE_H | |

#include "llvm/ADT/ArrayRef.h" | |

#include "llvm/ADT/DenseMap.h" | |

#include "llvm/Support/Allocator.h" | |

#include <vector> | |

namespace llvm { | |

/// Represents an undefined index in the suffix tree. | |

const unsigned EmptyIdx = -1; | |

/// A node in a suffix tree which represents a substring or suffix. | |

/// | |

/// Each node has either no children or at least two children, with the root | |

/// being a exception in the empty tree. | |

/// | |

/// Children are represented as a map between unsigned integers and nodes. If | |

/// a node N has a child M on unsigned integer k, then the mapping represented | |

/// by N is a proper prefix of the mapping represented by M. Note that this, | |

/// although similar to a trie is somewhat different: each node stores a full | |

/// substring of the full mapping rather than a single character state. | |

/// | |

/// Each internal node contains a pointer to the internal node representing | |

/// the same string, but with the first character chopped off. This is stored | |

/// in \p Link. Each leaf node stores the start index of its respective | |

/// suffix in \p SuffixIdx. | |

struct SuffixTreeNode { | |

/// The children of this node. | |

/// | |

/// A child existing on an unsigned integer implies that from the mapping | |

/// represented by the current node, there is a way to reach another | |

/// mapping by tacking that character on the end of the current string. | |

llvm::DenseMap<unsigned, SuffixTreeNode *> Children; | |

/// The start index of this node's substring in the main string. | |

unsigned StartIdx = EmptyIdx; | |

/// The end index of this node's substring in the main string. | |

/// | |

/// Every leaf node must have its \p EndIdx incremented at the end of every | |

/// step in the construction algorithm. To avoid having to update O(N) | |

/// nodes individually at the end of every step, the end index is stored | |

/// as a pointer. | |

unsigned *EndIdx = nullptr; | |

/// For leaves, the start index of the suffix represented by this node. | |

/// | |

/// For all other nodes, this is ignored. | |

unsigned SuffixIdx = EmptyIdx; | |

/// For internal nodes, a pointer to the internal node representing | |

/// the same sequence with the first character chopped off. | |

/// | |

/// This acts as a shortcut in Ukkonen's algorithm. One of the things that | |

/// Ukkonen's algorithm does to achieve linear-time construction is | |

/// keep track of which node the next insert should be at. This makes each | |

/// insert O(1), and there are a total of O(N) inserts. The suffix link | |

/// helps with inserting children of internal nodes. | |

/// | |

/// Say we add a child to an internal node with associated mapping S. The | |

/// next insertion must be at the node representing S - its first character. | |

/// This is given by the way that we iteratively build the tree in Ukkonen's | |

/// algorithm. The main idea is to look at the suffixes of each prefix in the | |

/// string, starting with the longest suffix of the prefix, and ending with | |

/// the shortest. Therefore, if we keep pointers between such nodes, we can | |

/// move to the next insertion point in O(1) time. If we don't, then we'd | |

/// have to query from the root, which takes O(N) time. This would make the | |

/// construction algorithm O(N^2) rather than O(N). | |

SuffixTreeNode *Link = nullptr; | |

/// The length of the string formed by concatenating the edge labels from the | |

/// root to this node. | |

unsigned ConcatLen = 0; | |

/// Returns true if this node is a leaf. | |

bool isLeaf() const { return SuffixIdx != EmptyIdx; } | |

/// Returns true if this node is the root of its owning \p SuffixTree. | |

bool isRoot() const { return StartIdx == EmptyIdx; } | |

/// Return the number of elements in the substring associated with this node. | |

size_t size() const { | |

// Is it the root? If so, it's the empty string so return 0. | |

if (isRoot()) | |

return 0; | |

assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); | |

// Size = the number of elements in the string. | |

// For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. | |

return *EndIdx - StartIdx + 1; | |

} | |

SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link) | |

: StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {} | |

SuffixTreeNode() {} | |

}; | |

/// A data structure for fast substring queries. | |

/// | |

/// Suffix trees represent the suffixes of their input strings in their leaves. | |

/// A suffix tree is a type of compressed trie structure where each node | |

/// represents an entire substring rather than a single character. Each leaf | |

/// of the tree is a suffix. | |

/// | |

/// A suffix tree can be seen as a type of state machine where each state is a | |

/// substring of the full string. The tree is structured so that, for a string | |

/// of length N, there are exactly N leaves in the tree. This structure allows | |

/// us to quickly find repeated substrings of the input string. | |

/// | |

/// In this implementation, a "string" is a vector of unsigned integers. | |

/// These integers may result from hashing some data type. A suffix tree can | |

/// contain 1 or many strings, which can then be queried as one large string. | |

/// | |

/// The suffix tree is implemented using Ukkonen's algorithm for linear-time | |

/// suffix tree construction. Ukkonen's algorithm is explained in more detail | |

/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The | |

/// paper is available at | |

/// | |

/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf | |

class SuffixTree { | |

public: | |

/// Each element is an integer representing an instruction in the module. | |

llvm::ArrayRef<unsigned> Str; | |

/// A repeated substring in the tree. | |

struct RepeatedSubstring { | |

/// The length of the string. | |

unsigned Length; | |

/// The start indices of each occurrence. | |

std::vector<unsigned> StartIndices; | |

}; | |

private: | |

/// Maintains each node in the tree. | |

llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; | |

/// The root of the suffix tree. | |

/// | |

/// The root represents the empty string. It is maintained by the | |

/// \p NodeAllocator like every other node in the tree. | |

SuffixTreeNode *Root = nullptr; | |

/// Maintains the end indices of the internal nodes in the tree. | |

/// | |

/// Each internal node is guaranteed to never have its end index change | |

/// during the construction algorithm; however, leaves must be updated at | |

/// every step. Therefore, we need to store leaf end indices by reference | |

/// to avoid updating O(N) leaves at every step of construction. Thus, | |

/// every internal node must be allocated its own end index. | |

llvm::BumpPtrAllocator InternalEndIdxAllocator; | |

/// The end index of each leaf in the tree. | |

unsigned LeafEndIdx = -1; | |

/// Helper struct which keeps track of the next insertion point in | |

/// Ukkonen's algorithm. | |

struct ActiveState { | |

/// The next node to insert at. | |

SuffixTreeNode *Node = nullptr; | |

/// The index of the first character in the substring currently being added. | |

unsigned Idx = EmptyIdx; | |

/// The length of the substring we have to add at the current step. | |

unsigned Len = 0; | |

}; | |

/// The point the next insertion will take place at in the | |

/// construction algorithm. | |

ActiveState Active; | |

/// Allocate a leaf node and add it to the tree. | |

/// | |

/// \param Parent The parent of this node. | |

/// \param StartIdx The start index of this node's associated string. | |

/// \param Edge The label on the edge leaving \p Parent to this node. | |

/// | |

/// \returns A pointer to the allocated leaf node. | |

SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, | |

unsigned Edge); | |

/// Allocate an internal node and add it to the tree. | |

/// | |

/// \param Parent The parent of this node. Only null when allocating the root. | |

/// \param StartIdx The start index of this node's associated string. | |

/// \param EndIdx The end index of this node's associated string. | |

/// \param Edge The label on the edge leaving \p Parent to this node. | |

/// | |

/// \returns A pointer to the allocated internal node. | |

SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, | |

unsigned EndIdx, unsigned Edge); | |

/// Set the suffix indices of the leaves to the start indices of their | |

/// respective suffixes. | |

void setSuffixIndices(); | |

/// Construct the suffix tree for the prefix of the input ending at | |

/// \p EndIdx. | |

/// | |

/// Used to construct the full suffix tree iteratively. At the end of each | |

/// step, the constructed suffix tree is either a valid suffix tree, or a | |

/// suffix tree with implicit suffixes. At the end of the final step, the | |

/// suffix tree is a valid tree. | |

/// | |

/// \param EndIdx The end index of the current prefix in the main string. | |

/// \param SuffixesToAdd The number of suffixes that must be added | |

/// to complete the suffix tree at the current phase. | |

/// | |

/// \returns The number of suffixes that have not been added at the end of | |

/// this step. | |

unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); | |

public: | |

/// Construct a suffix tree from a sequence of unsigned integers. | |

/// | |

/// \param Str The string to construct the suffix tree for. | |

SuffixTree(const std::vector<unsigned> &Str); | |

/// Iterator for finding all repeated substrings in the suffix tree. | |

struct RepeatedSubstringIterator { | |

private: | |

/// The current node we're visiting. | |

SuffixTreeNode *N = nullptr; | |

/// The repeated substring associated with this node. | |

RepeatedSubstring RS; | |

/// The nodes left to visit. | |

std::vector<SuffixTreeNode *> ToVisit; | |

/// The minimum length of a repeated substring to find. | |

/// Since we're outlining, we want at least two instructions in the range. | |

/// FIXME: This may not be true for targets like X86 which support many | |

/// instruction lengths. | |

const unsigned MinLength = 2; | |

/// Move the iterator to the next repeated substring. | |

void advance() { | |

// Clear the current state. If we're at the end of the range, then this | |

// is the state we want to be in. | |

RS = RepeatedSubstring(); | |

N = nullptr; | |

// Each leaf node represents a repeat of a string. | |

std::vector<SuffixTreeNode *> LeafChildren; | |

// Continue visiting nodes until we find one which repeats more than once. | |

while (!ToVisit.empty()) { | |

SuffixTreeNode *Curr = ToVisit.back(); | |

ToVisit.pop_back(); | |

LeafChildren.clear(); | |

// Keep track of the length of the string associated with the node. If | |

// it's too short, we'll quit. | |

unsigned Length = Curr->ConcatLen; | |

// Iterate over each child, saving internal nodes for visiting, and | |

// leaf nodes in LeafChildren. Internal nodes represent individual | |

// strings, which may repeat. | |

for (auto &ChildPair : Curr->Children) { | |

// Save all of this node's children for processing. | |

if (!ChildPair.second->isLeaf()) | |

ToVisit.push_back(ChildPair.second); | |

// It's not an internal node, so it must be a leaf. If we have a | |

// long enough string, then save the leaf children. | |

else if (Length >= MinLength) | |

LeafChildren.push_back(ChildPair.second); | |

} | |

// The root never represents a repeated substring. If we're looking at | |

// that, then skip it. | |

if (Curr->isRoot()) | |

continue; | |

// Do we have any repeated substrings? | |

if (LeafChildren.size() >= 2) { | |

// Yes. Update the state to reflect this, and then bail out. | |

N = Curr; | |

RS.Length = Length; | |

for (SuffixTreeNode *Leaf : LeafChildren) | |

RS.StartIndices.push_back(Leaf->SuffixIdx); | |

break; | |

} | |

} | |

// At this point, either NewRS is an empty RepeatedSubstring, or it was | |

// set in the above loop. Similarly, N is either nullptr, or the node | |

// associated with NewRS. | |

} | |

public: | |

/// Return the current repeated substring. | |

RepeatedSubstring &operator*() { return RS; } | |

RepeatedSubstringIterator &operator++() { | |

advance(); | |

return *this; | |

} | |

RepeatedSubstringIterator operator++(int I) { | |

RepeatedSubstringIterator It(*this); | |

advance(); | |

return It; | |

} | |

bool operator==(const RepeatedSubstringIterator &Other) const { | |

return N == Other.N; | |

} | |

bool operator!=(const RepeatedSubstringIterator &Other) const { | |

return !(*this == Other); | |

} | |

RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) { | |

// Do we have a non-null node? | |

if (N) { | |

// Yes. At the first step, we need to visit all of N's children. | |

// Note: This means that we visit N last. | |

ToVisit.push_back(N); | |

advance(); | |

} | |

} | |

}; | |

typedef RepeatedSubstringIterator iterator; | |

iterator begin() { return iterator(Root); } | |

iterator end() { return iterator(nullptr); } | |

}; | |

} // namespace llvm | |

#endif // LLVM_SUPPORT_SUFFIXTREE_H |