blob: 7d0d1cf0c58b95f89e3574312efbe02b8aa32954 [file] [log] [blame]
//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines nodes for use within a SuffixTree.
//
// Each node has either no children or at least two children, with the root
// being a exception in the empty tree.
//
// Children are represented as a map between unsigned integers and nodes. If
// a node N has a child M on unsigned integer k, then the mapping represented
// by N is a proper prefix of the mapping represented by M. Note that this,
// although similar to a trie is somewhat different: each node stores a full
// substring of the full mapping rather than a single character state.
//
// Each internal node contains a pointer to the internal node representing
// the same string, but with the first character chopped off. This is stored
// in \p Link. Each leaf node stores the start index of its respective
// suffix in \p SuffixIdx.
//===----------------------------------------------------------------------===//
#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
#include "llvm/ADT/DenseMap.h"
namespace llvm {
/// A node in a suffix tree which represents a substring or suffix.
struct SuffixTreeNode {
public:
/// Represents an undefined index in the suffix tree.
static const unsigned EmptyIdx = -1;
enum class NodeKind { ST_Leaf, ST_Internal };
private:
const NodeKind Kind;
/// The start index of this node's substring in the main string.
unsigned StartIdx = EmptyIdx;
/// The length of the string formed by concatenating the edge labels from
/// the root to this node.
unsigned ConcatLen = 0;
public:
// LLVM RTTI boilerplate.
NodeKind getKind() const { return Kind; }
/// \return the start index of this node's substring in the entire string.
unsigned getStartIdx() const;
/// \returns the end index of this node.
virtual unsigned getEndIdx() const = 0;
/// Advance this node's StartIdx by \p Inc.
void incrementStartIdx(unsigned Inc);
/// Set the length of the string from the root to this node to \p Len.
void setConcatLen(unsigned Len);
/// \returns the length of the string from the root to this node.
unsigned getConcatLen() const;
SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
: Kind(Kind), StartIdx(StartIdx) {}
virtual ~SuffixTreeNode() = default;
};
// A node with two or more children, or the root.
struct SuffixTreeInternalNode : SuffixTreeNode {
private:
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned EndIdx = EmptyIdx;
/// A pointer to the internal node representing the same sequence with the
/// first character chopped off.
///
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that
/// Ukkonen's algorithm does to achieve linear-time construction is
/// keep track of which node the next insert should be at. This makes each
/// insert O(1), and there are a total of O(N) inserts. The suffix link
/// helps with inserting children of internal nodes.
///
/// Say we add a child to an internal node with associated mapping S. The
/// next insertion must be at the node representing S - its first character.
/// This is given by the way that we iteratively build the tree in Ukkonen's
/// algorithm. The main idea is to look at the suffixes of each prefix in the
/// string, starting with the longest suffix of the prefix, and ending with
/// the shortest. Therefore, if we keep pointers between such nodes, we can
/// move to the next insertion point in O(1) time. If we don't, then we'd
/// have to query from the root, which takes O(N) time. This would make the
/// construction algorithm O(N^2) rather than O(N).
SuffixTreeInternalNode *Link = nullptr;
public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Internal;
}
/// \returns true if this node is the root of its owning \p SuffixTree.
bool isRoot() const;
/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;
/// Sets \p Link to \p L. Assumes \p L is not null.
void setLink(SuffixTreeInternalNode *L);
/// \returns the pointer to the Link node.
SuffixTreeInternalNode *getLink() const;
/// The children of this node.
///
/// A child existing on an unsigned integer implies that from the mapping
/// represented by the current node, there is a way to reach another
/// mapping by tacking that character on the end of the current string.
DenseMap<unsigned, SuffixTreeNode *> Children;
SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
SuffixTreeInternalNode *Link)
: SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
Link(Link) {}
virtual ~SuffixTreeInternalNode() = default;
};
// A node representing a suffix.
struct SuffixTreeLeafNode : SuffixTreeNode {
private:
/// The start index of the suffix represented by this leaf.
unsigned SuffixIdx = EmptyIdx;
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned *EndIdx = nullptr;
public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode *N) {
return N->getKind() == NodeKind::ST_Leaf;
}
/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;
/// \returns the start index of the suffix represented by this leaf.
unsigned getSuffixIdx() const;
/// Sets the start index of the suffix represented by this leaf to \p Idx.
void setSuffixIdx(unsigned Idx);
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
: SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
virtual ~SuffixTreeLeafNode() = default;
};
} // namespace llvm
#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H