Counting Distinct Substrings In A Given String Using Trie

To count all substrings in a given string, we do the following :

  • Construct a Trie with all the substrings that are the suffixes of a given string.
  • Keep a count of nodes that are being created in the Trie while inserting the substrings (suffixes).

The number of nodes in the Trie is the same as the number of unique substrings in a given string.

Example: Consider the string [ ababa ].
The number of unique substrings for string [ ababa ] : [ a ], [ ab ], [ aba ], [ abab ], [ ababa ], [ b ], [ ba ], [ bab ] and [ baba ]. Thus we have 9 unqiue substrings.

We insert the following substrings in the Trie : [ ababc ], [ baba ], [ aba ], [ ba ] & [ a ]. The nodes marked in yellow indicate the end of the string.
We see that the number of nodes (excluding root) are 9 i.e same as the number of unique substrings.

All_Substrings



Program for counting the number of distinct substrings in a given string

class TrieNode:
    """A trie node"""

    def __init__(self, char):

        # Character stored in this node
        self.char = char

        # A flag that marks if the word ends on this particular node.
        self.end_of_word = False

        # A dictionary of child nodes where the keys are the characters (letters) 
        # and values are the nodes
        self.children = {}

class Trie:
    """The trie structure"""

    def __init__(self):
        """
        The root node of a trie is empty and does not store any character.
        """
        self.root = TrieNode("")
        self.node_count = 0

    def Insert (self, string):
        """Insert a string i.e a word into the trie"""
        node = self.root

        # Check each character in the string 
        # If none of the children of the current node contains the character, 
        # create a new child of the current node for storing the character.
        for char in string:
            if char in node.children:
                node = node.children[char]
            else:
                # As the character is not found, create a new trie node
                new_node = TrieNode(char)
                node.children[char] = new_node
                node = new_node
                self.node_count += 1

        # Mark the end of a word
        node.end_of_word = True

def main():

    t = Trie()

    string = input("Enter string : ")
    # For a given string abc
    # Insert abc
    # Insert bc
    # Insert c
    for beg in range (0, len(string)):
        t.Insert (string[beg:])

    print ("Unique substring count : " + str(t.node_count))

if __name__ == "__main__" :
    main()

Output

Enter string : abc
Unique substring count : 6

Enter string : goalgo
Unique substring count : 18

Enter string : a
Unique substring count : 1

Enter string : xyzzy
Unique substring count : 13
#include<iostream>
using namespace std;

class TrieNode {

    public:

    TrieNode * children[129];

    TrieNode() {
        for (int i=0; i<=128; i++) {
            children[i] = NULL;
        }
    }
};

class Trie {

    public:

    TrieNode root;
    int node_count;

    Trie () {
        // Track the number of nodes in the trie.
        node_count = 0;
    }

    // Insert a string into the trie.
    void Insert(string str) {

        TrieNode * parent = &root;

        for (size_t i=0; i<str.size(); i++) {
            int N = str.at(i);
            if (parent->children[N] == NULL) {
                parent->children[N] = new TrieNode;
                node_count++;
            }
            parent = parent->children[N];
        }
    }
};

int main() {

    int tests;
    cout << "Enter number of strings : ";
    cin >> tests;

    while (tests--) {

        Trie t;
        string str;
        cin >> str;
        /* Insert substrings in the trie like below
        Example for string abc, 
        Insert : abc
        Insert : bc
        Insert : c
        */
        for (size_t i=0; i<str.size(); i++) {
            t.Insert(str.substr(i, str.size()-i));
        }
        cout << "Unique substring count : " << t.node_count << endl;
    }
    return 0;
}

Output

Enter number of strings : 5

abc
Unique substring count : 6

ab
Unique substring count : 3

a
Unique substring count : 1

ababa
Unique substring count : 9

goalgo
Unique substring count : 18
import java.util.Scanner;
import java.util.HashMap;

class TrieNode {

    public char letter;
    public boolean end_of_word; // A flag that marks if the word ends on this particular node.
    public HashMap <Character, TrieNode> children;

    TrieNode (char c) {
        // Character stored in this node
        letter = c;
        end_of_word = false;

        // A dictionary of child nodes where the keys are the characters (letters) 
        // and values are the nodes
        children = new HashMap<Character, TrieNode>();
    }

};

class Trie {

    TrieNode root;
    public int node_count;

    Trie () {
        // The root node of a trie is empty and does not store any character.
        root = new TrieNode('0');
        node_count = 0;
    }

    // Insert a string i.e a word into the trie
    public void Insert (String word) {

        TrieNode node = root;

        // Check each character in the string 
        // If none of the children of the current node contains the character, 
        // create a new child of the current node for storing the character.
        for (int i = 0; i < word.length(); i++) {
            char c = word.charAt(i);
            if (node.children.containsKey(c)) {
                node = node.children.get(c);
            } else {
                // As the character is not found, create a new trie node
                TrieNode new_node = new TrieNode(c);
                node.children.put(c, new_node);
                node = new_node;
                node_count++;
            }
        }
        // Mark the end of a word
        node.end_of_word = true;
    }

    public static void main (String args[]) {

        Trie t = new Trie();

        // Search for a prefix in the Trie
        Scanner scanner_obj = new Scanner(System.in); // Create a Scanner object
        System.out.print("Enter string: ");

        String str = scanner_obj.nextLine();

        /* For a given string abc
         # Insert abc
         # Insert bc
         # Insert c */
         
        for (int beg = 0; beg < str.length(); beg++) {
            int end = str.length();
            t.Insert (str.substring(beg, end));
        }
        System.out.println("Unique substring count : " +  t.node_count);
    }
}

Output

Enter string: abc 
Unique substring count : 6

Enter string: goalgo
Unique substring count : 18

Enter string: ababa
Unique substring count : 9

Enter string: a
Unique substring count : 1



Copyright (c) 2019-2024, Algotree.org.
All rights reserved.