Given dictionary words, find words matching the given pattern
Problem
Input was given as
List dict = new List();
List
that contains a list of 10000 words. Find all the words that match the given pattern without regular expressions.
Ex: You are given a pattern like (h*t). The words matching it would be (hot, hit, hat, etc.)
This problem is similar to "Finding the words by ranking" in that it utilize Trie data structure to store the words. Only difference from "Finding the words by ranking" case, is that the Trie we implement for this question does not keep the list of words by ranking.
Instead, it keeps either empty string in case of the current node is not leaf node or the word. Figure 1. illustrate the structure of the Trie.
Figure 1. Trie example |
After building the Trie, we can perform the DFS to find the word matching the given pattern.
If pattern contains "*", we search all the children of the node. We can pass the std::vector
Running time of this algorithm will be as blow:
- Building Trie from the N words: O(K×N), where K is the longest word in the input list.
- Searching Trie for the given pattern, O(K×MT), where K is the length of the pattern, T is the number of "*" in the pattern and M is the number of children in each node.
This is the worst case running time when input pattern is "∗∗∗∗"
With one "*", in the pattern with the length K, the running time will be O(K×M).
Here is the complete code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<list> | |
#include<iostream> | |
#include<string> | |
#include<vector> | |
#include<stdlib.h> | |
using namespace std; | |
struct node { | |
string value; | |
node* children[256]; | |
node(){ | |
memset(children, 0, 256*sizeof(struct node *)); | |
} | |
}; | |
class Trie { | |
public: | |
Trie() | |
{ | |
root = new node(); | |
} | |
void add(string word) | |
{ | |
add_word(root, word, 0); | |
} | |
vector<string> find(string pattern) | |
{ | |
vector<string> found; | |
if(root) | |
{ | |
search(pattern, "", 0, root, found); | |
} | |
return found; | |
} | |
private: | |
void add_word(node* cur, string word, int pos) | |
{ | |
if (pos == word.length()) | |
{ | |
cur->value = word; | |
return; | |
} | |
char next = word[pos]; | |
if (!cur->children[next]) | |
{ | |
cur->children[next] = new node(); | |
} | |
add_word(cur->children[next], word, pos+1); | |
} | |
void search(string pattern, string prev, int pos, node* cur, vector<string>& found) | |
{ | |
if (pos == pattern.length()) | |
{ | |
if (cur->value.length() > 0) | |
found.push_back(prev); | |
return; | |
} | |
char next = pattern[pos]; | |
if (next == '*') | |
{ | |
for(int i = 0; i < 256; i++) | |
{ | |
if (cur->children[i]) | |
{ | |
prev.push_back(i); | |
search(pattern, prev, pos+1, cur->children[i], found); | |
prev.pop_back(); | |
} | |
} | |
} else { | |
if (cur->children[next]) | |
{ | |
prev.push_back(next); | |
search(pattern, prev, pos+1, cur->children[next],found); | |
} | |
} | |
} | |
node * root; | |
}; | |
int main() { | |
list<string> dic; | |
list<string>::iterator iter; | |
dic.push_back("hash"); | |
dic.push_back("hat"); | |
dic.push_back("hot"); | |
dic.push_back("max"); | |
dic.push_back("mat"); | |
dic.push_back("box"); | |
dic.push_back("fox"); | |
Trie dictionary; | |
for(iter = dic.begin(); iter != dic.end(); iter++) | |
{ | |
dictionary.add(*iter); | |
} | |
vector<string> result = dictionary.find("h*t"); | |
cout<<" result for h*t : "<<endl; | |
for (int i =0; i < result.size(); i++) | |
cout<<result[i]<<", "; | |
cout<<endl; | |
cout<<" result for ma* : "<<endl; | |
result = dictionary.find("ma*"); | |
for (int i =0; i < result.size(); i++) | |
cout<<result[i]<<", "; | |
cout<<endl; | |
cout<<" result for *ox : "<<endl; | |
result = dictionary.find("*ox"); | |
for (int i =0; i < result.size(); i++) | |
cout<<result[i]<<", "; | |
cout<<endl; | |
cout<<" result for ma : "<<endl; | |
result = dictionary.find("ma"); | |
for (int i =0; i < result.size(); i++) | |
cout<<result[i]<<", "; | |
cout<<endl; | |
return 1; | |
} |
Practice statistics:
25:49: to write up the solution without building Trie. (initial data structure had a flaw)
10:31: to write up the logic building the Trie.
10:00: to fix up the flaw, which returning the non-existing words. Made the Trie's leaf has the value and intermediate node does not have value.
UPDATE ( 2019-06-03):
Retried the same question again. Not sure why I used Trie. Building the Trie alone already takes O(N) and search the list will take O(N) any way depending on the number of '*' in a search pattern.
This means simple brute-force search can be legitimate solution, which takes O(KN), where K is the longest words in the dictionary and N is the number of words in the dictionary.
I wish there is a faster way like O(logn). I did not find the solution yet.
Here is a python solution implementing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
input = ['abc', 'hot', 'hit', 'hat', 'haa', 'etc'] | |
pattern = "h*t" | |
class RegFinder: | |
def find(self, L, pattern): | |
found = [] | |
for i in L: | |
if self.matched(i, pattern): | |
found.append(i) | |
return found | |
def matched(self, word, pattern): | |
if len(word) != len(pattern): | |
return False | |
for i, c in enumerate(word): | |
if pattern[i] == '*': | |
continue | |
elif (c != pattern[i]): | |
return False | |
return True | |
r = RegFinder() | |
print("input = {} result = {}".format(input, r.find(input, pattern))) | |
pattern2 = "h**" | |
print("input = {} result = {}".format(input, r.find(input, pattern2))) |
end
Comments
Post a Comment