Finding the shortest sequence containing the keywords (Minimum window subsequence problem)
Problem
Find the shortest string [] containing the keyword inside
Example,
Words: sky cloud google search sky work blue
Keywords: sky blue
Return: sky work blue
Explanation
First of all, this is the same question as the "Finding the minimum sub-string" question. It was modified by using words instead of characters to fool the interviewees.
In the first attempt, I fell for this trick and thought I add to use a different approach such as binary search. The second attempt was not far from the first approach but still did not figure out the fact that it was another variation of the minimum sub-string sequence problem.
In the third attempt (a few months after the second attempt), I tried to simplify the word into just numbers and found that it was, indeed, a minimum sub-string sequence. I took the greedy approach to solve this problem in O(N) time.
For a detailed explanation of the algorithm, you can refer to "Finding the minimum sub-string".
Key assumptions made in this solution are:
Here is the python solution. This code has lots of room for improvement if I had more time.
Practice statistics:
42:34 to write up the code without test cases (I should move faster next time) - yay!! I wrote the code in under 45 mins
06:04: to write up the test cases and refactor the code
03:41: debug the code. --> This should not happen next time. !!
Find the shortest string [] containing the keyword inside
Example,
Words: sky cloud google search sky work blue
Keywords: sky blue
Return: sky work blue
Explanation
First of all, this is the same question as the "Finding the minimum sub-string" question. It was modified by using words instead of characters to fool the interviewees.
In the first attempt, I fell for this trick and thought I add to use a different approach such as binary search. The second attempt was not far from the first approach but still did not figure out the fact that it was another variation of the minimum sub-string sequence problem.
In the third attempt (a few months after the second attempt), I tried to simplify the word into just numbers and found that it was, indeed, a minimum sub-string sequence. I took the greedy approach to solve this problem in O(N) time.
For a detailed explanation of the algorithm, you can refer to "Finding the minimum sub-string".
Key assumptions made in this solution are:
- keywords don't include the duplicate word
- the answer does not have to contain keywords in order.
UPDATE(2023-03-01): Solved the problem again and chose the wrong direction and spent 1 hr writing the wrong code (it worked but had bugs).
After understanding that it was a minimum span problem, rewrite the code and found that the original code had a bug. Fixed the bug.
Here is the new solution without a bug.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Find the shortest string [] containing the keyword inside | |
Example, | |
Words: sky cloud google search sky work blue | |
Keywords: sky blue | |
Return: sky work blue | |
input = "sky cloud google search sky work blue" | |
keyword = "cloud sky" | |
Return = "sky cloud" | |
''' | |
def find_shortest_substring(input, keywords): | |
input_words = input.split(' ') | |
targets = keywords.split(' ') | |
keyword_map = {} | |
for i, w in enumerate(targets): | |
keyword_map[w] = 0 | |
answer = None | |
spos = None | |
for i, word in enumerate(input_words): | |
if word in keyword_map: | |
if spos is None: | |
spos = i | |
keyword_map[word] += 1 | |
else: | |
continue | |
if is_found(keyword_map) == True: | |
epos = i | |
if answer is None: | |
answer = input_words[spos: epos + 1] | |
for p in range(spos, epos+1): | |
cur = input_words[p] | |
spos = p | |
if cur in keyword_map: | |
keyword_map[cur] -= 1 | |
if keyword_map[cur] <= 0: | |
if epos - spos +1 < len(answer): | |
answer = input_words[spos: epos + 1] | |
# move spos forward by one since we removed what spos was pointing | |
spos +=1 | |
break | |
return " ".join(answer) | |
def is_found(keyword_map): | |
count = 0 | |
for k in keyword_map: | |
if keyword_map[k] > 0: | |
count += 1 | |
print(keyword_map) | |
return count == len(keyword_map) | |
# test code | |
input = "sky cloud google search sky work blue" | |
keyword = "sky blue" | |
minimum_span = find_shortest_substring(input, keywords=keyword) | |
print(minimum_span) | |
assert minimum_span == 'sky work blue' | |
input = "sky cloud google search sky work blue" | |
keyword = "cloud sky" | |
minimum_span = find_shortest_substring(input, keywords=keyword) | |
print(minimum_span) | |
assert minimum_span == 'sky cloud' | |
input = "sky cloud google search sky work blue" | |
keyword = "sky cloud blue" | |
minimum_span = find_shortest_substring(input, keywords=keyword) | |
print(minimum_span) | |
assert minimum_span == 'cloud google search sky work blue' | |
input = "sky cloud google search sky work blue" | |
keyword = "google work" | |
minimum_span = find_shortest_substring(input, keywords=keyword) | |
print(minimum_span) | |
assert minimum_span == 'google search sky work' |
Here is the python solution. This code has lots of room for improvement if I had more time.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FindShortestString: | |
def find(self, words, keywords): | |
#create a hashmap to find answer | |
kmap = {} | |
for k in keywords: | |
kmap[k] = 0 | |
answer = [] | |
spos = epos = None | |
for i, w in enumerate(words): | |
# if w is keyword and first encountered | |
if w in kmap: | |
#update spos if it was not set | |
spos = i if kmap[w] == 0 and spos == None else spos | |
kmap[w] +=1 | |
else: | |
continue | |
#check if we found all the words | |
if self.is_found(kmap): | |
epos = i | |
answer = words[spos:epos+1] | |
#refine the answer if possible. move spos as much as possible. | |
for i in range(spos, epos + 1): | |
current = words[i] | |
spos = i | |
if current in kmap: | |
kmap[current] -= 1 | |
if kmap[current] <= 0: | |
#update date the answer | |
if epos - spos + 1 < len(answer): | |
answer = words[spos:epos+1] | |
#move the spos to next one and reset counter | |
spos = None | |
break | |
return answer | |
def is_found(self, map): | |
count = 0 | |
for k in map: | |
if map[k] > 0: | |
count +=1 | |
return count == len(map) | |
f = FindShortestString() | |
words = ["sky", "cloud", "blah", "test", "google", "bright" ,"search", "sky", "work", "bright", "test2","blue"] | |
keywords = ["sky", "bright" ,"blue"] | |
print ("\nwords = {}, keywords = {}, answer = {}".format(words, keywords, f.find(words, keywords))) | |
words = ["sky", "cloud", "google", "search", "sky", "work" ,"blue"] | |
keywords = ["sky","blue"] | |
print ("\nwords = {}, keywords = {}, answer = {}".format(words, keywords, f.find(words, keywords))) | |
words = ["sky", "cloud", "blah", "test", "google", "bright" ,"search", "bright", "bright", "sky", "test2","blue"] | |
keywords = ["sky", "bright" ,"blue"] | |
print ("words = {}, keywords = {}, answer = {}\n\n".format(words, keywords, f.find(words, keywords))) |
Practice statistics:
42:34 to write up the code without test cases (I should move faster next time) - yay!! I wrote the code in under 45 mins
06:04: to write up the test cases and refactor the code
03:41: debug the code. --> This should not happen next time. !!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def find_closest_bigger_number(array, target, next_closest): | |
if(len(array) == 1): | |
return array[0] if (array[0] > target) and (next_closest - target) > (array[0] - target) else next_closest | |
half = len(array)/2 | |
if array[half] > target: | |
next_closest = array[half] if (next_closest - target) > (array[half] - target) else next_closest | |
return find_closest_bigger_number(array[:half], target, next_closest) | |
else: | |
return find_closest_bigger_number(array[half+1:], target, next_closest) | |
def find_shortest_sentence(keywords, words): | |
keywordHash ={} | |
answer =[] | |
shortest = sys.maxint | |
keyword_occurrence_map = [] | |
for pos, keyword in enumerate(keywords): | |
keywordHash[keyword] = pos | |
keyword_occurrence_map.append([]) | |
#go through the words and count the occurrence, O(N) | |
for p, word in enumerate(words): | |
matched_key_word_pos = keywordHash.get(word, -1) | |
if matched_key_word_pos == -1: | |
continue | |
keyword_occurrence_map[matched_key_word_pos].append(p) | |
for first_pos in keyword_occurrence_map[0]: | |
prev_pos = first_pos | |
last_pos = sys.maxint | |
for next_word_occurrence_map in keyword_occurrence_map[1:] : | |
next_word_pos = find_closest_bigger_number(next_word_occurrence_map, prev_pos, sys.maxint) | |
if (next_word_pos == sys.maxint): | |
last_pos = sys.maxint | |
break | |
last_pos = next_word_pos | |
if last_pos == sys.maxint: | |
continue | |
if (last_pos - first_pos < shortest): | |
shortest = last_pos - first_pos | |
answer= [first_pos, last_pos] | |
return answer | |
''' | |
2. keywords can be found in any order | |
''' | |
if __name__ == "__main__": | |
words = ["sky", "cloud", "blah", "test", "google", "search", "blue", "sky", "work", "bright", "test2","blue"] | |
keywords = ["sky", "bright" ,"blue"] | |
answer = find_shortest_sentence(keywords, words) | |
if (len(answer) > 0): | |
print " ".join(word for word in words[answer[0]: answer[1]+1]) | |
print "\n" | |
else: | |
print "no matching sentence found" | |
#this code will fail for this input. | |
words2 = ["sky", "cloud", "blah", "test", "google", "bright" ,"search", "sky", "work", "bright", "test2","blue"] | |
keywords2 = ["sky", "bright" ,"blue"] | |
answer = find_shortest_sentence(keywords2, words2) | |
if (len(answer) > 0): | |
print " ".join(word for word in words2[answer[0]: answer[1]+1]) | |
print "\n" | |
else: | |
print "no matching sentence found" | |
#original question | |
words3 = ["sky", "cloud", "google", "search", "sky", "work", "blue"] | |
keywords3 = ["sky", "blue"] | |
answer = find_shortest_sentence(keywords3, words3) | |
if (len(answer) > 0): | |
print " ".join(word for word in words3[answer[0]: answer[1]+1]) | |
print "\n" | |
else: | |
print "no matching sentence found" |
Comments
Post a Comment