Saturday 14 November 2015

Trie - Live Stream Words Detector

1. Problem Description
"
".

2. Algorithm
The idea is to use Trie structure to build up the dictionary. Feed the live stream into the dictionary (assuming a new character is coming at a time). Go down deep into the tree (Trie data structure) one more level given the newly coming character. Keep a list of valid tree paths and go through the list when a new character is coming.
    - Generate signal to call the API, if a new word is found with the new character.
      Keep this tree path because following character(s) can form new words.
    - Keep this tree path in the list, if does not find a new word but the tree path is valid.
      Because it is one of candidates to form a new word with the following stream.
    - Remove this tree path from the list, if the tree path is invalid.
    - Search the new character in the root of dictionary
        - Call the API and add the tree path into the list, if this single character is a word.
        - Add the tree path into the list, if the tree path is valid.

3. C++ Implementation
This implementation is based on my previous blog entry about Trie structure, Data Structure and Algorithm - Trie.
// ********************************************************************************
// Implementation
// ********************************************************************************
// Header 
#pragma once

#include <list>
#include <string>
#include <vector>

struct DictionaryTrie;

class LiveStreamWordsDetector
{
public:
    LiveStreamWordsDetector();
    LiveStreamWordsDetector(std::vector<std::string> const &words);
    ~LiveStreamWordsDetector();

    void AddWordIntoDict(std::string const& word);
    void AddWordsIntoDict(std::vector<std::string> const &words);

    void AppendLiveStream(char const ch,
                          std::vector<std::string> &liveWords);

private:
    DictionaryTrie* mDictRoot;
    std::list<DictionaryTrie*> mPossibleWords;
};

// CPP
#include "LiveStreamWordsDetector.h"

#include "DictionaryTrie.h"
#include "DictionaryTrie_internal.h"

LiveStreamWordsDetector::LiveStreamWordsDetector()
: mDictRoot(new DictionaryTrie())
{
}

LiveStreamWordsDetector::LiveStreamWordsDetector(std::vector<std::string> const &words)
: mDictRoot(new DictionaryTrie())
{
    AddWordsIntoDict(words);
}

LiveStreamWordsDetector::~LiveStreamWordsDetector()
{
    DestoryDictionaryTrie(mDictRoot);
    mDictRoot = nullptr;
}

void LiveStreamWordsDetector::AddWordIntoDict(std::string const &word)
{
    AddWord(mDictRoot, word.c_str());
}

void LiveStreamWordsDetector::AddWordsIntoDict(std::vector<std::string> const &words)
{
    auto iterEnd = words.end();
    for (auto iter = words.begin(); iter != iterEnd; ++iter) {
        AddWordIntoDict(*iter);
    }
}

void LiveStreamWordsDetector::AppendLiveStream(char const ch, std::vector<std::string> &words)
{
    words.clear();
    DictionaryTrie* temp = nullptr;
    //  check the previous valid words
    for (auto iter = mPossibleWords.begin(); iter != mPossibleWords.end();) {
        temp = LocateChar(*iter, ch);
        if (temp){
            // replace with the new pointer after appending the new character
            *iter = temp;
            if (temp->str) {
                // a word found
                words.push_back(temp->str);
            }
            ++iter;
        }
        else {
            // remove from the list if the appending char makes the word invalid
            iter = mPossibleWords.erase(iter);
        }
    }
    // add the new starting character into list if exists
    temp = LocateChar(mDictRoot, ch);
    if (temp) {
        mPossibleWords.push_front(temp);
        if (temp->str) {
            // a word found - single character word
            words.push_back(&ch);
        }
    }
}

// ********************************************************************************
// Test
// ********************************************************************************
void TestLiveStream()
{
    const std::vector<std::string> words =
        {"ok", "test", "one", "try", "trying"};
    LiveStreamWordsDetector lsd(words);

    const std::string input("abcokdeftrying");
    std::vector<std::string> output;

    lsd.AppendLiveStream('a', output);
    assert(output.empty());
    lsd.AppendLiveStream('b', output);
    assert(output.empty());
    lsd.AppendLiveStream('c', output);
    assert(output.empty());
    lsd.AppendLiveStream('o', output);
    assert(output.empty());
    lsd.AppendLiveStream('k', output);
    assert(output.size() == 1);
    assert(output[0] == "ok");

    lsd.AppendLiveStream('d', output);
    assert(output.empty());
    lsd.AppendLiveStream('e', output);
    assert(output.empty());
    lsd.AppendLiveStream('f', output);
    assert(output.empty());
    lsd.AppendLiveStream('t', output);
    assert(output.empty());
    lsd.AppendLiveStream('r', output);
    assert(output.empty());
    lsd.AppendLiveStream('y', output);
    assert(output.size() == 1);
    assert(output[0] == "try");
    lsd.AppendLiveStream('i', output);
    assert(output.empty());
    lsd.AppendLiveStream('n', output);
    assert(output.empty());
    lsd.AppendLiveStream('g', output);
    assert(output.size() == 1);
    assert(output[0] == "trying");
}

No comments:

Post a Comment