How can I load a very large dictionary from JavaScript without freezing the DOM?

I wrote a spell check script that uses a long list of words. I have formatted this big wordlist as a JavaScript object and I am loading the list as a script. So when it is loaded, this very large object is parsed.

dictionary.js

var dictionary = {
   "apple" : "apple",
   "banana" : "banana",
   "orange" : "orange"
}

      

Formatted this way to validate words:

if (dictionary["apple"]){
    //word is valid
}

      

The problem is that this giant parsed object is causing the DOM to freeze significantly.

How can I load my data structure in such a way that I can parse it piece by piece? How can I store / load my data structure so that the DOM can handle it without hovering?

+3


source to share


2 answers


Write your JS file in the form

var words = JSON.parse('{"aardvark": "aardvark", ...}');

      

Parsing JSON will be orders of magnitude faster than a JS parser.

The actual search will be approximately 0.01ms according to my measurements.



Background

There are several aspects to consider when analyzing performance in this situation, including load throughput, parsing, preprocessing, or memory creation and retrieval if necessary. In this case, all other performance issues are overwhelmed by JS parsing time, which can be up to 10 seconds for a 120K input hash. When downloading from a server, the 2.5MB size can be an issue, but it will shrink to 20% of its original size. In terms of search performance, JS hashes are already optimized for fast retrieval; the actual seek time can be less than 0.01 ms, especially for subsequent access to the same key. As far as memory is concerned, there seems to be a couple of ways to optimize this, but most browsers can hold an object of this size without breaking a sweat.

Patricia trie's approach is interesting and mainly deals with memory and bandwidth utilization issues, but in this case they are not the main areas of concern.

+6


source


If you have a lot of words in your list, you can reduce the size of the dictionary using patricia trie . Patricia trie is a special kind of tree that is optimized for finding strings. Basically, each node in a trie is one letter. For example:

var dictionary = {
    'a': {
        '': 1, // a
        'a': {
            '': 1, // aa
            'l': {
                '': 1, // aal
                'ii': 1 // aalii
            },
            'm': 1, // aam
            'ni': 1, // aani
            'r': {
                'd': {
                    'vark': 1, // aardvark
                    'wolf': 1 // aardwolf
                },
                'on': {
                    '': 1, // aaron
                    'ic': 1 // aaronic
                }
            }
        },
    },
}

      

In the above example, I am using an empty string '' (actually a valid property ID!) To denote the end of a word.

You can implement search here:

function isWord_internal(str, dic) {
    var strlen, i, substr, substrlen;
    strlen = str.length;
    for(i = strlen; i > 0; i--) {
        substr = str.slice(0, i);
        substrlen = substr.length;
        if(dic[substr]) {
            if(dic[substr] === 1) {
                if(substrlen === strlen) {
                    return true;
                }
                return false; // end of the line, folks
            }
            if(dic[substr][''] && substrlen === strlen) {
                return true;
            }
            return isWord_internal(str.slice(i), dic[substr]);
        } // else keep going
    }
    // not found
    return false;
}

function isWord(str) {
    return isWord_internal(str, dictionary); // assumes that the dictionary variable exists already
}

      

Since objects are O (log n) hash tables and the algorithm is linear with word size, your performance should be O (n log n).

This should also reduce the size of your list, since English words are not random and often have substrings in common (you probably want to minify it).

To convert the current dictionary, you can use this function to create it in memory:

function patriciafy_internal(str, pat) {
    var i, substr, patkeys, patkeyslen, j, patkey, pos, portion;
    for(i = str.length; i > 0; i--) {
        substr = str.slice(0, i);
        patkeys = Object.keys(pat);
        patkeyslen = patkeys.length;
        for(j = 0; j < patkeyslen; j++) {
            patkey = patkeys[j];
            pos = patkey.indexOf(substr);
            if(pos !== -1) {
                if(pat[patkey] !== 1) {
                    // keep going down the rabbit hole
                    patriciafy_internal(str.slice(i), pat[patkey]);
                } else {
                    // split this node and store the new key
                    portion = patkey.slice(0, pos + 1);
                    delete pat[patkey];
                    pat[portion] = {'': 1};
                    pat[portion][str.slice(1)] = 1;
                }
                return;
            }
        }
    }
    // no substring found - store the whole thing at this level
    pat[str] = 1;
}

function patriciafy(dic) {
    var pat, keys, keyslen, str, i;
    pat = {};
    keys = Object.keys(dic);
    keyslen = keys.length;
    for(i = 0; i < keyslen; i++) {
        // insert this str into the trie
        str = keys[i];
        patriciafy_internal(str, pat);
    }
    return pat;
}

      

Then just use JSON.stringify

to print it for use.

A test on my machine with a 235,887 word dictionary (taken from here ) showed that Patricia's tree was slightly larger than 1/3 smaller than the flat dictionary of words where the flat dictionary format was {"name":1,"name2":1,...}

.

Patricia tree: 2,296,106 characters, Flat dictionary: 3,419,872 characters

This is still huge, but much smaller. Perhaps you can also save a significant amount of space by storing it in a non-JS format like.

var dictionary = {
    'a': {
        '': 1,
        'a': {
            '': 1,
            'l': {
                '': 1,
                'ii': 1
            }
        }
    }
};

      

which reduces to

// 57 characters
var dictionary={'a':{'':1,'a':{'':1,'l':{'':1,'ii':1}}}};

      



can be

// 17 characters - single @ represents ''
@a{@@a{@@l{@@i}}}

      

which you then analyzed.

=====

Patricia tree to - @ - syntax:

function patToAt_internal(pat) {
    var keys, i, s = "", key;
    s += "{";
    keys = Object.keys(pat);
    for(i = 0; i < keys.length; i++) {
        key = keys[i];
        s += "@";
        s += key; // empty string will do nothing
        if(pat[key] !== 1) { // concat the sub-trie
            s += "{" + patToAt_internal(pat[key]) + "}";
        }
    }
    s += "}";
    return s;
}

function patToAt(pat) {
    return patToAt_internal(pat);
}

      

@ is the syntax for the Patricia tree:

function atToPat_internal(at, pos) {

    var s = "", result, ff = true, ch;
    s += "{";
    pos++; // eat { character
    while((ch = at.charAt(pos)) !== "}") {
        // eat @ character
        pos++;
        // comma
        if(!ff) s += ",";
        else ff = false;
        // start key quote
        s += '"';
        // get the key
        while((ch = at.charAt(pos)) !== "@" && ch !== "{" && ch !== "}") {
            s += ch;
            pos++;
        }
        // close key quote and colon
        s += '":';
        // sub-trie
        if(ch === "{") {
            result = atToPat_internal(at, pos);
            s += result.s;
            pos = result.pos + 1;
        } else {
            s += "1";
        }
    }
    s += "}";
    return {pos: pos, s: s};

}

// this part is difficult, so we'll just delegate the heavy lifting to JSON.parse
function atToPat(at) {
    return JSON.parse(atToPat_internal(at, 0).s);
}

      

Using @ -syntax, the total size is reduced to one third of the original flat dictionary:

enter image description here

Summarizing...

Using this approach, you must first create an @ -tree (just once) by doing

var dictionary = { ... } // your words here
// usually you can get all of the log if you triple click, or just output it into a textarea
console.log(patToAt(patriciafy(dictionary)));

      

and then once you have done that, in your page you can do

var dictionary = atToPat("..."); // @-tree goes in the quotes

// and look up things like this:
isWord('apple');

      

+3


source







All Articles