/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"use strict";
var doublearray = require("doublearray");
var DynamicDictionaries = require("../dict/DynamicDictionaries.js");
var TokenInfoDictionary = require("../dict/TokenInfoDictionary.js");
var ConnectionCosts = require("../dict/ConnectionCosts.js");
var UnknownDictionary = require("../dict/UnknownDictionary.js");
var CharacterDefinition = require("../dict/CharacterDefinition.js"); // TODO Remove this dependency
/**
* Build dictionaries (token info, connection costs)
*
* Generates from matrix.def
* cc.dat: Connection costs
*
* Generates from *.csv
* dat.dat: Double array
* tid.dat: Token info dictionary
* tid_map.dat: targetMap
* tid_pos.dat: posList (part of speech)
*/
function DictionaryBuilder() {
// Array of entries, each entry in Mecab form
// (0: surface form, 1: left id, 2: right id, 3: word cost, 4: part of speech id, 5-: other features)
this.tid_entries = [];
this.unk_entries = [];
this.matrix_text = "0 0";
this.char_text = "";
}
DictionaryBuilder.prototype.addTokenInfoDictionary = function (text) {
var new_entries = text.split(/\n/).map(function (row) {
return row.split(",");
});
this.tid_entries = this.tid_entries.concat(new_entries);
return this;
};
/**
*
* @param {string} matrix_text Contents of file "matrix.def"
* @returns {DictionaryBuilder}
*/
DictionaryBuilder.prototype.costMatrix = function (matrix_text) {
this.matrix_text = matrix_text;
return this;
};
DictionaryBuilder.prototype.charDef = function (char_text) {
this.char_text = char_text;
return this;
};
DictionaryBuilder.prototype.unkDef = function (text) {
this.unk_entries = text.split(/\n/).map(function (row) {
return row.split(",");
});
return this;
};
DictionaryBuilder.prototype.build = function () {
var dictionaries = this.buildTokenInfoDictionary();
var connection_costs = this.buildConnectionCosts();
var unknown_dictionary = this.buildUnknownDictionary();
return new DynamicDictionaries(dictionaries.trie, dictionaries.token_info_dictionary, connection_costs, unknown_dictionary);
};
/**
* Build TokenInfoDictionary
*
* @returns {{trie: *, token_info_dictionary: *}}
*/
DictionaryBuilder.prototype.buildTokenInfoDictionary = function () {
var token_info_dictionary = new TokenInfoDictionary();
// using as hashmap, string -> string (word_id -> surface_form) to build dictionary
var dictionary_entries = token_info_dictionary.buildDictionary(this.tid_entries);
var trie = this.buildDoubleArray();
for (var token_info_id in dictionary_entries) {
var surface_form = dictionary_entries[token_info_id];
var trie_id = trie.lookup(surface_form);
// Assertion
// if (trie_id < 0) {
// console.log("Not Found:" + surface_form);
// }
token_info_dictionary.addMapping(trie_id, token_info_id);
}
return {
trie: trie,
token_info_dictionary: token_info_dictionary
};
};
DictionaryBuilder.prototype.buildUnknownDictionary = function () {
var unk_dictionary = new UnknownDictionary();
// using as hashmap, string -> string (word_id -> surface_form) to build dictionary
var dictionary_entries = unk_dictionary.buildDictionary(this.unk_entries);
var char_def = CharacterDefinition.readCharacterDefinition(this.char_text); // Create CharacterDefinition (factory method)
unk_dictionary.characterDefinition(char_def);
for (var token_info_id in dictionary_entries) {
var class_name = dictionary_entries[token_info_id];
var class_id = char_def.invoke_definition_map.lookup(class_name);
// Assertion
// if (trie_id < 0) {
// console.log("Not Found:" + surface_form);
// }
unk_dictionary.addMapping(class_id, token_info_id);
}
return unk_dictionary;
};
/**
* Build connection costs dictionary
*/
DictionaryBuilder.prototype.buildConnectionCosts = function () {
return ConnectionCosts.build(this.matrix_text);
};
/**
* Build double array trie
*
* @returns {DoubleArray} Double-Array trie
*/
DictionaryBuilder.prototype.buildDoubleArray = function () {
var trie_id = 0;
var words = this.tid_entries.map(function (entry) {
var surface_form = entry[0];
return { k: surface_form, v: trie_id++ };
});
var builder = doublearray.builder(1024 * 1024);
return builder.build(words);
};
module.exports = DictionaryBuilder;