all files / packages/search/processors/ search-index.js

28.21% Statements 11/39
0% Branches 0/26
11.11% Functions 1/9
28.21% Lines 11/39
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114                                                                                                                                                                                                             
'use strict';
 
var _ = require('lodash');
var stemmer = require('stemmer');
var tokenizer = require('keyword-extractor');
var decode = require('ent/decode');
var bloom = require('bloomfilter');
var ByteBuffer = require("bytebuffer");
 
module.exports = function generateSearchIndexProcessorFactory(aliasMap, log, renderMarkdown) {
  /**
   *  Removes code blocks, tags and other useless content from string
   */
  function cleanText(md) {
    var res = '';
    if (md) {
      res = decode(md.replace(/[\r\n]+/g, ' ').replace(/<code.+?code>/mgi, ' ').replace(/<.+?>/gm, ' '));
    }
    return res;
  }
 
  var debug = log.debug;
 
  var errorRate = 0.001;
 
  return {
    $runAfter: ['inlineTagProcessor'],
    $runBefore: ['writing-files'],
    errorRate: function (v) {
      if (v) {
        errorRate = v;
        return this;
      }
      return errorRate;
    },
    $process: function generateSearchIndexProcessor (docs) {
 
      var tokens = [];
      var buffer = new ByteBuffer(0);
 
      var index = docs.filter(function (v) {
        // build search index for rendered docs only
        // do not dig down to the document sources
        // and skip extra content
        return ['componentGroup', 'config', 'nav-data', 'website'].indexOf(v.docType) < 0 &&
               !!v.renderedContent;
      }).map(function(v) {
        // for each document extract only certain properties
        var index = tokenizer.extract(
          ['events', 'properties', 'methods', 'name', 'codeName', 'renderedContent', 'docType'].map(function(k) {
            if ('string' === typeof v[k]) {
              // rendered content should be cleaned a bit
              return 'renderedContent' === k ? cleanText(v[k]) : v[k];
            } else if (v[k]) {
              var m = v[k];
              return [m.codeName || '',
                      m.name || '',
                      cleanText(m.renderedContent || ''),
                      m.alias || '',
                      m.eventTarget || '',
                      m.eventType || ''
              ].join(' ');
            } else {
              return '';
            }
          }).join(' ').replace(/(\$(\w+))/g, '$1 $2'), {
          language: "english",
          remove_digits: true,
          return_changed_case: true,
          remove_duplicates: true
        });
 
        var filter = new bloom.BloomFilter(
          Math.ceil(index.length * Math.abs(Math.log(errorRate)) / (Math.pow(Math.LN2, 2))),
          Math.ceil(Math.log2(1/errorRate)));
 
        index.forEach(function(v) {
          filter.add(v);
        })
 
        buffer.ensureCapacity(buffer.limit + filter.buckets.length*4);
 
        // append filter
        // use lodash for node <4.0 compat because buckets is a typed array
        // TODO: revert to array method by the end of node 0.12 LTS (https://github.com/nodejs/LTS#lts_schedule)
        _.forEach(filter.buckets, function(v) {
          buffer.writeInt32(v);
        })
 
        // update index
        tokens.push({
          name: v.name,
          path: v.areaKey || v.path,
          type: v.docType,
          size: filter.buckets.length
        });
      });
 
      docs.push({
        docType: 'search-data-index',
        id: 'search-index',
        name: 'search.index',
        renderedContent: buffer.toBinary(0,buffer.offset)
      });
 
      docs.push({
        docType: 'search-data',
        id: 'search-meta',
        renderedContent: JSON.stringify(tokens)
      });
    }
  };
};