All files / ramekin/lib simple-cluster.js

81.97% Statements 50/61
88.89% Branches 16/18
87.5% Functions 7/8
82.69% Lines 43/52

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116      1x       1x         1x   91x       1x 1x             13x       13x   13x 1105x 46501x 51x 51x         13x 1x 1x   12x         5097x 5097x 4556x   541x       12x 12x   12x   12x       1x                       1x 91x 91x   4186x         1x 1x 12x   12x   12x 1026x     12x     12x 1014x     1x       1x 1x       1x  
/**
 * Simple clustering algorithm.
 */
const _ = require('lodash')
 
class SimpleCluster {
  constructor (trendPhrases) {
    Iif (trendPhrases.length === 0) {
      console.error('No phrases to cluster')
      return
    }
    // let c = []
    this.minDistance = 0.3
    // create initial clusters & populate the distance matrix
    this.c = trendPhrases.map((phrase, i) => ({
      phrases: [phrase.phrase],
      docs: phrase.docs,
      score: [phrase.score] }))
    this.d = []
    this.d.fill([], 0, trendPhrases.length)
  }
 
  // find most match row, then match all those elements within
  // a certain similarity
  closetMatch (d, threshold) {
    // nothing left to cluster -> everything has already clustered
    Iif (d.length === 1) {
      return undefined
    }
    // @todo: add validation.
    var min = {i: 0, j: 1} // point to the first non symetrical match
    // find the closest matches
    for (let i = 0; i < d.length; i++) {
      for (let j = i + 1; j < d.length; j++) {
        if ((i !== j) && (d[i][j] < d[min.i][min.j])) {
          min.i = i
          min.j = j
        }
      }
    }
    //console.log('d.length', d.length)
    if (d[min.i][min.j] > threshold) {
      console.log(d[min.i][min.j] + ' is above the threshold, so we won\'t be merging')
      return undefined
    }
    return min
  }
 
  distance (a, b) {
    // replace with Sets
    let matches = _.intersection(a.docs, b.docs)
    if (matches.length === 0) {
      return 1
    }
    return 1 - (matches.length / Math.min(a.docs.length, b.docs.length))
  }
 
  merge (i, j) {
    i.phrases = i.phrases.concat(j.phrases)
    i.score = i.score.concat(j.score)
    // merge docs & remove duplicates
    i.docs = _.uniq(i.docs.concat(j.docs))
    
    return i
  }
 
  hcluster (distance, merge, closetMatch, c, d, minDistance) {
    const formatD = function (d) {
      let s = ''
      for (let i = 0; i < d.length; i++) {
        for (let j = 0; j < d.length; j++) {
          s += d[i][j] + '\t'
        }
        s += '\r\n'
      }
      return s
    }
 
    // calculate the initial distance matrix
    for (let i = 0; i < c.length; i++) {
      d[i] = d[i] || []
      for (let j = 0; j <= i; j++) {
        //console.log(c[i], c[j], distance(c[i], c[j]))
        d[i][j] = d[j][i] = (i === j) ? Infinity : distance(c[i], c[j])
      }
    }
 
    // the while condition could die???
    let match = null
    while ((match = closetMatch(d, minDistance)) !== undefined) {
      c[ match.i ] = merge(c[match.i], c[match.j])
      // remove the jth cluster
      c.splice(match.j, 1)
      // remove the jth column
      for (let i = 0; i < d.length; i++) {
        d[i].splice(match.j, 1)
      }
      // remove the jth row
      d.splice(match.j, 1)
 
      // recompute the distance matrix
      for (var i = 0; i < d.length; i++) {
        d[match.i][i] = d[i][match.i] = i === match.i ? Infinity : distance(c[match.i], c[i])
      }
    }
    return c
  }
 
  cluster () {
    this.hcluster(this.distance, this.merge, this.closetMatch, this.c, this.d, this.minDistance)
    return this.c
  }
}
 
module.exports = SimpleCluster