All files / ramekin/lib simple-cluster.js

0% Statements 0/61
0% Branches 0/18
0% Functions 0/8
0% Lines 0/52

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116                                                                                                                                                                                                                                       
/**
 * Simple clustering algorithm.
 */
const _ = require('lodash')
 
class SimpleCluster {
  constructor (trendPhrases) {
    if (trendPhrases.length === 0) {
      console.error('No phrases to cluster')
      return
    }
    // let c = []
    this.minDistance = 0.3
    // create initial clusters & populate the distance matrix
    this.c = trendPhrases.map((phrase, i) => ({
      phrases: [phrase.phrase],
      docs: phrase.docs,
      score: [phrase.score] }))
    this.d = []
    this.d.fill([], 0, trendPhrases.length)
  }
 
  // find most match row, then match all those elements within
  // a certain similarity
  closetMatch (d, threshold) {
    // nothing left to cluster -> everything has already clustered
    if (d.length === 1) {
      return undefined
    }
    // @todo: add validation.
    var min = {i: 0, j: 1} // point to the first non symetrical match
    // find the closest matches
    for (let i = 0; i < d.length; i++) {
      for (let j = i + 1; j < d.length; j++) {
        if ((i !== j) && (d[i][j] < d[min.i][min.j])) {
          min.i = i
          min.j = j
        }
      }
    }
    //console.log('d.length', d.length)
    if (d[min.i][min.j] > threshold) {
      console.log(d[min.i][min.j] + ' is above the threshold, so we won\'t be merging')
      return undefined
    }
    return min
  }
 
  distance (a, b) {
    // replace with Sets
    let matches = _.intersection(a.docs, b.docs)
    if (matches.length === 0) {
      return 1
    }
    return 1 - (matches.length / Math.min(a.docs.length, b.docs.length))
  }
 
  merge (i, j) {
    i.phrases = i.phrases.concat(j.phrases)
    i.score = i.score.concat(j.score)
    // merge docs & remove duplicates
    i.docs = _.uniq(i.docs.concat(j.docs))
    
    return i
  }
 
  hcluster (distance, merge, closetMatch, c, d, minDistance) {
    const formatD = function (d) {
      let s = ''
      for (let i = 0; i < d.length; i++) {
        for (let j = 0; j < d.length; j++) {
          s += d[i][j] + '\t'
        }
        s += '\r\n'
      }
      return s
    }
 
    // calculate the initial distance matrix
    for (let i = 0; i < c.length; i++) {
      d[i] = d[i] || []
      for (let j = 0; j <= i; j++) {
        //console.log(c[i], c[j], distance(c[i], c[j]))
        d[i][j] = d[j][i] = (i === j) ? Infinity : distance(c[i], c[j])
      }
    }
 
    // the while condition could die???
    let match = null
    while ((match = closetMatch(d, minDistance)) !== undefined) {
      c[ match.i ] = merge(c[match.i], c[match.j])
      // remove the jth cluster
      c.splice(match.j, 1)
      // remove the jth column
      for (let i = 0; i < d.length; i++) {
        d[i].splice(match.j, 1)
      }
      // remove the jth row
      d.splice(match.j, 1)
 
      // recompute the distance matrix
      for (var i = 0; i < d.length; i++) {
        d[match.i][i] = d[i][match.i] = i === match.i ? Infinity : distance(c[match.i], c[i])
      }
    }
    return c
  }
 
  cluster () {
    this.hcluster(this.distance, this.merge, this.closetMatch, this.c, this.d, this.minDistance)
    return this.c
  }
}
 
module.exports = SimpleCluster