// wink-sentiment
// Accurate and fast sentiment scoring of phrases with hashtags, emoticons & emojis.
//
// Copyright (C) 2017-18 GRAYPE Systems Private Limited
//
// This file is part of “wink-sentiment”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var emojis = require( './emojis.js' );
var afinn = require( './afinn-en-165.js' );
var emoticons = require( './emoticons.js' );
var negations = require( './negations.js' );
var affin2Grams = require( './afinn-en-165-2grams.js' );
var tokenize = require( 'wink-tokenizer' )().tokenize;
/* eslint max-depth: 0 */
// ### normalize
/**
*
* Computes the normalized sentiment score from the absolute scores.
*
* @param {number} hss absolute sentiment scrore of hashtags.
* @param {number} wss absolute sentiment scrore of words/emojis/emoticons.
* @param {number} sentiHashtags number of hashtags that have an associated sentiment score.
* @param {number} sentiWords wnumber of words that have an associated sentiment score.
* @param {number} totalWords total number of words in the text.
* @return {number} normalized score.
* @private
*/
var normalize = function ( hss, wss, sentiHashtags, sentiWords, totalWords ) {
// **N**ormalized **h**ashtags & **w**ords **s**entiment **s**cores.
let nhss = 0,
nwss = 0;
// 1. Normalize hashtags sentiment score by computing the average.
if ( sentiHashtags ) nhss = hss / sentiHashtags;
if ( sentiWords ) {
// 2. Normalize words sentiment score by computing the average.
nwss = wss / sentiWords;
// 3. Normalized words sentiment score is further adjusted on the basis of the
// total number of words in the text.
// Average sentence length in words (assumed).
const avgLength = 15;
// Make adjustments.
nwss /= Math.sqrt( ( totalWords > avgLength ) ? ( totalWords / avgLength ) : 1 );
}
return ( nhss && nwss ) ? ( ( nhss + nwss ) / 2 ) : ( nwss || nhss );
}; // normalize()
// ### sentiment
/**
*
* Computes the absolue and normalized sentiment scores of the input `phrase`,
* after tokenizing it.
*
* The normalized score is computed by taking into account of absolute scores of
* words, emojis, emoticons, and hashtags and adjusting it on the basis of total
* words in the text; this is always between -5 and +5. A score of less than 0 indicates
* negative sentiments and a score of more than 0 indicates positive sentiments;
* wheras a near zero score suggests a neutral sentiment. While counting tokens
* only the ones tagged as **`word`**, **`emoji`**, or **`emoticon`** are counted;
* and one letter words are ignored.
*
* It performs tokenization using [wink-tokenizer](http://winkjs.org/wink-tokenizer/).
* During sentiment analysis, each token may be assigned up to 3 new properties.
* These properties are:
*
* 1. **`score`** — contains the sentiment score of the word, emoji, emoticon or hashtag, which is always
* between -5 and +5. This is added only when the word in question has a positive or
* negative sentiment associated with it.
* 2. **`negation`** — is added & set to **true** whenever the `score` of the
* token has beeen impacted due to a negation word apprearing prior to it.
* 3. **`grouped`** — is added whenever, the token is the first
* word of a short idiom or a phrase. It's value provides the number of tokens
* that have been grouped together to form the phrase/idiom.
*
* @param {string} phrase whoes sentiment score needs to be computed.
* @return {object} absolute `score`, `normalizedScore` and `tokenizedPhrase` of `phrase`.
*
* @example
* sentiment( 'not a good product #fail' );
* // -> { score: -5,
* // normalizedScore: -2.5,
* // tokenizedPhrase: [
* // { value: 'not', tag: 'word' },
* // { value: 'a', tag: 'word' },
* // { value: 'good', tag: 'word', negation: true, score: -3 },
* // { value: 'product', tag: 'word' },
* // { value: '#fail', tag: 'hashtag', score: -2 }
* // ]
* // }
*/
var sentiment = function ( phrase ) {
if ( typeof phrase !== 'string' ) {
throw Error( 'wink-sentiment: input phrase must be a string, instead found: ' + typeof phrase );
}
// Early exit.
var tokenizedPhrase = tokenize( phrase );
if ( tokenizedPhrase.length === 0 ) return { score: 0, normalizedScore: 0 };
// Sentiment Score.
var ss = 0;
// Hash Tags SS.
var hss = 0;
// Number of sentiment containing hashtags and words encountered.
var sentiHashtags = 0,
sentiWords = 0;
// Number of words encountered.
var words = 0;
// Helpers: for loop indexes, token, temp ss, and word count.
var k, kmax, t, tkn, tss, wc;
for ( k = 0, kmax = tokenizedPhrase.length; k < kmax; k += 1 ) {
tkn = tokenizedPhrase[ k ];
t = tkn.value;
switch ( tkn.tag ) {
case 'emoji':
tss = emojis[ t ];
if ( tss ) {
ss += tss;
tkn.score = tss;
sentiWords += 1;
}
words += 1;
break;
case 'emoticon':
tss = emoticons[ t ];
if ( tss ) {
ss += tss;
tkn.score = tss;
sentiWords += 1;
}
words += 1;
break;
case 'hashtag':
tss = afinn[ t.slice( 1 ).toLowerCase() ];
if ( tss ) {
tkn.score = tss;
hss += tss;
sentiHashtags += 1;
}
break;
case 'word':
t = t.toLowerCase();
wc = 1;
// Check for bigram configurations i.e. token at `k` and `k+1`. Accordingly
// compute the sentiment score in `tss`. Convert to Lower Case for case insensitive comparison.
if ( ( k < ( kmax - 1 ) ) && affin2Grams[ t ] && ( affin2Grams[ t ][ tokenizedPhrase[ k + 1 ].value.toLowerCase() ] !== undefined ) ) {
tss = affin2Grams[ t ][ tokenizedPhrase[ k + 1 ].value.toLowerCase() ];
tkn.grouped = 1;
// Will have to count `2` words!
wc = 2;
// sentiWords += 1;
} else {
tss = afinn[ t ] || 0;
// sentiWords += 1;
}
// Check for negation — upto two words ahead; even a bigram AFINN config may be negated! Convert to Lower Case for case insensitive comparison.
if ( ( k > 0 && negations[ tokenizedPhrase[ k - 1 ].value.toLowerCase() ] ) || ( k > 1 && negations[ tokenizedPhrase[ k - 2 ].value.toLowerCase() ] ) ) {
tss = -tss;
tkn.negation = true;
}
ss += tss;
// Increment `k` by 1 if a bigram config was found earlier i.e. `wc` was set to **2**.
k += ( wc - 1 );
if ( tss ) {
tkn.score = tss;
sentiWords += 1;
}
// Update number of words accordingly.
words += wc;
break;
default:
// Do Nothing!
} // swtich ( t.tag )
}
// if ( words === 0 ) words = 1;
// Return score and its normalized value.
return {
score: ( ss + hss ),
normalizedScore: +( normalize( hss, ss, sentiHashtags, sentiWords, words ) ).toFixed( 4 ),
tokenizedPhrase: tokenizedPhrase
};
}; // sentiment()
module.exports = sentiment;