Files
talk/services/wordlist.js
T
Wyatt Johnson 2f7f0249b2 Added status changing + history
- Status's for comments is now an array of objects
2016-12-05 11:28:27 -05:00

165 lines
4.7 KiB
JavaScript

const debug = require('debug')('talk:services:wordlist');
const _ = require('lodash');
const natural = require('natural');
const tokenizer = new natural.WordTokenizer();
const Setting = require('../models/setting');
/**
* The root wordlist object.
* @type {Object}
*/
const wordlist = {
list: [],
enabled: false
};
/**
* Loads wordlists in from the naughty-words package based on languages
* selected.
* @param {Array} languages language codes to add to the wordlist
*/
wordlist.init = () => {
return Setting
.retrieve()
.then((settings) => {
// Insert the settings wordlist.
wordlist.insert(settings.wordlist);
});
};
/**
* Inserts the wordlist data and enables the wordlist.
* @param {Array} list list of words to be added to the wordlist
*/
wordlist.insert = (list) => {
// Add the words to this array, but also lowercase the words so that an
// easy comparison can take place.
wordlist.list = _.uniq(wordlist.list.concat(list.map((word) => {
return tokenizer.tokenize(word.toLowerCase());
})));
debug(`Added ${list.length} words to the wordlist, now the wordlist is ${wordlist.list.length} entries long.`);
// Enable the wordlist.
wordlist.enabled = true;
return Promise.resolve(wordlist);
};
/**
* Tests the phrase to see if it contains any of the defined blockwords.
* @param {String} phrase value to check for blockwords.
* @return {Boolean} true if a blockword is found, false otherwise.
*/
wordlist.match = (phrase) => {
// Lowercase the word to ensure that we don't miss a match due to
// capitalization.
let lowerPhraseWords = tokenizer.tokenize(phrase.toLowerCase());
// This will return true in the event that at least one blockword is found
// in the phrase.
return wordlist.list.some((blockphrase) => {
// First, let's see if we can find the first word in the blockphrase in the
// source phrase.
let idx = lowerPhraseWords.indexOf(blockphrase[0]);
if (idx === -1) {
// The first blockword in the blockphrase did not match the source phrase
// anywhere.
return false;
}
// Here we'll quick respond with true in the event that the blockphrase was
// just a single word.
if (blockphrase.length === 1) {
return true;
}
// We found the first word in the source phrase! Lets ensure it matches the
// rest of the blockphrase...
// Check to see if it even has the length to support this word!
if (lowerPhraseWords.length < idx + blockphrase.length - 1) {
// We couldn't possibly have the entire phrase here because we don't have
// enough entries!
return false;
}
for (let i = 1; i < blockphrase.length; i++) {
// Check to see if the next word also matches!
if (lowerPhraseWords[idx + i] !== blockphrase[i]) {
return false;
}
}
// We've walked over all the words of the blockphrase, and haven't had a
// mismatch... It does contain the whole word!
return true;
});
};
// ErrContainsProfanity is returned in the event that the middleware detects
// profanity/wordlisted words in the payload.
const ErrContainsProfanity = new Error('contains profanity');
ErrContainsProfanity.status = 400;
/**
* Connect middleware for scanning request bodies for wordlisted words and
* attaching a ErrContainsProfanity to the req.wordlisted parameter, otherwise
* it will just set that parameter to false.
* @param {Array} fields selectors for the body to extract the fields to be
* tested
* @return {Function} the Connect middleware
*/
wordlist.filter = (...fields) => (req, res, next) => {
// Start with the sensible default that the content does not contain
// profanity.
req.wordlist = {
matched: false
};
// If the wordlist isn't enabled, then don't actually perform checking and
// forward the request!
if (!wordlist.enabled) {
return next();
}
// Loop over all the fields from the body that we want to check.
const containsProfanity = fields.some((field) => {
let phrase = _.get(req.body, field, false);
// If the field doesn't exist in the body, then it can't be profane!
if (!phrase) {
// Return that there wasn't a profane word here.
return false;
}
// Check if the field contains a profane word.
if (wordlist.match(phrase)) {
debug(`the field "${field}" contained a phrase "${phrase}" which contained a wordlisted word/phrase`);
return true;
}
return false;
});
// The body could contain some profanity, address that here.
if (containsProfanity) {
req.wordlist.matched = ErrContainsProfanity;
}
next();
};
module.exports = wordlist;
module.exports.ErrContainsProfanity = ErrContainsProfanity;