Files
talk/services/wordlist.js
T
Wyatt Johnson 8680d6cab8 Fixes issues with wordlist and usernames
- Ensure that the wordlist can't contain emoji's
- Ensure that username checks user the correct tokenizer
2017-03-29 10:57:12 -06:00

274 lines
7.3 KiB
JavaScript

const debug = require('debug')('talk:services:wordlist');
const _ = require('lodash');
const natural = require('natural');
const tokenizer = new natural.WordTokenizer();
const nameTokenizer = new natural.RegexpTokenizer({pattern: /\_/});
const SettingsService = require('./settings');
const Errors = require('../errors');
// REGEX to prevent emoji's from entering the wordlist.
const EMOJI_REGEX = /(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?(?:\u200d(?:[^\ud800-\udfff]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?)*/;
/**
* The root wordlist object.
* @type {Object}
*/
class Wordlist {
constructor() {
this.lists = {
banned: [],
suspect: []
};
}
/**
* Loads wordlists in from the database
*/
load() {
return SettingsService
.retrieve()
.then((settings) => {
// Insert the settings wordlist.
this.upsert(settings.wordlist);
});
}
/**
* Inserts the wordlist data
* @param {Array} list list of words to be set to the wordlist
*/
upsert(lists) {
// Add the words to this array, but also lowercase the words so that an
// easy comparison can take place.
['banned', 'suspect'].forEach((k) => {
if (!(k in lists)) {
return;
}
this.lists[k] = Wordlist.parseList(lists[k]);
debug(`Added ${lists[k].length} words to the ${k} wordlist.`);
});
return Promise.resolve(this);
}
/**
* Parses the list content.
* @param {Array} list array of words to parse for a list.
* @return {Array} the parsed list
*/
static parseList(list) {
return _.uniq(list.filter((word) => {
if (EMOJI_REGEX.test(word)) {
return false;
}
return true;
})
.map((word) => {
if (word.length === 1) {
return [word];
}
return tokenizer.tokenize(word.toLowerCase());
})
.filter((tokens) => {
if (tokens.length === 0) {
return false;
}
return true;
}));
}
/**
* Tests the phrase to see if it contains any of the defined blockwords.
* @param {String} phrase value to check for blockwords.
* @return {Boolean} true if a blockword is found, false otherwise.
*/
match(list, phrase, tk = tokenizer) {
// Lowercase the word to ensure that we don't miss a match due to
// capitalization.
let lowerPhraseWords = tk.tokenize(phrase.toLowerCase());
// This will return true in the event that at least one blockword is found
// in the phrase.
return list.some((blockphrase) => {
// First, let's see if we can find the first word in the blockphrase in the
// source phrase.
let idx = lowerPhraseWords.indexOf(blockphrase[0]);
if (idx === -1) {
// The first blockword in the blockphrase did not match the source phrase
// anywhere.
return false;
}
// Here we'll quick respond with true in the event that the blockphrase was
// just a single word.
if (blockphrase.length === 1) {
return true;
}
// We found the first word in the source phrase! Lets ensure it matches the
// rest of the blockphrase...
// Check to see if it even has the length to support this word!
if (lowerPhraseWords.length < idx + blockphrase.length - 1) {
// We couldn't possibly have the entire phrase here because we don't have
// enough entries!
return false;
}
for (let i = 1; i < blockphrase.length; i++) {
// Check to see if the next word also matches!
if (lowerPhraseWords[idx + i] !== blockphrase[i]) {
return false;
}
}
// We've walked over all the words of the blockphrase, and haven't had a
// mismatch... It does contain the whole word!
return true;
});
}
/**
* Scans a specific field for wordlist violations.
*/
scan(fieldName, phrase) {
let errors = {};
// If the field doesn't exist in the body, then it can't be profane!
if (!phrase) {
// Return that there wasn't a profane word here.
return errors;
}
// Check if the field contains a banned word.
if (this.match(this.lists.banned, phrase)) {
debug(`the field "${fieldName}" contained a phrase "${phrase}" which contained a banned word/phrase`);
errors.banned = Errors.ErrContainsProfanity;
// Stop looping through the fields now, we discovered the worst possible
// situation (a banned word).
return errors;
}
// Check if the field contains a banned word.
if (this.match(this.lists.suspect, phrase)) {
debug(`the field "${fieldName}" contained a phrase "${phrase}" which contained a suspected word/phrase`);
errors.suspect = Errors.ErrContainsProfanity;
// Continue looping through the fields now, we discovered a possible bad
// word (suspect).
return errors;
}
}
/**
* Perform the filtering based on the loaded wordlists.
*/
filter(body, ...fields) {
// Start with the sensible default that the content does not contain
// profanity.
let errors = {};
// Loop over all the fields from the body that we want to check.
for (let i = 0; i < fields.length; i++) {
let fieldName = fields[i];
let phrase = _.get(body, fieldName, false);
// If the field doesn't exist in the body, then it can't be profane!
if (!phrase) {
// Return that there wasn't a profane word here.
continue;
}
errors = Object.assign(errors, this.scan(fieldName, phrase));
// Check if the field contains a banned word.
if (errors.banned) {
// Stop looping through the fields now, we discovered the worst possible
// situation (a banned word).
break;
}
// Check if the field contains a banned word.
if (errors.suspect) {
// Continue looping through the fields now, we discovered a possible bad
// word (suspect).
continue;
}
}
return errors;
}
/**
* check potential username for banned words
*/
static usernameCheck(username) {
const wl = new Wordlist();
return wl
.load()
.then(() => {
if (!wl.checkName(wl.lists.banned, username)) {
return Errors.ErrContainsProfanity;
}
});
}
checkName(list, name) {
return !this.match(list, name, nameTokenizer);
}
/**
* Connect middleware for scanning request bodies for wordlisted words and
* attaching a ErrContainsProfanity to the req.wordlisted parameter, otherwise
* it will just set that parameter to false.
* @param {Array} fields selectors for the body to extract the fields to be
* tested
* @return {Function} the Connect middleware
*/
static filter(...fields) {
return (req, res, next) => {
// Create a new instance of the Wordlist.
const wl = new Wordlist();
wl
.load()
.then(() => {
// Perform a filtering operation using the new instance of the
// Wordlist.
req.wordlist = wl.filter(req.body, ...fields);
// Call the next piece of middleware.
next();
});
};
}
}
module.exports = Wordlist;