const debug = require('debug')('talk:services:wordlist'); const _ = require('lodash'); const natural = require('natural'); const tokenizer = new natural.WordTokenizer(); const nameTokenizer = new natural.RegexpTokenizer({pattern: /\_/}); const SettingsService = require('./settings'); const Errors = require('../errors'); // REGEX to prevent emoji's from entering the wordlist. const EMOJI_REGEX = /(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?(?:\u200d(?:[^\ud800-\udfff]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?)*/; /** * The root wordlist object. * @type {Object} */ class Wordlist { constructor() { this.lists = { banned: [], suspect: [] }; } /** * Loads wordlists in from the database */ load() { return SettingsService .retrieve() .then((settings) => { // Insert the settings wordlist. this.upsert(settings.wordlist); }); } /** * Inserts the wordlist data * @param {Array} list list of words to be set to the wordlist */ upsert(lists) { // Add the words to this array, but also lowercase the words so that an // easy comparison can take place. ['banned', 'suspect'].forEach((k) => { if (!(k in lists)) { return; } this.lists[k] = Wordlist.parseList(lists[k]); debug(`Added ${lists[k].length} words to the ${k} wordlist.`); }); return Promise.resolve(this); } /** * Parses the list content. * @param {Array} list array of words to parse for a list. * @return {Array} the parsed list */ static parseList(list) { return _.uniq(list.filter((word) => { if (EMOJI_REGEX.test(word)) { return false; } return true; }) .map((word) => { if (word.length === 1) { return [word]; } return tokenizer.tokenize(word.toLowerCase()); }) .filter((tokens) => { if (tokens.length === 0) { return false; } return true; })); } /** * Tests the phrase to see if it contains any of the defined blockwords. * @param {String} phrase value to check for blockwords. * @return {Boolean} true if a blockword is found, false otherwise. */ match(list, phrase, tk = tokenizer) { // Lowercase the word to ensure that we don't miss a match due to // capitalization. let lowerPhraseWords = tk.tokenize(phrase.toLowerCase()); // This will return true in the event that at least one blockword is found // in the phrase. return list.some((blockphrase) => { // First, let's see if we can find the first word in the blockphrase in the // source phrase. let idx = lowerPhraseWords.indexOf(blockphrase[0]); if (idx === -1) { // The first blockword in the blockphrase did not match the source phrase // anywhere. return false; } // Here we'll quick respond with true in the event that the blockphrase was // just a single word. if (blockphrase.length === 1) { return true; } // We found the first word in the source phrase! Lets ensure it matches the // rest of the blockphrase... // Check to see if it even has the length to support this word! if (lowerPhraseWords.length < idx + blockphrase.length - 1) { // We couldn't possibly have the entire phrase here because we don't have // enough entries! return false; } for (let i = 1; i < blockphrase.length; i++) { // Check to see if the next word also matches! if (lowerPhraseWords[idx + i] !== blockphrase[i]) { return false; } } // We've walked over all the words of the blockphrase, and haven't had a // mismatch... It does contain the whole word! return true; }); } /** * Scans a specific field for wordlist violations. */ scan(fieldName, phrase) { let errors = {}; // If the field doesn't exist in the body, then it can't be profane! if (!phrase) { // Return that there wasn't a profane word here. return errors; } // Check if the field contains a banned word. if (this.match(this.lists.banned, phrase)) { debug(`the field "${fieldName}" contained a phrase "${phrase}" which contained a banned word/phrase`); errors.banned = Errors.ErrContainsProfanity; // Stop looping through the fields now, we discovered the worst possible // situation (a banned word). return errors; } // Check if the field contains a banned word. if (this.match(this.lists.suspect, phrase)) { debug(`the field "${fieldName}" contained a phrase "${phrase}" which contained a suspected word/phrase`); errors.suspect = Errors.ErrContainsProfanity; // Continue looping through the fields now, we discovered a possible bad // word (suspect). return errors; } } /** * Perform the filtering based on the loaded wordlists. */ filter(body, ...fields) { // Start with the sensible default that the content does not contain // profanity. let errors = {}; // Loop over all the fields from the body that we want to check. for (let i = 0; i < fields.length; i++) { let fieldName = fields[i]; let phrase = _.get(body, fieldName, false); // If the field doesn't exist in the body, then it can't be profane! if (!phrase) { // Return that there wasn't a profane word here. continue; } errors = Object.assign(errors, this.scan(fieldName, phrase)); // Check if the field contains a banned word. if (errors.banned) { // Stop looping through the fields now, we discovered the worst possible // situation (a banned word). break; } // Check if the field contains a banned word. if (errors.suspect) { // Continue looping through the fields now, we discovered a possible bad // word (suspect). continue; } } return errors; } /** * check potential username for banned words */ static usernameCheck(username) { const wl = new Wordlist(); return wl .load() .then(() => { if (!wl.checkName(wl.lists.banned, username)) { return Errors.ErrContainsProfanity; } }); } checkName(list, name) { return !this.match(list, name, nameTokenizer); } /** * Connect middleware for scanning request bodies for wordlisted words and * attaching a ErrContainsProfanity to the req.wordlisted parameter, otherwise * it will just set that parameter to false. * @param {Array} fields selectors for the body to extract the fields to be * tested * @return {Function} the Connect middleware */ static filter(...fields) { return (req, res, next) => { // Create a new instance of the Wordlist. const wl = new Wordlist(); wl .load() .then(() => { // Perform a filtering operation using the new instance of the // Wordlist. req.wordlist = wl.filter(req.body, ...fields); // Call the next piece of middleware. next(); }); }; } } module.exports = Wordlist;