From 9fa1f3d958bfa7c88c1313fc8eced34b958611ba Mon Sep 17 00:00:00 2001 From: Belen Curcio Date: Tue, 22 Aug 2017 16:15:41 -0300 Subject: [PATCH 1/5] Adding auto escape --- client/coral-admin/src/components/CommentBodyHighlighter.js | 1 + 1 file changed, 1 insertion(+) diff --git a/client/coral-admin/src/components/CommentBodyHighlighter.js b/client/coral-admin/src/components/CommentBodyHighlighter.js index 39be90d81..9a430f7c7 100644 --- a/client/coral-admin/src/components/CommentBodyHighlighter.js +++ b/client/coral-admin/src/components/CommentBodyHighlighter.js @@ -17,6 +17,7 @@ export default ({suspectWords, bannedWords, body, ...rest}) => { return ( From b6951488c42fdba9f17e6f59010066bc9e9db71e Mon Sep 17 00:00:00 2001 From: Belen Curcio Date: Tue, 22 Aug 2017 16:37:24 -0300 Subject: [PATCH 2/5] Tokenizer is messing with escaped regexes --- services/wordlist.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/services/wordlist.js b/services/wordlist.js index 2ae8ed90c..477baace6 100644 --- a/services/wordlist.js +++ b/services/wordlist.js @@ -9,6 +9,10 @@ const Errors = require('../errors'); // REGEX to prevent emoji's from entering the wordlist. const EMOJI_REGEX = /(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?(?:\u200d(?:[^\ud800-\udfff]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?)*/; +const escapeRegExp = function(s) { + return String(s).replace(/[\\^$*+?.()|[\]{}]/g, '\\$&'); +}; + /** * The root wordlist object. * @type {Object} @@ -74,7 +78,7 @@ class Wordlist { return [word]; } - return tokenizer.tokenize(word.toLowerCase()); + return tokenizer.tokenize(escapeRegExp(word.toLowerCase())); }) .filter((tokens) => { if (tokens.length === 0) { From 0342b6c826af21510f5c7c373a51131392e17d4d Mon Sep 17 00:00:00 2001 From: Belen Curcio Date: Wed, 23 Aug 2017 10:06:39 -0300 Subject: [PATCH 3/5] rolling back --- services/wordlist.js | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/services/wordlist.js b/services/wordlist.js index 477baace6..2ae8ed90c 100644 --- a/services/wordlist.js +++ b/services/wordlist.js @@ -9,10 +9,6 @@ const Errors = require('../errors'); // REGEX to prevent emoji's from entering the wordlist. const EMOJI_REGEX = /(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?(?:\u200d(?:[^\ud800-\udfff]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff])[\ufe0e\ufe0f]?(?:[\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0]|\ud83c[\udffb-\udfff])?)*/; -const escapeRegExp = function(s) { - return String(s).replace(/[\\^$*+?.()|[\]{}]/g, '\\$&'); -}; - /** * The root wordlist object. * @type {Object} @@ -78,7 +74,7 @@ class Wordlist { return [word]; } - return tokenizer.tokenize(escapeRegExp(word.toLowerCase())); + return tokenizer.tokenize(word.toLowerCase()); }) .filter((tokens) => { if (tokens.length === 0) { From a68513227abd9ec1b5c20dd72ba32d19ce9f1b18 Mon Sep 17 00:00:00 2001 From: Wyatt Johnson Date: Wed, 23 Aug 2017 09:54:00 -0600 Subject: [PATCH 4/5] adjusted tokenizer to use regex based tokenizer instead --- services/wordlist.js | 8 ++++---- test/server/services/wordlist.js | 29 ++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/services/wordlist.js b/services/wordlist.js index 2ae8ed90c..8744de4ba 100644 --- a/services/wordlist.js +++ b/services/wordlist.js @@ -1,8 +1,8 @@ const debug = require('debug')('talk:services:wordlist'); const _ = require('lodash'); -const natural = require('natural'); -const tokenizer = new natural.WordTokenizer(); -const nameTokenizer = new natural.RegexpTokenizer({pattern: /\_/}); +const {RegexpTokenizer} = require('natural'); +const tokenizer = new RegexpTokenizer({pattern: /[\.\s\'\"\?\!]/}); +const nameTokenizer = new RegexpTokenizer({pattern: /\_/}); const SettingsService = require('./settings'); const Errors = require('../errors'); @@ -73,7 +73,7 @@ class Wordlist { if (word.length === 1) { return [word]; } - + return tokenizer.tokenize(word.toLowerCase()); }) .filter((tokens) => { diff --git a/test/server/services/wordlist.js b/test/server/services/wordlist.js index 417844da4..6d0116e26 100644 --- a/test/server/services/wordlist.js +++ b/test/server/services/wordlist.js @@ -10,10 +10,12 @@ describe('services.Wordlist', () => { 'cookies', 'how to do bad things', 'how to do really bad things', - 's h i t' + 's h i t', + '$hit', + 'p**ch', ], suspect: [ - 'do bad things' + 'do bad things', ] }; @@ -26,9 +28,18 @@ describe('services.Wordlist', () => { before(() => wordlist.upsert(wordlists)); - it('has entries', () => { - expect(wordlist.lists.banned).to.not.be.empty; - expect(wordlist.lists.suspect).to.not.be.empty; + it('parses the wordlists correctly', () => { + expect(wordlist.lists.banned).to.deep.equal([ + [ 'cookies' ], + [ 'how', 'to', 'do', 'bad', 'things' ], + [ 'how', 'to', 'do', 'really', 'bad', 'things' ], + [ 's', 'h', 'i', 't' ], + [ '$hit' ], + [ 'p**ch' ], + ]); + expect(wordlist.lists.suspect).to.deep.equal([ + [ 'do', 'bad', 'things' ], + ]); }); }); @@ -57,7 +68,9 @@ describe('services.Wordlist', () => { 'cookies', 'COOKIES.', 'how to do bad things', - 'How To do bad things!' + 'How To do bad things!', + 'This stuff is $hit!', + 'That\'s a p**ch!', ].forEach((word) => { expect(wordlist.match(bannedList, word)).to.be.true; }); @@ -68,7 +81,9 @@ describe('services.Wordlist', () => { 'how to', 'cookie', 'how to be a great person?', - 'how to not do really bad things?' + 'how to not do really bad things?', + 'i have $100 dollars.', + 'I have bad $ hit lling', ].forEach((word) => { expect(wordlist.match(bannedList, word)).to.be.false; }); From 13796fb26863505144b4cf1659ff6cdacf3ddeab Mon Sep 17 00:00:00 2001 From: Wyatt Johnson Date: Thu, 24 Aug 2017 11:18:17 -0600 Subject: [PATCH 5/5] added more test cases --- test/server/services/wordlist.js | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/server/services/wordlist.js b/test/server/services/wordlist.js index 6d0116e26..19545ff3e 100644 --- a/test/server/services/wordlist.js +++ b/test/server/services/wordlist.js @@ -13,6 +13,7 @@ describe('services.Wordlist', () => { 's h i t', '$hit', 'p**ch', + 'p*ch', ], suspect: [ 'do bad things', @@ -36,6 +37,7 @@ describe('services.Wordlist', () => { [ 's', 'h', 'i', 't' ], [ '$hit' ], [ 'p**ch' ], + [ 'p*ch' ], ]); expect(wordlist.lists.suspect).to.deep.equal([ [ 'do', 'bad', 'things' ], @@ -84,6 +86,7 @@ describe('services.Wordlist', () => { 'how to not do really bad things?', 'i have $100 dollars.', 'I have bad $ hit lling', + 'That\'s a p***ch!', ].forEach((word) => { expect(wordlist.match(bannedList, word)).to.be.false; }); @@ -91,6 +94,39 @@ describe('services.Wordlist', () => { }); + describe('#scan', () => { + + it('does match on a bad word', () => { + [ + 'how to do really bad things', + 'what is cookies', + 'cookies', + 'COOKIES.', + 'how to do bad things', + 'How To do bad things!', + 'This stuff is $hit!', + 'That\'s a p**ch!', + ].forEach((word) => { + expect(wordlist.scan('body', word)).to.not.be.undefined; + }); + }); + + it('does not match on a good word', () => { + [ + 'how to', + 'cookie', + 'how to be a great person?', + 'how to not do really bad things?', + 'i have $100 dollars.', + 'I have bad $ hit lling', + 'That\'s a p***ch!', + ].forEach((word) => { + expect(wordlist.scan('body', word)).to.be.undefined; + }); + }); + + }); + describe('#checkName', () => { [ 'flowers',