feat: expanded regexp generation, locale support, caching (#2869)

Co-authored-by: Kim Gardner <kgardnr@gmail.com>
This commit is contained in:
Wyatt Johnson
2020-03-10 16:05:44 +00:00
committed by GitHub
parent 7d967fc93b
commit 45b778c522
15 changed files with 461 additions and 245 deletions
@@ -7,8 +7,13 @@ import CommentContent from "./CommentContent";
it("renders correctly", () => {
const props: PropTypesOf<typeof CommentContent> = {
suspectWords: ["worse"],
bannedWords: ["bad"],
phrases: {
locale: "en-US",
wordList: {
suspect: ["worse"],
banned: ["bad"],
},
},
className: "custom",
children: "Hello <b>Bob</b>, you bad guy",
};
@@ -19,8 +24,13 @@ it("renders correctly", () => {
it("renders empty words correctly", () => {
const props: PropTypesOf<typeof CommentContent> = {
suspectWords: [],
bannedWords: [],
phrases: {
locale: "en-US",
wordList: {
suspect: [],
banned: [],
},
},
className: "custom",
children: "Hello <b>Bob</b>, you bad guy",
};
@@ -1,7 +1,7 @@
import cn from "classnames";
import { memoize } from "lodash";
import React, { FunctionComponent } from "react";
import React, { FunctionComponent, useMemo } from "react";
import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
import { createPurify } from "coral-common/utils/purify";
import styles from "./CommentContent.css";
@@ -14,8 +14,7 @@ const purify = createPurify(window, false);
interface Props {
className?: string;
children: string | React.ReactElement;
suspectWords: ReadonlyArray<string>;
bannedWords: ReadonlyArray<string>;
phrases: GetPhrasesRegExpOptions;
}
function escapeHTML(unsafe: string) {
@@ -27,50 +26,11 @@ function escapeHTML(unsafe: string) {
.replace(/'/g, "&#039;");
}
function escapeRegExp(str: string) {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
}
// generate a regulare expression that catches the `phrases`.
function generateRegExp(phrases: ReadonlyArray<string>) {
const inner = phrases
.map(phrase =>
phrase
.split(/\s+/)
.map(word => escapeRegExp(word))
.join('[\\s"?!.]+')
)
.join("|");
const pattern = `(^|[^\\w])(${inner})(?=[^\\w]|$)`;
try {
return new RegExp(pattern, "iu");
} catch (_err) {
// IE does not support unicode support, so we'll create one without.
return new RegExp(pattern, "i");
}
}
// Generate a regular expression detecting `suspectWords` and `bannedWords` phrases.
function getPhrasesRegexp(
suspectWords: ReadonlyArray<string>,
bannedWords: ReadonlyArray<string>
) {
return generateRegExp([...suspectWords, ...bannedWords]);
}
// Memoized version as arguments rarely change.
const getPhrasesRegexpMemoized = memoize(getPhrasesRegexp);
// markPhrasesHTML looks for `supsectWords` and `bannedWords` inside `text` and highlights them by returning
// a HTML string.
function markPhrasesHTML(
text: string,
suspectWords: ReadonlyArray<string>,
bannedWords: ReadonlyArray<string>
) {
const regexp = getPhrasesRegexpMemoized(suspectWords, bannedWords);
const tokens = text.split(regexp);
// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
// the settings applied for the locale and highlights them by returning an HTML
// string.
function markPhrasesHTML(text: string, expression: RegExp) {
const tokens = text.split(expression);
if (tokens.length === 1) {
return text;
}
@@ -87,45 +47,42 @@ function markPhrasesHTML(
// markHTMLNode manipulates the node by looking for #text nodes and adding markers
// for `supsectWords` and `bannedWords`.
function markHTMLNode(
parentNode: Node,
suspectWords: ReadonlyArray<string>,
bannedWords: ReadonlyArray<string>
) {
function markHTMLNode(parentNode: Node, expression: RegExp) {
parentNode.childNodes.forEach(node => {
if (node.nodeName === "#text") {
const newContent = markPhrasesHTML(
node.textContent!,
suspectWords,
bannedWords
);
const newContent = markPhrasesHTML(node.textContent!, expression);
if (newContent !== node.textContent) {
const newNode = document.createElement("span");
newNode.innerHTML = newContent;
parentNode.replaceChild(newNode, node);
}
} else {
markHTMLNode(node, suspectWords, bannedWords);
markHTMLNode(node, expression);
}
});
}
const CommentContent: FunctionComponent<Props> = ({
suspectWords,
bannedWords,
phrases,
className,
children,
}) => {
// Cache the expression used via memo. This will reduce duplicate renders of
// this comment content when the children change but the phrase configuration
// does not change. The regExp is already cached on a deeper level
// automatically, this is just lessening that impact further.
const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
if (typeof children === "string") {
// We create a Shadow DOM Tree with the HTML body content and
// use it as a parser.
const node = document.createElement("div");
node.innerHTML = purify.sanitize(children);
if (suspectWords.length || bannedWords.length) {
if (expression) {
// Then we traverse it recursively and manipulate it to highlight suspect words
// and banned words.
markHTMLNode(node, suspectWords, bannedWords);
markHTMLNode(node, expression);
}
// Finally we render the content of the Shadow DOM Tree
@@ -30,10 +30,7 @@ const CommentRevisionContainer: FunctionComponent<Props> = ({
.map(c => (
<div key={c.id}>
<Timestamp>{c.createdAt}</Timestamp>
<CommentContent
suspectWords={settings.wordList.suspect}
bannedWords={settings.wordList.banned}
>
<CommentContent phrases={settings}>
{c.body ? c.body : ""}
</CommentContent>
</div>
@@ -57,6 +54,7 @@ const enhanced = withFragmentContainer<Props>({
`,
settings: graphql`
fragment CommentRevisionContainer_settings on Settings {
locale
wordList {
banned
suspect
@@ -21,8 +21,13 @@ const baseProps: PropTypesOf<typeof ModerateCardN> = {
status: "undecided",
featured: false,
viewContextHref: "http://localhost/comment",
suspectWords: ["suspect"],
bannedWords: ["banned"],
phrases: {
locale: "en-US",
wordList: {
suspect: ["suspect"],
banned: ["banned"],
},
},
siteName: null,
onApprove: noop,
onReject: noop,
@@ -10,6 +10,7 @@ import React, {
} from "react";
import { HOTKEYS } from "coral-admin/constants";
import { GetPhrasesRegExpOptions } from "coral-admin/helpers";
import { PropTypesOf } from "coral-framework/types";
import {
BaseButton,
@@ -48,8 +49,7 @@ interface Props {
featured: boolean;
moderatedBy: React.ReactNode | null;
viewContextHref: string;
suspectWords: ReadonlyArray<string>;
bannedWords: ReadonlyArray<string>;
phrases: GetPhrasesRegExpOptions;
showStory: boolean;
storyTitle?: React.ReactNode;
storyHref?: string;
@@ -87,8 +87,7 @@ const ModerateCard: FunctionComponent<Props> = ({
viewContextHref,
status,
featured,
suspectWords,
bannedWords,
phrases,
onApprove,
onReject,
onFeature,
@@ -219,11 +218,7 @@ const ModerateCard: FunctionComponent<Props> = ({
)}
</div>
<div className={styles.contentArea}>
<CommentContent
suspectWords={suspectWords}
bannedWords={bannedWords}
className={styles.content}
>
<CommentContent phrases={phrases} className={styles.content}>
{commentBody}
</CommentContent>
<div className={styles.viewContext}>
@@ -222,8 +222,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
status={getStatus(comment)}
featured={isFeatured(comment)}
viewContextHref={comment.permalink}
suspectWords={settings.wordList.suspect}
bannedWords={settings.wordList.banned}
phrases={settings}
onApprove={handleApprove}
onReject={handleReject}
onFeature={onFeature}
@@ -319,6 +318,7 @@ const enhanced = withFragmentContainer<Props>({
`,
settings: graphql`
fragment ModerateCardContainer_settings on Settings {
locale
wordList {
banned
suspect
@@ -47,16 +47,19 @@ exports[`renders approved correctly 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -177,16 +180,19 @@ exports[`renders correctly 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -307,16 +313,19 @@ exports[`renders dangling correctly 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -437,16 +446,19 @@ exports[`renders rejected correctly 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -576,16 +588,19 @@ exports[`renders reply correctly 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -706,16 +721,19 @@ exports[`renders story info 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
content
@@ -874,16 +892,19 @@ exports[`renders tombstoned when comment is deleted 1`] = `
className="ModerateCard-contentArea"
>
<CommentContent
bannedWords={
Array [
"banned",
]
}
className="ModerateCard-content"
suspectWords={
Array [
"suspect",
]
phrases={
Object {
"locale": "en-US",
"wordList": Object {
"banned": Array [
"banned",
],
"suspect": Array [
"suspect",
],
},
}
}
>
<Localized
@@ -0,0 +1,64 @@
import { LanguageCode } from "coral-common/helpers";
import { createWordListRegExp } from "coral-common/utils";
export interface GetPhrasesRegExpOptions {
locale: string;
wordList: {
banned: ReadonlyArray<string>;
suspect: ReadonlyArray<string>;
};
}
export function getPhrasesRegExp({
locale,
wordList: { banned, suspect },
}: GetPhrasesRegExpOptions) {
if (banned.length === 0 && suspect.length === 0) {
return null;
}
return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
}
// cache is used as a global validator to the cached RegExp used by the
// application. We expect that generally, there is only ever one word list used
// by the client at a time, so this ensures that we only re-create the word list
// if we must.
const cache = {
keys: {
locale: "",
suspect: [] as ReadonlyArray<string>,
banned: [] as ReadonlyArray<string>,
},
value: null as RegExp | null,
};
export default function(options: GetPhrasesRegExpOptions) {
// We assume that the cache is valid unless one of the below checks fails.
let expired = false;
// Check the locale.
if (cache.keys.locale !== options.locale) {
cache.keys.locale = options.locale;
expired = true;
}
// Check the banned words list.
if (cache.keys.banned !== options.wordList.banned) {
cache.keys.banned = options.wordList.banned;
expired = true;
}
// Check the suspect words list.
if (cache.keys.suspect !== options.wordList.suspect) {
cache.keys.suspect = options.wordList.suspect;
expired = true;
}
// If the cache is expired, or the value doesn't exist, regenerate it.
if (expired) {
cache.value = getPhrasesRegExp(options);
}
return cache.value;
}
+4
View File
@@ -1 +1,5 @@
export { default as getQueueConnection } from "./getQueueConnection";
export {
default as getPhrasesRegExp,
GetPhrasesRegExpOptions,
} from "./getPhrasesRegExp";
@@ -0,0 +1,76 @@
import { defaults } from "lodash";
import { LanguageCode } from "coral-common/helpers";
import { DeepPartial } from "coral-common/types";
interface WordListRule {
split: string;
punctuation: string;
whitespace: string;
}
const DefaultWordListRule: WordListRule = {
split: "[^\\w]",
punctuation: '[\\s"?!.]+',
whitespace: "\\s+",
};
const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
"en-US": DefaultWordListRule,
};
/**
* Escape string for special regular expression characters.
*
* @param str the string to escape from regex characters
*/
function escapeRegExp(str: string) {
// $& means the whole matched string
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
/**
* generateRegExp will generate the tester that can be used to test strings
* for matches on phrases.
*
* @param lang the language to possibly swap word list rules
* @param phrases the phrases to use for creating the expression
*/
export default function createWordListRegExp(
lang: LanguageCode,
phrases: string[]
) {
// Get the rule list for this language, fallback to english if we haven't
// provided any overrides.
const rule: WordListRule = defaults(
WordListRules[lang] || {},
DefaultWordListRule
);
const whitespace = new RegExp(rule.whitespace);
// Split up the words from the list into a regex escaped string.
const words = phrases
.map(phrase =>
phrase
// Split each phrase by whitespace.
.split(whitespace)
// Escape each phrase, we don't expect any of them to contain regex.
.map(word => escapeRegExp(word))
// Rejoin to ensure that any variation of the word separated by a
// punctuation character should also be caught.
.join(rule.punctuation)
)
// For each of these words, wrap a `|` or OR.
.join("|");
// Wrap the pattern in split rules.
const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
try {
return new RegExp(pattern, "iu");
} catch {
// IE does not support unicode support, so we'll create one without.
return new RegExp(pattern, "i");
}
}
+1
View File
@@ -12,3 +12,4 @@ export { default as isPromiseLike } from "./isPromiseLike";
export { default as isPromise } from "./isPromise";
export { default as startsWith } from "./startsWith";
export { default as getOrigin } from "./getOrigin";
export { default as createWordListRegExp } from "./createWordListRegExp";
@@ -1,18 +1,24 @@
import {
GQLCOMMENT_FLAG_REASON,
GQLCOMMENT_STATUS,
} from "coral-server/graph/schema/__generated__/types";
import { ACTION_TYPE } from "coral-server/models/action/comment";
import {
IntermediateModerationPhase,
IntermediatePhaseResult,
} from "coral-server/services/comments/pipeline";
import { containsMatchingPhraseMemoized } from "coral-server/services/comments/pipeline/wordList";
import {
GQLCOMMENT_FLAG_REASON,
GQLCOMMENT_STATUS,
} from "coral-server/graph/schema/__generated__/types";
import { WordList } from "../wordList";
// Create a new wordlist instance to use.
const list = new WordList();
// This phase checks the comment against the wordList.
export const wordList: IntermediateModerationPhase = ({
tenant,
comment,
htmlStripped,
}): IntermediatePhaseResult | void => {
// If there isn't a body, there can't be a bad word!
if (!comment.body) {
@@ -23,7 +29,7 @@ export const wordList: IntermediateModerationPhase = ({
// has pre-mod enabled or not. If the comment was rejected based on the
// wordList, then reject it, otherwise if the moderation setting is
// premod, set it to `premod`.
if (containsMatchingPhraseMemoized(tenant.wordList.banned, comment.body)) {
if (list.test(tenant, "banned", htmlStripped)) {
// Add the flag related to Trust to the comment.
return {
status: GQLCOMMENT_STATUS.REJECTED,
@@ -43,7 +49,7 @@ export const wordList: IntermediateModerationPhase = ({
// If the wordList has matched the suspect word filter and we haven't disabled
// auto-flagging suspect words, then we should flag the comment!
if (containsMatchingPhraseMemoized(tenant.wordList.suspect, comment.body)) {
if (list.test(tenant, "suspect", htmlStripped)) {
return {
actions: [
{
@@ -1,66 +1,82 @@
import {
containsMatchingPhrase,
containsMatchingPhraseMemoized,
Options,
WordList,
} from "coral-server/services/comments/pipeline/wordList";
const phrases = [
"cookies",
"how to do bad things",
"how to do really bad things",
"s h i t",
"$hit",
"p**ch",
"p*ch",
];
describe("en-US", () => {
const list = new WordList();
const options: Options = {
id: "tenant_1",
locale: "en-US",
wordList: {
banned: [
"cookies",
"how to do bad things",
"how to do really bad things",
"s h i t",
"$hit",
"p**ch",
"p*ch",
"banned",
"ban",
],
suspect: [],
},
};
describe("containsMatchingPhrase", () => {
it("does match on a word in the list", () => {
[
"how to do really bad things",
"what is cookies",
"cookies",
"COOKIES.",
"how to do bad things",
"How To do bad things!",
"This stuff is $hit!",
"That's a p**ch!",
].forEach(word => {
expect(containsMatchingPhrase(phrases, word)).toEqual(true);
describe("containsMatchingPhrase", () => {
it("does match on a word in the list", () => {
[
"how to do really bad things",
"what is cookies",
"cookies",
"COOKIES.",
"how to do bad things",
"How To do bad things!",
"How.To.do.bad.things!",
"This stuff is $hit!",
"This is a test.\nTo see if cookies are found, in the second line.",
"That's a p**ch!",
"Banned words should be detected",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(true);
});
});
it("does not match on a word not in the list", () => {
[
"how to",
"cookie",
"how to be a great person?",
"how to not do really bad things?",
"i have $100 dollars.",
"I have bad $ hit lling",
"That's a p***ch!",
"When bann is spelt wrong, it won't be caught.",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(false);
});
});
it("allows an empty list", () => {
expect(list.test(options, "banned", "test")).toEqual(false);
});
});
it("does not match on a word not in the list", () => {
[
"how to",
"cookie",
"how to be a great person?",
"how to not do really bad things?",
"i have $100 dollars.",
"I have bad $ hit lling",
"That's a p***ch!",
].forEach(word => {
expect(containsMatchingPhrase(phrases, word)).toEqual(false);
});
});
it("allows an empty list", () => {
expect(containsMatchingPhrase([], "test")).toEqual(false);
});
});
describe("containsMatchingPhraseMemoized", () => {
it("return true for all cases after memoizing the first result", () => {
[
"cookies 1",
"cookies 2",
"cookies 4",
"cookies 5",
"this is for cookies 6",
"this is for cookies 7",
"this is for cookies 8",
"this is for cookies 9",
].forEach(word => {
expect(containsMatchingPhraseMemoized(phrases, word)).toEqual(true);
describe("containsMatchingPhraseMemoized", () => {
it("return true for all cases after memoizing the first result", () => {
[
"cookies 1",
"cookies 2",
"cookies 4",
"cookies 5",
"this is for cookies 6",
"this is for cookies 7",
"this is for cookies 8",
"this is for cookies 9",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(true);
});
});
});
});
@@ -1,37 +1,100 @@
import { memoize } from "lodash";
import ms from "ms";
import now from "performance-now";
// Replace `memoize.Cache`.
memoize.Cache = WeakMap;
import { LanguageCode } from "coral-common/helpers";
import { createWordListRegExp } from "coral-common/utils";
import logger from "coral-server/logger";
import { Tenant } from "coral-server/models/tenant";
/**
* Escape string for special regular expression characters.
*/
export function escapeRegExp(str: string) {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
interface Lists {
banned: RegExp | false;
suspect: RegExp | false;
}
/**
* Generate a regular expression that catches the `phrases`.
*/
export function generateRegExp(phrases: string[]) {
const inner = phrases
.map(phrase =>
phrase
.split(/\s+/)
.map(word => escapeRegExp(word))
.join('[\\s"?!.]+')
)
.join("|");
return new RegExp(`(^|[^\\w])(${inner})(?=[^\\w]|$)`, "miu");
export type Options = Pick<Tenant, "id" | "locale" | "wordList">;
export class WordList {
private readonly cache = new WeakMap<Options, Lists>();
private generate(locale: LanguageCode, list: string[]) {
// If a word list has no entries, then we can make a simple tester.
if (list.length === 0) {
return false;
}
return createWordListRegExp(locale, list);
}
/**
* create will create the List's.
*
* @param options options used to generate Lists
*/
private create(options: Options): Lists {
return {
banned: this.generate(options.locale, options.wordList.banned),
suspect: this.generate(options.locale, options.wordList.suspect),
};
}
/**
* lists will create/return a cached set of testers for the provided word
* lists.
*
* @param options the options object that is also used as the cache key
*/
private lists(options: Options, cache: boolean): Lists {
// If the request isn't supposed to use the cache, then just return a new
// one.
if (!cache) {
return this.create(options);
}
// As this is supposed to be cached, try to get it from the cache, or create
// it.
let lists = this.cache.get(options);
if (!lists) {
const startedAt = now();
lists = this.create(options);
logger.info(
{ tenantID: options.id, took: ms(now() - startedAt) },
"regenerated word list cache"
);
this.cache.set(options, lists);
}
return lists;
}
/**
* test will test the string against the selected list. The generated lists
* are cached and re-used on subsequent calls.
*
* @param options the options object that is also used as the cache key
* @param listName the list to test against
* @param testString the string to test to see if they match anything on the
* list
* @param cache when true, will re-use the cached testers based on the lists
*/
public test(
options: Options,
listName: keyof Lists,
testString: string,
cache = true
): boolean {
const list = this.lists(options, cache)[listName];
if (!list) {
return false;
}
const startedAt = now();
const result = list.test(testString);
logger.debug(
{ tenantID: options.id, took: ms(now() - startedAt) },
"word list phrase test complete"
);
return result;
}
}
export const generateRegExpMemoized = memoize(generateRegExp);
export const containsMatchingPhrase = (phrases: string[], testString: string) =>
phrases.length > 0 ? generateRegExp(phrases).test(testString) : false;
export const containsMatchingPhraseMemoized = (
phrases: string[],
testString: string
) =>
phrases.length > 0 ? generateRegExpMemoized(phrases).test(testString) : false;
+3 -3
View File
@@ -1,5 +1,5 @@
import { Redis } from "ioredis";
import { isUndefined } from "lodash";
import { isUndefined, lowerCase, uniqBy } from "lodash";
import { DateTime } from "luxon";
import { Db } from "mongodb";
import { URL } from "url";
@@ -45,11 +45,11 @@ function cleanWordList(
list: GQLSettingsWordListInput
): GQLSettingsWordListInput {
if (list.banned) {
list.banned = list.banned.filter(Boolean);
list.banned = uniqBy(list.banned.filter(Boolean), lowerCase) as string[];
}
if (list.suspect) {
list.suspect = list.suspect.filter(Boolean);
list.suspect = uniqBy(list.suspect.filter(Boolean), lowerCase) as string[];
}
return list;