mirror of
https://github.com/wassname/talk.git
synced 2026-06-27 19:17:09 +08:00
feat: expanded regexp generation, locale support, caching (#2869)
Co-authored-by: Kim Gardner <kgardnr@gmail.com>
This commit is contained in:
@@ -7,8 +7,13 @@ import CommentContent from "./CommentContent";
|
||||
|
||||
it("renders correctly", () => {
|
||||
const props: PropTypesOf<typeof CommentContent> = {
|
||||
suspectWords: ["worse"],
|
||||
bannedWords: ["bad"],
|
||||
phrases: {
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
suspect: ["worse"],
|
||||
banned: ["bad"],
|
||||
},
|
||||
},
|
||||
className: "custom",
|
||||
children: "Hello <b>Bob</b>, you bad guy",
|
||||
};
|
||||
@@ -19,8 +24,13 @@ it("renders correctly", () => {
|
||||
|
||||
it("renders empty words correctly", () => {
|
||||
const props: PropTypesOf<typeof CommentContent> = {
|
||||
suspectWords: [],
|
||||
bannedWords: [],
|
||||
phrases: {
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
suspect: [],
|
||||
banned: [],
|
||||
},
|
||||
},
|
||||
className: "custom",
|
||||
children: "Hello <b>Bob</b>, you bad guy",
|
||||
};
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import cn from "classnames";
|
||||
import { memoize } from "lodash";
|
||||
import React, { FunctionComponent } from "react";
|
||||
import React, { FunctionComponent, useMemo } from "react";
|
||||
|
||||
import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
|
||||
import { createPurify } from "coral-common/utils/purify";
|
||||
|
||||
import styles from "./CommentContent.css";
|
||||
@@ -14,8 +14,7 @@ const purify = createPurify(window, false);
|
||||
interface Props {
|
||||
className?: string;
|
||||
children: string | React.ReactElement;
|
||||
suspectWords: ReadonlyArray<string>;
|
||||
bannedWords: ReadonlyArray<string>;
|
||||
phrases: GetPhrasesRegExpOptions;
|
||||
}
|
||||
|
||||
function escapeHTML(unsafe: string) {
|
||||
@@ -27,50 +26,11 @@ function escapeHTML(unsafe: string) {
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
function escapeRegExp(str: string) {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
|
||||
}
|
||||
|
||||
// generate a regulare expression that catches the `phrases`.
|
||||
function generateRegExp(phrases: ReadonlyArray<string>) {
|
||||
const inner = phrases
|
||||
.map(phrase =>
|
||||
phrase
|
||||
.split(/\s+/)
|
||||
.map(word => escapeRegExp(word))
|
||||
.join('[\\s"?!.]+')
|
||||
)
|
||||
.join("|");
|
||||
|
||||
const pattern = `(^|[^\\w])(${inner})(?=[^\\w]|$)`;
|
||||
try {
|
||||
return new RegExp(pattern, "iu");
|
||||
} catch (_err) {
|
||||
// IE does not support unicode support, so we'll create one without.
|
||||
return new RegExp(pattern, "i");
|
||||
}
|
||||
}
|
||||
|
||||
// Generate a regular expression detecting `suspectWords` and `bannedWords` phrases.
|
||||
function getPhrasesRegexp(
|
||||
suspectWords: ReadonlyArray<string>,
|
||||
bannedWords: ReadonlyArray<string>
|
||||
) {
|
||||
return generateRegExp([...suspectWords, ...bannedWords]);
|
||||
}
|
||||
|
||||
// Memoized version as arguments rarely change.
|
||||
const getPhrasesRegexpMemoized = memoize(getPhrasesRegexp);
|
||||
|
||||
// markPhrasesHTML looks for `supsectWords` and `bannedWords` inside `text` and highlights them by returning
|
||||
// a HTML string.
|
||||
function markPhrasesHTML(
|
||||
text: string,
|
||||
suspectWords: ReadonlyArray<string>,
|
||||
bannedWords: ReadonlyArray<string>
|
||||
) {
|
||||
const regexp = getPhrasesRegexpMemoized(suspectWords, bannedWords);
|
||||
const tokens = text.split(regexp);
|
||||
// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
|
||||
// the settings applied for the locale and highlights them by returning an HTML
|
||||
// string.
|
||||
function markPhrasesHTML(text: string, expression: RegExp) {
|
||||
const tokens = text.split(expression);
|
||||
if (tokens.length === 1) {
|
||||
return text;
|
||||
}
|
||||
@@ -87,45 +47,42 @@ function markPhrasesHTML(
|
||||
|
||||
// markHTMLNode manipulates the node by looking for #text nodes and adding markers
|
||||
// for `supsectWords` and `bannedWords`.
|
||||
function markHTMLNode(
|
||||
parentNode: Node,
|
||||
suspectWords: ReadonlyArray<string>,
|
||||
bannedWords: ReadonlyArray<string>
|
||||
) {
|
||||
function markHTMLNode(parentNode: Node, expression: RegExp) {
|
||||
parentNode.childNodes.forEach(node => {
|
||||
if (node.nodeName === "#text") {
|
||||
const newContent = markPhrasesHTML(
|
||||
node.textContent!,
|
||||
suspectWords,
|
||||
bannedWords
|
||||
);
|
||||
const newContent = markPhrasesHTML(node.textContent!, expression);
|
||||
if (newContent !== node.textContent) {
|
||||
const newNode = document.createElement("span");
|
||||
newNode.innerHTML = newContent;
|
||||
parentNode.replaceChild(newNode, node);
|
||||
}
|
||||
} else {
|
||||
markHTMLNode(node, suspectWords, bannedWords);
|
||||
markHTMLNode(node, expression);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const CommentContent: FunctionComponent<Props> = ({
|
||||
suspectWords,
|
||||
bannedWords,
|
||||
phrases,
|
||||
className,
|
||||
children,
|
||||
}) => {
|
||||
// Cache the expression used via memo. This will reduce duplicate renders of
|
||||
// this comment content when the children change but the phrase configuration
|
||||
// does not change. The regExp is already cached on a deeper level
|
||||
// automatically, this is just lessening that impact further.
|
||||
const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
|
||||
|
||||
if (typeof children === "string") {
|
||||
// We create a Shadow DOM Tree with the HTML body content and
|
||||
// use it as a parser.
|
||||
const node = document.createElement("div");
|
||||
node.innerHTML = purify.sanitize(children);
|
||||
|
||||
if (suspectWords.length || bannedWords.length) {
|
||||
if (expression) {
|
||||
// Then we traverse it recursively and manipulate it to highlight suspect words
|
||||
// and banned words.
|
||||
markHTMLNode(node, suspectWords, bannedWords);
|
||||
markHTMLNode(node, expression);
|
||||
}
|
||||
|
||||
// Finally we render the content of the Shadow DOM Tree
|
||||
|
||||
@@ -30,10 +30,7 @@ const CommentRevisionContainer: FunctionComponent<Props> = ({
|
||||
.map(c => (
|
||||
<div key={c.id}>
|
||||
<Timestamp>{c.createdAt}</Timestamp>
|
||||
<CommentContent
|
||||
suspectWords={settings.wordList.suspect}
|
||||
bannedWords={settings.wordList.banned}
|
||||
>
|
||||
<CommentContent phrases={settings}>
|
||||
{c.body ? c.body : ""}
|
||||
</CommentContent>
|
||||
</div>
|
||||
@@ -57,6 +54,7 @@ const enhanced = withFragmentContainer<Props>({
|
||||
`,
|
||||
settings: graphql`
|
||||
fragment CommentRevisionContainer_settings on Settings {
|
||||
locale
|
||||
wordList {
|
||||
banned
|
||||
suspect
|
||||
|
||||
@@ -21,8 +21,13 @@ const baseProps: PropTypesOf<typeof ModerateCardN> = {
|
||||
status: "undecided",
|
||||
featured: false,
|
||||
viewContextHref: "http://localhost/comment",
|
||||
suspectWords: ["suspect"],
|
||||
bannedWords: ["banned"],
|
||||
phrases: {
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
suspect: ["suspect"],
|
||||
banned: ["banned"],
|
||||
},
|
||||
},
|
||||
siteName: null,
|
||||
onApprove: noop,
|
||||
onReject: noop,
|
||||
|
||||
@@ -10,6 +10,7 @@ import React, {
|
||||
} from "react";
|
||||
|
||||
import { HOTKEYS } from "coral-admin/constants";
|
||||
import { GetPhrasesRegExpOptions } from "coral-admin/helpers";
|
||||
import { PropTypesOf } from "coral-framework/types";
|
||||
import {
|
||||
BaseButton,
|
||||
@@ -48,8 +49,7 @@ interface Props {
|
||||
featured: boolean;
|
||||
moderatedBy: React.ReactNode | null;
|
||||
viewContextHref: string;
|
||||
suspectWords: ReadonlyArray<string>;
|
||||
bannedWords: ReadonlyArray<string>;
|
||||
phrases: GetPhrasesRegExpOptions;
|
||||
showStory: boolean;
|
||||
storyTitle?: React.ReactNode;
|
||||
storyHref?: string;
|
||||
@@ -87,8 +87,7 @@ const ModerateCard: FunctionComponent<Props> = ({
|
||||
viewContextHref,
|
||||
status,
|
||||
featured,
|
||||
suspectWords,
|
||||
bannedWords,
|
||||
phrases,
|
||||
onApprove,
|
||||
onReject,
|
||||
onFeature,
|
||||
@@ -219,11 +218,7 @@ const ModerateCard: FunctionComponent<Props> = ({
|
||||
)}
|
||||
</div>
|
||||
<div className={styles.contentArea}>
|
||||
<CommentContent
|
||||
suspectWords={suspectWords}
|
||||
bannedWords={bannedWords}
|
||||
className={styles.content}
|
||||
>
|
||||
<CommentContent phrases={phrases} className={styles.content}>
|
||||
{commentBody}
|
||||
</CommentContent>
|
||||
<div className={styles.viewContext}>
|
||||
|
||||
@@ -222,8 +222,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
|
||||
status={getStatus(comment)}
|
||||
featured={isFeatured(comment)}
|
||||
viewContextHref={comment.permalink}
|
||||
suspectWords={settings.wordList.suspect}
|
||||
bannedWords={settings.wordList.banned}
|
||||
phrases={settings}
|
||||
onApprove={handleApprove}
|
||||
onReject={handleReject}
|
||||
onFeature={onFeature}
|
||||
@@ -319,6 +318,7 @@ const enhanced = withFragmentContainer<Props>({
|
||||
`,
|
||||
settings: graphql`
|
||||
fragment ModerateCardContainer_settings on Settings {
|
||||
locale
|
||||
wordList {
|
||||
banned
|
||||
suspect
|
||||
|
||||
+84
-63
@@ -47,16 +47,19 @@ exports[`renders approved correctly 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -177,16 +180,19 @@ exports[`renders correctly 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -307,16 +313,19 @@ exports[`renders dangling correctly 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -437,16 +446,19 @@ exports[`renders rejected correctly 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -576,16 +588,19 @@ exports[`renders reply correctly 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -706,16 +721,19 @@ exports[`renders story info 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
content
|
||||
@@ -874,16 +892,19 @@ exports[`renders tombstoned when comment is deleted 1`] = `
|
||||
className="ModerateCard-contentArea"
|
||||
>
|
||||
<CommentContent
|
||||
bannedWords={
|
||||
Array [
|
||||
"banned",
|
||||
]
|
||||
}
|
||||
className="ModerateCard-content"
|
||||
suspectWords={
|
||||
Array [
|
||||
"suspect",
|
||||
]
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
"wordList": Object {
|
||||
"banned": Array [
|
||||
"banned",
|
||||
],
|
||||
"suspect": Array [
|
||||
"suspect",
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
>
|
||||
<Localized
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { createWordListRegExp } from "coral-common/utils";
|
||||
|
||||
export interface GetPhrasesRegExpOptions {
|
||||
locale: string;
|
||||
wordList: {
|
||||
banned: ReadonlyArray<string>;
|
||||
suspect: ReadonlyArray<string>;
|
||||
};
|
||||
}
|
||||
|
||||
export function getPhrasesRegExp({
|
||||
locale,
|
||||
wordList: { banned, suspect },
|
||||
}: GetPhrasesRegExpOptions) {
|
||||
if (banned.length === 0 && suspect.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
|
||||
}
|
||||
|
||||
// cache is used as a global validator to the cached RegExp used by the
|
||||
// application. We expect that generally, there is only ever one word list used
|
||||
// by the client at a time, so this ensures that we only re-create the word list
|
||||
// if we must.
|
||||
const cache = {
|
||||
keys: {
|
||||
locale: "",
|
||||
suspect: [] as ReadonlyArray<string>,
|
||||
banned: [] as ReadonlyArray<string>,
|
||||
},
|
||||
value: null as RegExp | null,
|
||||
};
|
||||
|
||||
export default function(options: GetPhrasesRegExpOptions) {
|
||||
// We assume that the cache is valid unless one of the below checks fails.
|
||||
let expired = false;
|
||||
|
||||
// Check the locale.
|
||||
if (cache.keys.locale !== options.locale) {
|
||||
cache.keys.locale = options.locale;
|
||||
expired = true;
|
||||
}
|
||||
|
||||
// Check the banned words list.
|
||||
if (cache.keys.banned !== options.wordList.banned) {
|
||||
cache.keys.banned = options.wordList.banned;
|
||||
expired = true;
|
||||
}
|
||||
|
||||
// Check the suspect words list.
|
||||
if (cache.keys.suspect !== options.wordList.suspect) {
|
||||
cache.keys.suspect = options.wordList.suspect;
|
||||
expired = true;
|
||||
}
|
||||
|
||||
// If the cache is expired, or the value doesn't exist, regenerate it.
|
||||
if (expired) {
|
||||
cache.value = getPhrasesRegExp(options);
|
||||
}
|
||||
|
||||
return cache.value;
|
||||
}
|
||||
@@ -1 +1,5 @@
|
||||
export { default as getQueueConnection } from "./getQueueConnection";
|
||||
export {
|
||||
default as getPhrasesRegExp,
|
||||
GetPhrasesRegExpOptions,
|
||||
} from "./getPhrasesRegExp";
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
import { defaults } from "lodash";
|
||||
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { DeepPartial } from "coral-common/types";
|
||||
|
||||
interface WordListRule {
|
||||
split: string;
|
||||
punctuation: string;
|
||||
whitespace: string;
|
||||
}
|
||||
|
||||
const DefaultWordListRule: WordListRule = {
|
||||
split: "[^\\w]",
|
||||
punctuation: '[\\s"?!.]+',
|
||||
whitespace: "\\s+",
|
||||
};
|
||||
|
||||
const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
|
||||
"en-US": DefaultWordListRule,
|
||||
};
|
||||
|
||||
/**
|
||||
* Escape string for special regular expression characters.
|
||||
*
|
||||
* @param str the string to escape from regex characters
|
||||
*/
|
||||
function escapeRegExp(str: string) {
|
||||
// $& means the whole matched string
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
/**
|
||||
* generateRegExp will generate the tester that can be used to test strings
|
||||
* for matches on phrases.
|
||||
*
|
||||
* @param lang the language to possibly swap word list rules
|
||||
* @param phrases the phrases to use for creating the expression
|
||||
*/
|
||||
export default function createWordListRegExp(
|
||||
lang: LanguageCode,
|
||||
phrases: string[]
|
||||
) {
|
||||
// Get the rule list for this language, fallback to english if we haven't
|
||||
// provided any overrides.
|
||||
const rule: WordListRule = defaults(
|
||||
WordListRules[lang] || {},
|
||||
DefaultWordListRule
|
||||
);
|
||||
|
||||
const whitespace = new RegExp(rule.whitespace);
|
||||
|
||||
// Split up the words from the list into a regex escaped string.
|
||||
const words = phrases
|
||||
.map(phrase =>
|
||||
phrase
|
||||
// Split each phrase by whitespace.
|
||||
.split(whitespace)
|
||||
// Escape each phrase, we don't expect any of them to contain regex.
|
||||
.map(word => escapeRegExp(word))
|
||||
// Rejoin to ensure that any variation of the word separated by a
|
||||
// punctuation character should also be caught.
|
||||
.join(rule.punctuation)
|
||||
)
|
||||
// For each of these words, wrap a `|` or OR.
|
||||
.join("|");
|
||||
|
||||
// Wrap the pattern in split rules.
|
||||
const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
|
||||
|
||||
try {
|
||||
return new RegExp(pattern, "iu");
|
||||
} catch {
|
||||
// IE does not support unicode support, so we'll create one without.
|
||||
return new RegExp(pattern, "i");
|
||||
}
|
||||
}
|
||||
@@ -12,3 +12,4 @@ export { default as isPromiseLike } from "./isPromiseLike";
|
||||
export { default as isPromise } from "./isPromise";
|
||||
export { default as startsWith } from "./startsWith";
|
||||
export { default as getOrigin } from "./getOrigin";
|
||||
export { default as createWordListRegExp } from "./createWordListRegExp";
|
||||
|
||||
@@ -1,18 +1,24 @@
|
||||
import {
|
||||
GQLCOMMENT_FLAG_REASON,
|
||||
GQLCOMMENT_STATUS,
|
||||
} from "coral-server/graph/schema/__generated__/types";
|
||||
import { ACTION_TYPE } from "coral-server/models/action/comment";
|
||||
import {
|
||||
IntermediateModerationPhase,
|
||||
IntermediatePhaseResult,
|
||||
} from "coral-server/services/comments/pipeline";
|
||||
import { containsMatchingPhraseMemoized } from "coral-server/services/comments/pipeline/wordList";
|
||||
|
||||
import {
|
||||
GQLCOMMENT_FLAG_REASON,
|
||||
GQLCOMMENT_STATUS,
|
||||
} from "coral-server/graph/schema/__generated__/types";
|
||||
|
||||
import { WordList } from "../wordList";
|
||||
|
||||
// Create a new wordlist instance to use.
|
||||
const list = new WordList();
|
||||
|
||||
// This phase checks the comment against the wordList.
|
||||
export const wordList: IntermediateModerationPhase = ({
|
||||
tenant,
|
||||
comment,
|
||||
htmlStripped,
|
||||
}): IntermediatePhaseResult | void => {
|
||||
// If there isn't a body, there can't be a bad word!
|
||||
if (!comment.body) {
|
||||
@@ -23,7 +29,7 @@ export const wordList: IntermediateModerationPhase = ({
|
||||
// has pre-mod enabled or not. If the comment was rejected based on the
|
||||
// wordList, then reject it, otherwise if the moderation setting is
|
||||
// premod, set it to `premod`.
|
||||
if (containsMatchingPhraseMemoized(tenant.wordList.banned, comment.body)) {
|
||||
if (list.test(tenant, "banned", htmlStripped)) {
|
||||
// Add the flag related to Trust to the comment.
|
||||
return {
|
||||
status: GQLCOMMENT_STATUS.REJECTED,
|
||||
@@ -43,7 +49,7 @@ export const wordList: IntermediateModerationPhase = ({
|
||||
|
||||
// If the wordList has matched the suspect word filter and we haven't disabled
|
||||
// auto-flagging suspect words, then we should flag the comment!
|
||||
if (containsMatchingPhraseMemoized(tenant.wordList.suspect, comment.body)) {
|
||||
if (list.test(tenant, "suspect", htmlStripped)) {
|
||||
return {
|
||||
actions: [
|
||||
{
|
||||
|
||||
@@ -1,66 +1,82 @@
|
||||
import {
|
||||
containsMatchingPhrase,
|
||||
containsMatchingPhraseMemoized,
|
||||
Options,
|
||||
WordList,
|
||||
} from "coral-server/services/comments/pipeline/wordList";
|
||||
|
||||
const phrases = [
|
||||
"cookies",
|
||||
"how to do bad things",
|
||||
"how to do really bad things",
|
||||
"s h i t",
|
||||
"$hit",
|
||||
"p**ch",
|
||||
"p*ch",
|
||||
];
|
||||
describe("en-US", () => {
|
||||
const list = new WordList();
|
||||
const options: Options = {
|
||||
id: "tenant_1",
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
banned: [
|
||||
"cookies",
|
||||
"how to do bad things",
|
||||
"how to do really bad things",
|
||||
"s h i t",
|
||||
"$hit",
|
||||
"p**ch",
|
||||
"p*ch",
|
||||
"banned",
|
||||
"ban",
|
||||
],
|
||||
suspect: [],
|
||||
},
|
||||
};
|
||||
|
||||
describe("containsMatchingPhrase", () => {
|
||||
it("does match on a word in the list", () => {
|
||||
[
|
||||
"how to do really bad things",
|
||||
"what is cookies",
|
||||
"cookies",
|
||||
"COOKIES.",
|
||||
"how to do bad things",
|
||||
"How To do bad things!",
|
||||
"This stuff is $hit!",
|
||||
"That's a p**ch!",
|
||||
].forEach(word => {
|
||||
expect(containsMatchingPhrase(phrases, word)).toEqual(true);
|
||||
describe("containsMatchingPhrase", () => {
|
||||
it("does match on a word in the list", () => {
|
||||
[
|
||||
"how to do really bad things",
|
||||
"what is cookies",
|
||||
"cookies",
|
||||
"COOKIES.",
|
||||
"how to do bad things",
|
||||
"How To do bad things!",
|
||||
"How.To.do.bad.things!",
|
||||
"This stuff is $hit!",
|
||||
"This is a test.\nTo see if cookies are found, in the second line.",
|
||||
"That's a p**ch!",
|
||||
"Banned words should be detected",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(true);
|
||||
});
|
||||
});
|
||||
|
||||
it("does not match on a word not in the list", () => {
|
||||
[
|
||||
"how to",
|
||||
"cookie",
|
||||
"how to be a great person?",
|
||||
"how to not do really bad things?",
|
||||
"i have $100 dollars.",
|
||||
"I have bad $ hit lling",
|
||||
"That's a p***ch!",
|
||||
"When bann is spelt wrong, it won't be caught.",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
it("allows an empty list", () => {
|
||||
expect(list.test(options, "banned", "test")).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
it("does not match on a word not in the list", () => {
|
||||
[
|
||||
"how to",
|
||||
"cookie",
|
||||
"how to be a great person?",
|
||||
"how to not do really bad things?",
|
||||
"i have $100 dollars.",
|
||||
"I have bad $ hit lling",
|
||||
"That's a p***ch!",
|
||||
].forEach(word => {
|
||||
expect(containsMatchingPhrase(phrases, word)).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
it("allows an empty list", () => {
|
||||
expect(containsMatchingPhrase([], "test")).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("containsMatchingPhraseMemoized", () => {
|
||||
it("return true for all cases after memoizing the first result", () => {
|
||||
[
|
||||
"cookies 1",
|
||||
"cookies 2",
|
||||
"cookies 4",
|
||||
"cookies 5",
|
||||
"this is for cookies 6",
|
||||
"this is for cookies 7",
|
||||
"this is for cookies 8",
|
||||
"this is for cookies 9",
|
||||
].forEach(word => {
|
||||
expect(containsMatchingPhraseMemoized(phrases, word)).toEqual(true);
|
||||
describe("containsMatchingPhraseMemoized", () => {
|
||||
it("return true for all cases after memoizing the first result", () => {
|
||||
[
|
||||
"cookies 1",
|
||||
"cookies 2",
|
||||
"cookies 4",
|
||||
"cookies 5",
|
||||
"this is for cookies 6",
|
||||
"this is for cookies 7",
|
||||
"this is for cookies 8",
|
||||
"this is for cookies 9",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,37 +1,100 @@
|
||||
import { memoize } from "lodash";
|
||||
import ms from "ms";
|
||||
import now from "performance-now";
|
||||
|
||||
// Replace `memoize.Cache`.
|
||||
memoize.Cache = WeakMap;
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { createWordListRegExp } from "coral-common/utils";
|
||||
import logger from "coral-server/logger";
|
||||
import { Tenant } from "coral-server/models/tenant";
|
||||
|
||||
/**
|
||||
* Escape string for special regular expression characters.
|
||||
*/
|
||||
export function escapeRegExp(str: string) {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
|
||||
interface Lists {
|
||||
banned: RegExp | false;
|
||||
suspect: RegExp | false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a regular expression that catches the `phrases`.
|
||||
*/
|
||||
export function generateRegExp(phrases: string[]) {
|
||||
const inner = phrases
|
||||
.map(phrase =>
|
||||
phrase
|
||||
.split(/\s+/)
|
||||
.map(word => escapeRegExp(word))
|
||||
.join('[\\s"?!.]+')
|
||||
)
|
||||
.join("|");
|
||||
return new RegExp(`(^|[^\\w])(${inner})(?=[^\\w]|$)`, "miu");
|
||||
export type Options = Pick<Tenant, "id" | "locale" | "wordList">;
|
||||
|
||||
export class WordList {
|
||||
private readonly cache = new WeakMap<Options, Lists>();
|
||||
|
||||
private generate(locale: LanguageCode, list: string[]) {
|
||||
// If a word list has no entries, then we can make a simple tester.
|
||||
if (list.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return createWordListRegExp(locale, list);
|
||||
}
|
||||
|
||||
/**
|
||||
* create will create the List's.
|
||||
*
|
||||
* @param options options used to generate Lists
|
||||
*/
|
||||
private create(options: Options): Lists {
|
||||
return {
|
||||
banned: this.generate(options.locale, options.wordList.banned),
|
||||
suspect: this.generate(options.locale, options.wordList.suspect),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* lists will create/return a cached set of testers for the provided word
|
||||
* lists.
|
||||
*
|
||||
* @param options the options object that is also used as the cache key
|
||||
*/
|
||||
private lists(options: Options, cache: boolean): Lists {
|
||||
// If the request isn't supposed to use the cache, then just return a new
|
||||
// one.
|
||||
if (!cache) {
|
||||
return this.create(options);
|
||||
}
|
||||
|
||||
// As this is supposed to be cached, try to get it from the cache, or create
|
||||
// it.
|
||||
let lists = this.cache.get(options);
|
||||
if (!lists) {
|
||||
const startedAt = now();
|
||||
lists = this.create(options);
|
||||
logger.info(
|
||||
{ tenantID: options.id, took: ms(now() - startedAt) },
|
||||
"regenerated word list cache"
|
||||
);
|
||||
|
||||
this.cache.set(options, lists);
|
||||
}
|
||||
|
||||
return lists;
|
||||
}
|
||||
|
||||
/**
|
||||
* test will test the string against the selected list. The generated lists
|
||||
* are cached and re-used on subsequent calls.
|
||||
*
|
||||
* @param options the options object that is also used as the cache key
|
||||
* @param listName the list to test against
|
||||
* @param testString the string to test to see if they match anything on the
|
||||
* list
|
||||
* @param cache when true, will re-use the cached testers based on the lists
|
||||
*/
|
||||
public test(
|
||||
options: Options,
|
||||
listName: keyof Lists,
|
||||
testString: string,
|
||||
cache = true
|
||||
): boolean {
|
||||
const list = this.lists(options, cache)[listName];
|
||||
if (!list) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const startedAt = now();
|
||||
const result = list.test(testString);
|
||||
logger.debug(
|
||||
{ tenantID: options.id, took: ms(now() - startedAt) },
|
||||
"word list phrase test complete"
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
export const generateRegExpMemoized = memoize(generateRegExp);
|
||||
|
||||
export const containsMatchingPhrase = (phrases: string[], testString: string) =>
|
||||
phrases.length > 0 ? generateRegExp(phrases).test(testString) : false;
|
||||
|
||||
export const containsMatchingPhraseMemoized = (
|
||||
phrases: string[],
|
||||
testString: string
|
||||
) =>
|
||||
phrases.length > 0 ? generateRegExpMemoized(phrases).test(testString) : false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { Redis } from "ioredis";
|
||||
import { isUndefined } from "lodash";
|
||||
import { isUndefined, lowerCase, uniqBy } from "lodash";
|
||||
import { DateTime } from "luxon";
|
||||
import { Db } from "mongodb";
|
||||
import { URL } from "url";
|
||||
@@ -45,11 +45,11 @@ function cleanWordList(
|
||||
list: GQLSettingsWordListInput
|
||||
): GQLSettingsWordListInput {
|
||||
if (list.banned) {
|
||||
list.banned = list.banned.filter(Boolean);
|
||||
list.banned = uniqBy(list.banned.filter(Boolean), lowerCase) as string[];
|
||||
}
|
||||
|
||||
if (list.suspect) {
|
||||
list.suspect = list.suspect.filter(Boolean);
|
||||
list.suspect = uniqBy(list.suspect.filter(Boolean), lowerCase) as string[];
|
||||
}
|
||||
|
||||
return list;
|
||||
|
||||
Reference in New Issue
Block a user