feat: expanded regexp generation, locale support, caching (#2869)

Co-authored-by: Kim Gardner <kgardnr@gmail.com>
2026-06-27 19:17:09 +08:00 · 2020-03-10 16:05:44 +00:00
parent 7d967fc93b
commit 45b778c522
15 changed files with 461 additions and 245 deletions
@@ -7,8 +7,13 @@ import CommentContent from "./CommentContent";

 it("renders correctly", () => {
  const props: PropTypesOf<typeof CommentContent> = {
-    suspectWords: ["worse"],
-    bannedWords: ["bad"],
+    phrases: {
+      locale: "en-US",
+      wordList: {
+        suspect: ["worse"],
+        banned: ["bad"],
+      },
+    },
    className: "custom",
    children: "Hello <b>Bob</b>, you bad guy",
  };
@@ -19,8 +24,13 @@ it("renders correctly", () => {

 it("renders empty words correctly", () => {
  const props: PropTypesOf<typeof CommentContent> = {
-    suspectWords: [],
-    bannedWords: [],
+    phrases: {
+      locale: "en-US",
+      wordList: {
+        suspect: [],
+        banned: [],
+      },
+    },
    className: "custom",
    children: "Hello <b>Bob</b>, you bad guy",
  };
@@ -1,7 +1,7 @@
 import cn from "classnames";
-import { memoize } from "lodash";
-import React, { FunctionComponent } from "react";
+import React, { FunctionComponent, useMemo } from "react";

+import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
 import { createPurify } from "coral-common/utils/purify";

 import styles from "./CommentContent.css";
@@ -14,8 +14,7 @@ const purify = createPurify(window, false);
 interface Props {
  className?: string;
  children: string | React.ReactElement;
-  suspectWords: ReadonlyArray<string>;
-  bannedWords: ReadonlyArray<string>;
+  phrases: GetPhrasesRegExpOptions;
 }

 function escapeHTML(unsafe: string) {
@@ -27,50 +26,11 @@ function escapeHTML(unsafe: string) {
    .replace(/'/g, "&#039;");
 }

-function escapeRegExp(str: string) {
-  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
-}
-
-// generate a regulare expression that catches the `phrases`.
-function generateRegExp(phrases: ReadonlyArray<string>) {
-  const inner = phrases
-    .map(phrase =>
-      phrase
-        .split(/\s+/)
-        .map(word => escapeRegExp(word))
-        .join('[\\s"?!.]+')
-    )
-    .join("|");
-
-  const pattern = `(^|[^\\w])(${inner})(?=[^\\w]|$)`;
-  try {
-    return new RegExp(pattern, "iu");
-  } catch (_err) {
-    // IE does not support unicode support, so we'll create one without.
-    return new RegExp(pattern, "i");
-  }
-}
-
-// Generate a regular expression detecting `suspectWords` and `bannedWords` phrases.
-function getPhrasesRegexp(
-  suspectWords: ReadonlyArray<string>,
-  bannedWords: ReadonlyArray<string>
-) {
-  return generateRegExp([...suspectWords, ...bannedWords]);
-}
-
-// Memoized version as arguments rarely change.
-const getPhrasesRegexpMemoized = memoize(getPhrasesRegexp);
-
-// markPhrasesHTML looks for `supsectWords` and `bannedWords` inside `text` and highlights them by returning
-// a HTML string.
-function markPhrasesHTML(
-  text: string,
-  suspectWords: ReadonlyArray<string>,
-  bannedWords: ReadonlyArray<string>
-) {
-  const regexp = getPhrasesRegexpMemoized(suspectWords, bannedWords);
-  const tokens = text.split(regexp);
+// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
+// the settings applied for the locale and highlights them by returning an HTML
+// string.
+function markPhrasesHTML(text: string, expression: RegExp) {
+  const tokens = text.split(expression);
  if (tokens.length === 1) {
    return text;
  }
@@ -87,45 +47,42 @@ function markPhrasesHTML(

 // markHTMLNode manipulates the node by looking for #text nodes and adding markers
 // for `supsectWords` and `bannedWords`.
-function markHTMLNode(
-  parentNode: Node,
-  suspectWords: ReadonlyArray<string>,
-  bannedWords: ReadonlyArray<string>
-) {
+function markHTMLNode(parentNode: Node, expression: RegExp) {
  parentNode.childNodes.forEach(node => {
    if (node.nodeName === "#text") {
-      const newContent = markPhrasesHTML(
-        node.textContent!,
-        suspectWords,
-        bannedWords
-      );
+      const newContent = markPhrasesHTML(node.textContent!, expression);
      if (newContent !== node.textContent) {
        const newNode = document.createElement("span");
        newNode.innerHTML = newContent;
        parentNode.replaceChild(newNode, node);
      }
    } else {
-      markHTMLNode(node, suspectWords, bannedWords);
+      markHTMLNode(node, expression);
    }
  });
 }

 const CommentContent: FunctionComponent<Props> = ({
-  suspectWords,
-  bannedWords,
+  phrases,
  className,
  children,
 }) => {
+  // Cache the expression used via memo. This will reduce duplicate renders of
+  // this comment content when the children change but the phrase configuration
+  // does not change. The regExp is already cached on a deeper level
+  // automatically, this is just lessening that impact further.
+  const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
+
  if (typeof children === "string") {
    // We create a Shadow DOM Tree with the HTML body content and
    // use it as a parser.
    const node = document.createElement("div");
    node.innerHTML = purify.sanitize(children);

-    if (suspectWords.length || bannedWords.length) {
+    if (expression) {
      // Then we traverse it recursively and manipulate it to highlight suspect words
      // and banned words.
-      markHTMLNode(node, suspectWords, bannedWords);
+      markHTMLNode(node, expression);
    }

    // Finally we render the content of the Shadow DOM Tree
@@ -30,10 +30,7 @@ const CommentRevisionContainer: FunctionComponent<Props> = ({
        .map(c => (
          <div key={c.id}>
            <Timestamp>{c.createdAt}</Timestamp>
-            <CommentContent
-              suspectWords={settings.wordList.suspect}
-              bannedWords={settings.wordList.banned}
-            >
+            <CommentContent phrases={settings}>
              {c.body ? c.body : ""}
            </CommentContent>
          </div>
@@ -57,6 +54,7 @@ const enhanced = withFragmentContainer<Props>({
  `,
  settings: graphql`
    fragment CommentRevisionContainer_settings on Settings {
+      locale
      wordList {
        banned
        suspect
@@ -21,8 +21,13 @@ const baseProps: PropTypesOf<typeof ModerateCardN> = {
  status: "undecided",
  featured: false,
  viewContextHref: "http://localhost/comment",
-  suspectWords: ["suspect"],
-  bannedWords: ["banned"],
+  phrases: {
+    locale: "en-US",
+    wordList: {
+      suspect: ["suspect"],
+      banned: ["banned"],
+    },
+  },
  siteName: null,
  onApprove: noop,
  onReject: noop,
@@ -10,6 +10,7 @@ import React, {
 } from "react";

 import { HOTKEYS } from "coral-admin/constants";
+import { GetPhrasesRegExpOptions } from "coral-admin/helpers";
 import { PropTypesOf } from "coral-framework/types";
 import {
  BaseButton,
@@ -48,8 +49,7 @@ interface Props {
  featured: boolean;
  moderatedBy: React.ReactNode | null;
  viewContextHref: string;
-  suspectWords: ReadonlyArray<string>;
-  bannedWords: ReadonlyArray<string>;
+  phrases: GetPhrasesRegExpOptions;
  showStory: boolean;
  storyTitle?: React.ReactNode;
  storyHref?: string;
@@ -87,8 +87,7 @@ const ModerateCard: FunctionComponent<Props> = ({
  viewContextHref,
  status,
  featured,
-  suspectWords,
-  bannedWords,
+  phrases,
  onApprove,
  onReject,
  onFeature,
@@ -219,11 +218,7 @@ const ModerateCard: FunctionComponent<Props> = ({
            )}
          </div>
          <div className={styles.contentArea}>
-            <CommentContent
-              suspectWords={suspectWords}
-              bannedWords={bannedWords}
-              className={styles.content}
-            >
+            <CommentContent phrases={phrases} className={styles.content}>
              {commentBody}
            </CommentContent>
            <div className={styles.viewContext}>
@@ -222,8 +222,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
          status={getStatus(comment)}
          featured={isFeatured(comment)}
          viewContextHref={comment.permalink}
-          suspectWords={settings.wordList.suspect}
-          bannedWords={settings.wordList.banned}
+          phrases={settings}
          onApprove={handleApprove}
          onReject={handleReject}
          onFeature={onFeature}
@@ -319,6 +318,7 @@ const enhanced = withFragmentContainer<Props>({
  `,
  settings: graphql`
    fragment ModerateCardContainer_settings on Settings {
+      locale
      wordList {
        banned
        suspect
@@ -47,16 +47,19 @@ exports[`renders approved correctly 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -177,16 +180,19 @@ exports[`renders correctly 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -307,16 +313,19 @@ exports[`renders dangling correctly 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -437,16 +446,19 @@ exports[`renders rejected correctly 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -576,16 +588,19 @@ exports[`renders reply correctly 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -706,16 +721,19 @@ exports[`renders story info 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          content
@@ -874,16 +892,19 @@ exports[`renders tombstoned when comment is deleted 1`] = `
        className="ModerateCard-contentArea"
      >
        <CommentContent
-          bannedWords={
-            Array [
-              "banned",
-            ]
-          }
          className="ModerateCard-content"
-          suspectWords={
-            Array [
-              "suspect",
-            ]
+          phrases={
+            Object {
+              "locale": "en-US",
+              "wordList": Object {
+                "banned": Array [
+                  "banned",
+                ],
+                "suspect": Array [
+                  "suspect",
+                ],
+              },
+            }
          }
        >
          <Localized
@@ -0,0 +1,64 @@
+import { LanguageCode } from "coral-common/helpers";
+import { createWordListRegExp } from "coral-common/utils";
+
+export interface GetPhrasesRegExpOptions {
+  locale: string;
+  wordList: {
+    banned: ReadonlyArray<string>;
+    suspect: ReadonlyArray<string>;
+  };
+}
+
+export function getPhrasesRegExp({
+  locale,
+  wordList: { banned, suspect },
+}: GetPhrasesRegExpOptions) {
+  if (banned.length === 0 && suspect.length === 0) {
+    return null;
+  }
+
+  return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
+}
+
+// cache is used as a global validator to the cached RegExp used by the
+// application. We expect that generally, there is only ever one word list used
+// by the client at a time, so this ensures that we only re-create the word list
+// if we must.
+const cache = {
+  keys: {
+    locale: "",
+    suspect: [] as ReadonlyArray<string>,
+    banned: [] as ReadonlyArray<string>,
+  },
+  value: null as RegExp | null,
+};
+
+export default function(options: GetPhrasesRegExpOptions) {
+  // We assume that the cache is valid unless one of the below checks fails.
+  let expired = false;
+
+  // Check the locale.
+  if (cache.keys.locale !== options.locale) {
+    cache.keys.locale = options.locale;
+    expired = true;
+  }
+
+  // Check the banned words list.
+  if (cache.keys.banned !== options.wordList.banned) {
+    cache.keys.banned = options.wordList.banned;
+    expired = true;
+  }
+
+  // Check the suspect words list.
+  if (cache.keys.suspect !== options.wordList.suspect) {
+    cache.keys.suspect = options.wordList.suspect;
+    expired = true;
+  }
+
+  // If the cache is expired, or the value doesn't exist, regenerate it.
+  if (expired) {
+    cache.value = getPhrasesRegExp(options);
+  }
+
+  return cache.value;
+}
@@ -1 +1,5 @@
 export { default as getQueueConnection } from "./getQueueConnection";
+export {
+  default as getPhrasesRegExp,
+  GetPhrasesRegExpOptions,
+} from "./getPhrasesRegExp";
@@ -0,0 +1,76 @@
+import { defaults } from "lodash";
+
+import { LanguageCode } from "coral-common/helpers";
+import { DeepPartial } from "coral-common/types";
+
+interface WordListRule {
+  split: string;
+  punctuation: string;
+  whitespace: string;
+}
+
+const DefaultWordListRule: WordListRule = {
+  split: "[^\\w]",
+  punctuation: '[\\s"?!.]+',
+  whitespace: "\\s+",
+};
+
+const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
+  "en-US": DefaultWordListRule,
+};
+
+/**
+ * Escape string for special regular expression characters.
+ *
+ * @param str the string to escape from regex characters
+ */
+function escapeRegExp(str: string) {
+  // $& means the whole matched string
+  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+/**
+ * generateRegExp will generate the tester that can be used to test strings
+ * for matches on phrases.
+ *
+ * @param lang the language to possibly swap word list rules
+ * @param phrases the phrases to use for creating the expression
+ */
+export default function createWordListRegExp(
+  lang: LanguageCode,
+  phrases: string[]
+) {
+  // Get the rule list for this language, fallback to english if we haven't
+  // provided any overrides.
+  const rule: WordListRule = defaults(
+    WordListRules[lang] || {},
+    DefaultWordListRule
+  );
+
+  const whitespace = new RegExp(rule.whitespace);
+
+  // Split up the words from the list into a regex escaped string.
+  const words = phrases
+    .map(phrase =>
+      phrase
+        // Split each phrase by whitespace.
+        .split(whitespace)
+        // Escape each phrase, we don't expect any of them to contain regex.
+        .map(word => escapeRegExp(word))
+        // Rejoin to ensure that any variation of the word separated by a
+        // punctuation character should also be caught.
+        .join(rule.punctuation)
+    )
+    // For each of these words, wrap a `|` or OR.
+    .join("|");
+
+  // Wrap the pattern in split rules.
+  const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
+
+  try {
+    return new RegExp(pattern, "iu");
+  } catch {
+    // IE does not support unicode support, so we'll create one without.
+    return new RegExp(pattern, "i");
+  }
+}
@@ -12,3 +12,4 @@ export { default as isPromiseLike } from "./isPromiseLike";
 export { default as isPromise } from "./isPromise";
 export { default as startsWith } from "./startsWith";
 export { default as getOrigin } from "./getOrigin";
+export { default as createWordListRegExp } from "./createWordListRegExp";
@@ -1,18 +1,24 @@
-import {
-  GQLCOMMENT_FLAG_REASON,
-  GQLCOMMENT_STATUS,
-} from "coral-server/graph/schema/__generated__/types";
 import { ACTION_TYPE } from "coral-server/models/action/comment";
 import {
  IntermediateModerationPhase,
  IntermediatePhaseResult,
 } from "coral-server/services/comments/pipeline";
-import { containsMatchingPhraseMemoized } from "coral-server/services/comments/pipeline/wordList";
+
+import {
+  GQLCOMMENT_FLAG_REASON,
+  GQLCOMMENT_STATUS,
+} from "coral-server/graph/schema/__generated__/types";
+
+import { WordList } from "../wordList";
+
+// Create a new wordlist instance to use.
+const list = new WordList();

 // This phase checks the comment against the wordList.
 export const wordList: IntermediateModerationPhase = ({
  tenant,
  comment,
+  htmlStripped,
 }): IntermediatePhaseResult | void => {
  // If there isn't a body, there can't be a bad word!
  if (!comment.body) {
@@ -23,7 +29,7 @@ export const wordList: IntermediateModerationPhase = ({
  // has pre-mod enabled or not. If the comment was rejected based on the
  // wordList, then reject it, otherwise if the moderation setting is
  // premod, set it to `premod`.
-  if (containsMatchingPhraseMemoized(tenant.wordList.banned, comment.body)) {
+  if (list.test(tenant, "banned", htmlStripped)) {
    // Add the flag related to Trust to the comment.
    return {
      status: GQLCOMMENT_STATUS.REJECTED,
@@ -43,7 +49,7 @@ export const wordList: IntermediateModerationPhase = ({

  // If the wordList has matched the suspect word filter and we haven't disabled
  // auto-flagging suspect words, then we should flag the comment!
-  if (containsMatchingPhraseMemoized(tenant.wordList.suspect, comment.body)) {
+  if (list.test(tenant, "suspect", htmlStripped)) {
    return {
      actions: [
        {
@@ -1,66 +1,82 @@
 import {
-  containsMatchingPhrase,
-  containsMatchingPhraseMemoized,
+  Options,
+  WordList,
 } from "coral-server/services/comments/pipeline/wordList";

-const phrases = [
-  "cookies",
-  "how to do bad things",
-  "how to do really bad things",
-  "s h i t",
-  "$hit",
-  "p**ch",
-  "p*ch",
-];
+describe("en-US", () => {
+  const list = new WordList();
+  const options: Options = {
+    id: "tenant_1",
+    locale: "en-US",
+    wordList: {
+      banned: [
+        "cookies",
+        "how to do bad things",
+        "how to do really bad things",
+        "s h i t",
+        "$hit",
+        "p**ch",
+        "p*ch",
+        "banned",
+        "ban",
+      ],
+      suspect: [],
+    },
+  };

-describe("containsMatchingPhrase", () => {
-  it("does match on a word in the list", () => {
-    [
-      "how to do really bad things",
-      "what is cookies",
-      "cookies",
-      "COOKIES.",
-      "how to do bad things",
-      "How To do bad things!",
-      "This stuff is $hit!",
-      "That's a p**ch!",
-    ].forEach(word => {
-      expect(containsMatchingPhrase(phrases, word)).toEqual(true);
+  describe("containsMatchingPhrase", () => {
+    it("does match on a word in the list", () => {
+      [
+        "how to do really bad things",
+        "what is cookies",
+        "cookies",
+        "COOKIES.",
+        "how to do bad things",
+        "How To do bad things!",
+        "How.To.do.bad.things!",
+        "This stuff is $hit!",
+        "This is a test.\nTo see if cookies are found, in the second line.",
+        "That's a p**ch!",
+        "Banned words should be detected",
+      ].forEach(word => {
+        expect(list.test(options, "banned", word)).toEqual(true);
+      });
+    });
+
+    it("does not match on a word not in the list", () => {
+      [
+        "how to",
+        "cookie",
+        "how to be a great person?",
+        "how to not do really bad things?",
+        "i have $100 dollars.",
+        "I have bad $ hit lling",
+        "That's a p***ch!",
+        "When bann is spelt wrong, it won't be caught.",
+      ].forEach(word => {
+        expect(list.test(options, "banned", word)).toEqual(false);
+      });
+    });
+
+    it("allows an empty list", () => {
+      expect(list.test(options, "banned", "test")).toEqual(false);
    });
  });

-  it("does not match on a word not in the list", () => {
-    [
-      "how to",
-      "cookie",
-      "how to be a great person?",
-      "how to not do really bad things?",
-      "i have $100 dollars.",
-      "I have bad $ hit lling",
-      "That's a p***ch!",
-    ].forEach(word => {
-      expect(containsMatchingPhrase(phrases, word)).toEqual(false);
-    });
-  });
-
-  it("allows an empty list", () => {
-    expect(containsMatchingPhrase([], "test")).toEqual(false);
-  });
-});
-
-describe("containsMatchingPhraseMemoized", () => {
-  it("return true for all cases after memoizing the first result", () => {
-    [
-      "cookies 1",
-      "cookies 2",
-      "cookies 4",
-      "cookies 5",
-      "this is for cookies 6",
-      "this is for cookies 7",
-      "this is for cookies 8",
-      "this is for cookies 9",
-    ].forEach(word => {
-      expect(containsMatchingPhraseMemoized(phrases, word)).toEqual(true);
+  describe("containsMatchingPhraseMemoized", () => {
+    it("return true for all cases after memoizing the first result", () => {
+      [
+        "cookies 1",
+        "cookies 2",
+        "cookies 4",
+        "cookies 5",
+        "this is for cookies 6",
+        "this is for cookies 7",
+        "this is for cookies 8",
+        "this is for cookies 9",
+      ].forEach(word => {
+        expect(list.test(options, "banned", word)).toEqual(true);
+      });
    });
  });
 });
@@ -1,37 +1,100 @@
-import { memoize } from "lodash";
+import ms from "ms";
+import now from "performance-now";

-// Replace `memoize.Cache`.
-memoize.Cache = WeakMap;
+import { LanguageCode } from "coral-common/helpers";
+import { createWordListRegExp } from "coral-common/utils";
+import logger from "coral-server/logger";
+import { Tenant } from "coral-server/models/tenant";

-/**
- * Escape string for special regular expression characters.
- */
-export function escapeRegExp(str: string) {
-  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
+interface Lists {
+  banned: RegExp | false;
+  suspect: RegExp | false;
 }

-/**
- * Generate a regular expression that catches the `phrases`.
- */
-export function generateRegExp(phrases: string[]) {
-  const inner = phrases
-    .map(phrase =>
-      phrase
-        .split(/\s+/)
-        .map(word => escapeRegExp(word))
-        .join('[\\s"?!.]+')
-    )
-    .join("|");
-  return new RegExp(`(^|[^\\w])(${inner})(?=[^\\w]|$)`, "miu");
+export type Options = Pick<Tenant, "id" | "locale" | "wordList">;
+
+export class WordList {
+  private readonly cache = new WeakMap<Options, Lists>();
+
+  private generate(locale: LanguageCode, list: string[]) {
+    // If a word list has no entries, then we can make a simple tester.
+    if (list.length === 0) {
+      return false;
+    }
+
+    return createWordListRegExp(locale, list);
+  }
+
+  /**
+   * create will create the List's.
+   *
+   * @param options options used to generate Lists
+   */
+  private create(options: Options): Lists {
+    return {
+      banned: this.generate(options.locale, options.wordList.banned),
+      suspect: this.generate(options.locale, options.wordList.suspect),
+    };
+  }
+
+  /**
+   * lists will create/return a cached set of testers for the provided word
+   * lists.
+   *
+   * @param options the options object that is also used as the cache key
+   */
+  private lists(options: Options, cache: boolean): Lists {
+    // If the request isn't supposed to use the cache, then just return a new
+    // one.
+    if (!cache) {
+      return this.create(options);
+    }
+
+    // As this is supposed to be cached, try to get it from the cache, or create
+    // it.
+    let lists = this.cache.get(options);
+    if (!lists) {
+      const startedAt = now();
+      lists = this.create(options);
+      logger.info(
+        { tenantID: options.id, took: ms(now() - startedAt) },
+        "regenerated word list cache"
+      );
+
+      this.cache.set(options, lists);
+    }
+
+    return lists;
+  }
+
+  /**
+   * test will test the string against the selected list. The generated lists
+   * are cached and re-used on subsequent calls.
+   *
+   * @param options the options object that is also used as the cache key
+   * @param listName the list to test against
+   * @param testString the string to test to see if they match anything on the
+   *                   list
+   * @param cache when true, will re-use the cached testers based on the lists
+   */
+  public test(
+    options: Options,
+    listName: keyof Lists,
+    testString: string,
+    cache = true
+  ): boolean {
+    const list = this.lists(options, cache)[listName];
+    if (!list) {
+      return false;
+    }
+
+    const startedAt = now();
+    const result = list.test(testString);
+    logger.debug(
+      { tenantID: options.id, took: ms(now() - startedAt) },
+      "word list phrase test complete"
+    );
+
+    return result;
+  }
 }
-
-export const generateRegExpMemoized = memoize(generateRegExp);
-
-export const containsMatchingPhrase = (phrases: string[], testString: string) =>
-  phrases.length > 0 ? generateRegExp(phrases).test(testString) : false;
-
-export const containsMatchingPhraseMemoized = (
-  phrases: string[],
-  testString: string
-) =>
-  phrases.length > 0 ? generateRegExpMemoized(phrases).test(testString) : false;
@@ -1,5 +1,5 @@
 import { Redis } from "ioredis";
-import { isUndefined } from "lodash";
+import { isUndefined, lowerCase, uniqBy } from "lodash";
 import { DateTime } from "luxon";
 import { Db } from "mongodb";
 import { URL } from "url";
@@ -45,11 +45,11 @@ function cleanWordList(
  list: GQLSettingsWordListInput
 ): GQLSettingsWordListInput {
  if (list.banned) {
-    list.banned = list.banned.filter(Boolean);
+    list.banned = uniqBy(list.banned.filter(Boolean), lowerCase) as string[];
  }

  if (list.suspect) {
-    list.suspect = list.suspect.filter(Boolean);
+    list.suspect = uniqBy(list.suspect.filter(Boolean), lowerCase) as string[];
  }

  return list;