[CORL-1001] Wordlist Fixes (#2920)

* fix: improve wordlist highlighting and perf * fix: updated tests * fix: implmeneted new regexp lib/patterns * fix: improve comment body css * fix: take into account the tree shaking is disabled See: https://github.com/webpack/webpack/issues/7094 Co-authored-by: Chi Vinh Le <vinh@vinh.tech> Co-authored-by: Kim Gardner <kgardnr@gmail.com>
2026-06-27 19:33:06 +08:00 · 2020-04-06 19:04:05 +00:00
parent 8966a8201b
commit 6711f09a79
20 changed files with 616 additions and 185 deletions
@@ -3217,7 +3217,6 @@
      "version": "7.8.3",
      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.8.3.tgz",
      "integrity": "sha512-lrIU4aVbmlM/wQPzhEvzvNJskKyYptuXb0fGC0lTQTupTOYtR2Vqbu6/jf8vTr4M8Wt1nIzxVrSvPI5qESa/xA==",
-      "dev": true,
      "requires": {
        "core-js-pure": "^3.0.0",
        "regenerator-runtime": "^0.13.2"
@@ -5760,6 +5759,12 @@
        "@types/node": "*"
      }
    },
+    "@types/xregexp": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/@types/xregexp/-/xregexp-4.3.0.tgz",
+      "integrity": "sha512-3gJTS9gt27pS7U9q5IVqo4YvKSlkf2ck8ish6etuDj6LIRxkL/2Y8RMUtK/QzvE1Yv2zwWV5yemI2BS0GGGFnA==",
+      "dev": true
+    },
    "@types/yargs": {
      "version": "12.0.12",
      "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-12.0.12.tgz",
@@ -10721,8 +10726,7 @@
    "core-js-pure": {
      "version": "3.6.4",
      "resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.6.4.tgz",
-      "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==",
-      "dev": true
+      "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw=="
    },
    "core-util-is": {
      "version": "1.0.2",
@@ -32077,8 +32081,7 @@
    "regenerator-runtime": {
      "version": "0.13.3",
      "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.3.tgz",
-      "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==",
-      "dev": true
+      "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw=="
    },
    "regenerator-transform": {
      "version": "0.13.4",
@@ -38138,6 +38141,14 @@
          "dev": true,
          "requires": {
            "xregexp": "4.0.0"
+          },
+          "dependencies": {
+            "xregexp": {
+              "version": "4.0.0",
+              "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
+              "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
+              "dev": true
+            }
          }
        },
        "del": {
@@ -39365,10 +39376,12 @@
      "integrity": "sha512-tGkGJkN8XqCod7OT+EvGYK5Z4SfDQGD30zAa58OcnAa0RRWgzUEK72tkXhsX1FZd+rgnhRxFtmO+ihkp8LHSkw=="
    },
    "xregexp": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
-      "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
-      "dev": true
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.3.0.tgz",
+      "integrity": "sha512-7jXDIFXh5yJ/orPn4SXjuVrWWoi4Cr8jfV1eHv9CixKSbU+jY4mxfrBwAuDvupPNKpMUY+FeIqsVw/JLT9+B8g==",
+      "requires": {
+        "@babel/runtime-corejs3": "^7.8.3"
+      }
    },
    "xtend": {
      "version": "4.0.1",
@@ -4,6 +4,7 @@
  "author": "The Coral Project",
  "homepage": "https://coralproject.net/",
  "sideEffects": [
+    "*.css.ts",
    "*.css"
  ],
  "repository": {
@@ -140,7 +141,8 @@
    "tsscmp": "^1.0.6",
    "url-regex": "^5.0.0",
    "uuid": "^3.3.3",
-    "verror": "^1.10.0"
+    "verror": "^1.10.0",
+    "xregexp": "^4.3.0"
  },
  "devDependencies": {
    "@babel/core": "^7.8.3",
@@ -240,6 +242,7 @@
    "@types/webpack-bundle-analyzer": "^2.13.1",
    "@types/webpack-dev-server": "^3.1.5",
    "@types/ws": "^5.1.2",
+    "@types/xregexp": "^4.3.0",
    "@typescript-eslint/eslint-plugin": "2.3.3",
    "@typescript-eslint/eslint-plugin-tslint": "2.3.3",
    "@typescript-eslint/parser": "2.3.3",
@@ -12,12 +12,12 @@ $comment-link-active: var(--v2-palette-primary-darkest);
  color: $comment-content;
  overflow-wrap: break-word;

-  & * bold,
-  & * strong {
+  b,
+  strong {
    font-weight: var(--v2-font-weight-primary-bold);
  }
-  & * italic,
-  & * em {
+  i,
+  em {
    font-style: italic;
  }
  blockquote {
@@ -50,3 +50,7 @@ $comment-link-active: var(--v2-palette-primary-darkest);
    }
  }
 }
+
+.highlight {
+  white-space: pre-wrap;
+}
@@ -16,6 +16,7 @@ it("renders correctly", () => {
    },
    className: "custom",
    children: "Hello <b>Bob</b>, you bad guy",
+    highlight: true,
  };
  const renderer = createRenderer();
  renderer.render(<CommentContent {...props} />);
@@ -33,6 +34,26 @@ it("renders empty words correctly", () => {
    },
    className: "custom",
    children: "Hello <b>Bob</b>, you bad guy",
+    highlight: true,
+  };
+  const renderer = createRenderer();
+  renderer.render(<CommentContent {...props} />);
+  expect(renderer.getRenderOutput()).toMatchSnapshot();
+});
+
+it("renders correctly even if it has consecutive banned words on comments", () => {
+  const props: PropTypesOf<typeof CommentContent> = {
+    phrases: {
+      locale: "en-US",
+      wordList: {
+        suspect: ["worse"],
+        banned: ["bad"],
+      },
+    },
+    className: "custom",
+    children:
+      "This is a very long comment with bad words. Let's try bad and bad. Now bad bad.\nBad BAD bad.\n",
+    highlight: true,
  };
  const renderer = createRenderer();
  renderer.render(<CommentContent {...props} />);
@@ -1,7 +1,12 @@
 import cn from "classnames";
 import React, { FunctionComponent, useMemo } from "react";
+import striptags from "striptags";

-import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
+import {
+  getPhrasesRegExp,
+  GetPhrasesRegExpOptions,
+  markHTMLNode,
+} from "coral-admin/helpers";
 import { createPurify } from "coral-common/utils/purify";

 import styles from "./CommentContent.css";
@@ -15,81 +20,60 @@ interface Props {
  className?: string;
  children: string | React.ReactElement;
  phrases: GetPhrasesRegExpOptions;
-}
-
-function escapeHTML(unsafe: string) {
-  return unsafe
-    .replace(/&/g, "&amp;")
-    .replace(/</g, "&lt;")
-    .replace(/>/g, "&gt;")
-    .replace(/"/g, "&quot;")
-    .replace(/'/g, "&#039;");
-}
-
-// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
-// the settings applied for the locale and highlights them by returning an HTML
-// string.
-function markPhrasesHTML(text: string, expression: RegExp) {
-  const tokens = text.split(expression);
-  if (tokens.length === 1) {
-    return text;
-  }
-  return tokens
-    .map((token, i) =>
-      // Using our Regexp patterns it returns tokens arranged this way
-      // [STRING_WITH_NO_MATCH, NEW_WORD_DELIMITER, MATCHED_WORD, ...].
-      // This pattern repeats throughout. Next line will mark MATCHED_WORD
-      // and escape all tokens.
-      i % 3 === 2 ? `<mark>${escapeHTML(token)}</mark>` : escapeHTML(token)
-    )
-    .join("");
-}
-
-// markHTMLNode manipulates the node by looking for #text nodes and adding markers
-// for `supsectWords` and `bannedWords`.
-function markHTMLNode(parentNode: Node, expression: RegExp) {
-  parentNode.childNodes.forEach(node => {
-    if (node.nodeName === "#text") {
-      const newContent = markPhrasesHTML(node.textContent!, expression);
-      if (newContent !== node.textContent) {
-        const newNode = document.createElement("span");
-        newNode.innerHTML = newContent;
-        parentNode.replaceChild(newNode, node);
-      }
-    } else {
-      markHTMLNode(node, expression);
-    }
-  });
+  highlight?: boolean;
 }

 const CommentContent: FunctionComponent<Props> = ({
  phrases,
  className,
  children,
+  highlight = false,
 }) => {
  // Cache the expression used via memo. This will reduce duplicate renders of
  // this comment content when the children change but the phrase configuration
  // does not change. The regExp is already cached on a deeper level
  // automatically, this is just lessening that impact further.
-  const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
+  const expression = useMemo(() => {
+    // If we aren't in highlight mode for this comment, don't even attempt to
+    // generate the expression.
+    if (!highlight) {
+      return null;
+    }

-  if (typeof children === "string") {
-    // We create a Shadow DOM Tree with the HTML body content and
-    // use it as a parser.
+    return getPhrasesRegExp(phrases);
+  }, [phrases, highlight]);
+
+  // Cache the parsed comment node. If the children cannot be parsed, this will
+  // be null.
+  const parsed = useMemo(() => {
+    if (typeof children !== "string") {
+      return null;
+    }
+
+    // Sanitize the input for display.
+    let html = purify.sanitize(children);
+    if (highlight) {
+      html = striptags(html, ["a"]);
+    }
+
+    // We create a Shadow DOM Tree with the HTML body content and use it as a
+    // parser.
    const node = document.createElement("div");
-    node.innerHTML = purify.sanitize(children);
+    node.innerHTML = html;

+    // If the expression is available, then mark the nodes.
    if (expression) {
-      // Then we traverse it recursively and manipulate it to highlight suspect words
-      // and banned words.
      markHTMLNode(node, expression);
    }

-    // Finally we render the content of the Shadow DOM Tree
+    return node;
+  }, [children, expression, highlight]);
+
+  if (parsed) {
    return (
      <div
-        className={cn(className, styles.root)}
-        dangerouslySetInnerHTML={{ __html: node.innerHTML }}
+        className={cn(className, styles.root, highlight && styles.highlight)}
+        dangerouslySetInnerHTML={{ __html: parsed.innerHTML }}
      />
    );
  }
@@ -2,10 +2,23 @@

 exports[`renders correctly 1`] = `
 <div
-  className="custom CommentContent-root"
+  className="custom CommentContent-root CommentContent-highlight"
  dangerouslySetInnerHTML={
    Object {
-      "__html": "Hello <b>Bob</b><span>, you <mark>bad</mark> guy</span>",
+      "__html": "<span>Hello Bob, you <mark>bad</mark> guy</span>",
+    }
+  }
+/>
+`;
+
+exports[`renders correctly even if it has consecutive banned words on comments 1`] = `
+<div
+  className="custom CommentContent-root CommentContent-highlight"
+  dangerouslySetInnerHTML={
+    Object {
+      "__html": "<span>This is a very long comment with <mark>bad</mark> words. Let's try <mark>bad</mark> and <mark>bad</mark>. Now <mark>bad</mark> bad.
+<mark>Bad</mark> BAD <mark>bad</mark>.
+</span>",
    }
  }
 />
@@ -13,10 +26,10 @@ exports[`renders correctly 1`] = `

 exports[`renders empty words correctly 1`] = `
 <div
-  className="custom CommentContent-root"
+  className="custom CommentContent-root CommentContent-highlight"
  dangerouslySetInnerHTML={
    Object {
-      "__html": "Hello <b>Bob</b>, you bad guy",
+      "__html": "Hello Bob, you bad guy",
    }
  }
 />
@@ -37,6 +37,7 @@ interface Props {
  username: string;
  createdAt: string;
  body: string;
+  highlight?: boolean;
  inReplyTo?: {
    id: string;
    username: string | null;
@@ -82,6 +83,7 @@ const ModerateCard: FunctionComponent<Props> = ({
  username,
  createdAt,
  body,
+  highlight = false,
  inReplyTo,
  comment,
  settings,
@@ -222,7 +224,11 @@ const ModerateCard: FunctionComponent<Props> = ({
            )}
          </div>
          <div className={styles.contentArea}>
-            <CommentContent phrases={phrases} className={styles.content}>
+            <CommentContent
+              highlight={highlight}
+              phrases={phrases}
+              className={styles.content}
+            >
              {commentBody}
            </CommentContent>
            <div className={styles.viewContext}>
@@ -215,6 +215,16 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
    },
    [comment]
  );
+
+  // Only highlight comments that have been flagged for containing a banned or
+  // suspect word.
+  const highlight = comment.revision
+    ? comment.revision.actionCounts.flag.reasons.COMMENT_DETECTED_BANNED_WORD +
+        comment.revision.actionCounts.flag.reasons
+          .COMMENT_DETECTED_SUSPECT_WORD >
+      0
+    : false;
+
  return (
    <>
      <FadeInTransition active={Boolean(comment.enteredLive)}>
@@ -227,6 +237,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
          }
          createdAt={comment.createdAt}
          body={comment.body!}
+          highlight={highlight}
          inReplyTo={comment.parent && comment.parent.author}
          comment={comment}
          settings={settings}
@@ -296,6 +307,16 @@ const enhanced = withFragmentContainer<Props>({
      statusLiveUpdated
      createdAt
      body
+      revision {
+        actionCounts {
+          flag {
+            reasons {
+              COMMENT_DETECTED_BANNED_WORD
+              COMMENT_DETECTED_SUSPECT_WORD
+            }
+          }
+        }
+      }
      tags {
        code
      }
@@ -44,6 +44,7 @@ exports[`renders approved correctly 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -178,6 +179,7 @@ exports[`renders correctly 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -312,6 +314,7 @@ exports[`renders dangling correctly 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -446,6 +449,7 @@ exports[`renders rejected correctly 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -589,6 +593,7 @@ exports[`renders reply correctly 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -723,6 +728,7 @@ exports[`renders story info 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -895,6 +901,7 @@ exports[`renders tombstoned when comment is deleted 1`] = `
      >
        <CommentContent
          className="ModerateCard-content"
+          highlight={false}
          phrases={
            Object {
              "locale": "en-US",
@@ -1,5 +1,7 @@
+import { lowerCase, uniqBy } from "lodash";
+
 import { LanguageCode } from "coral-common/helpers";
-import { createWordListRegExp } from "coral-common/utils";
+import createWordListRegExp from "coral-common/utils/createWordListRegExp";

 export interface GetPhrasesRegExpOptions {
  locale: string;
@@ -17,20 +19,35 @@ export function getPhrasesRegExp({
    return null;
  }

-  return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
+  // Because the banned and suspect word lists may sometimes overlap, we should
+  // make this list as short as possible before compiling it into a RegExp.
+  const phrases = uniqBy<string>([...banned, ...suspect], lowerCase);
+
+  // The locale is passed down to us from the Graph, we can cast it to a
+  // LanguageCode.
+  return createWordListRegExp(locale as LanguageCode, phrases);
 }

-// cache is used as a global validator to the cached RegExp used by the
+// Cache is used as a global validator to the cached RegExp used by the
 // application. We expect that generally, there is only ever one word list used
 // by the client at a time, so this ensures that we only re-create the word list
 // if we must.
-const cache = {
+interface Cache {
+  keys: {
+    locale: string;
+    suspect: ReadonlyArray<string>;
+    banned: ReadonlyArray<string>;
+  };
+  value: RegExp | null;
+}
+
+const cache: Cache = {
  keys: {
    locale: "",
-    suspect: [] as ReadonlyArray<string>,
-    banned: [] as ReadonlyArray<string>,
+    suspect: [],
+    banned: [],
  },
-  value: null as RegExp | null,
+  value: null,
 };

 export default function(options: GetPhrasesRegExpOptions) {
@@ -57,7 +74,12 @@ export default function(options: GetPhrasesRegExpOptions) {

  // If the cache is expired, or the value doesn't exist, regenerate it.
  if (expired) {
-    cache.value = getPhrasesRegExp(options);
+    try {
+      cache.value = getPhrasesRegExp(options);
+    } catch (err) {
+      window.console.error(err);
+      return null;
+    }
  }

  return cache.value;
@@ -3,3 +3,4 @@ export {
  default as getPhrasesRegExp,
  GetPhrasesRegExpOptions,
 } from "./getPhrasesRegExp";
+export { default as markHTMLNode } from "./markHTMLNode";
@@ -0,0 +1,58 @@
+// markPhrasesHTML looks for `suspect` and `banned` words inside `text` given
+// the settings applied for the locale and highlights them by returning an HTML
+// string.
+function markPhrasesHTML(text: string, expression: RegExp) {
+  const tokens = text.split(expression);
+
+  // If there were less than two matches, then there was no matched word
+  // associated with the passed in text.
+  if (tokens.length < 3) {
+    return null;
+  }
+
+  return tokens
+    .map((token, i) =>
+      // Using our Regexp patterns it returns tokens arranged this way:
+      //
+      //  - STRING_WITH_NO_MATCH
+      //  - NEW_WORD_DELIMITER
+      //  - MATCHED_WORD
+      //  - NEW_WORD_DELIMITER
+      //  - ...
+      //
+      // This pattern repeats throughout. Next line will mark MATCHED_WORD.
+      i % 4 === 2 ? "<mark>" + token + "</mark>" : token
+    )
+    .join("");
+}
+
+// markHTMLNode manipulates the node by looking for #text nodes and adding
+// markers.
+export default function markHTMLNode(parentNode: Node, expression: RegExp) {
+  parentNode.childNodes.forEach(node => {
+    // Anchor links are already marked by default, skip them now.
+    if (node.nodeName === "A") {
+      return;
+    }
+
+    // If the node isn't of text type then we can't mark it directly.
+    if (node.nodeName !== "#text") {
+      return markHTMLNode(node, expression);
+    }
+
+    // If the node doesn't have any text content, then we can't mark it either.
+    if (!node.textContent) {
+      return;
+    }
+
+    // We've encountered a text node with text content that isn't in an anchor
+    // link. We should try to mark and replace it's content.
+    const replacement = markPhrasesHTML(node.textContent, expression);
+    if (replacement) {
+      // Create the new span node to replace the old node with.
+      const newNode = document.createElement("span");
+      newNode.innerHTML = replacement;
+      parentNode.replaceChild(newNode, node);
+    }
+  });
+}
@@ -3,6 +3,10 @@ $fullscreenZIndex: 10;

 .wrapper {
  @mixin bodyCopy;
+  i
+  em {
+    font-style: italic;
+  }
  b,
  strong {
    font-weight: var(--font-weight-medium);
@@ -3,12 +3,12 @@
  color: var(--palette-text-dark);
  overflow-wrap: break-word;

-  & * bold,
-  & * strong {
+  b,
+  strong {
    font-weight: var(--font-weight-medium);
  }
-  & * italic,
-  & * em {
+  i
+  em {
    font-style: italic;
  }
  blockquote {
@@ -0,0 +1,179 @@
+// Jest Snapshot v1, https://goo.gl/fbAQLP
+
+exports[`en-US splits the multi-words in a sentence correctly 1`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 2`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french;fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 3`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french!fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 4`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french.fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 5`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french?fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 6`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french¿fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 7`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french:fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words in a sentence correctly 1`] = `
+Array [
+  "this sentence is",
+  " ",
+  "bad",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words in a sentence correctly 2`] = `
+Array [
+  "this sentence is",
+  " ",
+  "worse",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words with unicode in a sentence correctly 1`] = `
+Array [
+  "this sentence has one",
+  " ",
+  "jalapeño",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words with unicode in a sentence correctly 2`] = `
+Array [
+  "this sentence has many jalapeños.",
+]
+`;
+
+exports[`en-US splits words when there are repeat words 1`] = `
+Array [
+  "This is",
+  " ",
+  "bad",
+  " ",
+  "bad, very",
+  " ",
+  "BAD",
+  ".",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 1`] = `
+Array [
+  "biólogo se soletra com",
+  ": ",
+  "bi",
+  " ",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 2`] = `
+Array [
+  "",
+  "",
+  "m.e.r.d.a",
+  "",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 3`] = `
+Array [
+  "não tomo",
+  " ",
+  "café",
+  " ",
+  "pois faz mal",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 4`] = `
+Array [
+  "",
+  "",
+  "Como fazer coisas ruins",
+  "",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 5`] = `
+Array [
+  "O biólogo recomenda este artigo",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 6`] = `
+Array [
+  "cafe",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 7`] = `
+Array [
+  "Ser banido é uma merda",
+]
+`;
@@ -0,0 +1,173 @@
+import createWordListRegExp from "./createWordListRegExp";
+
+const buildTester = (re: RegExp) => (str: string) => re.test(str);
+
+const buildSplitter = (re: RegExp) => (str: string) => str.split(re);
+
+describe("en-US", () => {
+  const re = createWordListRegExp("en-US", [
+    "bad",
+    "french fries",
+    "worse",
+    "jalapeño",
+    "1km",
+  ]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("bad")).toBeTruthy();
+    expect(test("worse")).toBeTruthy();
+    expect(test("1km")).toBeTruthy();
+  });
+
+  it("test repeated words in the list", () => {
+    expect(test("bad bad bad")).toBeTruthy();
+    expect(test("worse worse worse")).toBeTruthy();
+  });
+
+  it("test words not in the list", () => {
+    expect(test("fine")).toBeFalsy();
+    expect(test("ok")).toBeFalsy();
+  });
+
+  it("test words that end with a unicode character", () => {
+    expect(test("I have one jalapeño.")).toBeTruthy();
+    expect(test("I have two jalapeños.")).toBeFalsy();
+  });
+
+  it("test words in the list while being case insensitive", () => {
+    expect(test("Bad")).toBeTruthy();
+    expect(test("BAd")).toBeTruthy();
+    expect(test("BAD")).toBeTruthy();
+    expect(test("bAD")).toBeTruthy();
+    expect(test("baD")).toBeTruthy();
+    expect(test("bAd")).toBeTruthy();
+    expect(test("1KM")).toBeTruthy();
+  });
+
+  it("test multi-words", () => {
+    expect(test("french fries")).toBeTruthy();
+    expect(test("french!fries")).toBeTruthy();
+    expect(test("french.fries")).toBeTruthy();
+    expect(test("french?fries")).toBeTruthy();
+    expect(test("french¿fries")).toBeTruthy();
+    expect(test("french:fries")).toBeTruthy();
+    expect(test("french;fries")).toBeTruthy();
+  });
+
+  it("test words at the end of a sentence", () => {
+    expect(test("this sentence is bad.")).toBeTruthy();
+    expect(test("this sentence is worse.")).toBeTruthy();
+    expect(test("this sentence has french fries.")).toBeTruthy();
+    expect(test("this sentence has french!fries.")).toBeTruthy();
+    expect(test("this sentence has french.fries.")).toBeTruthy();
+    expect(test("this sentence has french?fries.")).toBeTruthy();
+    expect(test("this sentence has french¿fries.")).toBeTruthy();
+    expect(test("this sentence has french:fries.")).toBeTruthy();
+    expect(test("this sentence has french;fries.")).toBeTruthy();
+  });
+
+  it("test words at the start of a sentence", () => {
+    expect(test("bad is the start of the sentence.")).toBeTruthy();
+    expect(test("worse is the start of the sentence.")).toBeTruthy();
+    expect(test("french fries is the start of the sentence.")).toBeTruthy();
+  });
+
+  it("test does not preserve state", () => {
+    expect(test("bad 1")).toBeTruthy();
+    expect(test("bad 2")).toBeTruthy();
+    expect(test("more bad 3")).toBeTruthy();
+    expect(test("more bad 4")).toBeTruthy();
+  });
+
+  it("test repeated words", () => {
+    expect(test("This is bad bad, very bad")).toBeTruthy();
+  });
+
+  it("test does not match substrings", () => {
+    expect(test("baddd")).toBeFalsy();
+    expect(test("wwworse")).toBeFalsy();
+    expect(test("fffrench fries")).toBeFalsy();
+  });
+
+  it("test handles when there are numbers", () => {
+    expect(test("bad3")).toBeTruthy();
+    expect(test("3bad")).toBeTruthy();
+  });
+
+  const split = buildSplitter(re);
+
+  it("splits the words in a sentence correctly", () => {
+    expect(split("this sentence is bad.")).toMatchSnapshot();
+    expect(split("this sentence is worse.")).toMatchSnapshot();
+  });
+
+  it("splits words when there are repeat words", () => {
+    expect(split("This is bad bad, very BAD.")).toMatchSnapshot();
+  });
+
+  it("splits the words with unicode in a sentence correctly", () => {
+    expect(split("this sentence has one jalapeño.")).toMatchSnapshot();
+    expect(split("this sentence has many jalapeños.")).toMatchSnapshot();
+  });
+
+  it("splits the multi-words in a sentence correctly", () => {
+    expect(split("this sentence has french fries.")).toMatchSnapshot();
+    expect(split("this sentence has french;fries.")).toMatchSnapshot();
+    expect(split("this sentence has french!fries.")).toMatchSnapshot();
+    expect(split("this sentence has french.fries.")).toMatchSnapshot();
+    expect(split("this sentence has french?fries.")).toMatchSnapshot();
+    expect(split("this sentence has french¿fries.")).toMatchSnapshot();
+    expect(split("this sentence has french:fries.")).toMatchSnapshot();
+  });
+});
+
+describe("es", () => {
+  const re = createWordListRegExp("es", ["adónde vas", "tú"]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("Pablo, ¿adónde vas?")).toBeTruthy();
+    expect(test("Estoy cansado, ¿y tú?")).toBeTruthy();
+    expect(test("¿tú?")).toBeTruthy();
+  });
+});
+
+describe("pt-BR", () => {
+  const re = createWordListRegExp("pt-BR", [
+    "bi",
+    "outro",
+    "café",
+    "m e r d a",
+    "Como fazer coisas ruins",
+  ]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("biólogo se soletra com: bi ")).toBeTruthy();
+    expect(test("m.e.r.d.a")).toBeTruthy();
+    expect(test("não tomo café pois faz mal")).toBeTruthy();
+    expect(test("Como fazer coisas ruins")).toBeTruthy();
+  });
+
+  it("test words not in the list", () => {
+    expect(test("O biólogo recomenda este artigo")).toBeFalsy();
+    expect(test("cafe")).toBeFalsy();
+    expect(test("Ser banido é uma merda")).toBeFalsy();
+  });
+
+  const split = buildSplitter(re);
+
+  it("splits the words with unicode in a sentence correctly", () => {
+    expect(split("biólogo se soletra com: bi ")).toMatchSnapshot();
+    expect(split("m.e.r.d.a")).toMatchSnapshot();
+    expect(split("não tomo café pois faz mal")).toMatchSnapshot();
+    expect(split("Como fazer coisas ruins")).toMatchSnapshot();
+    expect(split("O biólogo recomenda este artigo")).toMatchSnapshot();
+    expect(split("cafe")).toMatchSnapshot();
+    expect(split("Ser banido é uma merda")).toMatchSnapshot();
+  });
+});
@@ -1,18 +1,20 @@
 import { defaults } from "lodash";
+import XRegExp from "xregexp";

 import { LanguageCode } from "coral-common/helpers";
 import { DeepPartial } from "coral-common/types";

 interface WordListRule {
-  split: string;
+  boundary: string;
  punctuation: string;
-  whitespace: string;
 }

 const DefaultWordListRule: WordListRule = {
-  split: "[^\\w]",
-  punctuation: '[\\s"?!.¿¡`:;]+',
-  whitespace: "\\s+",
+  // The following symbol, \p{L} refers to any letter class within unicode.
+  // Because we're adding the ^, we're also saying to exclude any from that set,
+  // leaving all non-word characters from unicode available for selection.
+  boundary: "[^\\p{L}]+",
+  punctuation: "[\\s\"'?!.,¿¡`:;]+",
 };

 const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
@@ -47,14 +49,12 @@ export default function createWordListRegExp(
    DefaultWordListRule
  );

-  const whitespace = new RegExp(rule.whitespace);
-
  // Split up the words from the list into a regex escaped string.
  const words = phrases
    .map(phrase =>
      phrase
        // Split each phrase by whitespace.
-        .split(whitespace)
+        .split(/\s/)
        // Escape each phrase, we don't expect any of them to contain regex.
        .map(word => escapeRegExp(word))
        // Rejoin to ensure that any variation of the word separated by a
@@ -64,13 +64,18 @@ export default function createWordListRegExp(
    // For each of these words, wrap a `|` or OR.
    .join("|");

-  // Wrap the pattern in split rules.
-  const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
+  // Wrap the pattern in split rules. We want to match any word that either is
+  // at the start of a string, or a word boundary. The word must also either be
+  // at the end of the string or at another word boundary.
+  const pattern = `(^|${rule.boundary})(${words})($|${rule.boundary})`;

  try {
-    return new RegExp(pattern, "iu");
+    // Create the RegExp using xregexp to pre-process the pattern to generate
+    // one with the correct unicode ranges. Including A for "astral" unicode
+    // support for supporting higher character ranges.
+    return XRegExp(pattern, "iuA");
  } catch {
    // IE does not support unicode support, so we'll create one without.
-    return new RegExp(pattern, "i");
+    return XRegExp(pattern, "i");
  }
 }
@@ -12,4 +12,3 @@ export { default as isPromiseLike } from "./isPromiseLike";
 export { default as isPromise } from "./isPromise";
 export { default as startsWith } from "./startsWith";
 export { default as getOrigin } from "./getOrigin";
-export { default as createWordListRegExp } from "./createWordListRegExp";
@@ -1,82 +0,0 @@
-import {
-  Options,
-  WordList,
-} from "coral-server/services/comments/pipeline/wordList";
-
-describe("en-US", () => {
-  const list = new WordList();
-  const options: Options = {
-    id: "tenant_1",
-    locale: "en-US",
-    wordList: {
-      banned: [
-        "cookies",
-        "how to do bad things",
-        "how to do really bad things",
-        "s h i t",
-        "$hit",
-        "p**ch",
-        "p*ch",
-        "banned",
-        "ban",
-      ],
-      suspect: [],
-    },
-  };
-
-  describe("containsMatchingPhrase", () => {
-    it("does match on a word in the list", () => {
-      [
-        "how to do really bad things",
-        "what is cookies",
-        "cookies",
-        "COOKIES.",
-        "how to do bad things",
-        "How To do bad things!",
-        "How.To.do.bad.things!",
-        "This stuff is $hit!",
-        "This is a test.\nTo see if cookies are found, in the second line.",
-        "That's a p**ch!",
-        "Banned words should be detected",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(true);
-      });
-    });
-
-    it("does not match on a word not in the list", () => {
-      [
-        "how to",
-        "cookie",
-        "how to be a great person?",
-        "how to not do really bad things?",
-        "i have $100 dollars.",
-        "I have bad $ hit lling",
-        "That's a p***ch!",
-        "When bann is spelt wrong, it won't be caught.",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(false);
-      });
-    });
-
-    it("allows an empty list", () => {
-      expect(list.test(options, "banned", "test")).toEqual(false);
-    });
-  });
-
-  describe("containsMatchingPhraseMemoized", () => {
-    it("return true for all cases after memoizing the first result", () => {
-      [
-        "cookies 1",
-        "cookies 2",
-        "cookies 4",
-        "cookies 5",
-        "this is for cookies 6",
-        "this is for cookies 7",
-        "this is for cookies 8",
-        "this is for cookies 9",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(true);
-      });
-    });
-  });
-});
@@ -2,7 +2,7 @@ import ms from "ms";
 import now from "performance-now";

 import { LanguageCode } from "coral-common/helpers";
-import { createWordListRegExp } from "coral-common/utils";
+import createWordListRegExp from "coral-common/utils/createWordListRegExp";
 import logger from "coral-server/logger";
 import { Tenant } from "coral-server/models/tenant";

@@ -90,8 +90,8 @@ export class WordList {

    const startedAt = now();
    const result = list.test(testString);
-    logger.debug(
-      { tenantID: options.id, took: ms(now() - startedAt) },
+    logger.info(
+      { tenantID: options.id, listName, took: ms(now() - startedAt) },
      "word list phrase test complete"
    );