From 6711f09a79b06950c28df551c63276fc5ff8be04 Mon Sep 17 00:00:00 2001
From: Wyatt Johnson <wyattjoh@gmail.com>
Date: Mon, 6 Apr 2020 19:04:05 +0000
Subject: [PATCH] [CORL-1001] Wordlist Fixes (#2920)

* fix: improve wordlist highlighting and perf

* fix: updated tests

* fix: implmeneted new regexp lib/patterns

* fix: improve comment body css

* fix: take into account the tree shaking is disabled

See: https://github.com/webpack/webpack/issues/7094

Co-authored-by: Chi Vinh Le <vinh@vinh.tech>
Co-authored-by: Kim Gardner <kgardnr@gmail.com>
---
 package-lock.json                             |  31 ++-
 package.json                                  |   5 +-
 .../components/Comment/CommentContent.css     |  12 +-
 .../Comment/CommentContent.spec.tsx           |  21 ++
 .../components/Comment/CommentContent.tsx     |  96 ++++------
 .../CommentContent.spec.tsx.snap              |  21 +-
 .../components/ModerateCard/ModerateCard.tsx  |   8 +-
 .../ModerateCard/ModerateCardContainer.tsx    |  21 ++
 .../__snapshots__/ModerateCard.spec.tsx.snap  |   7 +
 .../client/admin/helpers/getPhrasesRegExp.ts  |  38 +++-
 src/core/client/admin/helpers/index.ts        |   1 +
 src/core/client/admin/helpers/markHTMLNode.ts |  58 ++++++
 .../MarkdownEditor/MarkdownEditor.css         |   4 +
 src/core/client/stream/shared/htmlContent.css |   8 +-
 .../createWordListRegExp.spec.ts.snap         | 179 ++++++++++++++++++
 .../common/utils/createWordListRegExp.spec.ts | 173 +++++++++++++++++
 src/core/common/utils/createWordListRegExp.ts |  29 +--
 src/core/common/utils/index.ts                |   1 -
 .../comments/pipeline/wordList.spec.ts        |  82 --------
 .../services/comments/pipeline/wordList.ts    |   6 +-
 20 files changed, 616 insertions(+), 185 deletions(-)
 create mode 100644 src/core/client/admin/helpers/markHTMLNode.ts
 create mode 100644 src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap
 create mode 100644 src/core/common/utils/createWordListRegExp.spec.ts
 delete mode 100644 src/core/server/services/comments/pipeline/wordList.spec.ts

diff --git a/package-lock.json b/package-lock.json
index 122acc43f..82b74c83a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -3217,7 +3217,6 @@
       "version": "7.8.3",
       "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.8.3.tgz",
       "integrity": "sha512-lrIU4aVbmlM/wQPzhEvzvNJskKyYptuXb0fGC0lTQTupTOYtR2Vqbu6/jf8vTr4M8Wt1nIzxVrSvPI5qESa/xA==",
-      "dev": true,
       "requires": {
         "core-js-pure": "^3.0.0",
         "regenerator-runtime": "^0.13.2"
@@ -5760,6 +5759,12 @@
         "@types/node": "*"
       }
     },
+    "@types/xregexp": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/@types/xregexp/-/xregexp-4.3.0.tgz",
+      "integrity": "sha512-3gJTS9gt27pS7U9q5IVqo4YvKSlkf2ck8ish6etuDj6LIRxkL/2Y8RMUtK/QzvE1Yv2zwWV5yemI2BS0GGGFnA==",
+      "dev": true
+    },
     "@types/yargs": {
       "version": "12.0.12",
       "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-12.0.12.tgz",
@@ -10721,8 +10726,7 @@
     "core-js-pure": {
       "version": "3.6.4",
       "resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.6.4.tgz",
-      "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==",
-      "dev": true
+      "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw=="
     },
     "core-util-is": {
       "version": "1.0.2",
@@ -32077,8 +32081,7 @@
     "regenerator-runtime": {
       "version": "0.13.3",
       "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.3.tgz",
-      "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==",
-      "dev": true
+      "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw=="
     },
     "regenerator-transform": {
       "version": "0.13.4",
@@ -38138,6 +38141,14 @@
           "dev": true,
           "requires": {
             "xregexp": "4.0.0"
+          },
+          "dependencies": {
+            "xregexp": {
+              "version": "4.0.0",
+              "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
+              "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
+              "dev": true
+            }
           }
         },
         "del": {
@@ -39365,10 +39376,12 @@
       "integrity": "sha512-tGkGJkN8XqCod7OT+EvGYK5Z4SfDQGD30zAa58OcnAa0RRWgzUEK72tkXhsX1FZd+rgnhRxFtmO+ihkp8LHSkw=="
     },
     "xregexp": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
-      "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
-      "dev": true
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.3.0.tgz",
+      "integrity": "sha512-7jXDIFXh5yJ/orPn4SXjuVrWWoi4Cr8jfV1eHv9CixKSbU+jY4mxfrBwAuDvupPNKpMUY+FeIqsVw/JLT9+B8g==",
+      "requires": {
+        "@babel/runtime-corejs3": "^7.8.3"
+      }
     },
     "xtend": {
       "version": "4.0.1",
diff --git a/package.json b/package.json
index b9e41affd..b42b5e301 100644
--- a/package.json
+++ b/package.json
@@ -4,6 +4,7 @@
   "author": "The Coral Project",
   "homepage": "https://coralproject.net/",
   "sideEffects": [
+    "*.css.ts",
     "*.css"
   ],
   "repository": {
@@ -140,7 +141,8 @@
     "tsscmp": "^1.0.6",
     "url-regex": "^5.0.0",
     "uuid": "^3.3.3",
-    "verror": "^1.10.0"
+    "verror": "^1.10.0",
+    "xregexp": "^4.3.0"
   },
   "devDependencies": {
     "@babel/core": "^7.8.3",
@@ -240,6 +242,7 @@
     "@types/webpack-bundle-analyzer": "^2.13.1",
     "@types/webpack-dev-server": "^3.1.5",
     "@types/ws": "^5.1.2",
+    "@types/xregexp": "^4.3.0",
     "@typescript-eslint/eslint-plugin": "2.3.3",
     "@typescript-eslint/eslint-plugin-tslint": "2.3.3",
     "@typescript-eslint/parser": "2.3.3",
diff --git a/src/core/client/admin/components/Comment/CommentContent.css b/src/core/client/admin/components/Comment/CommentContent.css
index 285d85278..403921648 100644
--- a/src/core/client/admin/components/Comment/CommentContent.css
+++ b/src/core/client/admin/components/Comment/CommentContent.css
@@ -12,12 +12,12 @@ $comment-link-active: var(--v2-palette-primary-darkest);
   color: $comment-content;
   overflow-wrap: break-word;
 
-  & * bold,
-  & * strong {
+  b,
+  strong {
     font-weight: var(--v2-font-weight-primary-bold);
   }
-  & * italic,
-  & * em {
+  i,
+  em {
     font-style: italic;
   }
   blockquote {
@@ -50,3 +50,7 @@ $comment-link-active: var(--v2-palette-primary-darkest);
     }
   }
 }
+
+.highlight {
+  white-space: pre-wrap;
+}
diff --git a/src/core/client/admin/components/Comment/CommentContent.spec.tsx b/src/core/client/admin/components/Comment/CommentContent.spec.tsx
index f4dbb5d05..0dadae902 100644
--- a/src/core/client/admin/components/Comment/CommentContent.spec.tsx
+++ b/src/core/client/admin/components/Comment/CommentContent.spec.tsx
@@ -16,6 +16,7 @@ it("renders correctly", () => {
     },
     className: "custom",
     children: "Hello <b>Bob</b>, you bad guy",
+    highlight: true,
   };
   const renderer = createRenderer();
   renderer.render(<CommentContent {...props} />);
@@ -33,6 +34,26 @@ it("renders empty words correctly", () => {
     },
     className: "custom",
     children: "Hello <b>Bob</b>, you bad guy",
+    highlight: true,
+  };
+  const renderer = createRenderer();
+  renderer.render(<CommentContent {...props} />);
+  expect(renderer.getRenderOutput()).toMatchSnapshot();
+});
+
+it("renders correctly even if it has consecutive banned words on comments", () => {
+  const props: PropTypesOf<typeof CommentContent> = {
+    phrases: {
+      locale: "en-US",
+      wordList: {
+        suspect: ["worse"],
+        banned: ["bad"],
+      },
+    },
+    className: "custom",
+    children:
+      "This is a very long comment with bad words. Let's try bad and bad. Now bad bad.\nBad BAD bad.\n",
+    highlight: true,
   };
   const renderer = createRenderer();
   renderer.render(<CommentContent {...props} />);
diff --git a/src/core/client/admin/components/Comment/CommentContent.tsx b/src/core/client/admin/components/Comment/CommentContent.tsx
index a650124ff..933cb90dc 100644
--- a/src/core/client/admin/components/Comment/CommentContent.tsx
+++ b/src/core/client/admin/components/Comment/CommentContent.tsx
@@ -1,7 +1,12 @@
 import cn from "classnames";
 import React, { FunctionComponent, useMemo } from "react";
+import striptags from "striptags";
 
-import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
+import {
+  getPhrasesRegExp,
+  GetPhrasesRegExpOptions,
+  markHTMLNode,
+} from "coral-admin/helpers";
 import { createPurify } from "coral-common/utils/purify";
 
 import styles from "./CommentContent.css";
@@ -15,81 +20,60 @@ interface Props {
   className?: string;
   children: string | React.ReactElement;
   phrases: GetPhrasesRegExpOptions;
-}
-
-function escapeHTML(unsafe: string) {
-  return unsafe
-    .replace(/&/g, "&amp;")
-    .replace(/</g, "&lt;")
-    .replace(/>/g, "&gt;")
-    .replace(/"/g, "&quot;")
-    .replace(/'/g, "&#039;");
-}
-
-// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
-// the settings applied for the locale and highlights them by returning an HTML
-// string.
-function markPhrasesHTML(text: string, expression: RegExp) {
-  const tokens = text.split(expression);
-  if (tokens.length === 1) {
-    return text;
-  }
-  return tokens
-    .map((token, i) =>
-      // Using our Regexp patterns it returns tokens arranged this way
-      // [STRING_WITH_NO_MATCH, NEW_WORD_DELIMITER, MATCHED_WORD, ...].
-      // This pattern repeats throughout. Next line will mark MATCHED_WORD
-      // and escape all tokens.
-      i % 3 === 2 ? `<mark>${escapeHTML(token)}</mark>` : escapeHTML(token)
-    )
-    .join("");
-}
-
-// markHTMLNode manipulates the node by looking for #text nodes and adding markers
-// for `supsectWords` and `bannedWords`.
-function markHTMLNode(parentNode: Node, expression: RegExp) {
-  parentNode.childNodes.forEach(node => {
-    if (node.nodeName === "#text") {
-      const newContent = markPhrasesHTML(node.textContent!, expression);
-      if (newContent !== node.textContent) {
-        const newNode = document.createElement("span");
-        newNode.innerHTML = newContent;
-        parentNode.replaceChild(newNode, node);
-      }
-    } else {
-      markHTMLNode(node, expression);
-    }
-  });
+  highlight?: boolean;
 }
 
 const CommentContent: FunctionComponent<Props> = ({
   phrases,
   className,
   children,
+  highlight = false,
 }) => {
   // Cache the expression used via memo. This will reduce duplicate renders of
   // this comment content when the children change but the phrase configuration
   // does not change. The regExp is already cached on a deeper level
   // automatically, this is just lessening that impact further.
-  const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
+  const expression = useMemo(() => {
+    // If we aren't in highlight mode for this comment, don't even attempt to
+    // generate the expression.
+    if (!highlight) {
+      return null;
+    }
 
-  if (typeof children === "string") {
-    // We create a Shadow DOM Tree with the HTML body content and
-    // use it as a parser.
+    return getPhrasesRegExp(phrases);
+  }, [phrases, highlight]);
+
+  // Cache the parsed comment node. If the children cannot be parsed, this will
+  // be null.
+  const parsed = useMemo(() => {
+    if (typeof children !== "string") {
+      return null;
+    }
+
+    // Sanitize the input for display.
+    let html = purify.sanitize(children);
+    if (highlight) {
+      html = striptags(html, ["a"]);
+    }
+
+    // We create a Shadow DOM Tree with the HTML body content and use it as a
+    // parser.
     const node = document.createElement("div");
-    node.innerHTML = purify.sanitize(children);
+    node.innerHTML = html;
 
+    // If the expression is available, then mark the nodes.
     if (expression) {
-      // Then we traverse it recursively and manipulate it to highlight suspect words
-      // and banned words.
       markHTMLNode(node, expression);
     }
 
-    // Finally we render the content of the Shadow DOM Tree
+    return node;
+  }, [children, expression, highlight]);
+
+  if (parsed) {
     return (
       <div
-        className={cn(className, styles.root)}
-        dangerouslySetInnerHTML={{ __html: node.innerHTML }}
+        className={cn(className, styles.root, highlight && styles.highlight)}
+        dangerouslySetInnerHTML={{ __html: parsed.innerHTML }}
       />
     );
   }
diff --git a/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap b/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap
index 3a2a72948..5d54e8ada 100644
--- a/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap
+++ b/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap
@@ -2,10 +2,23 @@
 
 exports[`renders correctly 1`] = `
 <div
-  className="custom CommentContent-root"
+  className="custom CommentContent-root CommentContent-highlight"
   dangerouslySetInnerHTML={
     Object {
-      "__html": "Hello <b>Bob</b><span>, you <mark>bad</mark> guy</span>",
+      "__html": "<span>Hello Bob, you <mark>bad</mark> guy</span>",
+    }
+  }
+/>
+`;
+
+exports[`renders correctly even if it has consecutive banned words on comments 1`] = `
+<div
+  className="custom CommentContent-root CommentContent-highlight"
+  dangerouslySetInnerHTML={
+    Object {
+      "__html": "<span>This is a very long comment with <mark>bad</mark> words. Let's try <mark>bad</mark> and <mark>bad</mark>. Now <mark>bad</mark> bad.
+<mark>Bad</mark> BAD <mark>bad</mark>.
+</span>",
     }
   }
 />
@@ -13,10 +26,10 @@ exports[`renders correctly 1`] = `
 
 exports[`renders empty words correctly 1`] = `
 <div
-  className="custom CommentContent-root"
+  className="custom CommentContent-root CommentContent-highlight"
   dangerouslySetInnerHTML={
     Object {
-      "__html": "Hello <b>Bob</b>, you bad guy",
+      "__html": "Hello Bob, you bad guy",
     }
   }
 />
diff --git a/src/core/client/admin/components/ModerateCard/ModerateCard.tsx b/src/core/client/admin/components/ModerateCard/ModerateCard.tsx
index 3ac3b34fb..3b6cb493c 100644
--- a/src/core/client/admin/components/ModerateCard/ModerateCard.tsx
+++ b/src/core/client/admin/components/ModerateCard/ModerateCard.tsx
@@ -37,6 +37,7 @@ interface Props {
   username: string;
   createdAt: string;
   body: string;
+  highlight?: boolean;
   inReplyTo?: {
     id: string;
     username: string | null;
@@ -82,6 +83,7 @@ const ModerateCard: FunctionComponent<Props> = ({
   username,
   createdAt,
   body,
+  highlight = false,
   inReplyTo,
   comment,
   settings,
@@ -222,7 +224,11 @@ const ModerateCard: FunctionComponent<Props> = ({
             )}
           </div>
           <div className={styles.contentArea}>
-            <CommentContent phrases={phrases} className={styles.content}>
+            <CommentContent
+              highlight={highlight}
+              phrases={phrases}
+              className={styles.content}
+            >
               {commentBody}
             </CommentContent>
             <div className={styles.viewContext}>
diff --git a/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx b/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx
index fa2cff8f2..13699b7ab 100644
--- a/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx
+++ b/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx
@@ -215,6 +215,16 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
     },
     [comment]
   );
+
+  // Only highlight comments that have been flagged for containing a banned or
+  // suspect word.
+  const highlight = comment.revision
+    ? comment.revision.actionCounts.flag.reasons.COMMENT_DETECTED_BANNED_WORD +
+        comment.revision.actionCounts.flag.reasons
+          .COMMENT_DETECTED_SUSPECT_WORD >
+      0
+    : false;
+
   return (
     <>
       <FadeInTransition active={Boolean(comment.enteredLive)}>
@@ -227,6 +237,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
           }
           createdAt={comment.createdAt}
           body={comment.body!}
+          highlight={highlight}
           inReplyTo={comment.parent && comment.parent.author}
           comment={comment}
           settings={settings}
@@ -296,6 +307,16 @@ const enhanced = withFragmentContainer<Props>({
       statusLiveUpdated
       createdAt
       body
+      revision {
+        actionCounts {
+          flag {
+            reasons {
+              COMMENT_DETECTED_BANNED_WORD
+              COMMENT_DETECTED_SUSPECT_WORD
+            }
+          }
+        }
+      }
       tags {
         code
       }
diff --git a/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap b/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap
index 4db3819ce..3a4107f45 100644
--- a/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap
+++ b/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap
@@ -44,6 +44,7 @@ exports[`renders approved correctly 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -178,6 +179,7 @@ exports[`renders correctly 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -312,6 +314,7 @@ exports[`renders dangling correctly 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -446,6 +449,7 @@ exports[`renders rejected correctly 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -589,6 +593,7 @@ exports[`renders reply correctly 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -723,6 +728,7 @@ exports[`renders story info 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
@@ -895,6 +901,7 @@ exports[`renders tombstoned when comment is deleted 1`] = `
       >
         <CommentContent
           className="ModerateCard-content"
+          highlight={false}
           phrases={
             Object {
               "locale": "en-US",
diff --git a/src/core/client/admin/helpers/getPhrasesRegExp.ts b/src/core/client/admin/helpers/getPhrasesRegExp.ts
index 0d83e479c..df303534c 100644
--- a/src/core/client/admin/helpers/getPhrasesRegExp.ts
+++ b/src/core/client/admin/helpers/getPhrasesRegExp.ts
@@ -1,5 +1,7 @@
+import { lowerCase, uniqBy } from "lodash";
+
 import { LanguageCode } from "coral-common/helpers";
-import { createWordListRegExp } from "coral-common/utils";
+import createWordListRegExp from "coral-common/utils/createWordListRegExp";
 
 export interface GetPhrasesRegExpOptions {
   locale: string;
@@ -17,20 +19,35 @@ export function getPhrasesRegExp({
     return null;
   }
 
-  return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
+  // Because the banned and suspect word lists may sometimes overlap, we should
+  // make this list as short as possible before compiling it into a RegExp.
+  const phrases = uniqBy<string>([...banned, ...suspect], lowerCase);
+
+  // The locale is passed down to us from the Graph, we can cast it to a
+  // LanguageCode.
+  return createWordListRegExp(locale as LanguageCode, phrases);
 }
 
-// cache is used as a global validator to the cached RegExp used by the
+// Cache is used as a global validator to the cached RegExp used by the
 // application. We expect that generally, there is only ever one word list used
 // by the client at a time, so this ensures that we only re-create the word list
 // if we must.
-const cache = {
+interface Cache {
+  keys: {
+    locale: string;
+    suspect: ReadonlyArray<string>;
+    banned: ReadonlyArray<string>;
+  };
+  value: RegExp | null;
+}
+
+const cache: Cache = {
   keys: {
     locale: "",
-    suspect: [] as ReadonlyArray<string>,
-    banned: [] as ReadonlyArray<string>,
+    suspect: [],
+    banned: [],
   },
-  value: null as RegExp | null,
+  value: null,
 };
 
 export default function(options: GetPhrasesRegExpOptions) {
@@ -57,7 +74,12 @@ export default function(options: GetPhrasesRegExpOptions) {
 
   // If the cache is expired, or the value doesn't exist, regenerate it.
   if (expired) {
-    cache.value = getPhrasesRegExp(options);
+    try {
+      cache.value = getPhrasesRegExp(options);
+    } catch (err) {
+      window.console.error(err);
+      return null;
+    }
   }
 
   return cache.value;
diff --git a/src/core/client/admin/helpers/index.ts b/src/core/client/admin/helpers/index.ts
index 553f32f38..16e48a31f 100644
--- a/src/core/client/admin/helpers/index.ts
+++ b/src/core/client/admin/helpers/index.ts
@@ -3,3 +3,4 @@ export {
   default as getPhrasesRegExp,
   GetPhrasesRegExpOptions,
 } from "./getPhrasesRegExp";
+export { default as markHTMLNode } from "./markHTMLNode";
diff --git a/src/core/client/admin/helpers/markHTMLNode.ts b/src/core/client/admin/helpers/markHTMLNode.ts
new file mode 100644
index 000000000..e1ffd617b
--- /dev/null
+++ b/src/core/client/admin/helpers/markHTMLNode.ts
@@ -0,0 +1,58 @@
+// markPhrasesHTML looks for `suspect` and `banned` words inside `text` given
+// the settings applied for the locale and highlights them by returning an HTML
+// string.
+function markPhrasesHTML(text: string, expression: RegExp) {
+  const tokens = text.split(expression);
+
+  // If there were less than two matches, then there was no matched word
+  // associated with the passed in text.
+  if (tokens.length < 3) {
+    return null;
+  }
+
+  return tokens
+    .map((token, i) =>
+      // Using our Regexp patterns it returns tokens arranged this way:
+      //
+      //  - STRING_WITH_NO_MATCH
+      //  - NEW_WORD_DELIMITER
+      //  - MATCHED_WORD
+      //  - NEW_WORD_DELIMITER
+      //  - ...
+      //
+      // This pattern repeats throughout. Next line will mark MATCHED_WORD.
+      i % 4 === 2 ? "<mark>" + token + "</mark>" : token
+    )
+    .join("");
+}
+
+// markHTMLNode manipulates the node by looking for #text nodes and adding
+// markers.
+export default function markHTMLNode(parentNode: Node, expression: RegExp) {
+  parentNode.childNodes.forEach(node => {
+    // Anchor links are already marked by default, skip them now.
+    if (node.nodeName === "A") {
+      return;
+    }
+
+    // If the node isn't of text type then we can't mark it directly.
+    if (node.nodeName !== "#text") {
+      return markHTMLNode(node, expression);
+    }
+
+    // If the node doesn't have any text content, then we can't mark it either.
+    if (!node.textContent) {
+      return;
+    }
+
+    // We've encountered a text node with text content that isn't in an anchor
+    // link. We should try to mark and replace it's content.
+    const replacement = markPhrasesHTML(node.textContent, expression);
+    if (replacement) {
+      // Create the new span node to replace the old node with.
+      const newNode = document.createElement("span");
+      newNode.innerHTML = replacement;
+      parentNode.replaceChild(newNode, node);
+    }
+  });
+}
diff --git a/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css b/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css
index fdb176c6e..cfab5c0d4 100644
--- a/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css
+++ b/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css
@@ -3,6 +3,10 @@ $fullscreenZIndex: 10;
 
 .wrapper {
   @mixin bodyCopy;
+  i
+  em {
+    font-style: italic;
+  }
   b,
   strong {
     font-weight: var(--font-weight-medium);
diff --git a/src/core/client/stream/shared/htmlContent.css b/src/core/client/stream/shared/htmlContent.css
index 8fdee109f..86afc2c65 100644
--- a/src/core/client/stream/shared/htmlContent.css
+++ b/src/core/client/stream/shared/htmlContent.css
@@ -3,12 +3,12 @@
   color: var(--palette-text-dark);
   overflow-wrap: break-word;
 
-  & * bold,
-  & * strong {
+  b,
+  strong {
     font-weight: var(--font-weight-medium);
   }
-  & * italic,
-  & * em {
+  i
+  em {
     font-style: italic;
   }
   blockquote {
diff --git a/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap b/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap
new file mode 100644
index 000000000..84677322d
--- /dev/null
+++ b/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap
@@ -0,0 +1,179 @@
+// Jest Snapshot v1, https://goo.gl/fbAQLP
+
+exports[`en-US splits the multi-words in a sentence correctly 1`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 2`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french;fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 3`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french!fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 4`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french.fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 5`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french?fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 6`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french¿fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the multi-words in a sentence correctly 7`] = `
+Array [
+  "this sentence has",
+  " ",
+  "french:fries",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words in a sentence correctly 1`] = `
+Array [
+  "this sentence is",
+  " ",
+  "bad",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words in a sentence correctly 2`] = `
+Array [
+  "this sentence is",
+  " ",
+  "worse",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words with unicode in a sentence correctly 1`] = `
+Array [
+  "this sentence has one",
+  " ",
+  "jalapeño",
+  ".",
+  "",
+]
+`;
+
+exports[`en-US splits the words with unicode in a sentence correctly 2`] = `
+Array [
+  "this sentence has many jalapeños.",
+]
+`;
+
+exports[`en-US splits words when there are repeat words 1`] = `
+Array [
+  "This is",
+  " ",
+  "bad",
+  " ",
+  "bad, very",
+  " ",
+  "BAD",
+  ".",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 1`] = `
+Array [
+  "biólogo se soletra com",
+  ": ",
+  "bi",
+  " ",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 2`] = `
+Array [
+  "",
+  "",
+  "m.e.r.d.a",
+  "",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 3`] = `
+Array [
+  "não tomo",
+  " ",
+  "café",
+  " ",
+  "pois faz mal",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 4`] = `
+Array [
+  "",
+  "",
+  "Como fazer coisas ruins",
+  "",
+  "",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 5`] = `
+Array [
+  "O biólogo recomenda este artigo",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 6`] = `
+Array [
+  "cafe",
+]
+`;
+
+exports[`pt-BR splits the words with unicode in a sentence correctly 7`] = `
+Array [
+  "Ser banido é uma merda",
+]
+`;
diff --git a/src/core/common/utils/createWordListRegExp.spec.ts b/src/core/common/utils/createWordListRegExp.spec.ts
new file mode 100644
index 000000000..174935ccc
--- /dev/null
+++ b/src/core/common/utils/createWordListRegExp.spec.ts
@@ -0,0 +1,173 @@
+import createWordListRegExp from "./createWordListRegExp";
+
+const buildTester = (re: RegExp) => (str: string) => re.test(str);
+
+const buildSplitter = (re: RegExp) => (str: string) => str.split(re);
+
+describe("en-US", () => {
+  const re = createWordListRegExp("en-US", [
+    "bad",
+    "french fries",
+    "worse",
+    "jalapeño",
+    "1km",
+  ]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("bad")).toBeTruthy();
+    expect(test("worse")).toBeTruthy();
+    expect(test("1km")).toBeTruthy();
+  });
+
+  it("test repeated words in the list", () => {
+    expect(test("bad bad bad")).toBeTruthy();
+    expect(test("worse worse worse")).toBeTruthy();
+  });
+
+  it("test words not in the list", () => {
+    expect(test("fine")).toBeFalsy();
+    expect(test("ok")).toBeFalsy();
+  });
+
+  it("test words that end with a unicode character", () => {
+    expect(test("I have one jalapeño.")).toBeTruthy();
+    expect(test("I have two jalapeños.")).toBeFalsy();
+  });
+
+  it("test words in the list while being case insensitive", () => {
+    expect(test("Bad")).toBeTruthy();
+    expect(test("BAd")).toBeTruthy();
+    expect(test("BAD")).toBeTruthy();
+    expect(test("bAD")).toBeTruthy();
+    expect(test("baD")).toBeTruthy();
+    expect(test("bAd")).toBeTruthy();
+    expect(test("1KM")).toBeTruthy();
+  });
+
+  it("test multi-words", () => {
+    expect(test("french fries")).toBeTruthy();
+    expect(test("french!fries")).toBeTruthy();
+    expect(test("french.fries")).toBeTruthy();
+    expect(test("french?fries")).toBeTruthy();
+    expect(test("french¿fries")).toBeTruthy();
+    expect(test("french:fries")).toBeTruthy();
+    expect(test("french;fries")).toBeTruthy();
+  });
+
+  it("test words at the end of a sentence", () => {
+    expect(test("this sentence is bad.")).toBeTruthy();
+    expect(test("this sentence is worse.")).toBeTruthy();
+    expect(test("this sentence has french fries.")).toBeTruthy();
+    expect(test("this sentence has french!fries.")).toBeTruthy();
+    expect(test("this sentence has french.fries.")).toBeTruthy();
+    expect(test("this sentence has french?fries.")).toBeTruthy();
+    expect(test("this sentence has french¿fries.")).toBeTruthy();
+    expect(test("this sentence has french:fries.")).toBeTruthy();
+    expect(test("this sentence has french;fries.")).toBeTruthy();
+  });
+
+  it("test words at the start of a sentence", () => {
+    expect(test("bad is the start of the sentence.")).toBeTruthy();
+    expect(test("worse is the start of the sentence.")).toBeTruthy();
+    expect(test("french fries is the start of the sentence.")).toBeTruthy();
+  });
+
+  it("test does not preserve state", () => {
+    expect(test("bad 1")).toBeTruthy();
+    expect(test("bad 2")).toBeTruthy();
+    expect(test("more bad 3")).toBeTruthy();
+    expect(test("more bad 4")).toBeTruthy();
+  });
+
+  it("test repeated words", () => {
+    expect(test("This is bad bad, very bad")).toBeTruthy();
+  });
+
+  it("test does not match substrings", () => {
+    expect(test("baddd")).toBeFalsy();
+    expect(test("wwworse")).toBeFalsy();
+    expect(test("fffrench fries")).toBeFalsy();
+  });
+
+  it("test handles when there are numbers", () => {
+    expect(test("bad3")).toBeTruthy();
+    expect(test("3bad")).toBeTruthy();
+  });
+
+  const split = buildSplitter(re);
+
+  it("splits the words in a sentence correctly", () => {
+    expect(split("this sentence is bad.")).toMatchSnapshot();
+    expect(split("this sentence is worse.")).toMatchSnapshot();
+  });
+
+  it("splits words when there are repeat words", () => {
+    expect(split("This is bad bad, very BAD.")).toMatchSnapshot();
+  });
+
+  it("splits the words with unicode in a sentence correctly", () => {
+    expect(split("this sentence has one jalapeño.")).toMatchSnapshot();
+    expect(split("this sentence has many jalapeños.")).toMatchSnapshot();
+  });
+
+  it("splits the multi-words in a sentence correctly", () => {
+    expect(split("this sentence has french fries.")).toMatchSnapshot();
+    expect(split("this sentence has french;fries.")).toMatchSnapshot();
+    expect(split("this sentence has french!fries.")).toMatchSnapshot();
+    expect(split("this sentence has french.fries.")).toMatchSnapshot();
+    expect(split("this sentence has french?fries.")).toMatchSnapshot();
+    expect(split("this sentence has french¿fries.")).toMatchSnapshot();
+    expect(split("this sentence has french:fries.")).toMatchSnapshot();
+  });
+});
+
+describe("es", () => {
+  const re = createWordListRegExp("es", ["adónde vas", "tú"]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("Pablo, ¿adónde vas?")).toBeTruthy();
+    expect(test("Estoy cansado, ¿y tú?")).toBeTruthy();
+    expect(test("¿tú?")).toBeTruthy();
+  });
+});
+
+describe("pt-BR", () => {
+  const re = createWordListRegExp("pt-BR", [
+    "bi",
+    "outro",
+    "café",
+    "m e r d a",
+    "Como fazer coisas ruins",
+  ]);
+
+  const test = buildTester(re);
+
+  it("test words in the list", () => {
+    expect(test("biólogo se soletra com: bi ")).toBeTruthy();
+    expect(test("m.e.r.d.a")).toBeTruthy();
+    expect(test("não tomo café pois faz mal")).toBeTruthy();
+    expect(test("Como fazer coisas ruins")).toBeTruthy();
+  });
+
+  it("test words not in the list", () => {
+    expect(test("O biólogo recomenda este artigo")).toBeFalsy();
+    expect(test("cafe")).toBeFalsy();
+    expect(test("Ser banido é uma merda")).toBeFalsy();
+  });
+
+  const split = buildSplitter(re);
+
+  it("splits the words with unicode in a sentence correctly", () => {
+    expect(split("biólogo se soletra com: bi ")).toMatchSnapshot();
+    expect(split("m.e.r.d.a")).toMatchSnapshot();
+    expect(split("não tomo café pois faz mal")).toMatchSnapshot();
+    expect(split("Como fazer coisas ruins")).toMatchSnapshot();
+    expect(split("O biólogo recomenda este artigo")).toMatchSnapshot();
+    expect(split("cafe")).toMatchSnapshot();
+    expect(split("Ser banido é uma merda")).toMatchSnapshot();
+  });
+});
diff --git a/src/core/common/utils/createWordListRegExp.ts b/src/core/common/utils/createWordListRegExp.ts
index e12144d2e..c6fed401e 100644
--- a/src/core/common/utils/createWordListRegExp.ts
+++ b/src/core/common/utils/createWordListRegExp.ts
@@ -1,18 +1,20 @@
 import { defaults } from "lodash";
+import XRegExp from "xregexp";
 
 import { LanguageCode } from "coral-common/helpers";
 import { DeepPartial } from "coral-common/types";
 
 interface WordListRule {
-  split: string;
+  boundary: string;
   punctuation: string;
-  whitespace: string;
 }
 
 const DefaultWordListRule: WordListRule = {
-  split: "[^\\w]",
-  punctuation: '[\\s"?!.¿¡`:;]+',
-  whitespace: "\\s+",
+  // The following symbol, \p{L} refers to any letter class within unicode.
+  // Because we're adding the ^, we're also saying to exclude any from that set,
+  // leaving all non-word characters from unicode available for selection.
+  boundary: "[^\\p{L}]+",
+  punctuation: "[\\s\"'?!.,¿¡`:;]+",
 };
 
 const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
@@ -47,14 +49,12 @@ export default function createWordListRegExp(
     DefaultWordListRule
   );
 
-  const whitespace = new RegExp(rule.whitespace);
-
   // Split up the words from the list into a regex escaped string.
   const words = phrases
     .map(phrase =>
       phrase
         // Split each phrase by whitespace.
-        .split(whitespace)
+        .split(/\s/)
         // Escape each phrase, we don't expect any of them to contain regex.
         .map(word => escapeRegExp(word))
         // Rejoin to ensure that any variation of the word separated by a
@@ -64,13 +64,18 @@ export default function createWordListRegExp(
     // For each of these words, wrap a `|` or OR.
     .join("|");
 
-  // Wrap the pattern in split rules.
-  const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
+  // Wrap the pattern in split rules. We want to match any word that either is
+  // at the start of a string, or a word boundary. The word must also either be
+  // at the end of the string or at another word boundary.
+  const pattern = `(^|${rule.boundary})(${words})($|${rule.boundary})`;
 
   try {
-    return new RegExp(pattern, "iu");
+    // Create the RegExp using xregexp to pre-process the pattern to generate
+    // one with the correct unicode ranges. Including A for "astral" unicode
+    // support for supporting higher character ranges.
+    return XRegExp(pattern, "iuA");
   } catch {
     // IE does not support unicode support, so we'll create one without.
-    return new RegExp(pattern, "i");
+    return XRegExp(pattern, "i");
   }
 }
diff --git a/src/core/common/utils/index.ts b/src/core/common/utils/index.ts
index a296f0652..4dedb2a47 100644
--- a/src/core/common/utils/index.ts
+++ b/src/core/common/utils/index.ts
@@ -12,4 +12,3 @@ export { default as isPromiseLike } from "./isPromiseLike";
 export { default as isPromise } from "./isPromise";
 export { default as startsWith } from "./startsWith";
 export { default as getOrigin } from "./getOrigin";
-export { default as createWordListRegExp } from "./createWordListRegExp";
diff --git a/src/core/server/services/comments/pipeline/wordList.spec.ts b/src/core/server/services/comments/pipeline/wordList.spec.ts
deleted file mode 100644
index 227c8240b..000000000
--- a/src/core/server/services/comments/pipeline/wordList.spec.ts
+++ /dev/null
@@ -1,82 +0,0 @@
-import {
-  Options,
-  WordList,
-} from "coral-server/services/comments/pipeline/wordList";
-
-describe("en-US", () => {
-  const list = new WordList();
-  const options: Options = {
-    id: "tenant_1",
-    locale: "en-US",
-    wordList: {
-      banned: [
-        "cookies",
-        "how to do bad things",
-        "how to do really bad things",
-        "s h i t",
-        "$hit",
-        "p**ch",
-        "p*ch",
-        "banned",
-        "ban",
-      ],
-      suspect: [],
-    },
-  };
-
-  describe("containsMatchingPhrase", () => {
-    it("does match on a word in the list", () => {
-      [
-        "how to do really bad things",
-        "what is cookies",
-        "cookies",
-        "COOKIES.",
-        "how to do bad things",
-        "How To do bad things!",
-        "How.To.do.bad.things!",
-        "This stuff is $hit!",
-        "This is a test.\nTo see if cookies are found, in the second line.",
-        "That's a p**ch!",
-        "Banned words should be detected",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(true);
-      });
-    });
-
-    it("does not match on a word not in the list", () => {
-      [
-        "how to",
-        "cookie",
-        "how to be a great person?",
-        "how to not do really bad things?",
-        "i have $100 dollars.",
-        "I have bad $ hit lling",
-        "That's a p***ch!",
-        "When bann is spelt wrong, it won't be caught.",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(false);
-      });
-    });
-
-    it("allows an empty list", () => {
-      expect(list.test(options, "banned", "test")).toEqual(false);
-    });
-  });
-
-  describe("containsMatchingPhraseMemoized", () => {
-    it("return true for all cases after memoizing the first result", () => {
-      [
-        "cookies 1",
-        "cookies 2",
-        "cookies 4",
-        "cookies 5",
-        "this is for cookies 6",
-        "this is for cookies 7",
-        "this is for cookies 8",
-        "this is for cookies 9",
-      ].forEach(word => {
-        expect(list.test(options, "banned", word)).toEqual(true);
-      });
-    });
-  });
-});
diff --git a/src/core/server/services/comments/pipeline/wordList.ts b/src/core/server/services/comments/pipeline/wordList.ts
index 84393dd38..985144df6 100644
--- a/src/core/server/services/comments/pipeline/wordList.ts
+++ b/src/core/server/services/comments/pipeline/wordList.ts
@@ -2,7 +2,7 @@ import ms from "ms";
 import now from "performance-now";
 
 import { LanguageCode } from "coral-common/helpers";
-import { createWordListRegExp } from "coral-common/utils";
+import createWordListRegExp from "coral-common/utils/createWordListRegExp";
 import logger from "coral-server/logger";
 import { Tenant } from "coral-server/models/tenant";
 
@@ -90,8 +90,8 @@ export class WordList {
 
     const startedAt = now();
     const result = list.test(testString);
-    logger.debug(
-      { tenantID: options.id, took: ms(now() - startedAt) },
+    logger.info(
+      { tenantID: options.id, listName, took: ms(now() - startedAt) },
       "word list phrase test complete"
     );