From 6711f09a79b06950c28df551c63276fc5ff8be04 Mon Sep 17 00:00:00 2001 From: Wyatt Johnson Date: Mon, 6 Apr 2020 19:04:05 +0000 Subject: [PATCH] [CORL-1001] Wordlist Fixes (#2920) * fix: improve wordlist highlighting and perf * fix: updated tests * fix: implmeneted new regexp lib/patterns * fix: improve comment body css * fix: take into account the tree shaking is disabled See: https://github.com/webpack/webpack/issues/7094 Co-authored-by: Chi Vinh Le Co-authored-by: Kim Gardner --- package-lock.json | 31 ++- package.json | 5 +- .../components/Comment/CommentContent.css | 12 +- .../Comment/CommentContent.spec.tsx | 21 ++ .../components/Comment/CommentContent.tsx | 96 ++++------ .../CommentContent.spec.tsx.snap | 21 +- .../components/ModerateCard/ModerateCard.tsx | 8 +- .../ModerateCard/ModerateCardContainer.tsx | 21 ++ .../__snapshots__/ModerateCard.spec.tsx.snap | 7 + .../client/admin/helpers/getPhrasesRegExp.ts | 38 +++- src/core/client/admin/helpers/index.ts | 1 + src/core/client/admin/helpers/markHTMLNode.ts | 58 ++++++ .../MarkdownEditor/MarkdownEditor.css | 4 + src/core/client/stream/shared/htmlContent.css | 8 +- .../createWordListRegExp.spec.ts.snap | 179 ++++++++++++++++++ .../common/utils/createWordListRegExp.spec.ts | 173 +++++++++++++++++ src/core/common/utils/createWordListRegExp.ts | 29 +-- src/core/common/utils/index.ts | 1 - .../comments/pipeline/wordList.spec.ts | 82 -------- .../services/comments/pipeline/wordList.ts | 6 +- 20 files changed, 616 insertions(+), 185 deletions(-) create mode 100644 src/core/client/admin/helpers/markHTMLNode.ts create mode 100644 src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap create mode 100644 src/core/common/utils/createWordListRegExp.spec.ts delete mode 100644 src/core/server/services/comments/pipeline/wordList.spec.ts diff --git a/package-lock.json b/package-lock.json index 122acc43f..82b74c83a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3217,7 +3217,6 @@ "version": "7.8.3", "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.8.3.tgz", "integrity": "sha512-lrIU4aVbmlM/wQPzhEvzvNJskKyYptuXb0fGC0lTQTupTOYtR2Vqbu6/jf8vTr4M8Wt1nIzxVrSvPI5qESa/xA==", - "dev": true, "requires": { "core-js-pure": "^3.0.0", "regenerator-runtime": "^0.13.2" @@ -5760,6 +5759,12 @@ "@types/node": "*" } }, + "@types/xregexp": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/@types/xregexp/-/xregexp-4.3.0.tgz", + "integrity": "sha512-3gJTS9gt27pS7U9q5IVqo4YvKSlkf2ck8ish6etuDj6LIRxkL/2Y8RMUtK/QzvE1Yv2zwWV5yemI2BS0GGGFnA==", + "dev": true + }, "@types/yargs": { "version": "12.0.12", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-12.0.12.tgz", @@ -10721,8 +10726,7 @@ "core-js-pure": { "version": "3.6.4", "resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.6.4.tgz", - "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==", - "dev": true + "integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==" }, "core-util-is": { "version": "1.0.2", @@ -32077,8 +32081,7 @@ "regenerator-runtime": { "version": "0.13.3", "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.3.tgz", - "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==", - "dev": true + "integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==" }, "regenerator-transform": { "version": "0.13.4", @@ -38138,6 +38141,14 @@ "dev": true, "requires": { "xregexp": "4.0.0" + }, + "dependencies": { + "xregexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz", + "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==", + "dev": true + } } }, "del": { @@ -39365,10 +39376,12 @@ "integrity": "sha512-tGkGJkN8XqCod7OT+EvGYK5Z4SfDQGD30zAa58OcnAa0RRWgzUEK72tkXhsX1FZd+rgnhRxFtmO+ihkp8LHSkw==" }, "xregexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz", - "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==", - "dev": true + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.3.0.tgz", + "integrity": "sha512-7jXDIFXh5yJ/orPn4SXjuVrWWoi4Cr8jfV1eHv9CixKSbU+jY4mxfrBwAuDvupPNKpMUY+FeIqsVw/JLT9+B8g==", + "requires": { + "@babel/runtime-corejs3": "^7.8.3" + } }, "xtend": { "version": "4.0.1", diff --git a/package.json b/package.json index b9e41affd..b42b5e301 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "author": "The Coral Project", "homepage": "https://coralproject.net/", "sideEffects": [ + "*.css.ts", "*.css" ], "repository": { @@ -140,7 +141,8 @@ "tsscmp": "^1.0.6", "url-regex": "^5.0.0", "uuid": "^3.3.3", - "verror": "^1.10.0" + "verror": "^1.10.0", + "xregexp": "^4.3.0" }, "devDependencies": { "@babel/core": "^7.8.3", @@ -240,6 +242,7 @@ "@types/webpack-bundle-analyzer": "^2.13.1", "@types/webpack-dev-server": "^3.1.5", "@types/ws": "^5.1.2", + "@types/xregexp": "^4.3.0", "@typescript-eslint/eslint-plugin": "2.3.3", "@typescript-eslint/eslint-plugin-tslint": "2.3.3", "@typescript-eslint/parser": "2.3.3", diff --git a/src/core/client/admin/components/Comment/CommentContent.css b/src/core/client/admin/components/Comment/CommentContent.css index 285d85278..403921648 100644 --- a/src/core/client/admin/components/Comment/CommentContent.css +++ b/src/core/client/admin/components/Comment/CommentContent.css @@ -12,12 +12,12 @@ $comment-link-active: var(--v2-palette-primary-darkest); color: $comment-content; overflow-wrap: break-word; - & * bold, - & * strong { + b, + strong { font-weight: var(--v2-font-weight-primary-bold); } - & * italic, - & * em { + i, + em { font-style: italic; } blockquote { @@ -50,3 +50,7 @@ $comment-link-active: var(--v2-palette-primary-darkest); } } } + +.highlight { + white-space: pre-wrap; +} diff --git a/src/core/client/admin/components/Comment/CommentContent.spec.tsx b/src/core/client/admin/components/Comment/CommentContent.spec.tsx index f4dbb5d05..0dadae902 100644 --- a/src/core/client/admin/components/Comment/CommentContent.spec.tsx +++ b/src/core/client/admin/components/Comment/CommentContent.spec.tsx @@ -16,6 +16,7 @@ it("renders correctly", () => { }, className: "custom", children: "Hello Bob, you bad guy", + highlight: true, }; const renderer = createRenderer(); renderer.render(); @@ -33,6 +34,26 @@ it("renders empty words correctly", () => { }, className: "custom", children: "Hello Bob, you bad guy", + highlight: true, + }; + const renderer = createRenderer(); + renderer.render(); + expect(renderer.getRenderOutput()).toMatchSnapshot(); +}); + +it("renders correctly even if it has consecutive banned words on comments", () => { + const props: PropTypesOf = { + phrases: { + locale: "en-US", + wordList: { + suspect: ["worse"], + banned: ["bad"], + }, + }, + className: "custom", + children: + "This is a very long comment with bad words. Let's try bad and bad. Now bad bad.\nBad BAD bad.\n", + highlight: true, }; const renderer = createRenderer(); renderer.render(); diff --git a/src/core/client/admin/components/Comment/CommentContent.tsx b/src/core/client/admin/components/Comment/CommentContent.tsx index a650124ff..933cb90dc 100644 --- a/src/core/client/admin/components/Comment/CommentContent.tsx +++ b/src/core/client/admin/components/Comment/CommentContent.tsx @@ -1,7 +1,12 @@ import cn from "classnames"; import React, { FunctionComponent, useMemo } from "react"; +import striptags from "striptags"; -import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers"; +import { + getPhrasesRegExp, + GetPhrasesRegExpOptions, + markHTMLNode, +} from "coral-admin/helpers"; import { createPurify } from "coral-common/utils/purify"; import styles from "./CommentContent.css"; @@ -15,81 +20,60 @@ interface Props { className?: string; children: string | React.ReactElement; phrases: GetPhrasesRegExpOptions; -} - -function escapeHTML(unsafe: string) { - return unsafe - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'"); -} - -// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given -// the settings applied for the locale and highlights them by returning an HTML -// string. -function markPhrasesHTML(text: string, expression: RegExp) { - const tokens = text.split(expression); - if (tokens.length === 1) { - return text; - } - return tokens - .map((token, i) => - // Using our Regexp patterns it returns tokens arranged this way - // [STRING_WITH_NO_MATCH, NEW_WORD_DELIMITER, MATCHED_WORD, ...]. - // This pattern repeats throughout. Next line will mark MATCHED_WORD - // and escape all tokens. - i % 3 === 2 ? `${escapeHTML(token)}` : escapeHTML(token) - ) - .join(""); -} - -// markHTMLNode manipulates the node by looking for #text nodes and adding markers -// for `supsectWords` and `bannedWords`. -function markHTMLNode(parentNode: Node, expression: RegExp) { - parentNode.childNodes.forEach(node => { - if (node.nodeName === "#text") { - const newContent = markPhrasesHTML(node.textContent!, expression); - if (newContent !== node.textContent) { - const newNode = document.createElement("span"); - newNode.innerHTML = newContent; - parentNode.replaceChild(newNode, node); - } - } else { - markHTMLNode(node, expression); - } - }); + highlight?: boolean; } const CommentContent: FunctionComponent = ({ phrases, className, children, + highlight = false, }) => { // Cache the expression used via memo. This will reduce duplicate renders of // this comment content when the children change but the phrase configuration // does not change. The regExp is already cached on a deeper level // automatically, this is just lessening that impact further. - const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]); + const expression = useMemo(() => { + // If we aren't in highlight mode for this comment, don't even attempt to + // generate the expression. + if (!highlight) { + return null; + } - if (typeof children === "string") { - // We create a Shadow DOM Tree with the HTML body content and - // use it as a parser. + return getPhrasesRegExp(phrases); + }, [phrases, highlight]); + + // Cache the parsed comment node. If the children cannot be parsed, this will + // be null. + const parsed = useMemo(() => { + if (typeof children !== "string") { + return null; + } + + // Sanitize the input for display. + let html = purify.sanitize(children); + if (highlight) { + html = striptags(html, ["a"]); + } + + // We create a Shadow DOM Tree with the HTML body content and use it as a + // parser. const node = document.createElement("div"); - node.innerHTML = purify.sanitize(children); + node.innerHTML = html; + // If the expression is available, then mark the nodes. if (expression) { - // Then we traverse it recursively and manipulate it to highlight suspect words - // and banned words. markHTMLNode(node, expression); } - // Finally we render the content of the Shadow DOM Tree + return node; + }, [children, expression, highlight]); + + if (parsed) { return (
); } diff --git a/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap b/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap index 3a2a72948..5d54e8ada 100644 --- a/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap +++ b/src/core/client/admin/components/Comment/__snapshots__/CommentContent.spec.tsx.snap @@ -2,10 +2,23 @@ exports[`renders correctly 1`] = `
Bob, you bad guy", + "__html": "Hello Bob, you bad guy", + } + } +/> +`; + +exports[`renders correctly even if it has consecutive banned words on comments 1`] = ` +
This is a very long comment with bad words. Let's try bad and bad. Now bad bad. +Bad BAD bad. +", } } /> @@ -13,10 +26,10 @@ exports[`renders correctly 1`] = ` exports[`renders empty words correctly 1`] = `
Bob, you bad guy", + "__html": "Hello Bob, you bad guy", } } /> diff --git a/src/core/client/admin/components/ModerateCard/ModerateCard.tsx b/src/core/client/admin/components/ModerateCard/ModerateCard.tsx index 3ac3b34fb..3b6cb493c 100644 --- a/src/core/client/admin/components/ModerateCard/ModerateCard.tsx +++ b/src/core/client/admin/components/ModerateCard/ModerateCard.tsx @@ -37,6 +37,7 @@ interface Props { username: string; createdAt: string; body: string; + highlight?: boolean; inReplyTo?: { id: string; username: string | null; @@ -82,6 +83,7 @@ const ModerateCard: FunctionComponent = ({ username, createdAt, body, + highlight = false, inReplyTo, comment, settings, @@ -222,7 +224,11 @@ const ModerateCard: FunctionComponent = ({ )}
- + {commentBody}
diff --git a/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx b/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx index fa2cff8f2..13699b7ab 100644 --- a/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx +++ b/src/core/client/admin/components/ModerateCard/ModerateCardContainer.tsx @@ -215,6 +215,16 @@ const ModerateCardContainer: FunctionComponent = ({ }, [comment] ); + + // Only highlight comments that have been flagged for containing a banned or + // suspect word. + const highlight = comment.revision + ? comment.revision.actionCounts.flag.reasons.COMMENT_DETECTED_BANNED_WORD + + comment.revision.actionCounts.flag.reasons + .COMMENT_DETECTED_SUSPECT_WORD > + 0 + : false; + return ( <> @@ -227,6 +237,7 @@ const ModerateCardContainer: FunctionComponent = ({ } createdAt={comment.createdAt} body={comment.body!} + highlight={highlight} inReplyTo={comment.parent && comment.parent.author} comment={comment} settings={settings} @@ -296,6 +307,16 @@ const enhanced = withFragmentContainer({ statusLiveUpdated createdAt body + revision { + actionCounts { + flag { + reasons { + COMMENT_DETECTED_BANNED_WORD + COMMENT_DETECTED_SUSPECT_WORD + } + } + } + } tags { code } diff --git a/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap b/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap index 4db3819ce..3a4107f45 100644 --- a/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap +++ b/src/core/client/admin/components/ModerateCard/__snapshots__/ModerateCard.spec.tsx.snap @@ -44,6 +44,7 @@ exports[`renders approved correctly 1`] = ` > ([...banned, ...suspect], lowerCase); + + // The locale is passed down to us from the Graph, we can cast it to a + // LanguageCode. + return createWordListRegExp(locale as LanguageCode, phrases); } -// cache is used as a global validator to the cached RegExp used by the +// Cache is used as a global validator to the cached RegExp used by the // application. We expect that generally, there is only ever one word list used // by the client at a time, so this ensures that we only re-create the word list // if we must. -const cache = { +interface Cache { + keys: { + locale: string; + suspect: ReadonlyArray; + banned: ReadonlyArray; + }; + value: RegExp | null; +} + +const cache: Cache = { keys: { locale: "", - suspect: [] as ReadonlyArray, - banned: [] as ReadonlyArray, + suspect: [], + banned: [], }, - value: null as RegExp | null, + value: null, }; export default function(options: GetPhrasesRegExpOptions) { @@ -57,7 +74,12 @@ export default function(options: GetPhrasesRegExpOptions) { // If the cache is expired, or the value doesn't exist, regenerate it. if (expired) { - cache.value = getPhrasesRegExp(options); + try { + cache.value = getPhrasesRegExp(options); + } catch (err) { + window.console.error(err); + return null; + } } return cache.value; diff --git a/src/core/client/admin/helpers/index.ts b/src/core/client/admin/helpers/index.ts index 553f32f38..16e48a31f 100644 --- a/src/core/client/admin/helpers/index.ts +++ b/src/core/client/admin/helpers/index.ts @@ -3,3 +3,4 @@ export { default as getPhrasesRegExp, GetPhrasesRegExpOptions, } from "./getPhrasesRegExp"; +export { default as markHTMLNode } from "./markHTMLNode"; diff --git a/src/core/client/admin/helpers/markHTMLNode.ts b/src/core/client/admin/helpers/markHTMLNode.ts new file mode 100644 index 000000000..e1ffd617b --- /dev/null +++ b/src/core/client/admin/helpers/markHTMLNode.ts @@ -0,0 +1,58 @@ +// markPhrasesHTML looks for `suspect` and `banned` words inside `text` given +// the settings applied for the locale and highlights them by returning an HTML +// string. +function markPhrasesHTML(text: string, expression: RegExp) { + const tokens = text.split(expression); + + // If there were less than two matches, then there was no matched word + // associated with the passed in text. + if (tokens.length < 3) { + return null; + } + + return tokens + .map((token, i) => + // Using our Regexp patterns it returns tokens arranged this way: + // + // - STRING_WITH_NO_MATCH + // - NEW_WORD_DELIMITER + // - MATCHED_WORD + // - NEW_WORD_DELIMITER + // - ... + // + // This pattern repeats throughout. Next line will mark MATCHED_WORD. + i % 4 === 2 ? "" + token + "" : token + ) + .join(""); +} + +// markHTMLNode manipulates the node by looking for #text nodes and adding +// markers. +export default function markHTMLNode(parentNode: Node, expression: RegExp) { + parentNode.childNodes.forEach(node => { + // Anchor links are already marked by default, skip them now. + if (node.nodeName === "A") { + return; + } + + // If the node isn't of text type then we can't mark it directly. + if (node.nodeName !== "#text") { + return markHTMLNode(node, expression); + } + + // If the node doesn't have any text content, then we can't mark it either. + if (!node.textContent) { + return; + } + + // We've encountered a text node with text content that isn't in an anchor + // link. We should try to mark and replace it's content. + const replacement = markPhrasesHTML(node.textContent, expression); + if (replacement) { + // Create the new span node to replace the old node with. + const newNode = document.createElement("span"); + newNode.innerHTML = replacement; + parentNode.replaceChild(newNode, node); + } + }); +} diff --git a/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css b/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css index fdb176c6e..cfab5c0d4 100644 --- a/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css +++ b/src/core/client/framework/components/loadables/MarkdownEditor/MarkdownEditor.css @@ -3,6 +3,10 @@ $fullscreenZIndex: 10; .wrapper { @mixin bodyCopy; + i + em { + font-style: italic; + } b, strong { font-weight: var(--font-weight-medium); diff --git a/src/core/client/stream/shared/htmlContent.css b/src/core/client/stream/shared/htmlContent.css index 8fdee109f..86afc2c65 100644 --- a/src/core/client/stream/shared/htmlContent.css +++ b/src/core/client/stream/shared/htmlContent.css @@ -3,12 +3,12 @@ color: var(--palette-text-dark); overflow-wrap: break-word; - & * bold, - & * strong { + b, + strong { font-weight: var(--font-weight-medium); } - & * italic, - & * em { + i + em { font-style: italic; } blockquote { diff --git a/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap b/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap new file mode 100644 index 000000000..84677322d --- /dev/null +++ b/src/core/common/utils/__snapshots__/createWordListRegExp.spec.ts.snap @@ -0,0 +1,179 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`en-US splits the multi-words in a sentence correctly 1`] = ` +Array [ + "this sentence has", + " ", + "french fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 2`] = ` +Array [ + "this sentence has", + " ", + "french;fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 3`] = ` +Array [ + "this sentence has", + " ", + "french!fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 4`] = ` +Array [ + "this sentence has", + " ", + "french.fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 5`] = ` +Array [ + "this sentence has", + " ", + "french?fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 6`] = ` +Array [ + "this sentence has", + " ", + "french¿fries", + ".", + "", +] +`; + +exports[`en-US splits the multi-words in a sentence correctly 7`] = ` +Array [ + "this sentence has", + " ", + "french:fries", + ".", + "", +] +`; + +exports[`en-US splits the words in a sentence correctly 1`] = ` +Array [ + "this sentence is", + " ", + "bad", + ".", + "", +] +`; + +exports[`en-US splits the words in a sentence correctly 2`] = ` +Array [ + "this sentence is", + " ", + "worse", + ".", + "", +] +`; + +exports[`en-US splits the words with unicode in a sentence correctly 1`] = ` +Array [ + "this sentence has one", + " ", + "jalapeño", + ".", + "", +] +`; + +exports[`en-US splits the words with unicode in a sentence correctly 2`] = ` +Array [ + "this sentence has many jalapeños.", +] +`; + +exports[`en-US splits words when there are repeat words 1`] = ` +Array [ + "This is", + " ", + "bad", + " ", + "bad, very", + " ", + "BAD", + ".", + "", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 1`] = ` +Array [ + "biólogo se soletra com", + ": ", + "bi", + " ", + "", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 2`] = ` +Array [ + "", + "", + "m.e.r.d.a", + "", + "", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 3`] = ` +Array [ + "não tomo", + " ", + "café", + " ", + "pois faz mal", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 4`] = ` +Array [ + "", + "", + "Como fazer coisas ruins", + "", + "", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 5`] = ` +Array [ + "O biólogo recomenda este artigo", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 6`] = ` +Array [ + "cafe", +] +`; + +exports[`pt-BR splits the words with unicode in a sentence correctly 7`] = ` +Array [ + "Ser banido é uma merda", +] +`; diff --git a/src/core/common/utils/createWordListRegExp.spec.ts b/src/core/common/utils/createWordListRegExp.spec.ts new file mode 100644 index 000000000..174935ccc --- /dev/null +++ b/src/core/common/utils/createWordListRegExp.spec.ts @@ -0,0 +1,173 @@ +import createWordListRegExp from "./createWordListRegExp"; + +const buildTester = (re: RegExp) => (str: string) => re.test(str); + +const buildSplitter = (re: RegExp) => (str: string) => str.split(re); + +describe("en-US", () => { + const re = createWordListRegExp("en-US", [ + "bad", + "french fries", + "worse", + "jalapeño", + "1km", + ]); + + const test = buildTester(re); + + it("test words in the list", () => { + expect(test("bad")).toBeTruthy(); + expect(test("worse")).toBeTruthy(); + expect(test("1km")).toBeTruthy(); + }); + + it("test repeated words in the list", () => { + expect(test("bad bad bad")).toBeTruthy(); + expect(test("worse worse worse")).toBeTruthy(); + }); + + it("test words not in the list", () => { + expect(test("fine")).toBeFalsy(); + expect(test("ok")).toBeFalsy(); + }); + + it("test words that end with a unicode character", () => { + expect(test("I have one jalapeño.")).toBeTruthy(); + expect(test("I have two jalapeños.")).toBeFalsy(); + }); + + it("test words in the list while being case insensitive", () => { + expect(test("Bad")).toBeTruthy(); + expect(test("BAd")).toBeTruthy(); + expect(test("BAD")).toBeTruthy(); + expect(test("bAD")).toBeTruthy(); + expect(test("baD")).toBeTruthy(); + expect(test("bAd")).toBeTruthy(); + expect(test("1KM")).toBeTruthy(); + }); + + it("test multi-words", () => { + expect(test("french fries")).toBeTruthy(); + expect(test("french!fries")).toBeTruthy(); + expect(test("french.fries")).toBeTruthy(); + expect(test("french?fries")).toBeTruthy(); + expect(test("french¿fries")).toBeTruthy(); + expect(test("french:fries")).toBeTruthy(); + expect(test("french;fries")).toBeTruthy(); + }); + + it("test words at the end of a sentence", () => { + expect(test("this sentence is bad.")).toBeTruthy(); + expect(test("this sentence is worse.")).toBeTruthy(); + expect(test("this sentence has french fries.")).toBeTruthy(); + expect(test("this sentence has french!fries.")).toBeTruthy(); + expect(test("this sentence has french.fries.")).toBeTruthy(); + expect(test("this sentence has french?fries.")).toBeTruthy(); + expect(test("this sentence has french¿fries.")).toBeTruthy(); + expect(test("this sentence has french:fries.")).toBeTruthy(); + expect(test("this sentence has french;fries.")).toBeTruthy(); + }); + + it("test words at the start of a sentence", () => { + expect(test("bad is the start of the sentence.")).toBeTruthy(); + expect(test("worse is the start of the sentence.")).toBeTruthy(); + expect(test("french fries is the start of the sentence.")).toBeTruthy(); + }); + + it("test does not preserve state", () => { + expect(test("bad 1")).toBeTruthy(); + expect(test("bad 2")).toBeTruthy(); + expect(test("more bad 3")).toBeTruthy(); + expect(test("more bad 4")).toBeTruthy(); + }); + + it("test repeated words", () => { + expect(test("This is bad bad, very bad")).toBeTruthy(); + }); + + it("test does not match substrings", () => { + expect(test("baddd")).toBeFalsy(); + expect(test("wwworse")).toBeFalsy(); + expect(test("fffrench fries")).toBeFalsy(); + }); + + it("test handles when there are numbers", () => { + expect(test("bad3")).toBeTruthy(); + expect(test("3bad")).toBeTruthy(); + }); + + const split = buildSplitter(re); + + it("splits the words in a sentence correctly", () => { + expect(split("this sentence is bad.")).toMatchSnapshot(); + expect(split("this sentence is worse.")).toMatchSnapshot(); + }); + + it("splits words when there are repeat words", () => { + expect(split("This is bad bad, very BAD.")).toMatchSnapshot(); + }); + + it("splits the words with unicode in a sentence correctly", () => { + expect(split("this sentence has one jalapeño.")).toMatchSnapshot(); + expect(split("this sentence has many jalapeños.")).toMatchSnapshot(); + }); + + it("splits the multi-words in a sentence correctly", () => { + expect(split("this sentence has french fries.")).toMatchSnapshot(); + expect(split("this sentence has french;fries.")).toMatchSnapshot(); + expect(split("this sentence has french!fries.")).toMatchSnapshot(); + expect(split("this sentence has french.fries.")).toMatchSnapshot(); + expect(split("this sentence has french?fries.")).toMatchSnapshot(); + expect(split("this sentence has french¿fries.")).toMatchSnapshot(); + expect(split("this sentence has french:fries.")).toMatchSnapshot(); + }); +}); + +describe("es", () => { + const re = createWordListRegExp("es", ["adónde vas", "tú"]); + + const test = buildTester(re); + + it("test words in the list", () => { + expect(test("Pablo, ¿adónde vas?")).toBeTruthy(); + expect(test("Estoy cansado, ¿y tú?")).toBeTruthy(); + expect(test("¿tú?")).toBeTruthy(); + }); +}); + +describe("pt-BR", () => { + const re = createWordListRegExp("pt-BR", [ + "bi", + "outro", + "café", + "m e r d a", + "Como fazer coisas ruins", + ]); + + const test = buildTester(re); + + it("test words in the list", () => { + expect(test("biólogo se soletra com: bi ")).toBeTruthy(); + expect(test("m.e.r.d.a")).toBeTruthy(); + expect(test("não tomo café pois faz mal")).toBeTruthy(); + expect(test("Como fazer coisas ruins")).toBeTruthy(); + }); + + it("test words not in the list", () => { + expect(test("O biólogo recomenda este artigo")).toBeFalsy(); + expect(test("cafe")).toBeFalsy(); + expect(test("Ser banido é uma merda")).toBeFalsy(); + }); + + const split = buildSplitter(re); + + it("splits the words with unicode in a sentence correctly", () => { + expect(split("biólogo se soletra com: bi ")).toMatchSnapshot(); + expect(split("m.e.r.d.a")).toMatchSnapshot(); + expect(split("não tomo café pois faz mal")).toMatchSnapshot(); + expect(split("Como fazer coisas ruins")).toMatchSnapshot(); + expect(split("O biólogo recomenda este artigo")).toMatchSnapshot(); + expect(split("cafe")).toMatchSnapshot(); + expect(split("Ser banido é uma merda")).toMatchSnapshot(); + }); +}); diff --git a/src/core/common/utils/createWordListRegExp.ts b/src/core/common/utils/createWordListRegExp.ts index e12144d2e..c6fed401e 100644 --- a/src/core/common/utils/createWordListRegExp.ts +++ b/src/core/common/utils/createWordListRegExp.ts @@ -1,18 +1,20 @@ import { defaults } from "lodash"; +import XRegExp from "xregexp"; import { LanguageCode } from "coral-common/helpers"; import { DeepPartial } from "coral-common/types"; interface WordListRule { - split: string; + boundary: string; punctuation: string; - whitespace: string; } const DefaultWordListRule: WordListRule = { - split: "[^\\w]", - punctuation: '[\\s"?!.¿¡`:;]+', - whitespace: "\\s+", + // The following symbol, \p{L} refers to any letter class within unicode. + // Because we're adding the ^, we're also saying to exclude any from that set, + // leaving all non-word characters from unicode available for selection. + boundary: "[^\\p{L}]+", + punctuation: "[\\s\"'?!.,¿¡`:;]+", }; const WordListRules: DeepPartial> = { @@ -47,14 +49,12 @@ export default function createWordListRegExp( DefaultWordListRule ); - const whitespace = new RegExp(rule.whitespace); - // Split up the words from the list into a regex escaped string. const words = phrases .map(phrase => phrase // Split each phrase by whitespace. - .split(whitespace) + .split(/\s/) // Escape each phrase, we don't expect any of them to contain regex. .map(word => escapeRegExp(word)) // Rejoin to ensure that any variation of the word separated by a @@ -64,13 +64,18 @@ export default function createWordListRegExp( // For each of these words, wrap a `|` or OR. .join("|"); - // Wrap the pattern in split rules. - const pattern = `(^|${rule.split})(${words})($|${rule.split})`; + // Wrap the pattern in split rules. We want to match any word that either is + // at the start of a string, or a word boundary. The word must also either be + // at the end of the string or at another word boundary. + const pattern = `(^|${rule.boundary})(${words})($|${rule.boundary})`; try { - return new RegExp(pattern, "iu"); + // Create the RegExp using xregexp to pre-process the pattern to generate + // one with the correct unicode ranges. Including A for "astral" unicode + // support for supporting higher character ranges. + return XRegExp(pattern, "iuA"); } catch { // IE does not support unicode support, so we'll create one without. - return new RegExp(pattern, "i"); + return XRegExp(pattern, "i"); } } diff --git a/src/core/common/utils/index.ts b/src/core/common/utils/index.ts index a296f0652..4dedb2a47 100644 --- a/src/core/common/utils/index.ts +++ b/src/core/common/utils/index.ts @@ -12,4 +12,3 @@ export { default as isPromiseLike } from "./isPromiseLike"; export { default as isPromise } from "./isPromise"; export { default as startsWith } from "./startsWith"; export { default as getOrigin } from "./getOrigin"; -export { default as createWordListRegExp } from "./createWordListRegExp"; diff --git a/src/core/server/services/comments/pipeline/wordList.spec.ts b/src/core/server/services/comments/pipeline/wordList.spec.ts deleted file mode 100644 index 227c8240b..000000000 --- a/src/core/server/services/comments/pipeline/wordList.spec.ts +++ /dev/null @@ -1,82 +0,0 @@ -import { - Options, - WordList, -} from "coral-server/services/comments/pipeline/wordList"; - -describe("en-US", () => { - const list = new WordList(); - const options: Options = { - id: "tenant_1", - locale: "en-US", - wordList: { - banned: [ - "cookies", - "how to do bad things", - "how to do really bad things", - "s h i t", - "$hit", - "p**ch", - "p*ch", - "banned", - "ban", - ], - suspect: [], - }, - }; - - describe("containsMatchingPhrase", () => { - it("does match on a word in the list", () => { - [ - "how to do really bad things", - "what is cookies", - "cookies", - "COOKIES.", - "how to do bad things", - "How To do bad things!", - "How.To.do.bad.things!", - "This stuff is $hit!", - "This is a test.\nTo see if cookies are found, in the second line.", - "That's a p**ch!", - "Banned words should be detected", - ].forEach(word => { - expect(list.test(options, "banned", word)).toEqual(true); - }); - }); - - it("does not match on a word not in the list", () => { - [ - "how to", - "cookie", - "how to be a great person?", - "how to not do really bad things?", - "i have $100 dollars.", - "I have bad $ hit lling", - "That's a p***ch!", - "When bann is spelt wrong, it won't be caught.", - ].forEach(word => { - expect(list.test(options, "banned", word)).toEqual(false); - }); - }); - - it("allows an empty list", () => { - expect(list.test(options, "banned", "test")).toEqual(false); - }); - }); - - describe("containsMatchingPhraseMemoized", () => { - it("return true for all cases after memoizing the first result", () => { - [ - "cookies 1", - "cookies 2", - "cookies 4", - "cookies 5", - "this is for cookies 6", - "this is for cookies 7", - "this is for cookies 8", - "this is for cookies 9", - ].forEach(word => { - expect(list.test(options, "banned", word)).toEqual(true); - }); - }); - }); -}); diff --git a/src/core/server/services/comments/pipeline/wordList.ts b/src/core/server/services/comments/pipeline/wordList.ts index 84393dd38..985144df6 100644 --- a/src/core/server/services/comments/pipeline/wordList.ts +++ b/src/core/server/services/comments/pipeline/wordList.ts @@ -2,7 +2,7 @@ import ms from "ms"; import now from "performance-now"; import { LanguageCode } from "coral-common/helpers"; -import { createWordListRegExp } from "coral-common/utils"; +import createWordListRegExp from "coral-common/utils/createWordListRegExp"; import logger from "coral-server/logger"; import { Tenant } from "coral-server/models/tenant"; @@ -90,8 +90,8 @@ export class WordList { const startedAt = now(); const result = list.test(testString); - logger.debug( - { tenantID: options.id, took: ms(now() - startedAt) }, + logger.info( + { tenantID: options.id, listName, took: ms(now() - startedAt) }, "word list phrase test complete" );