[CORL-1001] Wordlist Fixes (#2920)

* fix: improve wordlist highlighting and perf

* fix: updated tests

* fix: implmeneted new regexp lib/patterns

* fix: improve comment body css

* fix: take into account the tree shaking is disabled

See: https://github.com/webpack/webpack/issues/7094

Co-authored-by: Chi Vinh Le <vinh@vinh.tech>
Co-authored-by: Kim Gardner <kgardnr@gmail.com>
This commit is contained in:
Wyatt Johnson
2020-04-06 19:04:05 +00:00
committed by GitHub
parent 8966a8201b
commit 6711f09a79
20 changed files with 616 additions and 185 deletions
+22 -9
View File
@@ -3217,7 +3217,6 @@
"version": "7.8.3",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.8.3.tgz",
"integrity": "sha512-lrIU4aVbmlM/wQPzhEvzvNJskKyYptuXb0fGC0lTQTupTOYtR2Vqbu6/jf8vTr4M8Wt1nIzxVrSvPI5qESa/xA==",
"dev": true,
"requires": {
"core-js-pure": "^3.0.0",
"regenerator-runtime": "^0.13.2"
@@ -5760,6 +5759,12 @@
"@types/node": "*"
}
},
"@types/xregexp": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/@types/xregexp/-/xregexp-4.3.0.tgz",
"integrity": "sha512-3gJTS9gt27pS7U9q5IVqo4YvKSlkf2ck8ish6etuDj6LIRxkL/2Y8RMUtK/QzvE1Yv2zwWV5yemI2BS0GGGFnA==",
"dev": true
},
"@types/yargs": {
"version": "12.0.12",
"resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-12.0.12.tgz",
@@ -10721,8 +10726,7 @@
"core-js-pure": {
"version": "3.6.4",
"resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.6.4.tgz",
"integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==",
"dev": true
"integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw=="
},
"core-util-is": {
"version": "1.0.2",
@@ -32077,8 +32081,7 @@
"regenerator-runtime": {
"version": "0.13.3",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.3.tgz",
"integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==",
"dev": true
"integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw=="
},
"regenerator-transform": {
"version": "0.13.4",
@@ -38138,6 +38141,14 @@
"dev": true,
"requires": {
"xregexp": "4.0.0"
},
"dependencies": {
"xregexp": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
"integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
"dev": true
}
}
},
"del": {
@@ -39365,10 +39376,12 @@
"integrity": "sha512-tGkGJkN8XqCod7OT+EvGYK5Z4SfDQGD30zAa58OcnAa0RRWgzUEK72tkXhsX1FZd+rgnhRxFtmO+ihkp8LHSkw=="
},
"xregexp": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
"integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
"dev": true
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.3.0.tgz",
"integrity": "sha512-7jXDIFXh5yJ/orPn4SXjuVrWWoi4Cr8jfV1eHv9CixKSbU+jY4mxfrBwAuDvupPNKpMUY+FeIqsVw/JLT9+B8g==",
"requires": {
"@babel/runtime-corejs3": "^7.8.3"
}
},
"xtend": {
"version": "4.0.1",
+4 -1
View File
@@ -4,6 +4,7 @@
"author": "The Coral Project",
"homepage": "https://coralproject.net/",
"sideEffects": [
"*.css.ts",
"*.css"
],
"repository": {
@@ -140,7 +141,8 @@
"tsscmp": "^1.0.6",
"url-regex": "^5.0.0",
"uuid": "^3.3.3",
"verror": "^1.10.0"
"verror": "^1.10.0",
"xregexp": "^4.3.0"
},
"devDependencies": {
"@babel/core": "^7.8.3",
@@ -240,6 +242,7 @@
"@types/webpack-bundle-analyzer": "^2.13.1",
"@types/webpack-dev-server": "^3.1.5",
"@types/ws": "^5.1.2",
"@types/xregexp": "^4.3.0",
"@typescript-eslint/eslint-plugin": "2.3.3",
"@typescript-eslint/eslint-plugin-tslint": "2.3.3",
"@typescript-eslint/parser": "2.3.3",
@@ -12,12 +12,12 @@ $comment-link-active: var(--v2-palette-primary-darkest);
color: $comment-content;
overflow-wrap: break-word;
& * bold,
& * strong {
b,
strong {
font-weight: var(--v2-font-weight-primary-bold);
}
& * italic,
& * em {
i,
em {
font-style: italic;
}
blockquote {
@@ -50,3 +50,7 @@ $comment-link-active: var(--v2-palette-primary-darkest);
}
}
}
.highlight {
white-space: pre-wrap;
}
@@ -16,6 +16,7 @@ it("renders correctly", () => {
},
className: "custom",
children: "Hello <b>Bob</b>, you bad guy",
highlight: true,
};
const renderer = createRenderer();
renderer.render(<CommentContent {...props} />);
@@ -33,6 +34,26 @@ it("renders empty words correctly", () => {
},
className: "custom",
children: "Hello <b>Bob</b>, you bad guy",
highlight: true,
};
const renderer = createRenderer();
renderer.render(<CommentContent {...props} />);
expect(renderer.getRenderOutput()).toMatchSnapshot();
});
it("renders correctly even if it has consecutive banned words on comments", () => {
const props: PropTypesOf<typeof CommentContent> = {
phrases: {
locale: "en-US",
wordList: {
suspect: ["worse"],
banned: ["bad"],
},
},
className: "custom",
children:
"This is a very long comment with bad words. Let's try bad and bad. Now bad bad.\nBad BAD bad.\n",
highlight: true,
};
const renderer = createRenderer();
renderer.render(<CommentContent {...props} />);
@@ -1,7 +1,12 @@
import cn from "classnames";
import React, { FunctionComponent, useMemo } from "react";
import striptags from "striptags";
import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
import {
getPhrasesRegExp,
GetPhrasesRegExpOptions,
markHTMLNode,
} from "coral-admin/helpers";
import { createPurify } from "coral-common/utils/purify";
import styles from "./CommentContent.css";
@@ -15,81 +20,60 @@ interface Props {
className?: string;
children: string | React.ReactElement;
phrases: GetPhrasesRegExpOptions;
}
function escapeHTML(unsafe: string) {
return unsafe
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&#039;");
}
// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
// the settings applied for the locale and highlights them by returning an HTML
// string.
function markPhrasesHTML(text: string, expression: RegExp) {
const tokens = text.split(expression);
if (tokens.length === 1) {
return text;
}
return tokens
.map((token, i) =>
// Using our Regexp patterns it returns tokens arranged this way
// [STRING_WITH_NO_MATCH, NEW_WORD_DELIMITER, MATCHED_WORD, ...].
// This pattern repeats throughout. Next line will mark MATCHED_WORD
// and escape all tokens.
i % 3 === 2 ? `<mark>${escapeHTML(token)}</mark>` : escapeHTML(token)
)
.join("");
}
// markHTMLNode manipulates the node by looking for #text nodes and adding markers
// for `supsectWords` and `bannedWords`.
function markHTMLNode(parentNode: Node, expression: RegExp) {
parentNode.childNodes.forEach(node => {
if (node.nodeName === "#text") {
const newContent = markPhrasesHTML(node.textContent!, expression);
if (newContent !== node.textContent) {
const newNode = document.createElement("span");
newNode.innerHTML = newContent;
parentNode.replaceChild(newNode, node);
}
} else {
markHTMLNode(node, expression);
}
});
highlight?: boolean;
}
const CommentContent: FunctionComponent<Props> = ({
phrases,
className,
children,
highlight = false,
}) => {
// Cache the expression used via memo. This will reduce duplicate renders of
// this comment content when the children change but the phrase configuration
// does not change. The regExp is already cached on a deeper level
// automatically, this is just lessening that impact further.
const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
const expression = useMemo(() => {
// If we aren't in highlight mode for this comment, don't even attempt to
// generate the expression.
if (!highlight) {
return null;
}
if (typeof children === "string") {
// We create a Shadow DOM Tree with the HTML body content and
// use it as a parser.
return getPhrasesRegExp(phrases);
}, [phrases, highlight]);
// Cache the parsed comment node. If the children cannot be parsed, this will
// be null.
const parsed = useMemo(() => {
if (typeof children !== "string") {
return null;
}
// Sanitize the input for display.
let html = purify.sanitize(children);
if (highlight) {
html = striptags(html, ["a"]);
}
// We create a Shadow DOM Tree with the HTML body content and use it as a
// parser.
const node = document.createElement("div");
node.innerHTML = purify.sanitize(children);
node.innerHTML = html;
// If the expression is available, then mark the nodes.
if (expression) {
// Then we traverse it recursively and manipulate it to highlight suspect words
// and banned words.
markHTMLNode(node, expression);
}
// Finally we render the content of the Shadow DOM Tree
return node;
}, [children, expression, highlight]);
if (parsed) {
return (
<div
className={cn(className, styles.root)}
dangerouslySetInnerHTML={{ __html: node.innerHTML }}
className={cn(className, styles.root, highlight && styles.highlight)}
dangerouslySetInnerHTML={{ __html: parsed.innerHTML }}
/>
);
}
@@ -2,10 +2,23 @@
exports[`renders correctly 1`] = `
<div
className="custom CommentContent-root"
className="custom CommentContent-root CommentContent-highlight"
dangerouslySetInnerHTML={
Object {
"__html": "Hello <b>Bob</b><span>, you <mark>bad</mark> guy</span>",
"__html": "<span>Hello Bob, you <mark>bad</mark> guy</span>",
}
}
/>
`;
exports[`renders correctly even if it has consecutive banned words on comments 1`] = `
<div
className="custom CommentContent-root CommentContent-highlight"
dangerouslySetInnerHTML={
Object {
"__html": "<span>This is a very long comment with <mark>bad</mark> words. Let's try <mark>bad</mark> and <mark>bad</mark>. Now <mark>bad</mark> bad.
<mark>Bad</mark> BAD <mark>bad</mark>.
</span>",
}
}
/>
@@ -13,10 +26,10 @@ exports[`renders correctly 1`] = `
exports[`renders empty words correctly 1`] = `
<div
className="custom CommentContent-root"
className="custom CommentContent-root CommentContent-highlight"
dangerouslySetInnerHTML={
Object {
"__html": "Hello <b>Bob</b>, you bad guy",
"__html": "Hello Bob, you bad guy",
}
}
/>
@@ -37,6 +37,7 @@ interface Props {
username: string;
createdAt: string;
body: string;
highlight?: boolean;
inReplyTo?: {
id: string;
username: string | null;
@@ -82,6 +83,7 @@ const ModerateCard: FunctionComponent<Props> = ({
username,
createdAt,
body,
highlight = false,
inReplyTo,
comment,
settings,
@@ -222,7 +224,11 @@ const ModerateCard: FunctionComponent<Props> = ({
)}
</div>
<div className={styles.contentArea}>
<CommentContent phrases={phrases} className={styles.content}>
<CommentContent
highlight={highlight}
phrases={phrases}
className={styles.content}
>
{commentBody}
</CommentContent>
<div className={styles.viewContext}>
@@ -215,6 +215,16 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
},
[comment]
);
// Only highlight comments that have been flagged for containing a banned or
// suspect word.
const highlight = comment.revision
? comment.revision.actionCounts.flag.reasons.COMMENT_DETECTED_BANNED_WORD +
comment.revision.actionCounts.flag.reasons
.COMMENT_DETECTED_SUSPECT_WORD >
0
: false;
return (
<>
<FadeInTransition active={Boolean(comment.enteredLive)}>
@@ -227,6 +237,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
}
createdAt={comment.createdAt}
body={comment.body!}
highlight={highlight}
inReplyTo={comment.parent && comment.parent.author}
comment={comment}
settings={settings}
@@ -296,6 +307,16 @@ const enhanced = withFragmentContainer<Props>({
statusLiveUpdated
createdAt
body
revision {
actionCounts {
flag {
reasons {
COMMENT_DETECTED_BANNED_WORD
COMMENT_DETECTED_SUSPECT_WORD
}
}
}
}
tags {
code
}
@@ -44,6 +44,7 @@ exports[`renders approved correctly 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -178,6 +179,7 @@ exports[`renders correctly 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -312,6 +314,7 @@ exports[`renders dangling correctly 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -446,6 +449,7 @@ exports[`renders rejected correctly 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -589,6 +593,7 @@ exports[`renders reply correctly 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -723,6 +728,7 @@ exports[`renders story info 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -895,6 +901,7 @@ exports[`renders tombstoned when comment is deleted 1`] = `
>
<CommentContent
className="ModerateCard-content"
highlight={false}
phrases={
Object {
"locale": "en-US",
@@ -1,5 +1,7 @@
import { lowerCase, uniqBy } from "lodash";
import { LanguageCode } from "coral-common/helpers";
import { createWordListRegExp } from "coral-common/utils";
import createWordListRegExp from "coral-common/utils/createWordListRegExp";
export interface GetPhrasesRegExpOptions {
locale: string;
@@ -17,20 +19,35 @@ export function getPhrasesRegExp({
return null;
}
return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
// Because the banned and suspect word lists may sometimes overlap, we should
// make this list as short as possible before compiling it into a RegExp.
const phrases = uniqBy<string>([...banned, ...suspect], lowerCase);
// The locale is passed down to us from the Graph, we can cast it to a
// LanguageCode.
return createWordListRegExp(locale as LanguageCode, phrases);
}
// cache is used as a global validator to the cached RegExp used by the
// Cache is used as a global validator to the cached RegExp used by the
// application. We expect that generally, there is only ever one word list used
// by the client at a time, so this ensures that we only re-create the word list
// if we must.
const cache = {
interface Cache {
keys: {
locale: string;
suspect: ReadonlyArray<string>;
banned: ReadonlyArray<string>;
};
value: RegExp | null;
}
const cache: Cache = {
keys: {
locale: "",
suspect: [] as ReadonlyArray<string>,
banned: [] as ReadonlyArray<string>,
suspect: [],
banned: [],
},
value: null as RegExp | null,
value: null,
};
export default function(options: GetPhrasesRegExpOptions) {
@@ -57,7 +74,12 @@ export default function(options: GetPhrasesRegExpOptions) {
// If the cache is expired, or the value doesn't exist, regenerate it.
if (expired) {
cache.value = getPhrasesRegExp(options);
try {
cache.value = getPhrasesRegExp(options);
} catch (err) {
window.console.error(err);
return null;
}
}
return cache.value;
+1
View File
@@ -3,3 +3,4 @@ export {
default as getPhrasesRegExp,
GetPhrasesRegExpOptions,
} from "./getPhrasesRegExp";
export { default as markHTMLNode } from "./markHTMLNode";
@@ -0,0 +1,58 @@
// markPhrasesHTML looks for `suspect` and `banned` words inside `text` given
// the settings applied for the locale and highlights them by returning an HTML
// string.
function markPhrasesHTML(text: string, expression: RegExp) {
const tokens = text.split(expression);
// If there were less than two matches, then there was no matched word
// associated with the passed in text.
if (tokens.length < 3) {
return null;
}
return tokens
.map((token, i) =>
// Using our Regexp patterns it returns tokens arranged this way:
//
// - STRING_WITH_NO_MATCH
// - NEW_WORD_DELIMITER
// - MATCHED_WORD
// - NEW_WORD_DELIMITER
// - ...
//
// This pattern repeats throughout. Next line will mark MATCHED_WORD.
i % 4 === 2 ? "<mark>" + token + "</mark>" : token
)
.join("");
}
// markHTMLNode manipulates the node by looking for #text nodes and adding
// markers.
export default function markHTMLNode(parentNode: Node, expression: RegExp) {
parentNode.childNodes.forEach(node => {
// Anchor links are already marked by default, skip them now.
if (node.nodeName === "A") {
return;
}
// If the node isn't of text type then we can't mark it directly.
if (node.nodeName !== "#text") {
return markHTMLNode(node, expression);
}
// If the node doesn't have any text content, then we can't mark it either.
if (!node.textContent) {
return;
}
// We've encountered a text node with text content that isn't in an anchor
// link. We should try to mark and replace it's content.
const replacement = markPhrasesHTML(node.textContent, expression);
if (replacement) {
// Create the new span node to replace the old node with.
const newNode = document.createElement("span");
newNode.innerHTML = replacement;
parentNode.replaceChild(newNode, node);
}
});
}
@@ -3,6 +3,10 @@ $fullscreenZIndex: 10;
.wrapper {
@mixin bodyCopy;
i
em {
font-style: italic;
}
b,
strong {
font-weight: var(--font-weight-medium);
@@ -3,12 +3,12 @@
color: var(--palette-text-dark);
overflow-wrap: break-word;
& * bold,
& * strong {
b,
strong {
font-weight: var(--font-weight-medium);
}
& * italic,
& * em {
i
em {
font-style: italic;
}
blockquote {
@@ -0,0 +1,179 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`en-US splits the multi-words in a sentence correctly 1`] = `
Array [
"this sentence has",
" ",
"french fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 2`] = `
Array [
"this sentence has",
" ",
"french;fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 3`] = `
Array [
"this sentence has",
" ",
"french!fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 4`] = `
Array [
"this sentence has",
" ",
"french.fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 5`] = `
Array [
"this sentence has",
" ",
"french?fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 6`] = `
Array [
"this sentence has",
" ",
"french¿fries",
".",
"",
]
`;
exports[`en-US splits the multi-words in a sentence correctly 7`] = `
Array [
"this sentence has",
" ",
"french:fries",
".",
"",
]
`;
exports[`en-US splits the words in a sentence correctly 1`] = `
Array [
"this sentence is",
" ",
"bad",
".",
"",
]
`;
exports[`en-US splits the words in a sentence correctly 2`] = `
Array [
"this sentence is",
" ",
"worse",
".",
"",
]
`;
exports[`en-US splits the words with unicode in a sentence correctly 1`] = `
Array [
"this sentence has one",
" ",
"jalapeño",
".",
"",
]
`;
exports[`en-US splits the words with unicode in a sentence correctly 2`] = `
Array [
"this sentence has many jalapeños.",
]
`;
exports[`en-US splits words when there are repeat words 1`] = `
Array [
"This is",
" ",
"bad",
" ",
"bad, very",
" ",
"BAD",
".",
"",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 1`] = `
Array [
"biólogo se soletra com",
": ",
"bi",
" ",
"",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 2`] = `
Array [
"",
"",
"m.e.r.d.a",
"",
"",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 3`] = `
Array [
"não tomo",
" ",
"café",
" ",
"pois faz mal",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 4`] = `
Array [
"",
"",
"Como fazer coisas ruins",
"",
"",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 5`] = `
Array [
"O biólogo recomenda este artigo",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 6`] = `
Array [
"cafe",
]
`;
exports[`pt-BR splits the words with unicode in a sentence correctly 7`] = `
Array [
"Ser banido é uma merda",
]
`;
@@ -0,0 +1,173 @@
import createWordListRegExp from "./createWordListRegExp";
const buildTester = (re: RegExp) => (str: string) => re.test(str);
const buildSplitter = (re: RegExp) => (str: string) => str.split(re);
describe("en-US", () => {
const re = createWordListRegExp("en-US", [
"bad",
"french fries",
"worse",
"jalapeño",
"1km",
]);
const test = buildTester(re);
it("test words in the list", () => {
expect(test("bad")).toBeTruthy();
expect(test("worse")).toBeTruthy();
expect(test("1km")).toBeTruthy();
});
it("test repeated words in the list", () => {
expect(test("bad bad bad")).toBeTruthy();
expect(test("worse worse worse")).toBeTruthy();
});
it("test words not in the list", () => {
expect(test("fine")).toBeFalsy();
expect(test("ok")).toBeFalsy();
});
it("test words that end with a unicode character", () => {
expect(test("I have one jalapeño.")).toBeTruthy();
expect(test("I have two jalapeños.")).toBeFalsy();
});
it("test words in the list while being case insensitive", () => {
expect(test("Bad")).toBeTruthy();
expect(test("BAd")).toBeTruthy();
expect(test("BAD")).toBeTruthy();
expect(test("bAD")).toBeTruthy();
expect(test("baD")).toBeTruthy();
expect(test("bAd")).toBeTruthy();
expect(test("1KM")).toBeTruthy();
});
it("test multi-words", () => {
expect(test("french fries")).toBeTruthy();
expect(test("french!fries")).toBeTruthy();
expect(test("french.fries")).toBeTruthy();
expect(test("french?fries")).toBeTruthy();
expect(test("french¿fries")).toBeTruthy();
expect(test("french:fries")).toBeTruthy();
expect(test("french;fries")).toBeTruthy();
});
it("test words at the end of a sentence", () => {
expect(test("this sentence is bad.")).toBeTruthy();
expect(test("this sentence is worse.")).toBeTruthy();
expect(test("this sentence has french fries.")).toBeTruthy();
expect(test("this sentence has french!fries.")).toBeTruthy();
expect(test("this sentence has french.fries.")).toBeTruthy();
expect(test("this sentence has french?fries.")).toBeTruthy();
expect(test("this sentence has french¿fries.")).toBeTruthy();
expect(test("this sentence has french:fries.")).toBeTruthy();
expect(test("this sentence has french;fries.")).toBeTruthy();
});
it("test words at the start of a sentence", () => {
expect(test("bad is the start of the sentence.")).toBeTruthy();
expect(test("worse is the start of the sentence.")).toBeTruthy();
expect(test("french fries is the start of the sentence.")).toBeTruthy();
});
it("test does not preserve state", () => {
expect(test("bad 1")).toBeTruthy();
expect(test("bad 2")).toBeTruthy();
expect(test("more bad 3")).toBeTruthy();
expect(test("more bad 4")).toBeTruthy();
});
it("test repeated words", () => {
expect(test("This is bad bad, very bad")).toBeTruthy();
});
it("test does not match substrings", () => {
expect(test("baddd")).toBeFalsy();
expect(test("wwworse")).toBeFalsy();
expect(test("fffrench fries")).toBeFalsy();
});
it("test handles when there are numbers", () => {
expect(test("bad3")).toBeTruthy();
expect(test("3bad")).toBeTruthy();
});
const split = buildSplitter(re);
it("splits the words in a sentence correctly", () => {
expect(split("this sentence is bad.")).toMatchSnapshot();
expect(split("this sentence is worse.")).toMatchSnapshot();
});
it("splits words when there are repeat words", () => {
expect(split("This is bad bad, very BAD.")).toMatchSnapshot();
});
it("splits the words with unicode in a sentence correctly", () => {
expect(split("this sentence has one jalapeño.")).toMatchSnapshot();
expect(split("this sentence has many jalapeños.")).toMatchSnapshot();
});
it("splits the multi-words in a sentence correctly", () => {
expect(split("this sentence has french fries.")).toMatchSnapshot();
expect(split("this sentence has french;fries.")).toMatchSnapshot();
expect(split("this sentence has french!fries.")).toMatchSnapshot();
expect(split("this sentence has french.fries.")).toMatchSnapshot();
expect(split("this sentence has french?fries.")).toMatchSnapshot();
expect(split("this sentence has french¿fries.")).toMatchSnapshot();
expect(split("this sentence has french:fries.")).toMatchSnapshot();
});
});
describe("es", () => {
const re = createWordListRegExp("es", ["adónde vas", "tú"]);
const test = buildTester(re);
it("test words in the list", () => {
expect(test("Pablo, ¿adónde vas?")).toBeTruthy();
expect(test("Estoy cansado, ¿y tú?")).toBeTruthy();
expect(test("¿tú?")).toBeTruthy();
});
});
describe("pt-BR", () => {
const re = createWordListRegExp("pt-BR", [
"bi",
"outro",
"café",
"m e r d a",
"Como fazer coisas ruins",
]);
const test = buildTester(re);
it("test words in the list", () => {
expect(test("biólogo se soletra com: bi ")).toBeTruthy();
expect(test("m.e.r.d.a")).toBeTruthy();
expect(test("não tomo café pois faz mal")).toBeTruthy();
expect(test("Como fazer coisas ruins")).toBeTruthy();
});
it("test words not in the list", () => {
expect(test("O biólogo recomenda este artigo")).toBeFalsy();
expect(test("cafe")).toBeFalsy();
expect(test("Ser banido é uma merda")).toBeFalsy();
});
const split = buildSplitter(re);
it("splits the words with unicode in a sentence correctly", () => {
expect(split("biólogo se soletra com: bi ")).toMatchSnapshot();
expect(split("m.e.r.d.a")).toMatchSnapshot();
expect(split("não tomo café pois faz mal")).toMatchSnapshot();
expect(split("Como fazer coisas ruins")).toMatchSnapshot();
expect(split("O biólogo recomenda este artigo")).toMatchSnapshot();
expect(split("cafe")).toMatchSnapshot();
expect(split("Ser banido é uma merda")).toMatchSnapshot();
});
});
+17 -12
View File
@@ -1,18 +1,20 @@
import { defaults } from "lodash";
import XRegExp from "xregexp";
import { LanguageCode } from "coral-common/helpers";
import { DeepPartial } from "coral-common/types";
interface WordListRule {
split: string;
boundary: string;
punctuation: string;
whitespace: string;
}
const DefaultWordListRule: WordListRule = {
split: "[^\\w]",
punctuation: '[\\s"?!.¿¡`:;]+',
whitespace: "\\s+",
// The following symbol, \p{L} refers to any letter class within unicode.
// Because we're adding the ^, we're also saying to exclude any from that set,
// leaving all non-word characters from unicode available for selection.
boundary: "[^\\p{L}]+",
punctuation: "[\\s\"'?!.,¿¡`:;]+",
};
const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
@@ -47,14 +49,12 @@ export default function createWordListRegExp(
DefaultWordListRule
);
const whitespace = new RegExp(rule.whitespace);
// Split up the words from the list into a regex escaped string.
const words = phrases
.map(phrase =>
phrase
// Split each phrase by whitespace.
.split(whitespace)
.split(/\s/)
// Escape each phrase, we don't expect any of them to contain regex.
.map(word => escapeRegExp(word))
// Rejoin to ensure that any variation of the word separated by a
@@ -64,13 +64,18 @@ export default function createWordListRegExp(
// For each of these words, wrap a `|` or OR.
.join("|");
// Wrap the pattern in split rules.
const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
// Wrap the pattern in split rules. We want to match any word that either is
// at the start of a string, or a word boundary. The word must also either be
// at the end of the string or at another word boundary.
const pattern = `(^|${rule.boundary})(${words})($|${rule.boundary})`;
try {
return new RegExp(pattern, "iu");
// Create the RegExp using xregexp to pre-process the pattern to generate
// one with the correct unicode ranges. Including A for "astral" unicode
// support for supporting higher character ranges.
return XRegExp(pattern, "iuA");
} catch {
// IE does not support unicode support, so we'll create one without.
return new RegExp(pattern, "i");
return XRegExp(pattern, "i");
}
}
-1
View File
@@ -12,4 +12,3 @@ export { default as isPromiseLike } from "./isPromiseLike";
export { default as isPromise } from "./isPromise";
export { default as startsWith } from "./startsWith";
export { default as getOrigin } from "./getOrigin";
export { default as createWordListRegExp } from "./createWordListRegExp";
@@ -1,82 +0,0 @@
import {
Options,
WordList,
} from "coral-server/services/comments/pipeline/wordList";
describe("en-US", () => {
const list = new WordList();
const options: Options = {
id: "tenant_1",
locale: "en-US",
wordList: {
banned: [
"cookies",
"how to do bad things",
"how to do really bad things",
"s h i t",
"$hit",
"p**ch",
"p*ch",
"banned",
"ban",
],
suspect: [],
},
};
describe("containsMatchingPhrase", () => {
it("does match on a word in the list", () => {
[
"how to do really bad things",
"what is cookies",
"cookies",
"COOKIES.",
"how to do bad things",
"How To do bad things!",
"How.To.do.bad.things!",
"This stuff is $hit!",
"This is a test.\nTo see if cookies are found, in the second line.",
"That's a p**ch!",
"Banned words should be detected",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(true);
});
});
it("does not match on a word not in the list", () => {
[
"how to",
"cookie",
"how to be a great person?",
"how to not do really bad things?",
"i have $100 dollars.",
"I have bad $ hit lling",
"That's a p***ch!",
"When bann is spelt wrong, it won't be caught.",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(false);
});
});
it("allows an empty list", () => {
expect(list.test(options, "banned", "test")).toEqual(false);
});
});
describe("containsMatchingPhraseMemoized", () => {
it("return true for all cases after memoizing the first result", () => {
[
"cookies 1",
"cookies 2",
"cookies 4",
"cookies 5",
"this is for cookies 6",
"this is for cookies 7",
"this is for cookies 8",
"this is for cookies 9",
].forEach(word => {
expect(list.test(options, "banned", word)).toEqual(true);
});
});
});
});
@@ -2,7 +2,7 @@ import ms from "ms";
import now from "performance-now";
import { LanguageCode } from "coral-common/helpers";
import { createWordListRegExp } from "coral-common/utils";
import createWordListRegExp from "coral-common/utils/createWordListRegExp";
import logger from "coral-server/logger";
import { Tenant } from "coral-server/models/tenant";
@@ -90,8 +90,8 @@ export class WordList {
const startedAt = now();
const result = list.test(testString);
logger.debug(
{ tenantID: options.id, took: ms(now() - startedAt) },
logger.info(
{ tenantID: options.id, listName, took: ms(now() - startedAt) },
"word list phrase test complete"
);