mirror of
https://github.com/wassname/talk.git
synced 2026-06-27 19:33:06 +08:00
[CORL-1001] Wordlist Fixes (#2920)
* fix: improve wordlist highlighting and perf * fix: updated tests * fix: implmeneted new regexp lib/patterns * fix: improve comment body css * fix: take into account the tree shaking is disabled See: https://github.com/webpack/webpack/issues/7094 Co-authored-by: Chi Vinh Le <vinh@vinh.tech> Co-authored-by: Kim Gardner <kgardnr@gmail.com>
This commit is contained in:
Generated
+22
-9
@@ -3217,7 +3217,6 @@
|
||||
"version": "7.8.3",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.8.3.tgz",
|
||||
"integrity": "sha512-lrIU4aVbmlM/wQPzhEvzvNJskKyYptuXb0fGC0lTQTupTOYtR2Vqbu6/jf8vTr4M8Wt1nIzxVrSvPI5qESa/xA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"core-js-pure": "^3.0.0",
|
||||
"regenerator-runtime": "^0.13.2"
|
||||
@@ -5760,6 +5759,12 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"@types/xregexp": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/xregexp/-/xregexp-4.3.0.tgz",
|
||||
"integrity": "sha512-3gJTS9gt27pS7U9q5IVqo4YvKSlkf2ck8ish6etuDj6LIRxkL/2Y8RMUtK/QzvE1Yv2zwWV5yemI2BS0GGGFnA==",
|
||||
"dev": true
|
||||
},
|
||||
"@types/yargs": {
|
||||
"version": "12.0.12",
|
||||
"resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-12.0.12.tgz",
|
||||
@@ -10721,8 +10726,7 @@
|
||||
"core-js-pure": {
|
||||
"version": "3.6.4",
|
||||
"resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.6.4.tgz",
|
||||
"integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw==",
|
||||
"dev": true
|
||||
"integrity": "sha512-epIhRLkXdgv32xIUFaaAry2wdxZYBi6bgM7cB136dzzXXa+dFyRLTZeLUJxnd8ShrmyVXBub63n2NHo2JAt8Cw=="
|
||||
},
|
||||
"core-util-is": {
|
||||
"version": "1.0.2",
|
||||
@@ -32077,8 +32081,7 @@
|
||||
"regenerator-runtime": {
|
||||
"version": "0.13.3",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.3.tgz",
|
||||
"integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw==",
|
||||
"dev": true
|
||||
"integrity": "sha512-naKIZz2GQ8JWh///G7L3X6LaQUAMp2lvb1rvwwsURe/VXwD6VMfr+/1NuNw3ag8v2kY1aQ/go5SNn79O9JU7yw=="
|
||||
},
|
||||
"regenerator-transform": {
|
||||
"version": "0.13.4",
|
||||
@@ -38138,6 +38141,14 @@
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"xregexp": "4.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"xregexp": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
|
||||
"integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"del": {
|
||||
@@ -39365,10 +39376,12 @@
|
||||
"integrity": "sha512-tGkGJkN8XqCod7OT+EvGYK5Z4SfDQGD30zAa58OcnAa0RRWgzUEK72tkXhsX1FZd+rgnhRxFtmO+ihkp8LHSkw=="
|
||||
},
|
||||
"xregexp": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz",
|
||||
"integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==",
|
||||
"dev": true
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.3.0.tgz",
|
||||
"integrity": "sha512-7jXDIFXh5yJ/orPn4SXjuVrWWoi4Cr8jfV1eHv9CixKSbU+jY4mxfrBwAuDvupPNKpMUY+FeIqsVw/JLT9+B8g==",
|
||||
"requires": {
|
||||
"@babel/runtime-corejs3": "^7.8.3"
|
||||
}
|
||||
},
|
||||
"xtend": {
|
||||
"version": "4.0.1",
|
||||
|
||||
+4
-1
@@ -4,6 +4,7 @@
|
||||
"author": "The Coral Project",
|
||||
"homepage": "https://coralproject.net/",
|
||||
"sideEffects": [
|
||||
"*.css.ts",
|
||||
"*.css"
|
||||
],
|
||||
"repository": {
|
||||
@@ -140,7 +141,8 @@
|
||||
"tsscmp": "^1.0.6",
|
||||
"url-regex": "^5.0.0",
|
||||
"uuid": "^3.3.3",
|
||||
"verror": "^1.10.0"
|
||||
"verror": "^1.10.0",
|
||||
"xregexp": "^4.3.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.8.3",
|
||||
@@ -240,6 +242,7 @@
|
||||
"@types/webpack-bundle-analyzer": "^2.13.1",
|
||||
"@types/webpack-dev-server": "^3.1.5",
|
||||
"@types/ws": "^5.1.2",
|
||||
"@types/xregexp": "^4.3.0",
|
||||
"@typescript-eslint/eslint-plugin": "2.3.3",
|
||||
"@typescript-eslint/eslint-plugin-tslint": "2.3.3",
|
||||
"@typescript-eslint/parser": "2.3.3",
|
||||
|
||||
@@ -12,12 +12,12 @@ $comment-link-active: var(--v2-palette-primary-darkest);
|
||||
color: $comment-content;
|
||||
overflow-wrap: break-word;
|
||||
|
||||
& * bold,
|
||||
& * strong {
|
||||
b,
|
||||
strong {
|
||||
font-weight: var(--v2-font-weight-primary-bold);
|
||||
}
|
||||
& * italic,
|
||||
& * em {
|
||||
i,
|
||||
em {
|
||||
font-style: italic;
|
||||
}
|
||||
blockquote {
|
||||
@@ -50,3 +50,7 @@ $comment-link-active: var(--v2-palette-primary-darkest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.highlight {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ it("renders correctly", () => {
|
||||
},
|
||||
className: "custom",
|
||||
children: "Hello <b>Bob</b>, you bad guy",
|
||||
highlight: true,
|
||||
};
|
||||
const renderer = createRenderer();
|
||||
renderer.render(<CommentContent {...props} />);
|
||||
@@ -33,6 +34,26 @@ it("renders empty words correctly", () => {
|
||||
},
|
||||
className: "custom",
|
||||
children: "Hello <b>Bob</b>, you bad guy",
|
||||
highlight: true,
|
||||
};
|
||||
const renderer = createRenderer();
|
||||
renderer.render(<CommentContent {...props} />);
|
||||
expect(renderer.getRenderOutput()).toMatchSnapshot();
|
||||
});
|
||||
|
||||
it("renders correctly even if it has consecutive banned words on comments", () => {
|
||||
const props: PropTypesOf<typeof CommentContent> = {
|
||||
phrases: {
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
suspect: ["worse"],
|
||||
banned: ["bad"],
|
||||
},
|
||||
},
|
||||
className: "custom",
|
||||
children:
|
||||
"This is a very long comment with bad words. Let's try bad and bad. Now bad bad.\nBad BAD bad.\n",
|
||||
highlight: true,
|
||||
};
|
||||
const renderer = createRenderer();
|
||||
renderer.render(<CommentContent {...props} />);
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
import cn from "classnames";
|
||||
import React, { FunctionComponent, useMemo } from "react";
|
||||
import striptags from "striptags";
|
||||
|
||||
import { getPhrasesRegExp, GetPhrasesRegExpOptions } from "coral-admin/helpers";
|
||||
import {
|
||||
getPhrasesRegExp,
|
||||
GetPhrasesRegExpOptions,
|
||||
markHTMLNode,
|
||||
} from "coral-admin/helpers";
|
||||
import { createPurify } from "coral-common/utils/purify";
|
||||
|
||||
import styles from "./CommentContent.css";
|
||||
@@ -15,81 +20,60 @@ interface Props {
|
||||
className?: string;
|
||||
children: string | React.ReactElement;
|
||||
phrases: GetPhrasesRegExpOptions;
|
||||
}
|
||||
|
||||
function escapeHTML(unsafe: string) {
|
||||
return unsafe
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
// markPhrasesHTML looks for `supsect` and `banned` words inside `text` given
|
||||
// the settings applied for the locale and highlights them by returning an HTML
|
||||
// string.
|
||||
function markPhrasesHTML(text: string, expression: RegExp) {
|
||||
const tokens = text.split(expression);
|
||||
if (tokens.length === 1) {
|
||||
return text;
|
||||
}
|
||||
return tokens
|
||||
.map((token, i) =>
|
||||
// Using our Regexp patterns it returns tokens arranged this way
|
||||
// [STRING_WITH_NO_MATCH, NEW_WORD_DELIMITER, MATCHED_WORD, ...].
|
||||
// This pattern repeats throughout. Next line will mark MATCHED_WORD
|
||||
// and escape all tokens.
|
||||
i % 3 === 2 ? `<mark>${escapeHTML(token)}</mark>` : escapeHTML(token)
|
||||
)
|
||||
.join("");
|
||||
}
|
||||
|
||||
// markHTMLNode manipulates the node by looking for #text nodes and adding markers
|
||||
// for `supsectWords` and `bannedWords`.
|
||||
function markHTMLNode(parentNode: Node, expression: RegExp) {
|
||||
parentNode.childNodes.forEach(node => {
|
||||
if (node.nodeName === "#text") {
|
||||
const newContent = markPhrasesHTML(node.textContent!, expression);
|
||||
if (newContent !== node.textContent) {
|
||||
const newNode = document.createElement("span");
|
||||
newNode.innerHTML = newContent;
|
||||
parentNode.replaceChild(newNode, node);
|
||||
}
|
||||
} else {
|
||||
markHTMLNode(node, expression);
|
||||
}
|
||||
});
|
||||
highlight?: boolean;
|
||||
}
|
||||
|
||||
const CommentContent: FunctionComponent<Props> = ({
|
||||
phrases,
|
||||
className,
|
||||
children,
|
||||
highlight = false,
|
||||
}) => {
|
||||
// Cache the expression used via memo. This will reduce duplicate renders of
|
||||
// this comment content when the children change but the phrase configuration
|
||||
// does not change. The regExp is already cached on a deeper level
|
||||
// automatically, this is just lessening that impact further.
|
||||
const expression = useMemo(() => getPhrasesRegExp(phrases), [phrases]);
|
||||
const expression = useMemo(() => {
|
||||
// If we aren't in highlight mode for this comment, don't even attempt to
|
||||
// generate the expression.
|
||||
if (!highlight) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof children === "string") {
|
||||
// We create a Shadow DOM Tree with the HTML body content and
|
||||
// use it as a parser.
|
||||
return getPhrasesRegExp(phrases);
|
||||
}, [phrases, highlight]);
|
||||
|
||||
// Cache the parsed comment node. If the children cannot be parsed, this will
|
||||
// be null.
|
||||
const parsed = useMemo(() => {
|
||||
if (typeof children !== "string") {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Sanitize the input for display.
|
||||
let html = purify.sanitize(children);
|
||||
if (highlight) {
|
||||
html = striptags(html, ["a"]);
|
||||
}
|
||||
|
||||
// We create a Shadow DOM Tree with the HTML body content and use it as a
|
||||
// parser.
|
||||
const node = document.createElement("div");
|
||||
node.innerHTML = purify.sanitize(children);
|
||||
node.innerHTML = html;
|
||||
|
||||
// If the expression is available, then mark the nodes.
|
||||
if (expression) {
|
||||
// Then we traverse it recursively and manipulate it to highlight suspect words
|
||||
// and banned words.
|
||||
markHTMLNode(node, expression);
|
||||
}
|
||||
|
||||
// Finally we render the content of the Shadow DOM Tree
|
||||
return node;
|
||||
}, [children, expression, highlight]);
|
||||
|
||||
if (parsed) {
|
||||
return (
|
||||
<div
|
||||
className={cn(className, styles.root)}
|
||||
dangerouslySetInnerHTML={{ __html: node.innerHTML }}
|
||||
className={cn(className, styles.root, highlight && styles.highlight)}
|
||||
dangerouslySetInnerHTML={{ __html: parsed.innerHTML }}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
+17
-4
@@ -2,10 +2,23 @@
|
||||
|
||||
exports[`renders correctly 1`] = `
|
||||
<div
|
||||
className="custom CommentContent-root"
|
||||
className="custom CommentContent-root CommentContent-highlight"
|
||||
dangerouslySetInnerHTML={
|
||||
Object {
|
||||
"__html": "Hello <b>Bob</b><span>, you <mark>bad</mark> guy</span>",
|
||||
"__html": "<span>Hello Bob, you <mark>bad</mark> guy</span>",
|
||||
}
|
||||
}
|
||||
/>
|
||||
`;
|
||||
|
||||
exports[`renders correctly even if it has consecutive banned words on comments 1`] = `
|
||||
<div
|
||||
className="custom CommentContent-root CommentContent-highlight"
|
||||
dangerouslySetInnerHTML={
|
||||
Object {
|
||||
"__html": "<span>This is a very long comment with <mark>bad</mark> words. Let's try <mark>bad</mark> and <mark>bad</mark>. Now <mark>bad</mark> bad.
|
||||
<mark>Bad</mark> BAD <mark>bad</mark>.
|
||||
</span>",
|
||||
}
|
||||
}
|
||||
/>
|
||||
@@ -13,10 +26,10 @@ exports[`renders correctly 1`] = `
|
||||
|
||||
exports[`renders empty words correctly 1`] = `
|
||||
<div
|
||||
className="custom CommentContent-root"
|
||||
className="custom CommentContent-root CommentContent-highlight"
|
||||
dangerouslySetInnerHTML={
|
||||
Object {
|
||||
"__html": "Hello <b>Bob</b>, you bad guy",
|
||||
"__html": "Hello Bob, you bad guy",
|
||||
}
|
||||
}
|
||||
/>
|
||||
|
||||
@@ -37,6 +37,7 @@ interface Props {
|
||||
username: string;
|
||||
createdAt: string;
|
||||
body: string;
|
||||
highlight?: boolean;
|
||||
inReplyTo?: {
|
||||
id: string;
|
||||
username: string | null;
|
||||
@@ -82,6 +83,7 @@ const ModerateCard: FunctionComponent<Props> = ({
|
||||
username,
|
||||
createdAt,
|
||||
body,
|
||||
highlight = false,
|
||||
inReplyTo,
|
||||
comment,
|
||||
settings,
|
||||
@@ -222,7 +224,11 @@ const ModerateCard: FunctionComponent<Props> = ({
|
||||
)}
|
||||
</div>
|
||||
<div className={styles.contentArea}>
|
||||
<CommentContent phrases={phrases} className={styles.content}>
|
||||
<CommentContent
|
||||
highlight={highlight}
|
||||
phrases={phrases}
|
||||
className={styles.content}
|
||||
>
|
||||
{commentBody}
|
||||
</CommentContent>
|
||||
<div className={styles.viewContext}>
|
||||
|
||||
@@ -215,6 +215,16 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
|
||||
},
|
||||
[comment]
|
||||
);
|
||||
|
||||
// Only highlight comments that have been flagged for containing a banned or
|
||||
// suspect word.
|
||||
const highlight = comment.revision
|
||||
? comment.revision.actionCounts.flag.reasons.COMMENT_DETECTED_BANNED_WORD +
|
||||
comment.revision.actionCounts.flag.reasons
|
||||
.COMMENT_DETECTED_SUSPECT_WORD >
|
||||
0
|
||||
: false;
|
||||
|
||||
return (
|
||||
<>
|
||||
<FadeInTransition active={Boolean(comment.enteredLive)}>
|
||||
@@ -227,6 +237,7 @@ const ModerateCardContainer: FunctionComponent<Props> = ({
|
||||
}
|
||||
createdAt={comment.createdAt}
|
||||
body={comment.body!}
|
||||
highlight={highlight}
|
||||
inReplyTo={comment.parent && comment.parent.author}
|
||||
comment={comment}
|
||||
settings={settings}
|
||||
@@ -296,6 +307,16 @@ const enhanced = withFragmentContainer<Props>({
|
||||
statusLiveUpdated
|
||||
createdAt
|
||||
body
|
||||
revision {
|
||||
actionCounts {
|
||||
flag {
|
||||
reasons {
|
||||
COMMENT_DETECTED_BANNED_WORD
|
||||
COMMENT_DETECTED_SUSPECT_WORD
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tags {
|
||||
code
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ exports[`renders approved correctly 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -178,6 +179,7 @@ exports[`renders correctly 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -312,6 +314,7 @@ exports[`renders dangling correctly 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -446,6 +449,7 @@ exports[`renders rejected correctly 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -589,6 +593,7 @@ exports[`renders reply correctly 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -723,6 +728,7 @@ exports[`renders story info 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
@@ -895,6 +901,7 @@ exports[`renders tombstoned when comment is deleted 1`] = `
|
||||
>
|
||||
<CommentContent
|
||||
className="ModerateCard-content"
|
||||
highlight={false}
|
||||
phrases={
|
||||
Object {
|
||||
"locale": "en-US",
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import { lowerCase, uniqBy } from "lodash";
|
||||
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { createWordListRegExp } from "coral-common/utils";
|
||||
import createWordListRegExp from "coral-common/utils/createWordListRegExp";
|
||||
|
||||
export interface GetPhrasesRegExpOptions {
|
||||
locale: string;
|
||||
@@ -17,20 +19,35 @@ export function getPhrasesRegExp({
|
||||
return null;
|
||||
}
|
||||
|
||||
return createWordListRegExp(locale as LanguageCode, [...banned, ...suspect]);
|
||||
// Because the banned and suspect word lists may sometimes overlap, we should
|
||||
// make this list as short as possible before compiling it into a RegExp.
|
||||
const phrases = uniqBy<string>([...banned, ...suspect], lowerCase);
|
||||
|
||||
// The locale is passed down to us from the Graph, we can cast it to a
|
||||
// LanguageCode.
|
||||
return createWordListRegExp(locale as LanguageCode, phrases);
|
||||
}
|
||||
|
||||
// cache is used as a global validator to the cached RegExp used by the
|
||||
// Cache is used as a global validator to the cached RegExp used by the
|
||||
// application. We expect that generally, there is only ever one word list used
|
||||
// by the client at a time, so this ensures that we only re-create the word list
|
||||
// if we must.
|
||||
const cache = {
|
||||
interface Cache {
|
||||
keys: {
|
||||
locale: string;
|
||||
suspect: ReadonlyArray<string>;
|
||||
banned: ReadonlyArray<string>;
|
||||
};
|
||||
value: RegExp | null;
|
||||
}
|
||||
|
||||
const cache: Cache = {
|
||||
keys: {
|
||||
locale: "",
|
||||
suspect: [] as ReadonlyArray<string>,
|
||||
banned: [] as ReadonlyArray<string>,
|
||||
suspect: [],
|
||||
banned: [],
|
||||
},
|
||||
value: null as RegExp | null,
|
||||
value: null,
|
||||
};
|
||||
|
||||
export default function(options: GetPhrasesRegExpOptions) {
|
||||
@@ -57,7 +74,12 @@ export default function(options: GetPhrasesRegExpOptions) {
|
||||
|
||||
// If the cache is expired, or the value doesn't exist, regenerate it.
|
||||
if (expired) {
|
||||
cache.value = getPhrasesRegExp(options);
|
||||
try {
|
||||
cache.value = getPhrasesRegExp(options);
|
||||
} catch (err) {
|
||||
window.console.error(err);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return cache.value;
|
||||
|
||||
@@ -3,3 +3,4 @@ export {
|
||||
default as getPhrasesRegExp,
|
||||
GetPhrasesRegExpOptions,
|
||||
} from "./getPhrasesRegExp";
|
||||
export { default as markHTMLNode } from "./markHTMLNode";
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
// markPhrasesHTML looks for `suspect` and `banned` words inside `text` given
|
||||
// the settings applied for the locale and highlights them by returning an HTML
|
||||
// string.
|
||||
function markPhrasesHTML(text: string, expression: RegExp) {
|
||||
const tokens = text.split(expression);
|
||||
|
||||
// If there were less than two matches, then there was no matched word
|
||||
// associated with the passed in text.
|
||||
if (tokens.length < 3) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return tokens
|
||||
.map((token, i) =>
|
||||
// Using our Regexp patterns it returns tokens arranged this way:
|
||||
//
|
||||
// - STRING_WITH_NO_MATCH
|
||||
// - NEW_WORD_DELIMITER
|
||||
// - MATCHED_WORD
|
||||
// - NEW_WORD_DELIMITER
|
||||
// - ...
|
||||
//
|
||||
// This pattern repeats throughout. Next line will mark MATCHED_WORD.
|
||||
i % 4 === 2 ? "<mark>" + token + "</mark>" : token
|
||||
)
|
||||
.join("");
|
||||
}
|
||||
|
||||
// markHTMLNode manipulates the node by looking for #text nodes and adding
|
||||
// markers.
|
||||
export default function markHTMLNode(parentNode: Node, expression: RegExp) {
|
||||
parentNode.childNodes.forEach(node => {
|
||||
// Anchor links are already marked by default, skip them now.
|
||||
if (node.nodeName === "A") {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the node isn't of text type then we can't mark it directly.
|
||||
if (node.nodeName !== "#text") {
|
||||
return markHTMLNode(node, expression);
|
||||
}
|
||||
|
||||
// If the node doesn't have any text content, then we can't mark it either.
|
||||
if (!node.textContent) {
|
||||
return;
|
||||
}
|
||||
|
||||
// We've encountered a text node with text content that isn't in an anchor
|
||||
// link. We should try to mark and replace it's content.
|
||||
const replacement = markPhrasesHTML(node.textContent, expression);
|
||||
if (replacement) {
|
||||
// Create the new span node to replace the old node with.
|
||||
const newNode = document.createElement("span");
|
||||
newNode.innerHTML = replacement;
|
||||
parentNode.replaceChild(newNode, node);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -3,6 +3,10 @@ $fullscreenZIndex: 10;
|
||||
|
||||
.wrapper {
|
||||
@mixin bodyCopy;
|
||||
i
|
||||
em {
|
||||
font-style: italic;
|
||||
}
|
||||
b,
|
||||
strong {
|
||||
font-weight: var(--font-weight-medium);
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
color: var(--palette-text-dark);
|
||||
overflow-wrap: break-word;
|
||||
|
||||
& * bold,
|
||||
& * strong {
|
||||
b,
|
||||
strong {
|
||||
font-weight: var(--font-weight-medium);
|
||||
}
|
||||
& * italic,
|
||||
& * em {
|
||||
i
|
||||
em {
|
||||
font-style: italic;
|
||||
}
|
||||
blockquote {
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
// Jest Snapshot v1, https://goo.gl/fbAQLP
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 1`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 2`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french;fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 3`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french!fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 4`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french.fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 5`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french?fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 6`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french¿fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the multi-words in a sentence correctly 7`] = `
|
||||
Array [
|
||||
"this sentence has",
|
||||
" ",
|
||||
"french:fries",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the words in a sentence correctly 1`] = `
|
||||
Array [
|
||||
"this sentence is",
|
||||
" ",
|
||||
"bad",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the words in a sentence correctly 2`] = `
|
||||
Array [
|
||||
"this sentence is",
|
||||
" ",
|
||||
"worse",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the words with unicode in a sentence correctly 1`] = `
|
||||
Array [
|
||||
"this sentence has one",
|
||||
" ",
|
||||
"jalapeño",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits the words with unicode in a sentence correctly 2`] = `
|
||||
Array [
|
||||
"this sentence has many jalapeños.",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`en-US splits words when there are repeat words 1`] = `
|
||||
Array [
|
||||
"This is",
|
||||
" ",
|
||||
"bad",
|
||||
" ",
|
||||
"bad, very",
|
||||
" ",
|
||||
"BAD",
|
||||
".",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 1`] = `
|
||||
Array [
|
||||
"biólogo se soletra com",
|
||||
": ",
|
||||
"bi",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 2`] = `
|
||||
Array [
|
||||
"",
|
||||
"",
|
||||
"m.e.r.d.a",
|
||||
"",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 3`] = `
|
||||
Array [
|
||||
"não tomo",
|
||||
" ",
|
||||
"café",
|
||||
" ",
|
||||
"pois faz mal",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 4`] = `
|
||||
Array [
|
||||
"",
|
||||
"",
|
||||
"Como fazer coisas ruins",
|
||||
"",
|
||||
"",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 5`] = `
|
||||
Array [
|
||||
"O biólogo recomenda este artigo",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 6`] = `
|
||||
Array [
|
||||
"cafe",
|
||||
]
|
||||
`;
|
||||
|
||||
exports[`pt-BR splits the words with unicode in a sentence correctly 7`] = `
|
||||
Array [
|
||||
"Ser banido é uma merda",
|
||||
]
|
||||
`;
|
||||
@@ -0,0 +1,173 @@
|
||||
import createWordListRegExp from "./createWordListRegExp";
|
||||
|
||||
const buildTester = (re: RegExp) => (str: string) => re.test(str);
|
||||
|
||||
const buildSplitter = (re: RegExp) => (str: string) => str.split(re);
|
||||
|
||||
describe("en-US", () => {
|
||||
const re = createWordListRegExp("en-US", [
|
||||
"bad",
|
||||
"french fries",
|
||||
"worse",
|
||||
"jalapeño",
|
||||
"1km",
|
||||
]);
|
||||
|
||||
const test = buildTester(re);
|
||||
|
||||
it("test words in the list", () => {
|
||||
expect(test("bad")).toBeTruthy();
|
||||
expect(test("worse")).toBeTruthy();
|
||||
expect(test("1km")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test repeated words in the list", () => {
|
||||
expect(test("bad bad bad")).toBeTruthy();
|
||||
expect(test("worse worse worse")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test words not in the list", () => {
|
||||
expect(test("fine")).toBeFalsy();
|
||||
expect(test("ok")).toBeFalsy();
|
||||
});
|
||||
|
||||
it("test words that end with a unicode character", () => {
|
||||
expect(test("I have one jalapeño.")).toBeTruthy();
|
||||
expect(test("I have two jalapeños.")).toBeFalsy();
|
||||
});
|
||||
|
||||
it("test words in the list while being case insensitive", () => {
|
||||
expect(test("Bad")).toBeTruthy();
|
||||
expect(test("BAd")).toBeTruthy();
|
||||
expect(test("BAD")).toBeTruthy();
|
||||
expect(test("bAD")).toBeTruthy();
|
||||
expect(test("baD")).toBeTruthy();
|
||||
expect(test("bAd")).toBeTruthy();
|
||||
expect(test("1KM")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test multi-words", () => {
|
||||
expect(test("french fries")).toBeTruthy();
|
||||
expect(test("french!fries")).toBeTruthy();
|
||||
expect(test("french.fries")).toBeTruthy();
|
||||
expect(test("french?fries")).toBeTruthy();
|
||||
expect(test("french¿fries")).toBeTruthy();
|
||||
expect(test("french:fries")).toBeTruthy();
|
||||
expect(test("french;fries")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test words at the end of a sentence", () => {
|
||||
expect(test("this sentence is bad.")).toBeTruthy();
|
||||
expect(test("this sentence is worse.")).toBeTruthy();
|
||||
expect(test("this sentence has french fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french!fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french.fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french?fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french¿fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french:fries.")).toBeTruthy();
|
||||
expect(test("this sentence has french;fries.")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test words at the start of a sentence", () => {
|
||||
expect(test("bad is the start of the sentence.")).toBeTruthy();
|
||||
expect(test("worse is the start of the sentence.")).toBeTruthy();
|
||||
expect(test("french fries is the start of the sentence.")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test does not preserve state", () => {
|
||||
expect(test("bad 1")).toBeTruthy();
|
||||
expect(test("bad 2")).toBeTruthy();
|
||||
expect(test("more bad 3")).toBeTruthy();
|
||||
expect(test("more bad 4")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test repeated words", () => {
|
||||
expect(test("This is bad bad, very bad")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test does not match substrings", () => {
|
||||
expect(test("baddd")).toBeFalsy();
|
||||
expect(test("wwworse")).toBeFalsy();
|
||||
expect(test("fffrench fries")).toBeFalsy();
|
||||
});
|
||||
|
||||
it("test handles when there are numbers", () => {
|
||||
expect(test("bad3")).toBeTruthy();
|
||||
expect(test("3bad")).toBeTruthy();
|
||||
});
|
||||
|
||||
const split = buildSplitter(re);
|
||||
|
||||
it("splits the words in a sentence correctly", () => {
|
||||
expect(split("this sentence is bad.")).toMatchSnapshot();
|
||||
expect(split("this sentence is worse.")).toMatchSnapshot();
|
||||
});
|
||||
|
||||
it("splits words when there are repeat words", () => {
|
||||
expect(split("This is bad bad, very BAD.")).toMatchSnapshot();
|
||||
});
|
||||
|
||||
it("splits the words with unicode in a sentence correctly", () => {
|
||||
expect(split("this sentence has one jalapeño.")).toMatchSnapshot();
|
||||
expect(split("this sentence has many jalapeños.")).toMatchSnapshot();
|
||||
});
|
||||
|
||||
it("splits the multi-words in a sentence correctly", () => {
|
||||
expect(split("this sentence has french fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french;fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french!fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french.fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french?fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french¿fries.")).toMatchSnapshot();
|
||||
expect(split("this sentence has french:fries.")).toMatchSnapshot();
|
||||
});
|
||||
});
|
||||
|
||||
describe("es", () => {
|
||||
const re = createWordListRegExp("es", ["adónde vas", "tú"]);
|
||||
|
||||
const test = buildTester(re);
|
||||
|
||||
it("test words in the list", () => {
|
||||
expect(test("Pablo, ¿adónde vas?")).toBeTruthy();
|
||||
expect(test("Estoy cansado, ¿y tú?")).toBeTruthy();
|
||||
expect(test("¿tú?")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
describe("pt-BR", () => {
|
||||
const re = createWordListRegExp("pt-BR", [
|
||||
"bi",
|
||||
"outro",
|
||||
"café",
|
||||
"m e r d a",
|
||||
"Como fazer coisas ruins",
|
||||
]);
|
||||
|
||||
const test = buildTester(re);
|
||||
|
||||
it("test words in the list", () => {
|
||||
expect(test("biólogo se soletra com: bi ")).toBeTruthy();
|
||||
expect(test("m.e.r.d.a")).toBeTruthy();
|
||||
expect(test("não tomo café pois faz mal")).toBeTruthy();
|
||||
expect(test("Como fazer coisas ruins")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("test words not in the list", () => {
|
||||
expect(test("O biólogo recomenda este artigo")).toBeFalsy();
|
||||
expect(test("cafe")).toBeFalsy();
|
||||
expect(test("Ser banido é uma merda")).toBeFalsy();
|
||||
});
|
||||
|
||||
const split = buildSplitter(re);
|
||||
|
||||
it("splits the words with unicode in a sentence correctly", () => {
|
||||
expect(split("biólogo se soletra com: bi ")).toMatchSnapshot();
|
||||
expect(split("m.e.r.d.a")).toMatchSnapshot();
|
||||
expect(split("não tomo café pois faz mal")).toMatchSnapshot();
|
||||
expect(split("Como fazer coisas ruins")).toMatchSnapshot();
|
||||
expect(split("O biólogo recomenda este artigo")).toMatchSnapshot();
|
||||
expect(split("cafe")).toMatchSnapshot();
|
||||
expect(split("Ser banido é uma merda")).toMatchSnapshot();
|
||||
});
|
||||
});
|
||||
@@ -1,18 +1,20 @@
|
||||
import { defaults } from "lodash";
|
||||
import XRegExp from "xregexp";
|
||||
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { DeepPartial } from "coral-common/types";
|
||||
|
||||
interface WordListRule {
|
||||
split: string;
|
||||
boundary: string;
|
||||
punctuation: string;
|
||||
whitespace: string;
|
||||
}
|
||||
|
||||
const DefaultWordListRule: WordListRule = {
|
||||
split: "[^\\w]",
|
||||
punctuation: '[\\s"?!.¿¡`:;]+',
|
||||
whitespace: "\\s+",
|
||||
// The following symbol, \p{L} refers to any letter class within unicode.
|
||||
// Because we're adding the ^, we're also saying to exclude any from that set,
|
||||
// leaving all non-word characters from unicode available for selection.
|
||||
boundary: "[^\\p{L}]+",
|
||||
punctuation: "[\\s\"'?!.,¿¡`:;]+",
|
||||
};
|
||||
|
||||
const WordListRules: DeepPartial<Record<LanguageCode, WordListRule>> = {
|
||||
@@ -47,14 +49,12 @@ export default function createWordListRegExp(
|
||||
DefaultWordListRule
|
||||
);
|
||||
|
||||
const whitespace = new RegExp(rule.whitespace);
|
||||
|
||||
// Split up the words from the list into a regex escaped string.
|
||||
const words = phrases
|
||||
.map(phrase =>
|
||||
phrase
|
||||
// Split each phrase by whitespace.
|
||||
.split(whitespace)
|
||||
.split(/\s/)
|
||||
// Escape each phrase, we don't expect any of them to contain regex.
|
||||
.map(word => escapeRegExp(word))
|
||||
// Rejoin to ensure that any variation of the word separated by a
|
||||
@@ -64,13 +64,18 @@ export default function createWordListRegExp(
|
||||
// For each of these words, wrap a `|` or OR.
|
||||
.join("|");
|
||||
|
||||
// Wrap the pattern in split rules.
|
||||
const pattern = `(^|${rule.split})(${words})($|${rule.split})`;
|
||||
// Wrap the pattern in split rules. We want to match any word that either is
|
||||
// at the start of a string, or a word boundary. The word must also either be
|
||||
// at the end of the string or at another word boundary.
|
||||
const pattern = `(^|${rule.boundary})(${words})($|${rule.boundary})`;
|
||||
|
||||
try {
|
||||
return new RegExp(pattern, "iu");
|
||||
// Create the RegExp using xregexp to pre-process the pattern to generate
|
||||
// one with the correct unicode ranges. Including A for "astral" unicode
|
||||
// support for supporting higher character ranges.
|
||||
return XRegExp(pattern, "iuA");
|
||||
} catch {
|
||||
// IE does not support unicode support, so we'll create one without.
|
||||
return new RegExp(pattern, "i");
|
||||
return XRegExp(pattern, "i");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,4 +12,3 @@ export { default as isPromiseLike } from "./isPromiseLike";
|
||||
export { default as isPromise } from "./isPromise";
|
||||
export { default as startsWith } from "./startsWith";
|
||||
export { default as getOrigin } from "./getOrigin";
|
||||
export { default as createWordListRegExp } from "./createWordListRegExp";
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
import {
|
||||
Options,
|
||||
WordList,
|
||||
} from "coral-server/services/comments/pipeline/wordList";
|
||||
|
||||
describe("en-US", () => {
|
||||
const list = new WordList();
|
||||
const options: Options = {
|
||||
id: "tenant_1",
|
||||
locale: "en-US",
|
||||
wordList: {
|
||||
banned: [
|
||||
"cookies",
|
||||
"how to do bad things",
|
||||
"how to do really bad things",
|
||||
"s h i t",
|
||||
"$hit",
|
||||
"p**ch",
|
||||
"p*ch",
|
||||
"banned",
|
||||
"ban",
|
||||
],
|
||||
suspect: [],
|
||||
},
|
||||
};
|
||||
|
||||
describe("containsMatchingPhrase", () => {
|
||||
it("does match on a word in the list", () => {
|
||||
[
|
||||
"how to do really bad things",
|
||||
"what is cookies",
|
||||
"cookies",
|
||||
"COOKIES.",
|
||||
"how to do bad things",
|
||||
"How To do bad things!",
|
||||
"How.To.do.bad.things!",
|
||||
"This stuff is $hit!",
|
||||
"This is a test.\nTo see if cookies are found, in the second line.",
|
||||
"That's a p**ch!",
|
||||
"Banned words should be detected",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(true);
|
||||
});
|
||||
});
|
||||
|
||||
it("does not match on a word not in the list", () => {
|
||||
[
|
||||
"how to",
|
||||
"cookie",
|
||||
"how to be a great person?",
|
||||
"how to not do really bad things?",
|
||||
"i have $100 dollars.",
|
||||
"I have bad $ hit lling",
|
||||
"That's a p***ch!",
|
||||
"When bann is spelt wrong, it won't be caught.",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
it("allows an empty list", () => {
|
||||
expect(list.test(options, "banned", "test")).toEqual(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("containsMatchingPhraseMemoized", () => {
|
||||
it("return true for all cases after memoizing the first result", () => {
|
||||
[
|
||||
"cookies 1",
|
||||
"cookies 2",
|
||||
"cookies 4",
|
||||
"cookies 5",
|
||||
"this is for cookies 6",
|
||||
"this is for cookies 7",
|
||||
"this is for cookies 8",
|
||||
"this is for cookies 9",
|
||||
].forEach(word => {
|
||||
expect(list.test(options, "banned", word)).toEqual(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -2,7 +2,7 @@ import ms from "ms";
|
||||
import now from "performance-now";
|
||||
|
||||
import { LanguageCode } from "coral-common/helpers";
|
||||
import { createWordListRegExp } from "coral-common/utils";
|
||||
import createWordListRegExp from "coral-common/utils/createWordListRegExp";
|
||||
import logger from "coral-server/logger";
|
||||
import { Tenant } from "coral-server/models/tenant";
|
||||
|
||||
@@ -90,8 +90,8 @@ export class WordList {
|
||||
|
||||
const startedAt = now();
|
||||
const result = list.test(testString);
|
||||
logger.debug(
|
||||
{ tenantID: options.id, took: ms(now() - startedAt) },
|
||||
logger.info(
|
||||
{ tenantID: options.id, listName, took: ms(now() - startedAt) },
|
||||
"word list phrase test complete"
|
||||
);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user