Match phrases not just words

2026-07-02 00:29:44 +08:00 · 2017-09-13 22:20:32 +07:00
parent 7976aace00
commit a16f101a12
1 changed files with 81 additions and 16 deletions
@@ -1,24 +1,89 @@
 import React from 'react';
 import {matchLinks} from '../utils';

-const wordSeparator = /([.\s'"?!])/;
+const capturingWordSeparator = /([.\s'"?!])/;
+const wordSeparator = /[.\s'"?!]/;

-// markWords looks for `words` inside `body` and highlights them by returning
+// markPhrases looks for `phrases` inside `body` and highlights them by returning
 // an array of React Elements.
-function markWords(body, words, keyPrefix) {
-  const tokens = body.split(wordSeparator);
+function markPhrases(body, phrases, keyPrefix) {
+  const tokens = body.split(capturingWordSeparator);
+  const phraseWords = phrases.map((phrase) => phrase.toLowerCase().split(wordSeparator));
  const content = [];
  let tmp = [];
-  tokens.forEach((token, i) => {
-    if (words.indexOf(token.toLowerCase()) >= 0) {
-      content.push(...tmp);
-      tmp = [];
-      content.push(<mark key={`${keyPrefix}_${i}`}>{token}</mark>);
-      return;
+
+  for (let l = 0; l < tokens.length; l++) {
+
+    // matchedWords is > 0 when a full match was found and contains
+    // the range length from this index to the end of the match.
+    let matchedWords = 0;
+
+    // Skip word separators and ''.
+    if (tokens[l] !== '' && !tokens[l].match(wordSeparator)) {
+      for (let m = 0; m < phraseWords.length; m++) {
+        const words = phraseWords[m];
+
+        // We try to match the full phrase, index keeps track
+        // of where we are now on the tokens array while matching
+        // the words of the phrase.
+        let index = l;
+        for (let n = 0; n < words.length; n++, index++) {
+
+          // Skip word separators and ''.
+          while (index < tokens.length && (tokens[index].match(wordSeparator) || tokens[index] === '')) {
+            index++;
+          }
+
+          // No more tokens left.
+          if (index >= tokens.length) {
+            break;
+          }
+
+          const token = tokens[index].toLowerCase();
+          const word = words[n];
+          if (token !== word) {
+            break;
+          }
+
+          // Full match!
+          if (n === words.length - 1) {
+
+            // Save the matched range length into matched words.
+            matchedWords = index - l + 1;
+            break;
+          }
+        }
+
+        // We matched a word so break out the loop.
+        if (matchedWords) {
+          break;
+        }
+      }
    }
-    tmp.push(token);
-  });
-  content.push(...tmp);
+
+    // We have a match!
+    if (matchedWords) {
+      const match = tokens.slice(l, l + matchedWords).join('');
+
+      // Append whatever we have in `tmp` and clear it.
+      content.push(tmp.join(''));
+      tmp = [];
+
+      content.push(<mark key={`${keyPrefix}_${l}`}>{match}</mark>);
+
+      // Move index further if we matched more than one word.
+      l += matchedWords - 1;
+
+      continue;
+    }
+
+    // No match, we just push this into `tmp`.
+    tmp.push(tokens[l]);
+  }
+
+  // Append any non matched tokens currently in `tmp`.
+  content.push(tmp.join(''));
+
  return content;
 }

@@ -41,7 +106,7 @@ function markLinks(body) {
 }

 export default ({suspectWords, bannedWords, body, ...rest}) => {
-  const words = [...suspectWords, ...bannedWords].map((word) => word.toLowerCase());
+  const phrases = [...suspectWords, ...bannedWords];

  // First highlight links.
  const content = markLinks(body)
@@ -52,8 +117,8 @@ export default ({suspectWords, bannedWords, body, ...rest}) => {
        return element;
      }

-      // Highlight suspect and banned words inside this part of text.
-      return markWords(element, words, index);
+      // Highlight suspect and banned phrase inside this part of text.
+      return markPhrases(element, phrases, index);
    });
  return (
    <div {...rest}>