Finalizes converting the transition words assessment to use the HTML …

…parser
Yoast · Dec 8, 2023 · 46330fd · 46330fd
1 parent a773f48
commit 46330fd
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 70 deletions.
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/word/getWordsForHTMLParserSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/word/getWordsForHTMLParserSpec.js
@@ -73,6 +73,12 @@ const testCases = [
 		text: "[caption]test[/caption]",
 		expectedResult: [ "[", "caption", "]", "test", "[", "/caption", "]" ],
 	},
+	{
+		description: "correctly tokenizes abbreviations",
+		text: "E.g. Asia contains many large flowing streams of water (i.e., rivers).",
+		expectedResult: [ "E.g.", " ", "Asia", " ", "contains", " ", "many", " ", "large", " ", "flowing", " ",
+			"streams", " ", "of", " ", "water", " ", "(", "i.e.", ",", " ", "rivers", ")", "." ],
+	},
 	{
 		description: "doesn't match with a hashed HTML entity (in this case, '#trade;' for '™') in the beginning or the end of the word",
 		text: "one trademark#trade;, and another '#trade;trademark'",

diff --git a/packages/yoastseo/spec/languageProcessing/researches/findTransitionWordsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/findTransitionWordsSpec.js
@@ -1,4 +1,3 @@
-/* eslint-disable capitalized-comments, spaced-comment */
 import transitionWordsResearch from "../../../src/languageProcessing/researches/findTransitionWords.js";
 import Paper from "../../../src/values/Paper.js";
 import EnglishResearcher from "../../../src/languageProcessing/languages/en/Researcher";
@@ -240,6 +239,7 @@ describe( "a test for finding transition words from a string", function() {
 	it( "works with normalizes quotes", function() {
 		// Transition word: what’s more.
 		mockPaper = new Paper( "what’s more", {} );
+		buildTree( mockPaper, mockResearcher );
 		result = transitionWordsResearch( mockPaper, mockResearcher );
 
 		expect( result ).toEqual( {
@@ -266,6 +266,7 @@ describe( "a test for finding transition words from a string", function() {
 			transitionWordSentences: 1,
 		};
 
+		buildTree( mockPaper, mockResearcher );
 		result = transitionWordsResearch( mockPaper, mockResearcher );
 
 		expect( result ).toEqual( expected );
@@ -280,26 +281,28 @@ describe( "a test for finding transition words from a string", function() {
 			transitionWordSentences: 0,
 		};
 
+		buildTree( mockPaper, mockResearcher );
 		result = transitionWordsResearch( mockPaper, mockResearcher );
 
 		expect( result ).toEqual( expected );
 	} );
 
 	it( "does recognize transition words with full stops, like 'e.g.'.", function() {
-		// Non-transition word: eggs.
+		// Transition words: e.g., i.e.
 		mockPaper = new Paper( "E.g. potatoes. I.e. apples." );
 		const expected = {
 			sentenceResults: [ {
 				sentence: "E.g. potatoes.",
 				transitionWords: [ "e.g." ],
 			}, {
-				sentence: "I.e. apples.",
+				sentence: " I.e. apples.",
 				transitionWords: [ "i.e." ],
 			} ],
 			totalSentences: 2,
 			transitionWordSentences: 2,
 		};
 
+		buildTree( mockPaper, mockResearcher );
 		result = transitionWordsResearch( mockPaper, mockResearcher );
 
 		expect( result ).toEqual( expected );

diff --git a/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js b/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js
@@ -501,6 +501,30 @@ describe( "A test for the tokenize function",
 			expect( br2.sourceCodeRange ).toEqual( { startOffset: 38, endOffset: 39 } );
 			expect( another1.sourceCodeRange ).toEqual( { startOffset: 39, endOffset: 46 } );
 		} );
+
+		it( "should correctly tokenize a paragraph with abbreviations", function() {
+			const mockPaper = new Paper( "<p>This is e.g. a cat from 2023 A.D., nice!</p>" );
+			const mockResearcher = new EnglishResearcher( mockPaper );
+			const languageProcessor = new LanguageProcessor( mockResearcher );
+			buildTreeNoTokenize( mockPaper );
+			const result = tokenize( mockPaper.getTree(), languageProcessor );
+			const sentences = result.childNodes[ 0 ].sentences;
+			expect( sentences.length ).toEqual( 1 );
+			const firstSentence = sentences[ 0 ];
+			expect( firstSentence.text ).toEqual( "This is e.g. a cat from 2023 A.D., nice!" );
+			expect( firstSentence.sourceCodeRange ).toEqual( { startOffset: 3, endOffset: 43 } );
+			expect( firstSentence.tokens.length ).toEqual( 19 );
+			const [ this1, , is1, , eg1, , a1, , cat1, , from1, , year1, , ad1, , , nice1, , ] = firstSentence.tokens;
+			expect( this1.sourceCodeRange ).toEqual( { startOffset: 3, endOffset: 7 } );
+			expect( is1.sourceCodeRange ).toEqual( { startOffset: 8, endOffset: 10 } );
+			expect( eg1.sourceCodeRange ).toEqual( { startOffset: 11, endOffset: 15 } );
+			expect( a1.sourceCodeRange ).toEqual( { startOffset: 16, endOffset: 17 } );
+			expect( cat1.sourceCodeRange ).toEqual( { startOffset: 18, endOffset: 21 } );
+			expect( from1.sourceCodeRange ).toEqual( { startOffset: 22, endOffset: 26 } );
+			expect( year1.sourceCodeRange ).toEqual( { startOffset: 27, endOffset: 31 } );
+			expect( ad1.sourceCodeRange ).toEqual( { startOffset: 32, endOffset: 36 } );
+			expect( nice1.sourceCodeRange ).toEqual( { startOffset: 38, endOffset: 42 } );
+		} );
 	} );
 
 describe( "A test for tokenizing a Japanese sentence", function() {

diff --git a/packages/yoastseo/spec/scoring/assessments/readability/TransitionWordsAssessmentSpec.js b/packages/yoastseo/spec/scoring/assessments/readability/TransitionWordsAssessmentSpec.js
@@ -5,6 +5,7 @@ import TransitionWordsAssessment from "../../../../src/scoring/assessments/reada
 import Paper from "../../../../src/values/Paper.js";
 import Factory from "../../../specHelpers/factory.js";
 import Mark from "../../../../src/values/Mark.js";
+import buildTree from "../../../specHelpers/parse/buildTree";
 
 const shortTextJapanese = "熱".repeat( 399 );
 const longTextJapanese = "熱".repeat( 400 );
@@ -107,6 +108,7 @@ describe( "An assessment for transition word percentage", function() {
 			"However, a cat with the toy looks happier. She is given raw food. Seniors don't like it.<br></br>\n" +
 			"</p>" );
 		const researcher = new EnglishResearcher( paper );
+		buildTree( paper, researcher );
 		const result = new TransitionWordsAssessment().getResult( paper, researcher );
 
 		expect( result.getScore() ).toEqual( 9 );
@@ -217,10 +219,11 @@ describe( "A test for marking sentences containing a transition word", function(
 			"However, a cat with the toy looks happier. She is given raw food. Seniors don't like it.<br></br>\n" +
 			"</p>" );
 		const researcher = new EnglishResearcher( paper );
+		buildTree( paper, researcher );
 		const expected = [
 			new Mark( {
-				original: "However, a cat with the toy looks happier.",
-				marked: "<yoastmark class='yoast-text-mark'>However, a cat with the toy looks happier.</yoastmark>" } ),
+				original: " However, a cat with the toy looks happier.",
+				marked: "<yoastmark class='yoast-text-mark'> However, a cat with the toy looks happier.</yoastmark>" } ),
 		];
 		expect( new TransitionWordsAssessment().getMarks( paper, researcher ) ).toEqual( expected );
 	} );

diff --git a/packages/yoastseo/src/languageProcessing/helpers/word/getWordsForHTMLParser.js b/packages/yoastseo/src/languageProcessing/helpers/word/getWordsForHTMLParser.js
@@ -1,5 +1,6 @@
 import { punctuationRegexEnd, punctuationRegexStart } from "../sanitize/removePunctuation";
 import { hashedHtmlEntitiesRegexEnd, hashedHtmlEntitiesRegexStart } from "../../../helpers/htmlEntities";
+import abbreviations from "../../languages/en/config/abbreviations";
 
 /*
  * The following regex matches a word separator. A word separator is either a whitespace, a slash, a
@@ -14,6 +15,8 @@ import { hashedHtmlEntitiesRegexEnd, hashedHtmlEntitiesRegexStart } from "../../
  */
 const wordSeparatorsRegex = /([\s\t\u00A0[\]])/;
 
+const ABBREVIATIONS = abbreviations.map( abbreviation => abbreviation.toLocaleLowerCase() );
+
 /**
  * Tokenizes a text similar to getWords, but in a suitable way for the HTML parser.
  * 1. It does not normalize whitespace.
@@ -49,8 +52,10 @@ const getWordsForHTMLParser = ( text ) => {
 			token = token.slice( 1 );
 		}
 		// Add all punctuation marks that occur after the last letter of the token to the posttokens array.
-		// Also, prevent matching with a hashed HTML entity at the end of the token.
-		while ( punctuationRegexEnd.test( token ) && ! hashedHtmlEntitiesRegexEnd.test( token ) ) {
+		// Also, prevent matching with a hashed HTML entity at the end of the token, or an abbreviation.
+		while ( punctuationRegexEnd.test( token ) &&
+		! hashedHtmlEntitiesRegexEnd.test( token ) &&
+		! ABBREVIATIONS.includes( token.toLocaleLowerCase() ) ) {
 			// Using unshift here because we are iterating from the end of the string to the beginning,
 			// and we want to keep the order of the punctuation marks.
 			// Therefore, we add them to the start of the array.

diff --git a/packages/yoastseo/src/languageProcessing/researches/findTransitionWords.js b/packages/yoastseo/src/languageProcessing/researches/findTransitionWords.js
@@ -1,88 +1,60 @@
-import createRegexFromDoubleArray from "../helpers/regex/createRegexFromDoubleArray.js";
-import { normalizeSingle as normalizeSingleQuotes } from "../helpers/sanitize/quotes.js";
-import { isWordInSentence as matchWordInSentence } from "../helpers/word/matchWordInSentence.js";
-
-import { flattenDeep } from "lodash-es";
 import getSentencesFromTree from "../helpers/sentence/getSentencesFromTree";
-
-let regexFromDoubleArray = null;
-let regexFromDoubleArrayCacheKey = "";
-
-/**
- * Memoizes the createRegexFromDoubleArray with the twoPartTransitionWords.
- *
- * @param {Array} twoPartTransitionWords The array containing two-part transition words.
- *
- * @returns {RegExp} The RegExp to match text with a double array.
- */
-function getRegexFromDoubleArray( twoPartTransitionWords ) {
-	const cacheKey = flattenDeep( twoPartTransitionWords ).join( "" );
-	if ( regexFromDoubleArrayCacheKey !== cacheKey || regexFromDoubleArray === null ) {
-		regexFromDoubleArrayCacheKey = cacheKey;
-		regexFromDoubleArray = createRegexFromDoubleArray( twoPartTransitionWords );
-	}
-	return regexFromDoubleArray;
-}
-
-/**
- * Matches the sentence against two part transition words.
- *
- * @param {string} sentence The sentence to match against.
- * @param {Array} twoPartTransitionWords The array containing two-part transition words.
- * @returns {Array} The found transitional words.
- */
-const matchTwoPartTransitionWords = function( sentence, twoPartTransitionWords ) {
-	const twoPartTransitionWordsRegex = getRegexFromDoubleArray( twoPartTransitionWords );
-	return sentence.match( twoPartTransitionWordsRegex );
-};
+import { includesConsecutiveWords } from "../../scoring/assessments/inclusiveLanguage/helpers/includesConsecutiveWords";
 
 /**
  * Matches the sentence against transition words.
  *
- * @param {string} sentence The sentence to match against.
- * @param {Array} transitionWords The array containing transition words.
- * @returns {Array} The found transitional words.
+ * @param {Sentence} sentence The sentence to match against.
+ * @param {string[]} transitionWords The array containing transition words.
+ * @returns {string[]} The found transition words.
  */
 const matchTransitionWords = function( sentence, transitionWords ) {
 	return transitionWords.filter( transitionWord => {
-		// Split into [ "as", "can", "be", "seen" ]
+		// Splits "as I have noted" into [ "as", "i", "have", "noted" ]
 		transitionWord = transitionWord.toLocaleLowerCase().split( " " );
+		// Retrieve the tokens per sentence, remove the spaces.
+		const tokens = sentence.tokens.filter( token => token.text !== " " ).map( token => token.text.toLocaleLowerCase() );
+		// Find the transitionWord in the tokens.
+		return includesConsecutiveWords( tokens, transitionWord ).length;
+	} );
+};
 
-		// Single words we can just find in the tokens.
-		if ( transitionWord.length === 1 ) {
-			return sentence.tokens.find( token => token.text.toLocaleLowerCase() === transitionWord[ 0 ] );
-		}
+/**
+ * Matches the sentence against two-part transition words.
+ *
+ * @param {Sentence} sentence The sentence to match against.
+ * @param {Array.<Array.<string>>} twoPartTransitionWords The array containing two-part transition words.
+ * @returns {string[]} The found two-part transition words.
+ */
+const matchTwoPartTransitionWords = function( sentence, twoPartTransitionWords ) {
+	return twoPartTransitionWords.filter( twoPartTransitionWord => {
+		const first = twoPartTransitionWord[ 0 ];
+		const second = twoPartTransitionWord[ 1 ];
+		const firstFound = matchTransitionWords( sentence, [ first ] );
+		const secondFound = matchTransitionWords( sentence, [ second ] );
 
-		// For multiple words, we need to (1) find them, (2) check if all have been found, (3) check if they are in order.
-		// Problem: what if the words appear multiple times in the sentence? We really need them to be in order.
-		const wordIndices = transitionWord.map( word => sentence.tokens.findIndex( token => {
-			return token.text.toLocaleLowerCase() === word;
-		} ) );
-		if ( wordIndices.includes( -1 ) ) {
-			return false;
-		}
-		return wordIndices.every( ( wordIndex, i ) => i === wordIndices.length - 1 || wordIndex + 2 === wordIndices[ i + 1 ] );
+		return firstFound.length && secondFound.length;
 	} );
 };
 
 /**
  * Checks the passed sentences to see if they contain transition words.
  *
- * @param {Array} sentences The sentences to match against.
- * @param {Array} transitionWords The array containing transition words.
- * @param {Array} twoPartTransitionWords The array containing two part transition words.
+ * @param {Sentence[]} sentences The sentences to match against.
+ * @param {string[]} transitionWords The array containing transition words.
+ * @param {Array.<Array.<string>>} twoPartTransitionWords The array containing two-part transition words.
  * @param {function} matchTransitionWordsHelper The language-specific helper function to match transition words in a sentence.
  *
- * @returns {Array} Array of sentence objects containing the complete sentence and the transition words.
+ * @returns {{sentence: string, transitionWords: string[]}[]} Array of sentence objects containing the complete sentence and the transition words.
  */
 const checkSentencesForTransitionWords = function( sentences, transitionWords, twoPartTransitionWords, matchTransitionWordsHelper ) {
 	const results = [];
 
 	sentences.forEach( sentence => {
 		if ( twoPartTransitionWords ) {
-			const twoPartMatches = matchTwoPartTransitionWords( sentence.text, twoPartTransitionWords );
+			const twoPartMatches = matchTwoPartTransitionWords( sentence, twoPartTransitionWords );
 
-			if ( twoPartMatches !== null ) {
+			if ( twoPartMatches.length !== 0 ) {
 				results.push( {
 					sentence: sentence.text,
 					transitionWords: twoPartMatches,
@@ -93,7 +65,7 @@ const checkSentencesForTransitionWords = function( sentences, transitionWords, t
 		}
 
 		const transitionWordMatches = matchTransitionWordsHelper
-			? matchTransitionWordsHelper( sentence, transitionWords )
+			? matchTransitionWordsHelper( sentence.text, transitionWords )
 			: matchTransitionWords( sentence, transitionWords );
 
 		if ( transitionWordMatches.length !== 0 ) {
@@ -109,13 +81,13 @@ const checkSentencesForTransitionWords = function( sentences, transitionWords, t
 
 /**
  * Checks how many sentences from a text contain at least one transition word or two-part transition word
- * that are defined in the transition words config and two part transition words config.
+ * that are defined in the transition words config and two-part transition words config.
  *
  * @param {Paper} paper The Paper object to get text from.
  * @param {Researcher} researcher The researcher.
  *
- * @returns {object} An object with the total number of sentences in the text
- *                   and the total number of sentences containing one or more transition words.
+ * @returns {{totalSentences: number, sentenceResults: {sentence: string, transitionWords: string[]}[], transitionWordSentences: number}} An object
+ * with the total number of sentences in the text and the total number of sentences containing one or more transition words.
  */
 export default function( paper, researcher ) {
 	const matchTransitionWordsHelper = researcher.getHelper( "matchTransitionWordsHelper" );