Skip to content

Commit

Permalink
Finalizes converting the transition words assessment to use the HTML …
Browse files Browse the repository at this point in the history
…parser
  • Loading branch information
mhkuu committed Dec 8, 2023
1 parent a773f48 commit 46330fd
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ const testCases = [
text: "[caption]test[/caption]",
expectedResult: [ "[", "caption", "]", "test", "[", "/caption", "]" ],
},
{
description: "correctly tokenizes abbreviations",
text: "E.g. Asia contains many large flowing streams of water (i.e., rivers).",
expectedResult: [ "E.g.", " ", "Asia", " ", "contains", " ", "many", " ", "large", " ", "flowing", " ",
"streams", " ", "of", " ", "water", " ", "(", "i.e.", ",", " ", "rivers", ")", "." ],
},
{
description: "doesn't match with a hashed HTML entity (in this case, '#trade;' for '™') in the beginning or the end of the word",
text: "one trademark#trade;, and another '#trade;trademark'",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
/* eslint-disable capitalized-comments, spaced-comment */
import transitionWordsResearch from "../../../src/languageProcessing/researches/findTransitionWords.js";
import Paper from "../../../src/values/Paper.js";
import EnglishResearcher from "../../../src/languageProcessing/languages/en/Researcher";
Expand Down Expand Up @@ -240,6 +239,7 @@ describe( "a test for finding transition words from a string", function() {
it( "works with normalizes quotes", function() {
// Transition word: what’s more.
mockPaper = new Paper( "what’s more", {} );
buildTree( mockPaper, mockResearcher );
result = transitionWordsResearch( mockPaper, mockResearcher );

expect( result ).toEqual( {
Expand All @@ -266,6 +266,7 @@ describe( "a test for finding transition words from a string", function() {
transitionWordSentences: 1,
};

buildTree( mockPaper, mockResearcher );
result = transitionWordsResearch( mockPaper, mockResearcher );

expect( result ).toEqual( expected );
Expand All @@ -280,26 +281,28 @@ describe( "a test for finding transition words from a string", function() {
transitionWordSentences: 0,
};

buildTree( mockPaper, mockResearcher );
result = transitionWordsResearch( mockPaper, mockResearcher );

expect( result ).toEqual( expected );
} );

it( "does recognize transition words with full stops, like 'e.g.'.", function() {
// Non-transition word: eggs.
// Transition words: e.g., i.e.
mockPaper = new Paper( "E.g. potatoes. I.e. apples." );
const expected = {
sentenceResults: [ {
sentence: "E.g. potatoes.",
transitionWords: [ "e.g." ],
}, {
sentence: "I.e. apples.",
sentence: " I.e. apples.",
transitionWords: [ "i.e." ],
} ],
totalSentences: 2,
transitionWordSentences: 2,
};

buildTree( mockPaper, mockResearcher );
result = transitionWordsResearch( mockPaper, mockResearcher );

expect( result ).toEqual( expected );
Expand Down
24 changes: 24 additions & 0 deletions packages/yoastseo/spec/parse/build/private/tokenizeSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,30 @@ describe( "A test for the tokenize function",
expect( br2.sourceCodeRange ).toEqual( { startOffset: 38, endOffset: 39 } );
expect( another1.sourceCodeRange ).toEqual( { startOffset: 39, endOffset: 46 } );
} );

it( "should correctly tokenize a paragraph with abbreviations", function() {
const mockPaper = new Paper( "<p>This is e.g. a cat from 2023 A.D., nice!</p>" );
const mockResearcher = new EnglishResearcher( mockPaper );
const languageProcessor = new LanguageProcessor( mockResearcher );
buildTreeNoTokenize( mockPaper );
const result = tokenize( mockPaper.getTree(), languageProcessor );
const sentences = result.childNodes[ 0 ].sentences;
expect( sentences.length ).toEqual( 1 );
const firstSentence = sentences[ 0 ];
expect( firstSentence.text ).toEqual( "This is e.g. a cat from 2023 A.D., nice!" );
expect( firstSentence.sourceCodeRange ).toEqual( { startOffset: 3, endOffset: 43 } );
expect( firstSentence.tokens.length ).toEqual( 19 );
const [ this1, , is1, , eg1, , a1, , cat1, , from1, , year1, , ad1, , , nice1, , ] = firstSentence.tokens;
expect( this1.sourceCodeRange ).toEqual( { startOffset: 3, endOffset: 7 } );
expect( is1.sourceCodeRange ).toEqual( { startOffset: 8, endOffset: 10 } );
expect( eg1.sourceCodeRange ).toEqual( { startOffset: 11, endOffset: 15 } );
expect( a1.sourceCodeRange ).toEqual( { startOffset: 16, endOffset: 17 } );
expect( cat1.sourceCodeRange ).toEqual( { startOffset: 18, endOffset: 21 } );
expect( from1.sourceCodeRange ).toEqual( { startOffset: 22, endOffset: 26 } );
expect( year1.sourceCodeRange ).toEqual( { startOffset: 27, endOffset: 31 } );
expect( ad1.sourceCodeRange ).toEqual( { startOffset: 32, endOffset: 36 } );
expect( nice1.sourceCodeRange ).toEqual( { startOffset: 38, endOffset: 42 } );
} );
} );

describe( "A test for tokenizing a Japanese sentence", function() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import TransitionWordsAssessment from "../../../../src/scoring/assessments/reada
import Paper from "../../../../src/values/Paper.js";
import Factory from "../../../specHelpers/factory.js";
import Mark from "../../../../src/values/Mark.js";
import buildTree from "../../../specHelpers/parse/buildTree";

const shortTextJapanese = "熱".repeat( 399 );
const longTextJapanese = "熱".repeat( 400 );
Expand Down Expand Up @@ -107,6 +108,7 @@ describe( "An assessment for transition word percentage", function() {
"However, a cat with the toy looks happier. She is given raw food. Seniors don't like it.<br></br>\n" +
"</p>" );
const researcher = new EnglishResearcher( paper );
buildTree( paper, researcher );
const result = new TransitionWordsAssessment().getResult( paper, researcher );

expect( result.getScore() ).toEqual( 9 );
Expand Down Expand Up @@ -217,10 +219,11 @@ describe( "A test for marking sentences containing a transition word", function(
"However, a cat with the toy looks happier. She is given raw food. Seniors don't like it.<br></br>\n" +
"</p>" );
const researcher = new EnglishResearcher( paper );
buildTree( paper, researcher );
const expected = [
new Mark( {
original: "However, a cat with the toy looks happier.",
marked: "<yoastmark class='yoast-text-mark'>However, a cat with the toy looks happier.</yoastmark>" } ),
original: " However, a cat with the toy looks happier.",
marked: "<yoastmark class='yoast-text-mark'> However, a cat with the toy looks happier.</yoastmark>" } ),
];
expect( new TransitionWordsAssessment().getMarks( paper, researcher ) ).toEqual( expected );
} );
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { punctuationRegexEnd, punctuationRegexStart } from "../sanitize/removePunctuation";
import { hashedHtmlEntitiesRegexEnd, hashedHtmlEntitiesRegexStart } from "../../../helpers/htmlEntities";
import abbreviations from "../../languages/en/config/abbreviations";

/*
* The following regex matches a word separator. A word separator is either a whitespace, a slash, a
Expand All @@ -14,6 +15,8 @@ import { hashedHtmlEntitiesRegexEnd, hashedHtmlEntitiesRegexStart } from "../../
*/
const wordSeparatorsRegex = /([\s\t\u00A0[\]])/;

const ABBREVIATIONS = abbreviations.map( abbreviation => abbreviation.toLocaleLowerCase() );

/**
* Tokenizes a text similar to getWords, but in a suitable way for the HTML parser.
* 1. It does not normalize whitespace.
Expand Down Expand Up @@ -49,8 +52,10 @@ const getWordsForHTMLParser = ( text ) => {
token = token.slice( 1 );
}
// Add all punctuation marks that occur after the last letter of the token to the posttokens array.
// Also, prevent matching with a hashed HTML entity at the end of the token.
while ( punctuationRegexEnd.test( token ) && ! hashedHtmlEntitiesRegexEnd.test( token ) ) {
// Also, prevent matching with a hashed HTML entity at the end of the token, or an abbreviation.
while ( punctuationRegexEnd.test( token ) &&
! hashedHtmlEntitiesRegexEnd.test( token ) &&
! ABBREVIATIONS.includes( token.toLocaleLowerCase() ) ) {
// Using unshift here because we are iterating from the end of the string to the beginning,
// and we want to keep the order of the punctuation marks.
// Therefore, we add them to the start of the array.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,88 +1,60 @@
import createRegexFromDoubleArray from "../helpers/regex/createRegexFromDoubleArray.js";
import { normalizeSingle as normalizeSingleQuotes } from "../helpers/sanitize/quotes.js";
import { isWordInSentence as matchWordInSentence } from "../helpers/word/matchWordInSentence.js";

import { flattenDeep } from "lodash-es";
import getSentencesFromTree from "../helpers/sentence/getSentencesFromTree";

let regexFromDoubleArray = null;
let regexFromDoubleArrayCacheKey = "";

/**
* Memoizes the createRegexFromDoubleArray with the twoPartTransitionWords.
*
* @param {Array} twoPartTransitionWords The array containing two-part transition words.
*
* @returns {RegExp} The RegExp to match text with a double array.
*/
function getRegexFromDoubleArray( twoPartTransitionWords ) {
const cacheKey = flattenDeep( twoPartTransitionWords ).join( "" );
if ( regexFromDoubleArrayCacheKey !== cacheKey || regexFromDoubleArray === null ) {
regexFromDoubleArrayCacheKey = cacheKey;
regexFromDoubleArray = createRegexFromDoubleArray( twoPartTransitionWords );
}
return regexFromDoubleArray;
}

/**
* Matches the sentence against two part transition words.
*
* @param {string} sentence The sentence to match against.
* @param {Array} twoPartTransitionWords The array containing two-part transition words.
* @returns {Array} The found transitional words.
*/
const matchTwoPartTransitionWords = function( sentence, twoPartTransitionWords ) {
const twoPartTransitionWordsRegex = getRegexFromDoubleArray( twoPartTransitionWords );
return sentence.match( twoPartTransitionWordsRegex );
};
import { includesConsecutiveWords } from "../../scoring/assessments/inclusiveLanguage/helpers/includesConsecutiveWords";

/**
* Matches the sentence against transition words.
*
* @param {string} sentence The sentence to match against.
* @param {Array} transitionWords The array containing transition words.
* @returns {Array} The found transitional words.
* @param {Sentence} sentence The sentence to match against.
* @param {string[]} transitionWords The array containing transition words.
* @returns {string[]} The found transition words.
*/
const matchTransitionWords = function( sentence, transitionWords ) {
return transitionWords.filter( transitionWord => {
// Split into [ "as", "can", "be", "seen" ]
// Splits "as I have noted" into [ "as", "i", "have", "noted" ]
transitionWord = transitionWord.toLocaleLowerCase().split( " " );
// Retrieve the tokens per sentence, remove the spaces.
const tokens = sentence.tokens.filter( token => token.text !== " " ).map( token => token.text.toLocaleLowerCase() );
// Find the transitionWord in the tokens.
return includesConsecutiveWords( tokens, transitionWord ).length;
} );
};

// Single words we can just find in the tokens.
if ( transitionWord.length === 1 ) {
return sentence.tokens.find( token => token.text.toLocaleLowerCase() === transitionWord[ 0 ] );
}
/**
* Matches the sentence against two-part transition words.
*
* @param {Sentence} sentence The sentence to match against.
* @param {Array.<Array.<string>>} twoPartTransitionWords The array containing two-part transition words.
* @returns {string[]} The found two-part transition words.
*/
const matchTwoPartTransitionWords = function( sentence, twoPartTransitionWords ) {
return twoPartTransitionWords.filter( twoPartTransitionWord => {
const first = twoPartTransitionWord[ 0 ];
const second = twoPartTransitionWord[ 1 ];
const firstFound = matchTransitionWords( sentence, [ first ] );
const secondFound = matchTransitionWords( sentence, [ second ] );

// For multiple words, we need to (1) find them, (2) check if all have been found, (3) check if they are in order.
// Problem: what if the words appear multiple times in the sentence? We really need them to be in order.
const wordIndices = transitionWord.map( word => sentence.tokens.findIndex( token => {
return token.text.toLocaleLowerCase() === word;
} ) );
if ( wordIndices.includes( -1 ) ) {
return false;
}
return wordIndices.every( ( wordIndex, i ) => i === wordIndices.length - 1 || wordIndex + 2 === wordIndices[ i + 1 ] );
return firstFound.length && secondFound.length;
} );
};

/**
* Checks the passed sentences to see if they contain transition words.
*
* @param {Array} sentences The sentences to match against.
* @param {Array} transitionWords The array containing transition words.
* @param {Array} twoPartTransitionWords The array containing two part transition words.
* @param {Sentence[]} sentences The sentences to match against.
* @param {string[]} transitionWords The array containing transition words.
* @param {Array.<Array.<string>>} twoPartTransitionWords The array containing two-part transition words.
* @param {function} matchTransitionWordsHelper The language-specific helper function to match transition words in a sentence.
*
* @returns {Array} Array of sentence objects containing the complete sentence and the transition words.
* @returns {{sentence: string, transitionWords: string[]}[]} Array of sentence objects containing the complete sentence and the transition words.
*/
const checkSentencesForTransitionWords = function( sentences, transitionWords, twoPartTransitionWords, matchTransitionWordsHelper ) {
const results = [];

sentences.forEach( sentence => {
if ( twoPartTransitionWords ) {
const twoPartMatches = matchTwoPartTransitionWords( sentence.text, twoPartTransitionWords );
const twoPartMatches = matchTwoPartTransitionWords( sentence, twoPartTransitionWords );

if ( twoPartMatches !== null ) {
if ( twoPartMatches.length !== 0 ) {
results.push( {
sentence: sentence.text,
transitionWords: twoPartMatches,
Expand All @@ -93,7 +65,7 @@ const checkSentencesForTransitionWords = function( sentences, transitionWords, t
}

const transitionWordMatches = matchTransitionWordsHelper
? matchTransitionWordsHelper( sentence, transitionWords )
? matchTransitionWordsHelper( sentence.text, transitionWords )
: matchTransitionWords( sentence, transitionWords );

if ( transitionWordMatches.length !== 0 ) {
Expand All @@ -109,13 +81,13 @@ const checkSentencesForTransitionWords = function( sentences, transitionWords, t

/**
* Checks how many sentences from a text contain at least one transition word or two-part transition word
* that are defined in the transition words config and two part transition words config.
* that are defined in the transition words config and two-part transition words config.
*
* @param {Paper} paper The Paper object to get text from.
* @param {Researcher} researcher The researcher.
*
* @returns {object} An object with the total number of sentences in the text
* and the total number of sentences containing one or more transition words.
* @returns {{totalSentences: number, sentenceResults: {sentence: string, transitionWords: string[]}[], transitionWordSentences: number}} An object
* with the total number of sentences in the text and the total number of sentences containing one or more transition words.
*/
export default function( paper, researcher ) {
const matchTransitionWordsHelper = researcher.getHelper( "matchTransitionWordsHelper" );
Expand Down

0 comments on commit 46330fd

Please sign in to comment.