Skip to content

Commit

Permalink
Make the tuple length in com_finder configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
Hackwar committed Jun 3, 2018
1 parent 3a13f8f commit c26f898
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 64 deletions.
11 changes: 11 additions & 0 deletions administrator/components/com_finder/config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,17 @@
description="COM_FINDER_FIELDSET_INDEX_OPTIONS_DESCRIPTION"
>

<field
name="tuplecount"
type="integer"
label="COM_FINDER_CONFIG_TUPLECOUNT_LABEL"
default="1"
validate="options"
first="1"
last="10"
step="1"
/>

<field
name="batch_size"
type="list"
Expand Down
47 changes: 24 additions & 23 deletions administrator/components/com_finder/helpers/indexer/helper.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public static function parse($input, $format = 'html')
*/
public static function tokenize($input, $lang, $phrase = false)
{
static $cache;
static $cache, $tuplecount;
$store = StringHelper::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;

// Check if the string has been tokenized already.
Expand All @@ -81,6 +81,12 @@ public static function tokenize($input, $lang, $phrase = false)
return $cache[$store];
}

if (!$tuplecount)
{
$params = ComponentHelper::getParams('com_finder');
$tuplecount = $params->get('tuplecount', 1);
}

$tokens = array();
$quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');

Expand Down Expand Up @@ -168,33 +174,28 @@ public static function tokenize($input, $lang, $phrase = false)
$tokens[] = new FinderIndexerToken($terms[$i], $lang);
}

// Create two and three word phrase tokens from the individual words.
for ($i = 0, $n = count($tokens); $i < $n; $i++)
// Create multi-word phrases tokens from the individual words.
if ($tuplecount > 1)
{
// Setup the phrase positions.
$i2 = $i + 1;
$i3 = $i + 2;

// Create the two word phrase.
if ($i2 < $n && isset($tokens[$i2]))
for ($i = 0, $n = count($tokens); $i < $n; $i++)
{
// Tokenize the two word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
$token->derived = true;
$temp = array($tokens[$i]->term);

// Add the token to the stack.
$tokens[] = $token;
}
// Create tokens for 2 to $tuplecount length phrases
for ($j = 1; $j < $tuplecount; $j++)
{
if ($i + $j >= $n || !isset($tokens[$i + $j]))
{
break;
}

// Create the three word phrase.
if ($i3 < $n && isset($tokens[$i3]))
{
// Tokenize the three word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
$token->derived = true;
$temp[] = $tokens[$i + $j]->term;
$token = new FinderIndexerToken($temp, $lang, $lang === 'zh' ? '' : ' ');
$token->derived = true;

// Add the token to the stack.
$tokens[] = $token;
// Add the token to the stack.
$tokens[] = $token;
}
}
}
}
Expand Down
66 changes: 25 additions & 41 deletions administrator/components/com_finder/helpers/indexer/query.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

defined('_JEXEC') or die;

use Joomla\CMS\Component\ComponentHelper;
use Joomla\Registry\Registry;
use Joomla\String\StringHelper;
use Joomla\Utilities\ArrayHelper;
Expand Down Expand Up @@ -731,11 +732,12 @@ protected function processDates($date1, $date2, $when1, $when2)
protected function processString($input, $lang, $mode)
{
// Clean up the input string.
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
$input = StringHelper::strtolower($input);
$input = preg_replace('#\s+#mi', ' ', $input);
$input = trim($input);
$debug = JFactory::getConfig()->get('debug_lang');
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
$input = StringHelper::strtolower($input);
$input = preg_replace('#\s+#mi', ' ', $input);
$input = trim($input);
$debug = JFactory::getConfig()->get('debug_lang');
$params = ComponentHelper::getParams('com_finder');

/*
* First, we need to handle string based modifiers. String based
Expand Down Expand Up @@ -893,51 +895,33 @@ protected function processString($input, $lang, $mode)

// Get the number of words in the phrase.
$parts = explode(' ', $match);
$tuplecount = $params->get('tuplecount', 1);

// Check if the phrase is longer than three words.
if (count($parts) > 3)
if (count($parts) > $tuplecount && $tuplecount > 1)
{
$chunk = array_slice($parts, 0, $tuplecount);
$parts = array_slice($parts, $tuplecount);

// If the chunk is not empty, add it as a phrase.
if (count($chunk))
{
$phrases[] = implode(' ', $chunk);
$terms[] = implode(' ', $chunk);
}

/*
* If the phrase is longer than three words, we need to
* If the phrase is longer than $tuplecount words, we need to
* break it down into smaller chunks of phrases that
* are less than or equal to three words. We overlap
* are less than or equal to $tuplecount words. We overlap
* the chunks so that we can ensure that a match is
* found for the complete phrase and not just portions
* of it.
*/
for ($i = 0, $c = count($parts); $i < $c; $i += 2)
for ($i = 0, $c = count($parts); $i < $c; $i += 1)
{
// Set up the chunk.
$chunk = array();

// The chunk has to be assembled based on how many
// pieces are available to use.
switch ($c - $i)
{
/*
* If only one word is left, we can break from
* the switch and loop because the last word
* was already used at the end of the last
* chunk.
*/
case 1:
break 2;

// If there words are left, we use them both as
// the last chunk of the phrase and we're done.
case 2:
$chunk[] = $parts[$i];
$chunk[] = $parts[$i + 1];
break;

// If there are three or more words left, we
// build a three word chunk and continue on.
default:
$chunk[] = $parts[$i];
$chunk[] = $parts[$i + 1];
$chunk[] = $parts[$i + 2];
break;
}
array_shift($chunk);
$chunk[] = array_shift($parts);

// If the chunk is not empty, add it as a phrase.
if (count($chunk))
Expand All @@ -949,7 +933,7 @@ protected function processString($input, $lang, $mode)
}
else
{
// The phrase is <= 3 words so we can use it as is.
// The phrase is <= $tuplecount words so we can use it as is.
$phrases[] = $match;
$terms[] = $match;
}
Expand Down
1 change: 1 addition & 0 deletions administrator/language/en-GB/en-GB.com_finder.ini
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ COM_FINDER_CONFIG_STEMMER_FR="French Only"
COM_FINDER_CONFIG_STEMMER_LABEL="Select Language Stemmer"
COM_FINDER_CONFIG_STEMMER_PORTER_EN="English Only"
COM_FINDER_CONFIG_STEMMER_SNOWBALL="Snowball"
COM_FINDER_CONFIG_TUPLECOUNT_LABEL="Length indexed tuples"
COM_FINDER_CONFIG_TEXT_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The body text comes from the summary and/or body of the content."
COM_FINDER_CONFIG_TEXT_MULTIPLIER_LABEL="Body Text Weight Multiplier"
COM_FINDER_CONFIG_TITLE_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The title text comes from the title of the content."
Expand Down

0 comments on commit c26f898

Please sign in to comment.