-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Lucene query parameters builder for Cloud Search (#29)
- Loading branch information
Showing
2 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
172 changes: 172 additions & 0 deletions
172
src/Aws/CloudSearch/RequestParameter/Query/Builder/LuceneQueryParametersBuilder.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
<?php | ||
declare(strict_types=1); | ||
|
||
namespace Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\Builder; | ||
|
||
use Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\QueryParameters; | ||
|
||
final class LuceneQueryParametersBuilder | ||
{ | ||
private const SEARCH_FIELD_PATTERN = '%s:'; | ||
private const EXACT_MATCH_PATTERN = '"%s"'; | ||
private const WILDCARD_MATCH_PATTERN = '*%s*'; | ||
private const OPERATOR_OR = 'OR'; | ||
private const OPERATOR_AND = 'AND'; | ||
private const SPECIAL_CHARACTERS = [ | ||
'+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', '\\' | ||
]; | ||
|
||
private string $rawQuery; | ||
private bool $shouldUseExactMatchingForIndividualWords = false; | ||
private bool $shouldUseWildcardMatching = false; | ||
/** @var array<string> */ | ||
private array $searchByFields = []; | ||
|
||
public function __construct(string $rawQuery) | ||
{ | ||
$this->rawQuery = $rawQuery; | ||
} | ||
|
||
public static function forRawQuery(string $query): self | ||
{ | ||
return new self($query); | ||
} | ||
|
||
public function useExactMatchingForIndividualWords(): self | ||
{ | ||
$this->shouldUseExactMatchingForIndividualWords = true; | ||
|
||
return $this; | ||
} | ||
|
||
public function useWildcardMatching(): self | ||
{ | ||
$this->shouldUseWildcardMatching = true; | ||
|
||
return $this; | ||
} | ||
|
||
public function searchByFields(string ...$fields): self | ||
{ | ||
$this->searchByFields = $fields; | ||
|
||
return $this; | ||
} | ||
|
||
public function build(): QueryParameters | ||
{ | ||
if (false === $this->shouldUseWildcardMatching && false === $this->shouldUseExactMatchingForIndividualWords) { | ||
return QueryParameters::usingLucene($this->escapeString($this->rawQuery)); | ||
} | ||
|
||
$queryParts = []; | ||
$queryTokens = $this->filterOutEmptyElements(explode(' ', $this->rawQuery)); | ||
|
||
foreach ($queryTokens as $queryToken) { | ||
$queryTokenParts = []; | ||
|
||
if (true === $this->shouldUseExactMatchingForIndividualWords) { | ||
$escapedQueryToken = $this->escapeString($queryToken); | ||
$queryTokenParts[] = $this->buildQueryForMatchPattern($escapedQueryToken, self::EXACT_MATCH_PATTERN); | ||
} | ||
|
||
if (true === $this->shouldUseWildcardMatching) { | ||
$queryTokenParts[] = $this->buildQueryPartWithWildcard($queryToken); | ||
} | ||
|
||
$queryParts[] = $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR); | ||
} | ||
|
||
return QueryParameters::usingLucene($this->joinPartsWithOperator($queryParts, self::OPERATOR_AND)); | ||
} | ||
|
||
private function buildQueryForMatchPattern(string $queryToken, string $matchPattern): string | ||
{ | ||
if (empty($this->searchByFields)) { | ||
return sprintf($matchPattern, $queryToken); | ||
} | ||
|
||
$matchPatternWithField = sprintf( | ||
'%s %s', | ||
self::SEARCH_FIELD_PATTERN, | ||
$matchPattern | ||
); | ||
$queryTokenParts = array_map( | ||
static fn (string $searchField) => sprintf($matchPatternWithField, $searchField, $queryToken), | ||
$this->searchByFields | ||
); | ||
|
||
return $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR); | ||
} | ||
|
||
private function joinPartsWithOperator(array $queryParts, string $operator): string | ||
{ | ||
$queryParts = $this->filterOutEmptyElements($queryParts); | ||
|
||
return sprintf( | ||
count($queryParts) > 1 ? '(%s)' : '%s', | ||
implode( | ||
'OR' === $operator ? ' OR ' : ' AND ', | ||
$queryParts | ||
) | ||
); | ||
} | ||
|
||
private function buildQueryPartWithWildcard(string $queryToken): string | ||
{ | ||
$hasAnyOfTheSpecialCharacters = strpbrk($queryToken, implode('', self::SPECIAL_CHARACTERS)); | ||
|
||
if (false === $hasAnyOfTheSpecialCharacters) { | ||
return $this->buildQueryForMatchPattern($queryToken, self::WILDCARD_MATCH_PATTERN); | ||
} | ||
|
||
// Wildcard query is not tokenized on CloudSearch side, so we need to do it here to get accurate results. | ||
// Currently there is no spaces in $queryToken, so only special characters must be used to tokenize the query. | ||
// More details about this case: | ||
// https://jaygurnaniblog.wordpress.com/2017/05/03/lucene-parsing-engine-in-aws-with-special-characters/ | ||
$wildcardQueryTokens = explode( | ||
' ', | ||
str_replace(self::SPECIAL_CHARACTERS, ' ', $queryToken) | ||
); | ||
$wildcardQueryTokens = $this->filterOutEmptyElements($wildcardQueryTokens); | ||
|
||
if (empty($wildcardQueryTokens)) { | ||
return ''; | ||
} | ||
|
||
$wildcardQuery = implode( | ||
' AND ', | ||
array_map( | ||
static fn(string $wildcardQueryToken) => sprintf(self::WILDCARD_MATCH_PATTERN, $wildcardQueryToken), | ||
$wildcardQueryTokens | ||
) | ||
); | ||
|
||
if (count($this->searchByFields) > 0) { | ||
$matchPatternWithField = sprintf( | ||
'%s (%s)', | ||
self::SEARCH_FIELD_PATTERN, | ||
$wildcardQuery | ||
); | ||
$queryTokenParts = array_map( | ||
static fn (string $searchField) => sprintf($matchPatternWithField, $searchField, $queryToken), | ||
$this->searchByFields | ||
); | ||
|
||
return $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR); | ||
} | ||
|
||
return $wildcardQuery; | ||
} | ||
|
||
private function escapeString(string $queryToken): string | ||
{ | ||
// Backslash is more special character... | ||
return str_replace('\\', '\\\\', $queryToken); | ||
} | ||
|
||
private function filterOutEmptyElements(array $elements): array | ||
{ | ||
return array_filter($elements, static fn (string $queryToken) => '' !== trim($queryToken)); | ||
} | ||
} |
115 changes: 115 additions & 0 deletions
115
tests/Aws/CloudSearch/RequestParameter/Query/Builder/LuceneQueryParametersBuilderTest.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\Builder; | ||
|
||
use PHPUnit\Framework\TestCase; | ||
|
||
final class LuceneQueryParametersBuilderTest extends TestCase | ||
{ | ||
public function testItBuildsSimpleQuery(): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery('test query'); | ||
|
||
self::assertSame('test query', $builder->build()->getQuery()); | ||
} | ||
|
||
public function testItBuildsWithExactMatch(): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery('test query ') | ||
->useExactMatchingForIndividualWords(); | ||
|
||
self::assertSame('("test" AND "query")', $builder->build()->getQuery()); | ||
} | ||
|
||
public function testItBuildsWithExactMatchAndFields(): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery('test query') | ||
->useExactMatchingForIndividualWords() | ||
->searchByFields('field1', 'field2'); | ||
|
||
self::assertSame( | ||
'((field1: "test" OR field2: "test") AND (field1: "query" OR field2: "query"))', | ||
$builder->build()->getQuery() | ||
); | ||
} | ||
|
||
/** | ||
* @dataProvider getQueryForWildcardSearch | ||
*/ | ||
public function testItBuildsWithWildcardMatch(string $searchQuery, string $expectedQuery): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery) | ||
->useWildcardMatching(); | ||
|
||
self::assertSame($expectedQuery, $builder->build()->getQuery()); | ||
} | ||
|
||
public function getQueryForWildcardSearch(): \Generator | ||
{ | ||
yield ['test', '*test*']; | ||
yield ['test query', '(*test* AND *query*)']; | ||
yield [ | ||
'https://landingi.example/test?query-string', | ||
'*https* AND *landingi.example* AND *test* AND *query* AND *string*', | ||
]; | ||
yield [ | ||
'!~@#^$ \\', | ||
'*@#* AND *$*', | ||
]; | ||
} | ||
|
||
/** | ||
* @dataProvider getQueryForWildcardSearchWithFields | ||
*/ | ||
public function testItBuildsWithWildcardMatchAndFields(string $searchQuery, string $expectedQuery): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery) | ||
->useWildcardMatching() | ||
->searchByFields('field1', 'field2'); | ||
|
||
self::assertSame($expectedQuery, $builder->build()->getQuery()); | ||
} | ||
|
||
public function getQueryForWildcardSearchWithFields(): \Generator | ||
{ | ||
yield ['test', '(field1: *test* OR field2: *test*)']; | ||
yield ['test query', '((field1: *test* OR field2: *test*) AND (field1: *query* OR field2: *query*))']; | ||
yield [ | ||
'https://landingi.example/test?query-string', | ||
'(field1: (*https* AND *landingi.example* AND *test* AND *query* AND *string*) OR field2: (*https* AND *landingi.example* AND *test* AND *query* AND *string*))', | ||
]; | ||
yield [ | ||
'!~@#^$ \\', | ||
'(field1: (*@#* AND *$*) OR field2: (*@#* AND *$*))', | ||
]; | ||
} | ||
|
||
/** | ||
* @dataProvider getQueryForWildcardAndExactSearchWithFields | ||
*/ | ||
public function testItBuildsWithWildcardAndExactMatchAndOneField(string $searchQuery, string $expectedQuery): void | ||
{ | ||
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery) | ||
->useWildcardMatching() | ||
->useExactMatchingForIndividualWords() | ||
->searchByFields('field1'); | ||
|
||
self::assertSame($expectedQuery, $builder->build()->getQuery()); | ||
} | ||
|
||
public function getQueryForWildcardAndExactSearchWithFields(): \Generator | ||
{ | ||
yield ['test', '(field1: "test" OR field1: *test*)']; | ||
yield ['test query', '((field1: "test" OR field1: *test*) AND (field1: "query" OR field1: *query*))']; | ||
yield [ | ||
'https://landingi.example/test?query-string', | ||
'(field1: "https://landingi.example/test?query-string" OR field1: (*https* AND *landingi.example* AND *test* AND *query* AND *string*))', | ||
]; | ||
yield [ | ||
'!~@#^$ \\', | ||
'((field1: "!~@#^$" OR field1: (*@#* AND *$*)) AND field1: "\\\")', | ||
]; | ||
} | ||
} |