Skip to content

Commit

Permalink
Add Lucene query parameters builder for Cloud Search (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
maureis authored Aug 17, 2022
1 parent f099c59 commit b27e93d
Show file tree
Hide file tree
Showing 2 changed files with 287 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
<?php
declare(strict_types=1);

namespace Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\Builder;

use Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\QueryParameters;

final class LuceneQueryParametersBuilder
{
private const SEARCH_FIELD_PATTERN = '%s:';
private const EXACT_MATCH_PATTERN = '"%s"';
private const WILDCARD_MATCH_PATTERN = '*%s*';
private const OPERATOR_OR = 'OR';
private const OPERATOR_AND = 'AND';
private const SPECIAL_CHARACTERS = [
'+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', '\\'
];

private string $rawQuery;
private bool $shouldUseExactMatchingForIndividualWords = false;
private bool $shouldUseWildcardMatching = false;
/** @var array<string> */
private array $searchByFields = [];

public function __construct(string $rawQuery)
{
$this->rawQuery = $rawQuery;
}

public static function forRawQuery(string $query): self
{
return new self($query);
}

public function useExactMatchingForIndividualWords(): self
{
$this->shouldUseExactMatchingForIndividualWords = true;

return $this;
}

public function useWildcardMatching(): self
{
$this->shouldUseWildcardMatching = true;

return $this;
}

public function searchByFields(string ...$fields): self
{
$this->searchByFields = $fields;

return $this;
}

public function build(): QueryParameters
{
if (false === $this->shouldUseWildcardMatching && false === $this->shouldUseExactMatchingForIndividualWords) {
return QueryParameters::usingLucene($this->escapeString($this->rawQuery));
}

$queryParts = [];
$queryTokens = $this->filterOutEmptyElements(explode(' ', $this->rawQuery));

foreach ($queryTokens as $queryToken) {
$queryTokenParts = [];

if (true === $this->shouldUseExactMatchingForIndividualWords) {
$escapedQueryToken = $this->escapeString($queryToken);
$queryTokenParts[] = $this->buildQueryForMatchPattern($escapedQueryToken, self::EXACT_MATCH_PATTERN);
}

if (true === $this->shouldUseWildcardMatching) {
$queryTokenParts[] = $this->buildQueryPartWithWildcard($queryToken);
}

$queryParts[] = $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR);
}

return QueryParameters::usingLucene($this->joinPartsWithOperator($queryParts, self::OPERATOR_AND));
}

private function buildQueryForMatchPattern(string $queryToken, string $matchPattern): string
{
if (empty($this->searchByFields)) {
return sprintf($matchPattern, $queryToken);
}

$matchPatternWithField = sprintf(
'%s %s',
self::SEARCH_FIELD_PATTERN,
$matchPattern
);
$queryTokenParts = array_map(
static fn (string $searchField) => sprintf($matchPatternWithField, $searchField, $queryToken),
$this->searchByFields
);

return $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR);
}

private function joinPartsWithOperator(array $queryParts, string $operator): string
{
$queryParts = $this->filterOutEmptyElements($queryParts);

return sprintf(
count($queryParts) > 1 ? '(%s)' : '%s',
implode(
'OR' === $operator ? ' OR ' : ' AND ',
$queryParts
)
);
}

private function buildQueryPartWithWildcard(string $queryToken): string
{
$hasAnyOfTheSpecialCharacters = strpbrk($queryToken, implode('', self::SPECIAL_CHARACTERS));

if (false === $hasAnyOfTheSpecialCharacters) {
return $this->buildQueryForMatchPattern($queryToken, self::WILDCARD_MATCH_PATTERN);
}

// Wildcard query is not tokenized on CloudSearch side, so we need to do it here to get accurate results.
// Currently there is no spaces in $queryToken, so only special characters must be used to tokenize the query.
// More details about this case:
// https://jaygurnaniblog.wordpress.com/2017/05/03/lucene-parsing-engine-in-aws-with-special-characters/
$wildcardQueryTokens = explode(
' ',
str_replace(self::SPECIAL_CHARACTERS, ' ', $queryToken)
);
$wildcardQueryTokens = $this->filterOutEmptyElements($wildcardQueryTokens);

if (empty($wildcardQueryTokens)) {
return '';
}

$wildcardQuery = implode(
' AND ',
array_map(
static fn(string $wildcardQueryToken) => sprintf(self::WILDCARD_MATCH_PATTERN, $wildcardQueryToken),
$wildcardQueryTokens
)
);

if (count($this->searchByFields) > 0) {
$matchPatternWithField = sprintf(
'%s (%s)',
self::SEARCH_FIELD_PATTERN,
$wildcardQuery
);
$queryTokenParts = array_map(
static fn (string $searchField) => sprintf($matchPatternWithField, $searchField, $queryToken),
$this->searchByFields
);

return $this->joinPartsWithOperator($queryTokenParts, self::OPERATOR_OR);
}

return $wildcardQuery;
}

private function escapeString(string $queryToken): string
{
// Backslash is more special character...
return str_replace('\\', '\\\\', $queryToken);
}

private function filterOutEmptyElements(array $elements): array
{
return array_filter($elements, static fn (string $queryToken) => '' !== trim($queryToken));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
<?php

declare(strict_types=1);

namespace Landingi\AwsBundle\Aws\CloudSearch\RequestParameter\Query\Builder;

use PHPUnit\Framework\TestCase;

final class LuceneQueryParametersBuilderTest extends TestCase
{
public function testItBuildsSimpleQuery(): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery('test query');

self::assertSame('test query', $builder->build()->getQuery());
}

public function testItBuildsWithExactMatch(): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery('test query ')
->useExactMatchingForIndividualWords();

self::assertSame('("test" AND "query")', $builder->build()->getQuery());
}

public function testItBuildsWithExactMatchAndFields(): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery('test query')
->useExactMatchingForIndividualWords()
->searchByFields('field1', 'field2');

self::assertSame(
'((field1: "test" OR field2: "test") AND (field1: "query" OR field2: "query"))',
$builder->build()->getQuery()
);
}

/**
* @dataProvider getQueryForWildcardSearch
*/
public function testItBuildsWithWildcardMatch(string $searchQuery, string $expectedQuery): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery)
->useWildcardMatching();

self::assertSame($expectedQuery, $builder->build()->getQuery());
}

public function getQueryForWildcardSearch(): \Generator
{
yield ['test', '*test*'];
yield ['test query', '(*test* AND *query*)'];
yield [
'https://landingi.example/test?query-string',
'*https* AND *landingi.example* AND *test* AND *query* AND *string*',
];
yield [
'!~@#^$ \\',
'*@#* AND *$*',
];
}

/**
* @dataProvider getQueryForWildcardSearchWithFields
*/
public function testItBuildsWithWildcardMatchAndFields(string $searchQuery, string $expectedQuery): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery)
->useWildcardMatching()
->searchByFields('field1', 'field2');

self::assertSame($expectedQuery, $builder->build()->getQuery());
}

public function getQueryForWildcardSearchWithFields(): \Generator
{
yield ['test', '(field1: *test* OR field2: *test*)'];
yield ['test query', '((field1: *test* OR field2: *test*) AND (field1: *query* OR field2: *query*))'];
yield [
'https://landingi.example/test?query-string',
'(field1: (*https* AND *landingi.example* AND *test* AND *query* AND *string*) OR field2: (*https* AND *landingi.example* AND *test* AND *query* AND *string*))',
];
yield [
'!~@#^$ \\',
'(field1: (*@#* AND *$*) OR field2: (*@#* AND *$*))',
];
}

/**
* @dataProvider getQueryForWildcardAndExactSearchWithFields
*/
public function testItBuildsWithWildcardAndExactMatchAndOneField(string $searchQuery, string $expectedQuery): void
{
$builder = LuceneQueryParametersBuilder::forRawQuery($searchQuery)
->useWildcardMatching()
->useExactMatchingForIndividualWords()
->searchByFields('field1');

self::assertSame($expectedQuery, $builder->build()->getQuery());
}

public function getQueryForWildcardAndExactSearchWithFields(): \Generator
{
yield ['test', '(field1: "test" OR field1: *test*)'];
yield ['test query', '((field1: "test" OR field1: *test*) AND (field1: "query" OR field1: *query*))'];
yield [
'https://landingi.example/test?query-string',
'(field1: "https://landingi.example/test?query-string" OR field1: (*https* AND *landingi.example* AND *test* AND *query* AND *string*))',
];
yield [
'!~@#^$ \\',
'((field1: "!~@#^$" OR field1: (*@#* AND *$*)) AND field1: "\\\")',
];
}
}

0 comments on commit b27e93d

Please sign in to comment.