Skip to content

Commit

Permalink
Updated search query cleaning algorithm to be closer to indexing clea…
Browse files Browse the repository at this point in the history
…ning algorithm.
parpalak committed Aug 21, 2024
1 parent 468d5e7 commit 251778d
Showing 4 changed files with 76 additions and 34 deletions.
6 changes: 3 additions & 3 deletions src/S2/Rose/Entity/Metadata/SentenceCollection.php
Original file line number Diff line number Diff line change
@@ -101,10 +101,10 @@ private function buildWordsInfo(): void
*/
public static function breakIntoWords(string $content): array
{
// Replace comma as decimal separator to dot
$content = preg_replace('/[\s()]\d+\K,(?=\d+(?:[\s()]|\.\s))/', '.', $content);
// Replace decimal separator: ',' -> '.'
$content = preg_replace('#(?:^|[\s()])-?\d+\K,(?=\d+(?:$|[\s()]|\.\s))#', '.', $content);

// We allow letters, digits and some punctuation: ".,-"
// We allow letters, digits and some punctuation: ".,-^_"
$content = str_replace(',', ', ', $content);
$content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content);
$content = mb_strtolower($content);
54 changes: 26 additions & 28 deletions src/S2/Rose/Entity/Query.php
Original file line number Diff line number Diff line change
@@ -115,46 +115,44 @@ public function valueToArray()

// Normalize
$content = str_replace(['«', '»', '', '', '', ''], '"', $content);
$content = str_replace(['---', '--', '', ''], '', $content);
$content = str_replace('', '-', $content); // Replace minus sign to a hyphen
$content = str_replace(['---', '', ''], '', $content); // Normalize dashes
$content = preg_replace('#,\\s+,#u', ',,', $content);
$content = preg_replace('#[^\\-\\p{L}0-9^_.,()";?!…:—]+#iu', ' ', $content);
$content = preg_replace('#\\n+#', ' ', $content);
$content = preg_replace('#\\s+#u', ' ', $content);
$content = mb_strtolower($content);

$content = preg_replace('#(,+)#u', '\\1 ', $content);
// Replace decimal separators: ',' -> '.'
$content = preg_replace('#(?<=^|\\s)(\\-?\\d+),(\\d+)(?=\\s|$)#u', '\\1.\\2', $content);

$content = preg_replace('#[ |\\/]+#', ' ', $content);

$words = explode(' ', $content);
foreach ($words as $k => $v) {
// Separate special chars from the letter combination
if (strlen($v) > 1) {
foreach (['', '^', '(', ')', '"', ':', '?', '!'] as $specialChar) {
if (mb_substr($v, 0, 1) == $specialChar || mb_substr($v, -1) == $specialChar) {
$words[$k] = str_replace($specialChar, '', $v);
$words[] = $specialChar;
}
}
// Separate special chars at the beginning of the word
while (true) {
$content = preg_replace('#(?:^|\\s)\K([—^()"?:!])(?=[^\s])#u', '\\1 ', $content, -1, $count);
if ($count === 0) {
break;
}
}

// Separate hyphen from the letter combination
if (strlen($v) > 1 && (substr($v, 0, 1) == '-' || substr($v, -1) == '-')) {
$words[$k] = str_replace('-', '', $v);
$words[] = '-';
// Separate special chars at the end of the word
while (true) {
$content = preg_replace('#(?<=[^\s])([—^()"?:!])(?=\\s|$)#u', ' \\1', $content, -1, $count);
if ($count === 0) {
break;
}
}

// Replace 'ё' inside words
if (false !== strpos($v, 'ё') && $v != 'ё') {
$words[$k] = str_replace('ё', 'е', $v);
}
// Separate groups of commas
$content = preg_replace('#(,+)#u', ' \\1 ', $content);

// Remove ','
if (preg_match('#^[^,]+,$#u', $v) || preg_match('#^,[^,]+$#u', $v)) {
$words[$k] = str_replace(',', '', $v);
$words[] = ',';
$words = preg_split('#\\s+#', $content);
foreach ($words as $k => &$v) {
// Replace 'ё' inside words
if ($v !== 'ё' && false !== strpos($v, 'ё')) {
$v = str_replace('ё', 'е', $v);
}
}
unset($v);

$words = array_unique($words);

StringHelper::removeLongWords($words);

7 changes: 4 additions & 3 deletions src/S2/Rose/Snippet/WordsByStemsExtractor.php
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<?php
/**
* @copyright 2024 Roman Parpalak
* @license MIT
* @package Rose
* @license MIT
* @package Rose
*/

declare(strict_types=1);
@@ -85,6 +85,7 @@ public function extract(string $text): array
* check each fragment for a match with the searched stem.
*
* @param string $text
*
* @return string[]
*/
private function getWords(string $text): array
@@ -93,6 +94,6 @@ private function getWords(string $text): array
return [$text];
}

return array_merge(explode('-', $text), [$text]);
return array_merge(array_filter(explode('-', $text), static fn(string $word) => $word !== ''), [$text]);
}
}
43 changes: 43 additions & 0 deletions tests/unit/Rose/Entity/QueryTest.php
Original file line number Diff line number Diff line change
@@ -11,17 +11,60 @@

/**
* @group entity
* @group query
*/
class QueryTest extends Unit
{
public function testFilterInput(): void
{
// Tests for splitting strings by special delimiters
$this->assertEquals([1, 2], (new Query('1|||2'))->valueToArray());
$this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray());
$this->assertEquals(['..'], (new Query('..'))->valueToArray());
$this->assertEquals(['...'], (new Query('...'))->valueToArray());
$this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray());

// Tests for replacing numbers
$this->assertEquals(['1.2'], (new Query('1,2'))->valueToArray());
// $this->assertEquals(['-1.2'], (new Query('-1,2'))->valueToArray());
$this->assertEquals(['1.2'], (new Query('1.2'))->valueToArray());

// Tests for replacing typographic quotes
$this->assertEquals(['"', 'text'], (new Query('«text»'))->valueToArray());
$this->assertEquals(['"', 'text'], (new Query('“text”'))->valueToArray());

// Tests for replacing dashes
$this->assertEquals(['a--b'], (new Query('a--b'))->valueToArray());
$this->assertEquals(['a—b'], (new Query('a---b'))->valueToArray()); // --- to mdash
$this->assertEquals(['a—b'], (new Query('a–b'))->valueToArray()); // ndash to mdash
$this->assertEquals(['a-b'], (new Query('a−b'))->valueToArray()); // Minus to hyphen

// Test for replacing line breaks and extra spaces
$this->assertEquals(['a', 'b'], (new Query("a\n\nb"))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query("a \t b"))->valueToArray());

// Tests for separating special characters
$this->assertEquals(['a!b'], (new Query('a!b'))->valueToArray());
$this->assertEquals(['!', 'ab'], (new Query('!ab'))->valueToArray());
$this->assertEquals(['!', 'a!b'], (new Query('!a!b'))->valueToArray());
$this->assertEquals(['(', 'word', ')'], (new Query('(word)'))->valueToArray());
$this->assertEquals(['mysql', '--all-databases'], (new Query('mysql --all-databases'))->valueToArray());

// Test for replacing "ё" with "е"
$this->assertEquals(['ё', 'полет', 'field'], (new Query('ё полёт field'))->valueToArray());

// Tests for handling commas
$this->assertEquals(['a', ',', 'b'], (new Query('a,b'))->valueToArray());
$this->assertEquals(['a', ',,', 'b'], (new Query('a,,b'))->valueToArray());
$this->assertEquals(['a', ',,,', 'b'], (new Query('a,,,b'))->valueToArray());

// Tests for removing long words
$this->assertEquals(['a', 'c'], (new Query('a ' . str_repeat('b', 101) . ' c'))->valueToArray());

// Tests for compatibility of multiple rules
$this->assertEquals(['a—b', '"', 'text'], (new Query('a–b «text»'))->valueToArray());
$this->assertEquals(['a', ',', 'b'], (new Query(" a, \n b "))->valueToArray());
}
}

0 comments on commit 251778d

Please sign in to comment.