diff --git a/bin/stem b/bin/stem index 290c444..049cb1b 100755 --- a/bin/stem +++ b/bin/stem @@ -13,16 +13,16 @@ if (PHP_SAPI !== 'cli') { require __DIR__ . '/../vendor/autoload.php'; if (count($argv) < 3) { - die('Usage: ' . $argv[0] . ' ' . PHP_EOL); + die('Usage: ' . $argv[0] . ' ' . PHP_EOL); } -$kind = $argv[1]; +$language = $argv[1]; $argument = $argv[2]; -$stemmer = match ($kind) { +$stemmer = match ($language) { 'russian' => new \S2\Rose\Stemmer\PorterStemmerRussian(), 'english' => new \S2\Rose\Stemmer\PorterStemmerEnglish(), - default => throw new \Exception('Unknown stemmer kind: ' . $kind), + default => throw new \Exception('Unknown stemmer language: ' . $language), }; echo $stemmer->stemWord($argument), PHP_EOL; diff --git a/src/S2/Rose/Stemmer/PorterStemmerEnglish.php b/src/S2/Rose/Stemmer/PorterStemmerEnglish.php index 4ba962b..e44521f 100644 --- a/src/S2/Rose/Stemmer/PorterStemmerEnglish.php +++ b/src/S2/Rose/Stemmer/PorterStemmerEnglish.php @@ -10,7 +10,7 @@ */ class PorterStemmerEnglish extends AbstractStemmer implements StemmerInterface { - private const SUPPORTS_REGEX = '#^[a-zA-Z\-0-9\'’]*$#Su'; + private const SUPPORTS_REGEX = '#(?:^|-|\d)[a-zA-Z\'’]+$#Su'; protected static array $irregularWords = [ 'skis' => 'ski', diff --git a/src/S2/Rose/Stemmer/PorterStemmerRussian.php b/src/S2/Rose/Stemmer/PorterStemmerRussian.php index ffed9c8..580fcca 100644 --- a/src/S2/Rose/Stemmer/PorterStemmerRussian.php +++ b/src/S2/Rose/Stemmer/PorterStemmerRussian.php @@ -7,7 +7,7 @@ */ class PorterStemmerRussian extends AbstractStemmer implements StemmerInterface { - const SUPPORTS_REGEX = '#^[а-яА-ЯёЁ\-0-9]*$#Su'; + const SUPPORTS_REGEX = '#(?:^|-|\d)[а-яА-ЯёЁ]+$#Su'; const VOWEL = '/аеиоуыэюя/Su'; const PERFECTIVEGROUND = '/((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$/Su'; @@ -296,6 +296,9 @@ class PorterStemmerRussian extends AbstractStemmer implements StemmerInterface 'модема' => 'модем', 'модему' => 'модем', + 'токен' => '', + 'токена' => 'токен', + 'ищу' => 'иска', 'ищешь' => 'иска', 'ищет' => 'иска',