From 01f0c32cba6d3684e64637068de458b1bb3427a1 Mon Sep 17 00:00:00 2001 From: fomichev Date: Fri, 17 Apr 2026 16:43:38 +0300 Subject: [PATCH] =?utf8?q?feat(ERP-292):=20SimilarityMatcher=20=E2=80=94?= =?utf8?q?=20TF-IDF=20cosine=20similarity?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- erp24/services/automark/SimilarityMatcher.php | 95 +++++++++++++++++++ .../automark/SimilarityMatcherTest.php | 66 +++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 erp24/services/automark/SimilarityMatcher.php create mode 100644 erp24/tests/unit/services/automark/SimilarityMatcherTest.php diff --git a/erp24/services/automark/SimilarityMatcher.php b/erp24/services/automark/SimilarityMatcher.php new file mode 100644 index 00000000..4de5e2c5 --- /dev/null +++ b/erp24/services/automark/SimilarityMatcher.php @@ -0,0 +1,95 @@ +..., 'category'=>..., 'species'=>..., ...] + */ + public function findBestMatch(string $name, array $corpus): ?ParseResult + { + if (empty($corpus)) { + return null; + } + + $queryTokens = self::tokenize($name); + if (empty($queryTokens)) { + return null; + } + + $bestScore = -1.0; + $bestItem = null; + + foreach ($corpus as $item) { + $corpusTokens = self::tokenize($item['name'] ?? ''); + $score = $this->cosineSimilarity($queryTokens, $corpusTokens); + if ($score > $bestScore) { + $bestScore = $score; + $bestItem = $item; + } + } + + if ($bestItem === null || $bestScore <= 0.0) { + return null; + } + + return new ParseResult( + category: $bestItem['category'] ?? null, + subcategory: $bestItem['subcategory'] ?? null, + species: $bestItem['species'] ?? null, + sort: $bestItem['sort'] ?? null, + type: $bestItem['type'] ?? null, + size: isset($bestItem['size']) ? (int) $bestItem['size'] : null, + color: $bestItem['color'] ?? null, + confidence: round($bestScore, 4), + method: 'similarity', + ); + } + + /** + * Токенизация: lowercase, только буквы (2+ символов), без стоп-слов. + * + * @return string[] + */ + public static function tokenize(string $text): array + { + $text = mb_strtolower($text); + $text = preg_replace('/\d+\s*(?:см|cm|СМ|CM)/iu', '', $text); + preg_match_all('/[а-яёa-z]{2,}/u', $text, $matches); + $tokens = $matches[0] ?? []; + return array_values(array_filter($tokens, fn($t) => !in_array($t, self::STOP_WORDS, true))); + } + + private function cosineSimilarity(array $a, array $b): float + { + $vecA = array_count_values($a); + $vecB = array_count_values($b); + + $allKeys = array_unique(array_merge(array_keys($vecA), array_keys($vecB))); + $dotProduct = 0.0; + $normA = 0.0; + $normB = 0.0; + + foreach ($allKeys as $key) { + $va = $vecA[$key] ?? 0; + $vb = $vecB[$key] ?? 0; + $dotProduct += $va * $vb; + $normA += $va * $va; + $normB += $vb * $vb; + } + + if ($normA === 0.0 || $normB === 0.0) { + return 0.0; + } + + return $dotProduct / (sqrt($normA) * sqrt($normB)); + } +} diff --git a/erp24/tests/unit/services/automark/SimilarityMatcherTest.php b/erp24/tests/unit/services/automark/SimilarityMatcherTest.php new file mode 100644 index 00000000..be26d37d --- /dev/null +++ b/erp24/tests/unit/services/automark/SimilarityMatcherTest.php @@ -0,0 +1,66 @@ +matcher = new SimilarityMatcher(); + } + + private function makeCorpus(): array + { + return [ + ['name' => 'Роза красная 50см Premium', 'category' => 'Срезы', 'subcategory' => null, 'species' => 'Роза', 'sort' => 'Premium', 'type' => null, 'size' => 50, 'color' => 'Красная'], + ['name' => 'Роза белая 60см Экстра', 'category' => 'Срезы', 'subcategory' => null, 'species' => 'Роза', 'sort' => 'Экстра', 'type' => null, 'size' => 60, 'color' => 'Белая'], + ['name' => 'Хризантема белая 70 см', 'category' => 'Срезы', 'subcategory' => null, 'species' => 'Хризантема', 'sort' => null, 'type' => null, 'size' => 70, 'color' => 'Белая'], + ]; + } + + public function testFindsBestMatchForSimilarName(): void + { + $result = $this->matcher->findBestMatch('Роза красная 50 Premium', $this->makeCorpus()); + + $this->assertInstanceOf(ParseResult::class, $result); + $this->assertSame('Срезы', $result->category); + $this->assertSame('Роза', $result->species); + $this->assertSame('similarity', $result->method); + $this->assertGreaterThan(0.0, $result->confidence); + } + + public function testReturnsNullForEmptyCorpus(): void + { + $result = $this->matcher->findBestMatch('Роза красная', []); + + $this->assertNull($result); + } + + public function testConfidenceIsHigherForCloseMatch(): void + { + $corpus = $this->makeCorpus(); + $closeMatch = $this->matcher->findBestMatch('Роза красная 50 Premium', $corpus); + $farMatch = $this->matcher->findBestMatch('Нечто совсем другое', $corpus); + + $this->assertGreaterThan($farMatch?->confidence ?? 0.0, $closeMatch->confidence); + } + + public function testTokenizePublicMethod(): void + { + $tokens = SimilarityMatcher::tokenize('Роза Premium 50см'); + + $this->assertContains('роза', $tokens); + $this->assertContains('premium', $tokens); + } +} -- 2.39.5