From: Vladimir Fomichev Date: Mon, 29 Sep 2025 15:18:45 +0000 (+0300) Subject: исправление парсера X-Git-Url: https://gitweb.erp-flowers.ru/?a=commitdiff_plain;h=da80d327962d201520f7618c9a2c53ace0428d12;p=erp24_rep%2Fyii-erp24%2F.git исправление парсера --- diff --git a/erp24/services/ProductParserService.php b/erp24/services/ProductParserService.php index 29bd650b..6bb84976 100644 --- a/erp24/services/ProductParserService.php +++ b/erp24/services/ProductParserService.php @@ -7,7 +7,6 @@ use DOMXPath; class ProductParserService { public function parseProductHtml(string $html): array { - $dom = new DOMDocument(); libxml_use_internal_errors(true); $dom->loadHTML($html); @@ -15,14 +14,23 @@ class ProductParserService { $xpath = new DOMXPath($dom); + $images = $this->extractImageUrls($xpath); + $ogImage = $this->extractOgImage($xpath); + $mainImage = $images[0] ?? $ogImage ?? ''; + + + if ($mainImage && (empty($images) || $images[0] !== $mainImage)) { + array_unshift($images, $mainImage); + $images = array_values(array_unique($images)); + } return [ - 'name' => $this->extractName($xpath), - 'image_url' => ($this->extractImageUrls($xpath))[0], + 'name' => $this->extractName($xpath) ?: $this->extractOgTitle($xpath), // мягкий фолбэк + 'image_url' => $mainImage, 'description' => $this->extractDescription($xpath), - 'properties' => $this->extractProperties($xpath), - 'video_url' => $this->extractVideoUrl($xpath), - 'image_urls' => $this->extractImageUrls($xpath), + 'properties' => $this->extractProperties($xpath), + 'video_url' => $this->extractVideoUrl($xpath), + 'image_urls' => $images, ]; } @@ -44,28 +52,68 @@ class ProductParserService { } return $src; } + private function extractOgImage(DOMXPath $xpath): ?string + { + $node = $xpath->query("//meta[@property='og:image' or @name='og:image']/@content")->item(0); + return $node ? trim($node->nodeValue) : null; + } - private function extractImageUrls(\DOMXPath $xpath): array + private function extractOgTitle(DOMXPath $xpath): string + { + $node = $xpath->query("//meta[@property='og:title' or @name='og:title']/@content")->item(0); + return $node ? trim($node->nodeValue) : ''; + } + private function extractImageUrls(DOMXPath $xpath): array { $urls = []; - $queryImages = "( - //div[starts-with(@id,'mainslider_') or contains(concat(' ', normalize-space(@class), ' '), ' swiper-container ')] - //img[contains(concat(' ', normalize-space(@class), ' '), ' main-image-content ')] - )"; - foreach ($xpath->query($queryImages) as $img) { - /** @var \DOMElement $img */ - $src = $img->getAttribute('src'); - if (!$src) { - $src = $img->getAttribute('data-src'); + $candidates = $xpath->query(" + //img[ + contains(@class,'main-image') or + contains(@class,'main-image-content') or + contains(@class,'swiper') or + contains(@class,'slide') or + contains(@class,'gallery') or + contains(@class,'product') or + contains(@class,'image') + ] | //source[@type='image/jpeg' or @type='image/webp'] + "); + + foreach ($candidates as $el) { + /** @var DOMElement $el */ + $src = $el->getAttribute('src') ?: $el->getAttribute('data-src'); + $srcset = $el->getAttribute('srcset') ?: $el->getAttribute('data-srcset'); + + if ($src) $urls[] = $src; + + if ($srcset) { + foreach (explode(',', $srcset) as $part) { + $u = trim(preg_replace('~\s+\d+[wx]$~', '', trim($part))); // отрезать " 524w" + if ($u) $urls[] = $u; + } } - if ($src) { - $urls[] = $src; + } + + foreach ($xpath->query("//script[@type='application/ld+json']") as $script) { + $json = trim($script->nodeValue ?? ''); + if (!$json) continue; + $data = json_decode($json, true); + if (!is_array($data)) continue; + + $graphs = isset($data['@graph']) && is_array($data['@graph']) ? $data['@graph'] : [$data]; + foreach ($graphs as $node) { + if (!is_array($node)) continue; + if (($node['@type'] ?? '') === 'Product' && !empty($node['image'])) { + if (is_string($node['image'])) $urls[] = $node['image']; + if (is_array($node['image'])) $urls = array_merge($urls, array_values($node['image'])); + } } } - $urls = array_values(array_unique($urls)); - $urls = array_values(array_filter($urls, fn($u) => preg_match('~flowwow-images\.com/data/flowers/~', $u))); + $urls = array_values(array_unique(array_filter(array_map('trim', $urls)))); + $urls = array_values(array_filter($urls, function ($u) { + return (bool)preg_match('~^https?://content\d*\.flowwow-images\.com/.+\.(?:jpe?g|png|webp|gif)(?:\?.*)?$~i', $u); + })); return $urls; }