class ProductParserService {
public function parseProductHtml(string $html): array
{
-
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
+ $images = $this->extractImageUrls($xpath);
+ $ogImage = $this->extractOgImage($xpath);
+ $mainImage = $images[0] ?? $ogImage ?? '';
+
+
+ if ($mainImage && (empty($images) || $images[0] !== $mainImage)) {
+ array_unshift($images, $mainImage);
+ $images = array_values(array_unique($images));
+ }
return [
- 'name' => $this->extractName($xpath),
- 'image_url' => ($this->extractImageUrls($xpath))[0],
+ 'name' => $this->extractName($xpath) ?: $this->extractOgTitle($xpath), // мягкий фолбэк
+ 'image_url' => $mainImage,
'description' => $this->extractDescription($xpath),
- 'properties' => $this->extractProperties($xpath),
- 'video_url' => $this->extractVideoUrl($xpath),
- 'image_urls' => $this->extractImageUrls($xpath),
+ 'properties' => $this->extractProperties($xpath),
+ 'video_url' => $this->extractVideoUrl($xpath),
+ 'image_urls' => $images,
];
}
}
return $src;
}
+ private function extractOgImage(DOMXPath $xpath): ?string
+ {
+ $node = $xpath->query("//meta[@property='og:image' or @name='og:image']/@content")->item(0);
+ return $node ? trim($node->nodeValue) : null;
+ }
- private function extractImageUrls(\DOMXPath $xpath): array
+ private function extractOgTitle(DOMXPath $xpath): string
+ {
+ $node = $xpath->query("//meta[@property='og:title' or @name='og:title']/@content")->item(0);
+ return $node ? trim($node->nodeValue) : '';
+ }
+ private function extractImageUrls(DOMXPath $xpath): array
{
$urls = [];
- $queryImages = "(
- //div[starts-with(@id,'mainslider_') or contains(concat(' ', normalize-space(@class), ' '), ' swiper-container ')]
- //img[contains(concat(' ', normalize-space(@class), ' '), ' main-image-content ')]
- )";
- foreach ($xpath->query($queryImages) as $img) {
- /** @var \DOMElement $img */
- $src = $img->getAttribute('src');
- if (!$src) {
- $src = $img->getAttribute('data-src');
+ $candidates = $xpath->query("
+ //img[
+ contains(@class,'main-image') or
+ contains(@class,'main-image-content') or
+ contains(@class,'swiper') or
+ contains(@class,'slide') or
+ contains(@class,'gallery') or
+ contains(@class,'product') or
+ contains(@class,'image')
+ ] | //source[@type='image/jpeg' or @type='image/webp']
+ ");
+
+ foreach ($candidates as $el) {
+ /** @var DOMElement $el */
+ $src = $el->getAttribute('src') ?: $el->getAttribute('data-src');
+ $srcset = $el->getAttribute('srcset') ?: $el->getAttribute('data-srcset');
+
+ if ($src) $urls[] = $src;
+
+ if ($srcset) {
+ foreach (explode(',', $srcset) as $part) {
+ $u = trim(preg_replace('~\s+\d+[wx]$~', '', trim($part))); // отрезать " 524w"
+ if ($u) $urls[] = $u;
+ }
}
- if ($src) {
- $urls[] = $src;
+ }
+
+ foreach ($xpath->query("//script[@type='application/ld+json']") as $script) {
+ $json = trim($script->nodeValue ?? '');
+ if (!$json) continue;
+ $data = json_decode($json, true);
+ if (!is_array($data)) continue;
+
+ $graphs = isset($data['@graph']) && is_array($data['@graph']) ? $data['@graph'] : [$data];
+ foreach ($graphs as $node) {
+ if (!is_array($node)) continue;
+ if (($node['@type'] ?? '') === 'Product' && !empty($node['image'])) {
+ if (is_string($node['image'])) $urls[] = $node['image'];
+ if (is_array($node['image'])) $urls = array_merge($urls, array_values($node['image']));
+ }
}
}
- $urls = array_values(array_unique($urls));
- $urls = array_values(array_filter($urls, fn($u) => preg_match('~flowwow-images\.com/data/flowers/~', $u)));
+ $urls = array_values(array_unique(array_filter(array_map('trim', $urls))));
+ $urls = array_values(array_filter($urls, function ($u) {
+ return (bool)preg_match('~^https?://content\d*\.flowwow-images\.com/.+\.(?:jpe?g|png|webp|gif)(?:\?.*)?$~i', $u);
+ }));
return $urls;
}