]> gitweb.erp-flowers.ru Git - erp24_rep/yii-erp24/.git/commitdiff
исправление парсера
authorVladimir Fomichev <vladimir.fomichev@erp-flowers.ru>
Mon, 29 Sep 2025 15:18:45 +0000 (18:18 +0300)
committerVladimir Fomichev <vladimir.fomichev@erp-flowers.ru>
Mon, 29 Sep 2025 15:18:45 +0000 (18:18 +0300)
erp24/services/ProductParserService.php

index 29bd650b84f974a4dce35f953899e8dae691b998..6bb84976fa6d1cbdb0ee16f48b248ee19379d9f2 100644 (file)
@@ -7,7 +7,6 @@ use DOMXPath;
 class ProductParserService {
     public function parseProductHtml(string $html): array
     {
-
         $dom = new DOMDocument();
         libxml_use_internal_errors(true);
         $dom->loadHTML($html);
@@ -15,14 +14,23 @@ class ProductParserService {
 
         $xpath = new DOMXPath($dom);
 
+        $images = $this->extractImageUrls($xpath);
+        $ogImage = $this->extractOgImage($xpath);
+        $mainImage = $images[0] ?? $ogImage ?? '';
+
+
+        if ($mainImage && (empty($images) || $images[0] !== $mainImage)) {
+            array_unshift($images, $mainImage);
+            $images = array_values(array_unique($images));
+        }
 
         return [
-            'name' => $this->extractName($xpath),
-            'image_url' => ($this->extractImageUrls($xpath))[0],
+            'name'        => $this->extractName($xpath) ?: $this->extractOgTitle($xpath), // мягкий фолбэк
+            'image_url'   => $mainImage,
             'description' => $this->extractDescription($xpath),
-            'properties' => $this->extractProperties($xpath),
-            'video_url' => $this->extractVideoUrl($xpath),
-            'image_urls' => $this->extractImageUrls($xpath),
+            'properties'  => $this->extractProperties($xpath),
+            'video_url'   => $this->extractVideoUrl($xpath),
+            'image_urls'  => $images,
         ];
     }
 
@@ -44,28 +52,68 @@ class ProductParserService {
         }
         return $src;
     }
+    private function extractOgImage(DOMXPath $xpath): ?string
+    {
+        $node = $xpath->query("//meta[@property='og:image' or @name='og:image']/@content")->item(0);
+        return $node ? trim($node->nodeValue) : null;
+    }
 
-    private function extractImageUrls(\DOMXPath $xpath): array
+    private function extractOgTitle(DOMXPath $xpath): string
+    {
+        $node = $xpath->query("//meta[@property='og:title' or @name='og:title']/@content")->item(0);
+        return $node ? trim($node->nodeValue) : '';
+    }
+    private function extractImageUrls(DOMXPath $xpath): array
     {
         $urls = [];
 
-        $queryImages = "(
-        //div[starts-with(@id,'mainslider_') or contains(concat(' ', normalize-space(@class), ' '), ' swiper-container ')]
-        //img[contains(concat(' ', normalize-space(@class), ' '), ' main-image-content ')]
-    )";
-        foreach ($xpath->query($queryImages) as $img) {
-            /** @var \DOMElement $img */
-            $src = $img->getAttribute('src');
-            if (!$src) {
-                $src = $img->getAttribute('data-src');
+        $candidates = $xpath->query("
+        //img[
+            contains(@class,'main-image') or
+            contains(@class,'main-image-content') or
+            contains(@class,'swiper') or
+            contains(@class,'slide') or
+            contains(@class,'gallery') or
+            contains(@class,'product') or
+            contains(@class,'image')
+        ] | //source[@type='image/jpeg' or @type='image/webp']
+    ");
+
+        foreach ($candidates as $el) {
+            /** @var DOMElement $el */
+            $src = $el->getAttribute('src') ?: $el->getAttribute('data-src');
+            $srcset = $el->getAttribute('srcset') ?: $el->getAttribute('data-srcset');
+
+            if ($src) $urls[] = $src;
+
+            if ($srcset) {
+                foreach (explode(',', $srcset) as $part) {
+                    $u = trim(preg_replace('~\s+\d+[wx]$~', '', trim($part))); // отрезать " 524w"
+                    if ($u) $urls[] = $u;
+                }
             }
-            if ($src) {
-                $urls[] = $src;
+        }
+
+        foreach ($xpath->query("//script[@type='application/ld+json']") as $script) {
+            $json = trim($script->nodeValue ?? '');
+            if (!$json) continue;
+            $data = json_decode($json, true);
+            if (!is_array($data)) continue;
+
+            $graphs = isset($data['@graph']) && is_array($data['@graph']) ? $data['@graph'] : [$data];
+            foreach ($graphs as $node) {
+                if (!is_array($node)) continue;
+                if (($node['@type'] ?? '') === 'Product' && !empty($node['image'])) {
+                    if (is_string($node['image'])) $urls[] = $node['image'];
+                    if (is_array($node['image']))  $urls = array_merge($urls, array_values($node['image']));
+                }
             }
         }
 
-        $urls = array_values(array_unique($urls));
-        $urls = array_values(array_filter($urls, fn($u) => preg_match('~flowwow-images\.com/data/flowers/~', $u)));
+        $urls = array_values(array_unique(array_filter(array_map('trim', $urls))));
+        $urls = array_values(array_filter($urls, function ($u) {
+            return (bool)preg_match('~^https?://content\d*\.flowwow-images\.com/.+\.(?:jpe?g|png|webp|gif)(?:\?.*)?$~i', $u);
+        }));
 
         return $urls;
     }