From ab875cd4d92860dec630922554d5c9d8d53c20bf Mon Sep 17 00:00:00 2001
From: Omair Saleh <omair@quikcue.com>
Date: Mon, 2 Mar 2026 22:43:38 +0800
Subject: [PATCH] fix: JV scraper broken by brotli encoding + improved
 robustness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ROOT CAUSE: _browser_headers() included 'Accept-Encoding: gzip, deflate, br'
but the container has no brotli decoder. Server sent compressed response
that requests couldn't decode → garbled HTML → empty title → 'Could not
find product' error on Demo A and Demo C.

FIXES:
- Remove 'br' from Accept-Encoding (use 'gzip, deflate' only)
- Price extraction: try itemprop on any element, then .pricec class, then regex
- Image extraction: multi-strategy (itemprop, gallery links, CDN pattern, OG)
- Detect homepage redirect (product removed/renamed) → clear error message
- Increase timeout from 15s to 20s for JV product scraping

TESTED:
- D3+K2: Title ✓, Price £12.95 ✓, 10 benefits ✓, 3 images ✓
- Vitamin D3 4000iu: Title ✓, £8.95 ✓, 6 benefits ✓, 7 images ✓
- B12: Title ✓, £11.95 ✓, 10 benefits ✓, 7 images ✓
- Removed product: clean error 'redirected to homepage'
---
 scraper.py | 65 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/scraper.py b/scraper.py
index 51f6eb5..a96be45 100644
--- a/scraper.py
+++ b/scraper.py
@@ -39,7 +39,7 @@ def _browser_headers(ua=_UA_CHROME):
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
         "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
-        "Accept-Encoding": "gzip, deflate, br",
+        "Accept-Encoding": "gzip, deflate",  # no brotli — needs pip brotli
         "Connection": "keep-alive",
         "Upgrade-Insecure-Requests": "1",
         "Sec-Fetch-Dest": "document",
@@ -273,8 +273,14 @@ Be thorough and specific — this is for competitive analysis."""
 def scrape_product(url: str) -> dict:
     """Scrape a JV product URL and return structured product data."""
     sess = _make_session()
-    r = sess.get(url, timeout=15, allow_redirects=True)
+    r = sess.get(url, timeout=20, allow_redirects=True)
     r.raise_for_status()
+
+    # Detect redirect to homepage (product removed/renamed)
+    final = r.url.rstrip("/")
+    if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"):
+        raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}")
+
     soup = BeautifulSoup(r.text, "html.parser")
 
     data = {}
@@ -287,27 +293,41 @@ def scrape_product(url: str) -> dict:
     h2 = soup.select_one(".ProdDet h2")
     data["subtitle"] = h2.get_text(strip=True) if h2 else ""
 
-    # Price from offer microdata
-    offer_price = soup.select_one("meta[itemprop='price']")
-    if offer_price:
-        data["price"] = f"£{offer_price.get('content', '')}"
+    # Price — try itemprop (any element), then class, then regex
+    price_el = soup.select_one("[itemprop='price']")
+    if price_el:
+        price_val = price_el.get("content") or price_el.get_text(strip=True)
+        price_val = re.sub(r'[^\d.]', '', price_val)
+        data["price"] = f"£{price_val}" if price_val else ""
     else:
-        price_match = re.search(r'£[\d.]+', soup.get_text())
-        data["price"] = price_match.group(0) if price_match else ""
+        # Try common price selectors
+        for sel in [".pricec", ".product-price", "[class*='Price']"]:
+            el = soup.select_one(sel)
+            if el:
+                pm = re.search(r'£[\d.]+', el.get_text())
+                if pm:
+                    data["price"] = pm.group(0)
+                    break
+        else:
+            price_match = re.search(r'£[\d.]+', soup.get_text())
+            data["price"] = price_match.group(0) if price_match else ""
 
     # SKU
     sku = soup.select_one("meta[itemprop='sku']")
     data["sku"] = sku.get("content", "") if sku else ""
 
-    # Images
+    # Images — try multiple strategies
     images = []
-    main_img = soup.select_one("img[itemprop='image']")
-    if main_img:
-        src = main_img.get("src", "")
-        if src and not src.startswith("http"):
-            src = "https://images.justvitamins.co.uk" + src
-        images.append(src)
-    for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"):
+    # 1. itemprop image
+    for img in soup.select("img[itemprop='image']"):
+        src = img.get("src", "")
+        if src:
+            if not src.startswith("http"):
+                src = "https://images.justvitamins.co.uk" + src
+            if src not in images:
+                images.append(src)
+    # 2. Product gallery links
+    for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"):
         href = a.get("href", "")
         if href:
             if not href.startswith("http"):
@@ -315,6 +335,19 @@ def scrape_product(url: str) -> dict:
             full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
             if full not in images and href not in images:
                 images.append(full if "Normal" in full else href)
+    # 3. Product image in img tags with justvitamins CDN
+    if not images:
+        for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"):
+            src = img.get("src", "")
+            if src and src not in images:
+                if not src.startswith("http"):
+                    src = "https://images.justvitamins.co.uk" + src
+                images.append(src)
+    # 4. OG image fallback
+    if not images:
+        og = soup.select_one("meta[property='og:image']")
+        if og and og.get("content"):
+            images.append(og["content"])
     data["images"] = images
 
     # Key benefits