From ab875cd4d92860dec630922554d5c9d8d53c20bf Mon Sep 17 00:00:00 2001 From: Omair Saleh Date: Mon, 2 Mar 2026 22:43:38 +0800 Subject: [PATCH] fix: JV scraper broken by brotli encoding + improved robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROOT CAUSE: _browser_headers() included 'Accept-Encoding: gzip, deflate, br' but the container has no brotli decoder. Server sent compressed response that requests couldn't decode → garbled HTML → empty title → 'Could not find product' error on Demo A and Demo C. FIXES: - Remove 'br' from Accept-Encoding (use 'gzip, deflate' only) - Price extraction: try itemprop on any element, then .pricec class, then regex - Image extraction: multi-strategy (itemprop, gallery links, CDN pattern, OG) - Detect homepage redirect (product removed/renamed) → clear error message - Increase timeout from 15s to 20s for JV product scraping TESTED: - D3+K2: Title ✓, Price £12.95 ✓, 10 benefits ✓, 3 images ✓ - Vitamin D3 4000iu: Title ✓, £8.95 ✓, 6 benefits ✓, 7 images ✓ - B12: Title ✓, £11.95 ✓, 10 benefits ✓, 7 images ✓ - Removed product: clean error 'redirected to homepage' --- scraper.py | 65 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/scraper.py b/scraper.py index 51f6eb5..a96be45 100644 --- a/scraper.py +++ b/scraper.py @@ -39,7 +39,7 @@ def _browser_headers(ua=_UA_CHROME): "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8", - "Accept-Encoding": "gzip, deflate, br", + "Accept-Encoding": "gzip, deflate", # no brotli — needs pip brotli "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", @@ -273,8 +273,14 @@ Be thorough and specific — this is for competitive analysis.""" def scrape_product(url: str) -> dict: """Scrape a JV product URL and return structured product data.""" sess = _make_session() - r = sess.get(url, timeout=15, allow_redirects=True) + r = sess.get(url, timeout=20, allow_redirects=True) r.raise_for_status() + + # Detect redirect to homepage (product removed/renamed) + final = r.url.rstrip("/") + if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"): + raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}") + soup = BeautifulSoup(r.text, "html.parser") data = {} @@ -287,27 +293,41 @@ def scrape_product(url: str) -> dict: h2 = soup.select_one(".ProdDet h2") data["subtitle"] = h2.get_text(strip=True) if h2 else "" - # Price from offer microdata - offer_price = soup.select_one("meta[itemprop='price']") - if offer_price: - data["price"] = f"£{offer_price.get('content', '')}" + # Price — try itemprop (any element), then class, then regex + price_el = soup.select_one("[itemprop='price']") + if price_el: + price_val = price_el.get("content") or price_el.get_text(strip=True) + price_val = re.sub(r'[^\d.]', '', price_val) + data["price"] = f"£{price_val}" if price_val else "" else: - price_match = re.search(r'£[\d.]+', soup.get_text()) - data["price"] = price_match.group(0) if price_match else "" + # Try common price selectors + for sel in [".pricec", ".product-price", "[class*='Price']"]: + el = soup.select_one(sel) + if el: + pm = re.search(r'£[\d.]+', el.get_text()) + if pm: + data["price"] = pm.group(0) + break + else: + price_match = re.search(r'£[\d.]+', soup.get_text()) + data["price"] = price_match.group(0) if price_match else "" # SKU sku = soup.select_one("meta[itemprop='sku']") data["sku"] = sku.get("content", "") if sku else "" - # Images + # Images — try multiple strategies images = [] - main_img = soup.select_one("img[itemprop='image']") - if main_img: - src = main_img.get("src", "") - if src and not src.startswith("http"): - src = "https://images.justvitamins.co.uk" + src - images.append(src) - for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"): + # 1. itemprop image + for img in soup.select("img[itemprop='image']"): + src = img.get("src", "") + if src: + if not src.startswith("http"): + src = "https://images.justvitamins.co.uk" + src + if src not in images: + images.append(src) + # 2. Product gallery links + for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"): href = a.get("href", "") if href: if not href.startswith("http"): @@ -315,6 +335,19 @@ def scrape_product(url: str) -> dict: full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/") if full not in images and href not in images: images.append(full if "Normal" in full else href) + # 3. Product image in img tags with justvitamins CDN + if not images: + for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"): + src = img.get("src", "") + if src and src not in images: + if not src.startswith("http"): + src = "https://images.justvitamins.co.uk" + src + images.append(src) + # 4. OG image fallback + if not images: + og = soup.select_one("meta[property='og:image']") + if og and og.get("content"): + images.append(og["content"]) data["images"] = images # Key benefits