diff --git a/scraper.py b/scraper.py index 51f6eb5..a96be45 100644 --- a/scraper.py +++ b/scraper.py @@ -39,7 +39,7 @@ def _browser_headers(ua=_UA_CHROME): "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8", - "Accept-Encoding": "gzip, deflate, br", + "Accept-Encoding": "gzip, deflate", # no brotli — needs pip brotli "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", @@ -273,8 +273,14 @@ Be thorough and specific — this is for competitive analysis.""" def scrape_product(url: str) -> dict: """Scrape a JV product URL and return structured product data.""" sess = _make_session() - r = sess.get(url, timeout=15, allow_redirects=True) + r = sess.get(url, timeout=20, allow_redirects=True) r.raise_for_status() + + # Detect redirect to homepage (product removed/renamed) + final = r.url.rstrip("/") + if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"): + raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}") + soup = BeautifulSoup(r.text, "html.parser") data = {} @@ -287,27 +293,41 @@ def scrape_product(url: str) -> dict: h2 = soup.select_one(".ProdDet h2") data["subtitle"] = h2.get_text(strip=True) if h2 else "" - # Price from offer microdata - offer_price = soup.select_one("meta[itemprop='price']") - if offer_price: - data["price"] = f"£{offer_price.get('content', '')}" + # Price — try itemprop (any element), then class, then regex + price_el = soup.select_one("[itemprop='price']") + if price_el: + price_val = price_el.get("content") or price_el.get_text(strip=True) + price_val = re.sub(r'[^\d.]', '', price_val) + data["price"] = f"£{price_val}" if price_val else "" else: - price_match = re.search(r'£[\d.]+', soup.get_text()) - data["price"] = price_match.group(0) if price_match else "" + # Try common price selectors + for sel in [".pricec", ".product-price", "[class*='Price']"]: + el = soup.select_one(sel) + if el: + pm = re.search(r'£[\d.]+', el.get_text()) + if pm: + data["price"] = pm.group(0) + break + else: + price_match = re.search(r'£[\d.]+', soup.get_text()) + data["price"] = price_match.group(0) if price_match else "" # SKU sku = soup.select_one("meta[itemprop='sku']") data["sku"] = sku.get("content", "") if sku else "" - # Images + # Images — try multiple strategies images = [] - main_img = soup.select_one("img[itemprop='image']") - if main_img: - src = main_img.get("src", "") - if src and not src.startswith("http"): - src = "https://images.justvitamins.co.uk" + src - images.append(src) - for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"): + # 1. itemprop image + for img in soup.select("img[itemprop='image']"): + src = img.get("src", "") + if src: + if not src.startswith("http"): + src = "https://images.justvitamins.co.uk" + src + if src not in images: + images.append(src) + # 2. Product gallery links + for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"): href = a.get("href", "") if href: if not href.startswith("http"): @@ -315,6 +335,19 @@ def scrape_product(url: str) -> dict: full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/") if full not in images and href not in images: images.append(full if "Normal" in full else href) + # 3. Product image in img tags with justvitamins CDN + if not images: + for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"): + src = img.get("src", "") + if src and src not in images: + if not src.startswith("http"): + src = "https://images.justvitamins.co.uk" + src + images.append(src) + # 4. OG image fallback + if not images: + og = soup.select_one("meta[property='og:image']") + if og and og.get("content"): + images.append(og["content"]) data["images"] = images # Key benefits