fix: JV scraper broken by brotli encoding + improved robustness
ROOT CAUSE: _browser_headers() included 'Accept-Encoding: gzip, deflate, br' but the container has no brotli decoder. Server sent compressed response that requests couldn't decode → garbled HTML → empty title → 'Could not find product' error on Demo A and Demo C. FIXES: - Remove 'br' from Accept-Encoding (use 'gzip, deflate' only) - Price extraction: try itemprop on any element, then .pricec class, then regex - Image extraction: multi-strategy (itemprop, gallery links, CDN pattern, OG) - Detect homepage redirect (product removed/renamed) → clear error message - Increase timeout from 15s to 20s for JV product scraping TESTED: - D3+K2: Title ✓, Price £12.95 ✓, 10 benefits ✓, 3 images ✓ - Vitamin D3 4000iu: Title ✓, £8.95 ✓, 6 benefits ✓, 7 images ✓ - B12: Title ✓, £11.95 ✓, 10 benefits ✓, 7 images ✓ - Removed product: clean error 'redirected to homepage'
This commit is contained in:
65
scraper.py
65
scraper.py
@@ -39,7 +39,7 @@ def _browser_headers(ua=_UA_CHROME):
|
|||||||
"User-Agent": ua,
|
"User-Agent": ua,
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
|
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
"Accept-Encoding": "gzip, deflate", # no brotli — needs pip brotli
|
||||||
"Connection": "keep-alive",
|
"Connection": "keep-alive",
|
||||||
"Upgrade-Insecure-Requests": "1",
|
"Upgrade-Insecure-Requests": "1",
|
||||||
"Sec-Fetch-Dest": "document",
|
"Sec-Fetch-Dest": "document",
|
||||||
@@ -273,8 +273,14 @@ Be thorough and specific — this is for competitive analysis."""
|
|||||||
def scrape_product(url: str) -> dict:
|
def scrape_product(url: str) -> dict:
|
||||||
"""Scrape a JV product URL and return structured product data."""
|
"""Scrape a JV product URL and return structured product data."""
|
||||||
sess = _make_session()
|
sess = _make_session()
|
||||||
r = sess.get(url, timeout=15, allow_redirects=True)
|
r = sess.get(url, timeout=20, allow_redirects=True)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
|
# Detect redirect to homepage (product removed/renamed)
|
||||||
|
final = r.url.rstrip("/")
|
||||||
|
if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"):
|
||||||
|
raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}")
|
||||||
|
|
||||||
soup = BeautifulSoup(r.text, "html.parser")
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
|
||||||
data = {}
|
data = {}
|
||||||
@@ -287,27 +293,41 @@ def scrape_product(url: str) -> dict:
|
|||||||
h2 = soup.select_one(".ProdDet h2")
|
h2 = soup.select_one(".ProdDet h2")
|
||||||
data["subtitle"] = h2.get_text(strip=True) if h2 else ""
|
data["subtitle"] = h2.get_text(strip=True) if h2 else ""
|
||||||
|
|
||||||
# Price from offer microdata
|
# Price — try itemprop (any element), then class, then regex
|
||||||
offer_price = soup.select_one("meta[itemprop='price']")
|
price_el = soup.select_one("[itemprop='price']")
|
||||||
if offer_price:
|
if price_el:
|
||||||
data["price"] = f"£{offer_price.get('content', '')}"
|
price_val = price_el.get("content") or price_el.get_text(strip=True)
|
||||||
|
price_val = re.sub(r'[^\d.]', '', price_val)
|
||||||
|
data["price"] = f"£{price_val}" if price_val else ""
|
||||||
else:
|
else:
|
||||||
price_match = re.search(r'£[\d.]+', soup.get_text())
|
# Try common price selectors
|
||||||
data["price"] = price_match.group(0) if price_match else ""
|
for sel in [".pricec", ".product-price", "[class*='Price']"]:
|
||||||
|
el = soup.select_one(sel)
|
||||||
|
if el:
|
||||||
|
pm = re.search(r'£[\d.]+', el.get_text())
|
||||||
|
if pm:
|
||||||
|
data["price"] = pm.group(0)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
price_match = re.search(r'£[\d.]+', soup.get_text())
|
||||||
|
data["price"] = price_match.group(0) if price_match else ""
|
||||||
|
|
||||||
# SKU
|
# SKU
|
||||||
sku = soup.select_one("meta[itemprop='sku']")
|
sku = soup.select_one("meta[itemprop='sku']")
|
||||||
data["sku"] = sku.get("content", "") if sku else ""
|
data["sku"] = sku.get("content", "") if sku else ""
|
||||||
|
|
||||||
# Images
|
# Images — try multiple strategies
|
||||||
images = []
|
images = []
|
||||||
main_img = soup.select_one("img[itemprop='image']")
|
# 1. itemprop image
|
||||||
if main_img:
|
for img in soup.select("img[itemprop='image']"):
|
||||||
src = main_img.get("src", "")
|
src = img.get("src", "")
|
||||||
if src and not src.startswith("http"):
|
if src:
|
||||||
src = "https://images.justvitamins.co.uk" + src
|
if not src.startswith("http"):
|
||||||
images.append(src)
|
src = "https://images.justvitamins.co.uk" + src
|
||||||
for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"):
|
if src not in images:
|
||||||
|
images.append(src)
|
||||||
|
# 2. Product gallery links
|
||||||
|
for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"):
|
||||||
href = a.get("href", "")
|
href = a.get("href", "")
|
||||||
if href:
|
if href:
|
||||||
if not href.startswith("http"):
|
if not href.startswith("http"):
|
||||||
@@ -315,6 +335,19 @@ def scrape_product(url: str) -> dict:
|
|||||||
full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
|
full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
|
||||||
if full not in images and href not in images:
|
if full not in images and href not in images:
|
||||||
images.append(full if "Normal" in full else href)
|
images.append(full if "Normal" in full else href)
|
||||||
|
# 3. Product image in img tags with justvitamins CDN
|
||||||
|
if not images:
|
||||||
|
for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"):
|
||||||
|
src = img.get("src", "")
|
||||||
|
if src and src not in images:
|
||||||
|
if not src.startswith("http"):
|
||||||
|
src = "https://images.justvitamins.co.uk" + src
|
||||||
|
images.append(src)
|
||||||
|
# 4. OG image fallback
|
||||||
|
if not images:
|
||||||
|
og = soup.select_one("meta[property='og:image']")
|
||||||
|
if og and og.get("content"):
|
||||||
|
images.append(og["content"])
|
||||||
data["images"] = images
|
data["images"] = images
|
||||||
|
|
||||||
# Key benefits
|
# Key benefits
|
||||||
|
|||||||
Reference in New Issue
Block a user