Files
justvitamin/scraper.py
Omair Saleh ab875cd4d9 fix: JV scraper broken by brotli encoding + improved robustness
ROOT CAUSE: _browser_headers() included 'Accept-Encoding: gzip, deflate, br'
but the container has no brotli decoder. Server sent compressed response
that requests couldn't decode → garbled HTML → empty title → 'Could not
find product' error on Demo A and Demo C.

FIXES:
- Remove 'br' from Accept-Encoding (use 'gzip, deflate' only)
- Price extraction: try itemprop on any element, then .pricec class, then regex
- Image extraction: multi-strategy (itemprop, gallery links, CDN pattern, OG)
- Detect homepage redirect (product removed/renamed) → clear error message
- Increase timeout from 15s to 20s for JV product scraping

TESTED:
- D3+K2: Title ✓, Price £12.95 ✓, 10 benefits ✓, 3 images ✓
- Vitamin D3 4000iu: Title ✓, £8.95 ✓, 6 benefits ✓, 7 images ✓
- B12: Title ✓, £11.95 ✓, 10 benefits ✓, 7 images ✓
- Removed product: clean error 'redirected to homepage'
2026-03-02 22:43:38 +08:00

588 lines
22 KiB
Python

"""Scrape product pages — JustVitamins specific + generic competitor.
Competitor scraping has a 3-tier fallback chain:
1. Direct HTTP with full browser headers + session cookies
2. Retry with alternate User-Agent / headers
3. Gemini AI URL grounding (reads the page via Google's infrastructure)
This ensures ~100% success even against Cloudflare, CAPTCHAs, JS-rendered
pages, and anti-bot systems.
"""
import os, re, json, time, logging
from urllib.parse import urljoin
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
log = logging.getLogger(__name__)
# ── Browser-like headers ─────────────────────────────────────
_UA_CHROME = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
_UA_FIREFOX = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) "
"Gecko/20100101 Firefox/133.0"
)
_UA_MAC = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/18.2 Safari/605.1.15"
)
def _browser_headers(ua=_UA_CHROME):
return {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
"Accept-Encoding": "gzip, deflate", # no brotli — needs pip brotli
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
"DNT": "1",
}
def _make_session(ua=_UA_CHROME):
"""Requests session with retries and browser headers."""
s = requests.Session()
retry = Retry(total=2, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retry))
s.mount("http://", HTTPAdapter(max_retries=retry))
s.headers.update(_browser_headers(ua))
return s
def _fetch_html(url: str) -> tuple:
"""Fetch HTML with 4-tier fallback. Returns (html_text, method_used) or raises.
For 404s and blocked pages, falls back to Gemini Google Search grounding."""
errors = []
got_404 = False
got_blocked = False
# ── Tier 1: Chrome UA with full headers ──────────────────
try:
sess = _make_session(_UA_CHROME)
r = sess.get(url, timeout=20, allow_redirects=True)
if r.status_code == 200 and len(r.text) > 1000:
if not _is_blocked(r.text):
return (r.text, "direct-chrome")
got_blocked = True
errors.append(f"Tier 1: blocked ({_block_reason(r.text)})")
elif r.status_code == 404:
got_404 = True
errors.append(f"Tier 1: 404 Not Found")
elif r.status_code == 403:
got_blocked = True
errors.append(f"Tier 1: 403 Forbidden")
else:
errors.append(f"Tier 1: HTTP {r.status_code}")
except Exception as e:
errors.append(f"Tier 1: {e}")
# If it's a 404 or blocked, skip tier 2-3 — go straight to Gemini
if not (got_404 or got_blocked):
# ── Tier 2: Firefox UA ───────────────────────────────
try:
sess = _make_session(_UA_FIREFOX)
r = sess.get(url, timeout=20, allow_redirects=True)
if r.status_code == 200 and len(r.text) > 1000:
if not _is_blocked(r.text):
return (r.text, "direct-firefox")
got_blocked = True
errors.append(f"Tier 2: blocked ({_block_reason(r.text)})")
else:
errors.append(f"Tier 2: HTTP {r.status_code}")
except Exception as e:
errors.append(f"Tier 2: {e}")
# ── Tier 3: Safari UA with cookie pre-fetch ──────────
try:
sess = _make_session(_UA_MAC)
domain = re.match(r'(https?://[^/]+)', url)
if domain:
try:
sess.get(domain.group(1) + "/", timeout=10, allow_redirects=True)
except Exception:
pass
r = sess.get(url, timeout=20, allow_redirects=True)
if r.status_code == 200 and len(r.text) > 1000:
if not _is_blocked(r.text):
return (r.text, "direct-safari")
errors.append(f"Tier 3: blocked ({_block_reason(r.text)})")
else:
errors.append(f"Tier 3: HTTP {r.status_code}")
except Exception as e:
errors.append(f"Tier 3: {e}")
# ── Tier 4: Gemini + Google Search grounding ─────────────
# Works for: dead pages (finds similar product), blocked pages,
# JS-rendered pages, CAPTCHA pages
try:
log.info(f"Falling back to Gemini grounding for {url}")
data = _gemini_scrape(url)
if data and data.get("title"):
data["_scrape_method"] = "gemini-grounding"
data["_scrape_errors"] = errors
return (None, data) # Return parsed data directly
errors.append("Tier 4 (Gemini): no title found")
except Exception as e:
errors.append(f"Tier 4 (Gemini): {e}")
# All tiers failed
error_summary = " | ".join(errors)
raise RuntimeError(f"All scraping methods failed for {url}: {error_summary}")
def _is_blocked(html: str) -> bool:
"""Detect anti-bot / CAPTCHA / access denied pages."""
lower = html.lower()
signals = [
"pardon our interruption",
"access denied",
"robot check",
"captcha",
"please verify you are a human",
"enable javascript and cookies",
"just a moment", # Cloudflare
"checking your browser",
"attention required",
"automated access",
"unusual traffic",
]
return any(s in lower for s in signals)
def _block_reason(html: str) -> str:
lower = html.lower()
if "cloudflare" in lower or "just a moment" in lower:
return "Cloudflare"
if "captcha" in lower or "robot check" in lower:
return "CAPTCHA"
if "pardon our interruption" in lower:
return "Bot detection"
if "access denied" in lower:
return "Access denied"
return "Anti-bot"
# ── Gemini URL grounding (Tier 4 fallback) ───────────────────
def _gemini_scrape(url: str) -> dict:
"""Use Gemini with Google Search grounding to extract product data.
Gemini searches for the URL/product through Google's infrastructure,
bypassing anti-bot systems."""
api_key = os.environ.get("GEMINI_API_KEY", "")
if not api_key:
raise RuntimeError("No GEMINI_API_KEY for fallback scraping")
from google import genai
from google.genai import types
client = genai.Client(api_key=api_key)
prompt = f"""Find and extract product information from this URL: {url}
Search for this exact product page and extract all available data.
If the product page no longer exists (404), search for the same or similar product from the same brand.
Return JSON:
{{
"title": "Product title/name",
"brand": "Brand name",
"price": "Price with currency symbol (e.g. £9.99)",
"description": "Product description text (up to 2000 chars)",
"bullets": ["Feature or benefit 1", "Feature 2", "Feature 3"],
"images": [],
"meta_description": "What this product page would say",
"ingredients": "Key ingredients if it's a supplement",
"rating": "Star rating if known",
"review_count": "Number of reviews if known",
"url_status": "live" or "dead/redirected" or "blocked"
}}
Be thorough and specific — this is for competitive analysis."""
try:
# Can't use response_mime_type=json with tools, so parse manually
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=prompt,
config=types.GenerateContentConfig(
temperature=0.1,
tools=[types.Tool(google_search=types.GoogleSearch())],
),
)
text = response.text or ""
# Extract JSON from response
match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
if match:
text = match.group(1)
else:
match = re.search(r'\{.*\}', text, re.DOTALL)
if match:
text = match.group(0)
data = json.loads(text)
data["url"] = url
data["raw_text"] = data.get("description", "")[:5000]
if not data.get("bullets"):
data["bullets"] = []
if not data.get("images"):
data["images"] = []
return data
except json.JSONDecodeError:
# Gemini returned text but not valid JSON — extract what we can
raw = response.text or ""
log.warning(f"Gemini returned non-JSON, extracting manually")
data = {
"url": url,
"title": "",
"brand": "",
"price": "",
"description": raw[:3000],
"raw_text": raw[:5000],
"bullets": [],
"images": [],
}
# Try to extract title from the text
for line in raw.split("\n"):
line = line.strip().strip("*#- ")
if "title" in line.lower() and ":" in line:
data["title"] = line.split(":", 1)[1].strip().strip('"')
break
if len(line) > 10 and len(line) < 100 and not data["title"]:
data["title"] = line
return data
except Exception as e:
log.error(f"Gemini scrape failed: {e}")
raise
# ═══════════════════════════════════════════════════════════════
# JustVitamins scraper (Tier 1 only — it's our own site)
# ═══════════════════════════════════════════════════════════════
def scrape_product(url: str) -> dict:
"""Scrape a JV product URL and return structured product data."""
sess = _make_session()
r = sess.get(url, timeout=20, allow_redirects=True)
r.raise_for_status()
# Detect redirect to homepage (product removed/renamed)
final = r.url.rstrip("/")
if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"):
raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}")
soup = BeautifulSoup(r.text, "html.parser")
data = {}
# Title
h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1")
data["title"] = h1.get_text(strip=True) if h1 else ""
# Subtitle
h2 = soup.select_one(".ProdDet h2")
data["subtitle"] = h2.get_text(strip=True) if h2 else ""
# Price — try itemprop (any element), then class, then regex
price_el = soup.select_one("[itemprop='price']")
if price_el:
price_val = price_el.get("content") or price_el.get_text(strip=True)
price_val = re.sub(r'[^\d.]', '', price_val)
data["price"] = f"£{price_val}" if price_val else ""
else:
# Try common price selectors
for sel in [".pricec", ".product-price", "[class*='Price']"]:
el = soup.select_one(sel)
if el:
pm = re.search(r'£[\d.]+', el.get_text())
if pm:
data["price"] = pm.group(0)
break
else:
price_match = re.search(r'£[\d.]+', soup.get_text())
data["price"] = price_match.group(0) if price_match else ""
# SKU
sku = soup.select_one("meta[itemprop='sku']")
data["sku"] = sku.get("content", "") if sku else ""
# Images — try multiple strategies
images = []
# 1. itemprop image
for img in soup.select("img[itemprop='image']"):
src = img.get("src", "")
if src:
if not src.startswith("http"):
src = "https://images.justvitamins.co.uk" + src
if src not in images:
images.append(src)
# 2. Product gallery links
for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"):
href = a.get("href", "")
if href:
if not href.startswith("http"):
href = "https://www.justvitamins.co.uk" + href
full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
if full not in images and href not in images:
images.append(full if "Normal" in full else href)
# 3. Product image in img tags with justvitamins CDN
if not images:
for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"):
src = img.get("src", "")
if src and src not in images:
if not src.startswith("http"):
src = "https://images.justvitamins.co.uk" + src
images.append(src)
# 4. OG image fallback
if not images:
og = soup.select_one("meta[property='og:image']")
if og and og.get("content"):
images.append(og["content"])
data["images"] = images
# Key benefits
benefits = []
for li in soup.select(".ProdDet li"):
txt = li.get_text(strip=True)
if txt and 10 < len(txt) < 120:
skip = ["subscribe", "save", "free delivery", "pause", "never run out"]
if not any(s in txt.lower() for s in skip):
benefits.append(txt)
seen = set()
unique = []
for b in benefits:
if b not in seen:
seen.add(b)
unique.append(b)
data["benefits"] = unique[:10]
# Quantity
qty = ""
for text in soup.stripped_strings:
m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I)
if m:
qty = text.strip()
break
data["quantity"] = qty
# Per unit cost
per_unit = ""
for text in soup.stripped_strings:
if re.search(r'only\s+[\d.]+p\s+per', text, re.I):
per_unit = text.strip()
break
data["per_unit_cost"] = per_unit
# Description
desc_parts = []
found_about = False
for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"):
txt = el.get_text(strip=True)
if "about this" in txt.lower():
found_about = True
continue
if "product information" in txt.lower():
break
if found_about and txt:
desc_parts.append(txt)
data["description"] = "\n".join(desc_parts)
# EFSA health claims
claims = []
for li in soup.select(".ProdDet li"):
txt = li.get_text(strip=True)
if any(k in txt.lower() for k in ["contributes", "maintenance of normal",
"normal function", "normal absorption"]):
claims.append(txt)
data["health_claims"] = list(dict.fromkeys(claims))
# Category from breadcrumbs
crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")]
data["category"] = crumbs[1] if len(crumbs) >= 2 else ""
data["url"] = url
return data
# ═══════════════════════════════════════════════════════════════
# Competitor scraper — bulletproof with fallback chain
# ═══════════════════════════════════════════════════════════════
def scrape_competitor(url: str) -> dict:
"""Scrape any competitor product page. Uses 4-tier fallback to ensure success."""
result = _fetch_html(url)
# If Gemini grounding returned parsed data directly (Tier 4)
if isinstance(result[1], dict):
return result[1]
html_text, method = result
soup = BeautifulSoup(html_text, "html.parser")
text = soup.get_text(" ", strip=True)
data = {"url": url, "raw_text": text[:5000], "_scrape_method": method}
# Title — try multiple selectors
title = ""
for sel in ["h1[itemprop='name']", "h1.product-title", "h1.product-name",
"h1.pdp__title", "[data-testid='product-title']", "h1"]:
el = soup.select_one(sel)
if el:
title = el.get_text(strip=True)
if len(title) > 5:
break
if not title:
og = soup.select_one("meta[property='og:title']")
title = og.get("content", "") if og else ""
data["title"] = title
# Meta description
meta = soup.select_one("meta[name='description']")
data["meta_description"] = meta.get("content", "") if meta else ""
# OG data
og_title = soup.select_one("meta[property='og:title']")
og_desc = soup.select_one("meta[property='og:description']")
og_img = soup.select_one("meta[property='og:image']")
data["og_title"] = og_title.get("content", "") if og_title else ""
data["og_description"] = og_desc.get("content", "") if og_desc else ""
# Price — multiple strategies
price = ""
# Schema.org
for sel in ["meta[itemprop='price']", "[itemprop='price']",
"meta[property='product:price:amount']"]:
el = soup.select_one(sel)
if el:
price = el.get("content", "") or el.get_text(strip=True)
if price:
break
# JSON-LD
if not price:
for script in soup.select("script[type='application/ld+json']"):
try:
ld = json.loads(script.string)
if isinstance(ld, list):
ld = ld[0]
offers = ld.get("offers", {})
if isinstance(offers, list):
offers = offers[0]
price = str(offers.get("price", ""))
if price:
currency = offers.get("priceCurrency", "GBP")
sym = {"GBP": "£", "USD": "$", "EUR": ""}.get(currency, "")
price = f"{sym}{price}"
break
except Exception:
continue
# Regex fallback
if not price:
price_match = re.search(r'[$£€][\d,.]+', text)
price = price_match.group(0) if price_match else ""
data["price"] = price
# Bullets / features
bullets = []
for li in soup.select("li"):
txt = li.get_text(strip=True)
if 15 < len(txt) < 300:
# Skip nav/menu items
parent = li.parent
if parent and parent.name in ("ul", "ol"):
parent_class = " ".join(parent.get("class", []))
if any(skip in parent_class.lower() for skip in ["nav", "menu", "footer", "breadcrumb"]):
continue
bullets.append(txt)
data["bullets"] = bullets[:15]
# Images — try OG first, then product images
images = []
if og_img:
images.append(og_img.get("content", ""))
for img in soup.select("img[src]"):
src = img.get("src", "")
alt = (img.get("alt", "") or "").lower()
# Prioritise product images
if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]):
if not src.startswith("http"):
src = urljoin(url, src)
# Skip tiny icons, tracking pixels, logos
width = img.get("width", "")
if width and width.isdigit() and int(width) < 50:
continue
if any(skip in src.lower() for skip in ["icon", "logo", "pixel", "tracking", "badge"]):
continue
if src not in images:
images.append(src)
data["images"] = images[:8]
# Brand from schema or common selectors
brand = ""
for sel in ["[itemprop='brand']", ".product-brand", "[data-testid='brand']"]:
el = soup.select_one(sel)
if el:
brand = el.get_text(strip=True)
if brand:
break
if not brand:
# Try JSON-LD
for script in soup.select("script[type='application/ld+json']"):
try:
ld = json.loads(script.string)
if isinstance(ld, list):
ld = ld[0]
b = ld.get("brand", {})
brand = b.get("name", "") if isinstance(b, dict) else str(b)
if brand:
break
except Exception:
continue
data["brand"] = brand
# Description
paras = []
for p in soup.select("p"):
txt = p.get_text(strip=True)
if 30 < len(txt) < 500:
paras.append(txt)
data["description"] = "\n".join(paras[:8])
# If we got very little from HTML, enrich with Gemini
if not data["title"] or (len(data.get("description", "")) < 50 and not data.get("bullets")):
try:
gemini_data = _gemini_scrape(url)
if gemini_data.get("title"):
# Merge — Gemini fills gaps
for k, v in gemini_data.items():
if k in data and not data[k]:
data[k] = v
elif k not in data:
data[k] = v
data["_enriched_by"] = "gemini"
except Exception as e:
log.warning(f"Gemini enrichment failed: {e}")
return data
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
url = sys.argv[1] if len(sys.argv) > 1 else \
"https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx"
if "justvitamins" in url:
d = scrape_product(url)
else:
d = scrape_competitor(url)
print(json.dumps(d, indent=2, default=str))