justvitamin/scraper.py

"""Scrape product pages — JustVitamins specific + generic competitor.

Competitor scraping has a 3-tier fallback chain:
  1. Direct HTTP with full browser headers + session cookies
  2. Retry with alternate User-Agent / headers
  3. Gemini AI URL grounding (reads the page via Google's infrastructure)

This ensures ~100% success even against Cloudflare, CAPTCHAs, JS-rendered
pages, and anti-bot systems.
"""

import os, re, json, time, logging
from urllib.parse import urljoin

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

log = logging.getLogger(__name__)

# ── Browser-like headers ─────────────────────────────────────

_UA_CHROME = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
_UA_FIREFOX = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) "
    "Gecko/20100101 Firefox/133.0"
)
_UA_MAC = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
    "(KHTML, like Gecko) Version/18.2 Safari/605.1.15"
)

def _browser_headers(ua=_UA_CHROME):
    return {
        "User-Agent": ua,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
        "Accept-Encoding": "gzip, deflate",  # no brotli — needs pip brotli
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Cache-Control": "max-age=0",
        "DNT": "1",
    }


def _make_session(ua=_UA_CHROME):
    """Requests session with retries and browser headers."""
    s = requests.Session()
    retry = Retry(total=2, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://", HTTPAdapter(max_retries=retry))
    s.headers.update(_browser_headers(ua))
    return s


def _fetch_html(url: str) -> tuple:
    """Fetch HTML with 4-tier fallback. Returns (html_text, method_used) or raises.
    For 404s and blocked pages, falls back to Gemini Google Search grounding."""
    errors = []
    got_404 = False
    got_blocked = False

    # ── Tier 1: Chrome UA with full headers ──────────────────
    try:
        sess = _make_session(_UA_CHROME)
        r = sess.get(url, timeout=20, allow_redirects=True)
        if r.status_code == 200 and len(r.text) > 1000:
            if not _is_blocked(r.text):
                return (r.text, "direct-chrome")
            got_blocked = True
            errors.append(f"Tier 1: blocked ({_block_reason(r.text)})")
        elif r.status_code == 404:
            got_404 = True
            errors.append(f"Tier 1: 404 Not Found")
        elif r.status_code == 403:
            got_blocked = True
            errors.append(f"Tier 1: 403 Forbidden")
        else:
            errors.append(f"Tier 1: HTTP {r.status_code}")
    except Exception as e:
        errors.append(f"Tier 1: {e}")

    # If it's a 404 or blocked, skip tier 2-3 — go straight to Gemini
    if not (got_404 or got_blocked):
        # ── Tier 2: Firefox UA ───────────────────────────────
        try:
            sess = _make_session(_UA_FIREFOX)
            r = sess.get(url, timeout=20, allow_redirects=True)
            if r.status_code == 200 and len(r.text) > 1000:
                if not _is_blocked(r.text):
                    return (r.text, "direct-firefox")
                got_blocked = True
                errors.append(f"Tier 2: blocked ({_block_reason(r.text)})")
            else:
                errors.append(f"Tier 2: HTTP {r.status_code}")
        except Exception as e:
            errors.append(f"Tier 2: {e}")

        # ── Tier 3: Safari UA with cookie pre-fetch ──────────
        try:
            sess = _make_session(_UA_MAC)
            domain = re.match(r'(https?://[^/]+)', url)
            if domain:
                try:
                    sess.get(domain.group(1) + "/", timeout=10, allow_redirects=True)
                except Exception:
                    pass
            r = sess.get(url, timeout=20, allow_redirects=True)
            if r.status_code == 200 and len(r.text) > 1000:
                if not _is_blocked(r.text):
                    return (r.text, "direct-safari")
                errors.append(f"Tier 3: blocked ({_block_reason(r.text)})")
            else:
                errors.append(f"Tier 3: HTTP {r.status_code}")
        except Exception as e:
            errors.append(f"Tier 3: {e}")

    # ── Tier 4: Gemini + Google Search grounding ─────────────
    # Works for: dead pages (finds similar product), blocked pages,
    # JS-rendered pages, CAPTCHA pages
    try:
        log.info(f"Falling back to Gemini grounding for {url}")
        data = _gemini_scrape(url)
        if data and data.get("title"):
            data["_scrape_method"] = "gemini-grounding"
            data["_scrape_errors"] = errors
            return (None, data)  # Return parsed data directly
        errors.append("Tier 4 (Gemini): no title found")
    except Exception as e:
        errors.append(f"Tier 4 (Gemini): {e}")

    # All tiers failed
    error_summary = " | ".join(errors)
    raise RuntimeError(f"All scraping methods failed for {url}: {error_summary}")


def _is_blocked(html: str) -> bool:
    """Detect anti-bot / CAPTCHA / access denied pages."""
    lower = html.lower()
    signals = [
        "pardon our interruption",
        "access denied",
        "robot check",
        "captcha",
        "please verify you are a human",
        "enable javascript and cookies",
        "just a moment",  # Cloudflare
        "checking your browser",
        "attention required",
        "automated access",
        "unusual traffic",
    ]
    return any(s in lower for s in signals)


def _block_reason(html: str) -> str:
    lower = html.lower()
    if "cloudflare" in lower or "just a moment" in lower:
        return "Cloudflare"
    if "captcha" in lower or "robot check" in lower:
        return "CAPTCHA"
    if "pardon our interruption" in lower:
        return "Bot detection"
    if "access denied" in lower:
        return "Access denied"
    return "Anti-bot"


# ── Gemini URL grounding (Tier 4 fallback) ───────────────────

def _gemini_scrape(url: str) -> dict:
    """Use Gemini with Google Search grounding to extract product data.
    Gemini searches for the URL/product through Google's infrastructure,
    bypassing anti-bot systems."""
    api_key = os.environ.get("GEMINI_API_KEY", "")
    if not api_key:
        raise RuntimeError("No GEMINI_API_KEY for fallback scraping")

    from google import genai
    from google.genai import types

    client = genai.Client(api_key=api_key)

    prompt = f"""Find and extract product information from this URL: {url}

Search for this exact product page and extract all available data.
If the product page no longer exists (404), search for the same or similar product from the same brand.

Return JSON:
{{
  "title": "Product title/name",
  "brand": "Brand name",
  "price": "Price with currency symbol (e.g. £9.99)",
  "description": "Product description text (up to 2000 chars)",
  "bullets": ["Feature or benefit 1", "Feature 2", "Feature 3"],
  "images": [],
  "meta_description": "What this product page would say",
  "ingredients": "Key ingredients if it's a supplement",
  "rating": "Star rating if known",
  "review_count": "Number of reviews if known",
  "url_status": "live" or "dead/redirected" or "blocked"
}}

Be thorough and specific — this is for competitive analysis."""

    try:
        # Can't use response_mime_type=json with tools, so parse manually
        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(
                temperature=0.1,
                tools=[types.Tool(google_search=types.GoogleSearch())],
            ),
        )
        text = response.text or ""
        # Extract JSON from response
        match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
        if match:
            text = match.group(1)
        else:
            match = re.search(r'\{.*\}', text, re.DOTALL)
            if match:
                text = match.group(0)
        data = json.loads(text)
        data["url"] = url
        data["raw_text"] = data.get("description", "")[:5000]
        if not data.get("bullets"):
            data["bullets"] = []
        if not data.get("images"):
            data["images"] = []
        return data
    except json.JSONDecodeError:
        # Gemini returned text but not valid JSON — extract what we can
        raw = response.text or ""
        log.warning(f"Gemini returned non-JSON, extracting manually")
        data = {
            "url": url,
            "title": "",
            "brand": "",
            "price": "",
            "description": raw[:3000],
            "raw_text": raw[:5000],
            "bullets": [],
            "images": [],
        }
        # Try to extract title from the text
        for line in raw.split("\n"):
            line = line.strip().strip("*#- ")
            if "title" in line.lower() and ":" in line:
                data["title"] = line.split(":", 1)[1].strip().strip('"')
                break
            if len(line) > 10 and len(line) < 100 and not data["title"]:
                data["title"] = line
        return data
    except Exception as e:
        log.error(f"Gemini scrape failed: {e}")
        raise


# ═══════════════════════════════════════════════════════════════
# JustVitamins scraper (Tier 1 only — it's our own site)
# ═══════════════════════════════════════════════════════════════

def scrape_product(url: str) -> dict:
    """Scrape a JV product URL and return structured product data."""
    sess = _make_session()
    r = sess.get(url, timeout=20, allow_redirects=True)
    r.raise_for_status()

    # Detect redirect to homepage (product removed/renamed)
    final = r.url.rstrip("/")
    if final.endswith("justvitamins.co.uk") or final.endswith("justvitamins.co.uk/"):
        raise RuntimeError(f"Product page redirected to homepage — URL may be outdated: {url}")

    soup = BeautifulSoup(r.text, "html.parser")

    data = {}

    # Title
    h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1")
    data["title"] = h1.get_text(strip=True) if h1 else ""

    # Subtitle
    h2 = soup.select_one(".ProdDet h2")
    data["subtitle"] = h2.get_text(strip=True) if h2 else ""

    # Price — try itemprop (any element), then class, then regex
    price_el = soup.select_one("[itemprop='price']")
    if price_el:
        price_val = price_el.get("content") or price_el.get_text(strip=True)
        price_val = re.sub(r'[^\d.]', '', price_val)
        data["price"] = f"£{price_val}" if price_val else ""
    else:
        # Try common price selectors
        for sel in [".pricec", ".product-price", "[class*='Price']"]:
            el = soup.select_one(sel)
            if el:
                pm = re.search(r'£[\d.]+', el.get_text())
                if pm:
                    data["price"] = pm.group(0)
                    break
        else:
            price_match = re.search(r'£[\d.]+', soup.get_text())
            data["price"] = price_match.group(0) if price_match else ""

    # SKU
    sku = soup.select_one("meta[itemprop='sku']")
    data["sku"] = sku.get("content", "") if sku else ""

    # Images — try multiple strategies
    images = []
    # 1. itemprop image
    for img in soup.select("img[itemprop='image']"):
        src = img.get("src", "")
        if src:
            if not src.startswith("http"):
                src = "https://images.justvitamins.co.uk" + src
            if src not in images:
                images.append(src)
    # 2. Product gallery links
    for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a, .ProdImg a"):
        href = a.get("href", "")
        if href:
            if not href.startswith("http"):
                href = "https://www.justvitamins.co.uk" + href
            full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
            if full not in images and href not in images:
                images.append(full if "Normal" in full else href)
    # 3. Product image in img tags with justvitamins CDN
    if not images:
        for img in soup.select("img[src*='justvitamins'], img[src*='product-images']"):
            src = img.get("src", "")
            if src and src not in images:
                if not src.startswith("http"):
                    src = "https://images.justvitamins.co.uk" + src
                images.append(src)
    # 4. OG image fallback
    if not images:
        og = soup.select_one("meta[property='og:image']")
        if og and og.get("content"):
            images.append(og["content"])
    data["images"] = images

    # Key benefits
    benefits = []
    for li in soup.select(".ProdDet li"):
        txt = li.get_text(strip=True)
        if txt and 10 < len(txt) < 120:
            skip = ["subscribe", "save", "free delivery", "pause", "never run out"]
            if not any(s in txt.lower() for s in skip):
                benefits.append(txt)
    seen = set()
    unique = []
    for b in benefits:
        if b not in seen:
            seen.add(b)
            unique.append(b)
    data["benefits"] = unique[:10]

    # Quantity
    qty = ""
    for text in soup.stripped_strings:
        m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I)
        if m:
            qty = text.strip()
            break
    data["quantity"] = qty

    # Per unit cost
    per_unit = ""
    for text in soup.stripped_strings:
        if re.search(r'only\s+[\d.]+p\s+per', text, re.I):
            per_unit = text.strip()
            break
    data["per_unit_cost"] = per_unit

    # Description
    desc_parts = []
    found_about = False
    for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"):
        txt = el.get_text(strip=True)
        if "about this" in txt.lower():
            found_about = True
            continue
        if "product information" in txt.lower():
            break
        if found_about and txt:
            desc_parts.append(txt)
    data["description"] = "\n".join(desc_parts)

    # EFSA health claims
    claims = []
    for li in soup.select(".ProdDet li"):
        txt = li.get_text(strip=True)
        if any(k in txt.lower() for k in ["contributes", "maintenance of normal",
                                            "normal function", "normal absorption"]):
            claims.append(txt)
    data["health_claims"] = list(dict.fromkeys(claims))

    # Category from breadcrumbs
    crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")]
    data["category"] = crumbs[1] if len(crumbs) >= 2 else ""

    data["url"] = url
    return data


# ═══════════════════════════════════════════════════════════════
# Competitor scraper — bulletproof with fallback chain
# ═══════════════════════════════════════════════════════════════

def scrape_competitor(url: str) -> dict:
    """Scrape any competitor product page. Uses 4-tier fallback to ensure success."""
    result = _fetch_html(url)

    # If Gemini grounding returned parsed data directly (Tier 4)
    if isinstance(result[1], dict):
        return result[1]

    html_text, method = result
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text(" ", strip=True)

    data = {"url": url, "raw_text": text[:5000], "_scrape_method": method}

    # Title — try multiple selectors
    title = ""
    for sel in ["h1[itemprop='name']", "h1.product-title", "h1.product-name",
                "h1.pdp__title", "[data-testid='product-title']", "h1"]:
        el = soup.select_one(sel)
        if el:
            title = el.get_text(strip=True)
            if len(title) > 5:
                break
    if not title:
        og = soup.select_one("meta[property='og:title']")
        title = og.get("content", "") if og else ""
    data["title"] = title

    # Meta description
    meta = soup.select_one("meta[name='description']")
    data["meta_description"] = meta.get("content", "") if meta else ""

    # OG data
    og_title = soup.select_one("meta[property='og:title']")
    og_desc = soup.select_one("meta[property='og:description']")
    og_img = soup.select_one("meta[property='og:image']")
    data["og_title"] = og_title.get("content", "") if og_title else ""
    data["og_description"] = og_desc.get("content", "") if og_desc else ""

    # Price — multiple strategies
    price = ""
    # Schema.org
    for sel in ["meta[itemprop='price']", "[itemprop='price']",
                "meta[property='product:price:amount']"]:
        el = soup.select_one(sel)
        if el:
            price = el.get("content", "") or el.get_text(strip=True)
            if price:
                break
    # JSON-LD
    if not price:
        for script in soup.select("script[type='application/ld+json']"):
            try:
                ld = json.loads(script.string)
                if isinstance(ld, list):
                    ld = ld[0]
                offers = ld.get("offers", {})
                if isinstance(offers, list):
                    offers = offers[0]
                price = str(offers.get("price", ""))
                if price:
                    currency = offers.get("priceCurrency", "GBP")
                    sym = {"GBP": "£", "USD": "$", "EUR": "€"}.get(currency, "")
                    price = f"{sym}{price}"
                    break
            except Exception:
                continue
    # Regex fallback
    if not price:
        price_match = re.search(r'[$£€][\d,.]+', text)
        price = price_match.group(0) if price_match else ""
    data["price"] = price

    # Bullets / features
    bullets = []
    for li in soup.select("li"):
        txt = li.get_text(strip=True)
        if 15 < len(txt) < 300:
            # Skip nav/menu items
            parent = li.parent
            if parent and parent.name in ("ul", "ol"):
                parent_class = " ".join(parent.get("class", []))
                if any(skip in parent_class.lower() for skip in ["nav", "menu", "footer", "breadcrumb"]):
                    continue
            bullets.append(txt)
    data["bullets"] = bullets[:15]

    # Images — try OG first, then product images
    images = []
    if og_img:
        images.append(og_img.get("content", ""))
    for img in soup.select("img[src]"):
        src = img.get("src", "")
        alt = (img.get("alt", "") or "").lower()
        # Prioritise product images
        if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]):
            if not src.startswith("http"):
                src = urljoin(url, src)
            # Skip tiny icons, tracking pixels, logos
            width = img.get("width", "")
            if width and width.isdigit() and int(width) < 50:
                continue
            if any(skip in src.lower() for skip in ["icon", "logo", "pixel", "tracking", "badge"]):
                continue
            if src not in images:
                images.append(src)
    data["images"] = images[:8]

    # Brand from schema or common selectors
    brand = ""
    for sel in ["[itemprop='brand']", ".product-brand", "[data-testid='brand']"]:
        el = soup.select_one(sel)
        if el:
            brand = el.get_text(strip=True)
            if brand:
                break
    if not brand:
        # Try JSON-LD
        for script in soup.select("script[type='application/ld+json']"):
            try:
                ld = json.loads(script.string)
                if isinstance(ld, list):
                    ld = ld[0]
                b = ld.get("brand", {})
                brand = b.get("name", "") if isinstance(b, dict) else str(b)
                if brand:
                    break
            except Exception:
                continue
    data["brand"] = brand

    # Description
    paras = []
    for p in soup.select("p"):
        txt = p.get_text(strip=True)
        if 30 < len(txt) < 500:
            paras.append(txt)
    data["description"] = "\n".join(paras[:8])

    # If we got very little from HTML, enrich with Gemini
    if not data["title"] or (len(data.get("description", "")) < 50 and not data.get("bullets")):
        try:
            gemini_data = _gemini_scrape(url)
            if gemini_data.get("title"):
                # Merge — Gemini fills gaps
                for k, v in gemini_data.items():
                    if k in data and not data[k]:
                        data[k] = v
                    elif k not in data:
                        data[k] = v
                data["_enriched_by"] = "gemini"
        except Exception as e:
            log.warning(f"Gemini enrichment failed: {e}")

    return data


if __name__ == "__main__":
    import sys
    logging.basicConfig(level=logging.INFO)
    url = sys.argv[1] if len(sys.argv) > 1 else \
        "https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx"
    if "justvitamins" in url:
        d = scrape_product(url)
    else:
        d = scrape_competitor(url)
    print(json.dumps(d, indent=2, default=str))