justvitamin/scraper.py

"""Scrape product pages — JustVitamins specific + generic competitor."""

import requests
from bs4 import BeautifulSoup
import re, json

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}


def scrape_product(url: str) -> dict:
    """Scrape a JV product URL and return structured product data."""
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    data = {}

    # Title
    h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1")
    data["title"] = h1.get_text(strip=True) if h1 else ""

    # Subtitle
    h2 = soup.select_one(".ProdDet h2")
    data["subtitle"] = h2.get_text(strip=True) if h2 else ""

    # Price from offer microdata
    offer_price = soup.select_one("meta[itemprop='price']")
    if offer_price:
        data["price"] = f"£{offer_price.get('content', '')}"
    else:
        price_match = re.search(r'£[\d.]+', soup.get_text())
        data["price"] = price_match.group(0) if price_match else ""

    # SKU
    sku = soup.select_one("meta[itemprop='sku']")
    data["sku"] = sku.get("content", "") if sku else ""

    # Images
    images = []
    main_img = soup.select_one("img[itemprop='image']")
    if main_img:
        src = main_img.get("src", "")
        if src and not src.startswith("http"):
            src = "https://images.justvitamins.co.uk" + src
        images.append(src)
    for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"):
        href = a.get("href", "")
        if href:
            if not href.startswith("http"):
                href = "https://www.justvitamins.co.uk" + href
            full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
            if full not in images and href not in images:
                images.append(full if "Normal" in full else href)
    data["images"] = images

    # Key benefits
    benefits = []
    for li in soup.select(".ProdDet li"):
        txt = li.get_text(strip=True)
        if txt and 10 < len(txt) < 120:
            skip = ["subscribe", "save", "free delivery", "pause", "never run out"]
            if not any(s in txt.lower() for s in skip):
                benefits.append(txt)
    seen = set()
    unique = []
    for b in benefits:
        if b not in seen:
            seen.add(b)
            unique.append(b)
    data["benefits"] = unique[:10]

    # Quantity
    qty = ""
    for text in soup.stripped_strings:
        m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I)
        if m:
            qty = text.strip()
            break
    data["quantity"] = qty

    # Per unit cost
    per_unit = ""
    for text in soup.stripped_strings:
        if re.search(r'only\s+[\d.]+p\s+per', text, re.I):
            per_unit = text.strip()
            break
    data["per_unit_cost"] = per_unit

    # Description
    desc_parts = []
    found_about = False
    for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"):
        txt = el.get_text(strip=True)
        if "about this" in txt.lower():
            found_about = True
            continue
        if "product information" in txt.lower():
            break
        if found_about and txt:
            desc_parts.append(txt)
    data["description"] = "\n".join(desc_parts)

    # EFSA health claims
    claims = []
    for li in soup.select(".ProdDet li"):
        txt = li.get_text(strip=True)
        if any(k in txt.lower() for k in ["contributes", "maintenance of normal",
                                            "normal function", "normal absorption"]):
            claims.append(txt)
    data["health_claims"] = list(dict.fromkeys(claims))

    # Category from breadcrumbs
    crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")]
    data["category"] = crumbs[1] if len(crumbs) >= 2 else ""

    data["url"] = url
    return data


def scrape_competitor(url: str) -> dict:
    """Scrape any ecommerce product page and extract what we can."""
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    text = soup.get_text(" ", strip=True)

    data = {"url": url, "raw_text": text[:5000]}

    # Title
    h1 = soup.select_one("h1")
    data["title"] = h1.get_text(strip=True) if h1 else ""

    # Meta description
    meta = soup.select_one("meta[name='description']")
    data["meta_description"] = meta.get("content", "") if meta else ""

    # OG data
    og_title = soup.select_one("meta[property='og:title']")
    og_desc = soup.select_one("meta[property='og:description']")
    data["og_title"] = og_title.get("content", "") if og_title else ""
    data["og_description"] = og_desc.get("content", "") if og_desc else ""

    # Price — try schema.org, then regex
    price_meta = soup.select_one("meta[itemprop='price']")
    if price_meta:
        data["price"] = price_meta.get("content", "")
    else:
        price_match = re.search(r'[$£€][\d,.]+', text)
        data["price"] = price_match.group(0) if price_match else ""

    # Bullets / features
    bullets = []
    for li in soup.select("li"):
        txt = li.get_text(strip=True)
        if 15 < len(txt) < 200:
            bullets.append(txt)
    data["bullets"] = bullets[:15]

    # Images
    images = []
    for img in soup.select("img[src]"):
        src = img.get("src", "")
        if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]):
            if not src.startswith("http"):
                from urllib.parse import urljoin
                src = urljoin(url, src)
            if src not in images:
                images.append(src)
    data["images"] = images[:5]

    # Brand from schema
    brand = soup.select_one("[itemprop='brand']")
    data["brand"] = brand.get_text(strip=True) if brand else ""

    # Description paragraphs
    paras = []
    for p in soup.select("p"):
        txt = p.get_text(strip=True)
        if 30 < len(txt) < 500:
            paras.append(txt)
    data["description"] = "\n".join(paras[:8])

    return data


if __name__ == "__main__":
    import sys
    url = sys.argv[1] if len(sys.argv) > 1 else \
        "https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx"
    if "justvitamins" in url:
        d = scrape_product(url)
    else:
        d = scrape_competitor(url)
    print(json.dumps(d, indent=2))