"""Scrape product pages — JustVitamins specific + generic competitor.""" import requests from bs4 import BeautifulSoup import re, json HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" } def scrape_product(url: str) -> dict: """Scrape a JV product URL and return structured product data.""" r = requests.get(url, headers=HEADERS, timeout=15) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") data = {} # Title h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1") data["title"] = h1.get_text(strip=True) if h1 else "" # Subtitle h2 = soup.select_one(".ProdDet h2") data["subtitle"] = h2.get_text(strip=True) if h2 else "" # Price from offer microdata offer_price = soup.select_one("meta[itemprop='price']") if offer_price: data["price"] = f"£{offer_price.get('content', '')}" else: price_match = re.search(r'£[\d.]+', soup.get_text()) data["price"] = price_match.group(0) if price_match else "" # SKU sku = soup.select_one("meta[itemprop='sku']") data["sku"] = sku.get("content", "") if sku else "" # Images images = [] main_img = soup.select_one("img[itemprop='image']") if main_img: src = main_img.get("src", "") if src and not src.startswith("http"): src = "https://images.justvitamins.co.uk" + src images.append(src) for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"): href = a.get("href", "") if href: if not href.startswith("http"): href = "https://www.justvitamins.co.uk" + href full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/") if full not in images and href not in images: images.append(full if "Normal" in full else href) data["images"] = images # Key benefits benefits = [] for li in soup.select(".ProdDet li"): txt = li.get_text(strip=True) if txt and 10 < len(txt) < 120: skip = ["subscribe", "save", "free delivery", "pause", "never run out"] if not any(s in txt.lower() for s in skip): benefits.append(txt) seen = set() unique = [] for b in benefits: if b not in seen: seen.add(b) unique.append(b) data["benefits"] = unique[:10] # Quantity qty = "" for text in soup.stripped_strings: m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I) if m: qty = text.strip() break data["quantity"] = qty # Per unit cost per_unit = "" for text in soup.stripped_strings: if re.search(r'only\s+[\d.]+p\s+per', text, re.I): per_unit = text.strip() break data["per_unit_cost"] = per_unit # Description desc_parts = [] found_about = False for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"): txt = el.get_text(strip=True) if "about this" in txt.lower(): found_about = True continue if "product information" in txt.lower(): break if found_about and txt: desc_parts.append(txt) data["description"] = "\n".join(desc_parts) # EFSA health claims claims = [] for li in soup.select(".ProdDet li"): txt = li.get_text(strip=True) if any(k in txt.lower() for k in ["contributes", "maintenance of normal", "normal function", "normal absorption"]): claims.append(txt) data["health_claims"] = list(dict.fromkeys(claims)) # Category from breadcrumbs crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")] data["category"] = crumbs[1] if len(crumbs) >= 2 else "" data["url"] = url return data def scrape_competitor(url: str) -> dict: """Scrape any ecommerce product page and extract what we can.""" r = requests.get(url, headers=HEADERS, timeout=15) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") text = soup.get_text(" ", strip=True) data = {"url": url, "raw_text": text[:5000]} # Title h1 = soup.select_one("h1") data["title"] = h1.get_text(strip=True) if h1 else "" # Meta description meta = soup.select_one("meta[name='description']") data["meta_description"] = meta.get("content", "") if meta else "" # OG data og_title = soup.select_one("meta[property='og:title']") og_desc = soup.select_one("meta[property='og:description']") data["og_title"] = og_title.get("content", "") if og_title else "" data["og_description"] = og_desc.get("content", "") if og_desc else "" # Price — try schema.org, then regex price_meta = soup.select_one("meta[itemprop='price']") if price_meta: data["price"] = price_meta.get("content", "") else: price_match = re.search(r'[$£€][\d,.]+', text) data["price"] = price_match.group(0) if price_match else "" # Bullets / features bullets = [] for li in soup.select("li"): txt = li.get_text(strip=True) if 15 < len(txt) < 200: bullets.append(txt) data["bullets"] = bullets[:15] # Images images = [] for img in soup.select("img[src]"): src = img.get("src", "") if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]): if not src.startswith("http"): from urllib.parse import urljoin src = urljoin(url, src) if src not in images: images.append(src) data["images"] = images[:5] # Brand from schema brand = soup.select_one("[itemprop='brand']") data["brand"] = brand.get_text(strip=True) if brand else "" # Description paragraphs paras = [] for p in soup.select("p"): txt = p.get_text(strip=True) if 30 < len(txt) < 500: paras.append(txt) data["description"] = "\n".join(paras[:8]) return data if __name__ == "__main__": import sys url = sys.argv[1] if len(sys.argv) > 1 else \ "https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx" if "justvitamins" in url: d = scrape_product(url) else: d = scrape_competitor(url) print(json.dumps(d, indent=2))