v2: Live Flask app — real Gemini AI demos, Nano Banana image gen, real £19.4M data dashboard
- Flask + gunicorn backend replacing static nginx - 3 live AI demos powered by Gemini 2.5 Flash - Nano Banana + Nano Banana Pro for product image generation - Real JV ecommerce dashboard (728K orders, 230K customers, 4MB data) - AI Infrastructure Proposal + Offer pages - Live product scraper for justvitamins.co.uk + competitor pages - API: /api/scrape, /api/generate-pack, /api/competitor-xray, /api/pdp-surgeon, /api/generate-images
This commit is contained in:
197
scraper.py
Normal file
197
scraper.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""Scrape product pages — JustVitamins specific + generic competitor."""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re, json
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
def scrape_product(url: str) -> dict:
|
||||
"""Scrape a JV product URL and return structured product data."""
|
||||
r = requests.get(url, headers=HEADERS, timeout=15)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
data = {}
|
||||
|
||||
# Title
|
||||
h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1")
|
||||
data["title"] = h1.get_text(strip=True) if h1 else ""
|
||||
|
||||
# Subtitle
|
||||
h2 = soup.select_one(".ProdDet h2")
|
||||
data["subtitle"] = h2.get_text(strip=True) if h2 else ""
|
||||
|
||||
# Price from offer microdata
|
||||
offer_price = soup.select_one("meta[itemprop='price']")
|
||||
if offer_price:
|
||||
data["price"] = f"£{offer_price.get('content', '')}"
|
||||
else:
|
||||
price_match = re.search(r'£[\d.]+', soup.get_text())
|
||||
data["price"] = price_match.group(0) if price_match else ""
|
||||
|
||||
# SKU
|
||||
sku = soup.select_one("meta[itemprop='sku']")
|
||||
data["sku"] = sku.get("content", "") if sku else ""
|
||||
|
||||
# Images
|
||||
images = []
|
||||
main_img = soup.select_one("img[itemprop='image']")
|
||||
if main_img:
|
||||
src = main_img.get("src", "")
|
||||
if src and not src.startswith("http"):
|
||||
src = "https://images.justvitamins.co.uk" + src
|
||||
images.append(src)
|
||||
for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"):
|
||||
href = a.get("href", "")
|
||||
if href:
|
||||
if not href.startswith("http"):
|
||||
href = "https://www.justvitamins.co.uk" + href
|
||||
full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
|
||||
if full not in images and href not in images:
|
||||
images.append(full if "Normal" in full else href)
|
||||
data["images"] = images
|
||||
|
||||
# Key benefits
|
||||
benefits = []
|
||||
for li in soup.select(".ProdDet li"):
|
||||
txt = li.get_text(strip=True)
|
||||
if txt and 10 < len(txt) < 120:
|
||||
skip = ["subscribe", "save", "free delivery", "pause", "never run out"]
|
||||
if not any(s in txt.lower() for s in skip):
|
||||
benefits.append(txt)
|
||||
seen = set()
|
||||
unique = []
|
||||
for b in benefits:
|
||||
if b not in seen:
|
||||
seen.add(b)
|
||||
unique.append(b)
|
||||
data["benefits"] = unique[:10]
|
||||
|
||||
# Quantity
|
||||
qty = ""
|
||||
for text in soup.stripped_strings:
|
||||
m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I)
|
||||
if m:
|
||||
qty = text.strip()
|
||||
break
|
||||
data["quantity"] = qty
|
||||
|
||||
# Per unit cost
|
||||
per_unit = ""
|
||||
for text in soup.stripped_strings:
|
||||
if re.search(r'only\s+[\d.]+p\s+per', text, re.I):
|
||||
per_unit = text.strip()
|
||||
break
|
||||
data["per_unit_cost"] = per_unit
|
||||
|
||||
# Description
|
||||
desc_parts = []
|
||||
found_about = False
|
||||
for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"):
|
||||
txt = el.get_text(strip=True)
|
||||
if "about this" in txt.lower():
|
||||
found_about = True
|
||||
continue
|
||||
if "product information" in txt.lower():
|
||||
break
|
||||
if found_about and txt:
|
||||
desc_parts.append(txt)
|
||||
data["description"] = "\n".join(desc_parts)
|
||||
|
||||
# EFSA health claims
|
||||
claims = []
|
||||
for li in soup.select(".ProdDet li"):
|
||||
txt = li.get_text(strip=True)
|
||||
if any(k in txt.lower() for k in ["contributes", "maintenance of normal",
|
||||
"normal function", "normal absorption"]):
|
||||
claims.append(txt)
|
||||
data["health_claims"] = list(dict.fromkeys(claims))
|
||||
|
||||
# Category from breadcrumbs
|
||||
crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")]
|
||||
data["category"] = crumbs[1] if len(crumbs) >= 2 else ""
|
||||
|
||||
data["url"] = url
|
||||
return data
|
||||
|
||||
|
||||
def scrape_competitor(url: str) -> dict:
|
||||
"""Scrape any ecommerce product page and extract what we can."""
|
||||
r = requests.get(url, headers=HEADERS, timeout=15)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
text = soup.get_text(" ", strip=True)
|
||||
|
||||
data = {"url": url, "raw_text": text[:5000]}
|
||||
|
||||
# Title
|
||||
h1 = soup.select_one("h1")
|
||||
data["title"] = h1.get_text(strip=True) if h1 else ""
|
||||
|
||||
# Meta description
|
||||
meta = soup.select_one("meta[name='description']")
|
||||
data["meta_description"] = meta.get("content", "") if meta else ""
|
||||
|
||||
# OG data
|
||||
og_title = soup.select_one("meta[property='og:title']")
|
||||
og_desc = soup.select_one("meta[property='og:description']")
|
||||
data["og_title"] = og_title.get("content", "") if og_title else ""
|
||||
data["og_description"] = og_desc.get("content", "") if og_desc else ""
|
||||
|
||||
# Price — try schema.org, then regex
|
||||
price_meta = soup.select_one("meta[itemprop='price']")
|
||||
if price_meta:
|
||||
data["price"] = price_meta.get("content", "")
|
||||
else:
|
||||
price_match = re.search(r'[$£€][\d,.]+', text)
|
||||
data["price"] = price_match.group(0) if price_match else ""
|
||||
|
||||
# Bullets / features
|
||||
bullets = []
|
||||
for li in soup.select("li"):
|
||||
txt = li.get_text(strip=True)
|
||||
if 15 < len(txt) < 200:
|
||||
bullets.append(txt)
|
||||
data["bullets"] = bullets[:15]
|
||||
|
||||
# Images
|
||||
images = []
|
||||
for img in soup.select("img[src]"):
|
||||
src = img.get("src", "")
|
||||
if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]):
|
||||
if not src.startswith("http"):
|
||||
from urllib.parse import urljoin
|
||||
src = urljoin(url, src)
|
||||
if src not in images:
|
||||
images.append(src)
|
||||
data["images"] = images[:5]
|
||||
|
||||
# Brand from schema
|
||||
brand = soup.select_one("[itemprop='brand']")
|
||||
data["brand"] = brand.get_text(strip=True) if brand else ""
|
||||
|
||||
# Description paragraphs
|
||||
paras = []
|
||||
for p in soup.select("p"):
|
||||
txt = p.get_text(strip=True)
|
||||
if 30 < len(txt) < 500:
|
||||
paras.append(txt)
|
||||
data["description"] = "\n".join(paras[:8])
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
url = sys.argv[1] if len(sys.argv) > 1 else \
|
||||
"https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx"
|
||||
if "justvitamins" in url:
|
||||
d = scrape_product(url)
|
||||
else:
|
||||
d = scrape_competitor(url)
|
||||
print(json.dumps(d, indent=2))
|
||||
Reference in New Issue
Block a user