v2: Live Flask app — real Gemini AI demos, Nano Banana image gen, real £19.4M data dashboard

- Flask + gunicorn backend replacing static nginx - 3 live AI demos powered by Gemini 2.5 Flash - Nano Banana + Nano Banana Pro for product image generation - Real JV ecommerce dashboard (728K orders, 230K customers, 4MB data) - AI Infrastructure Proposal + Offer pages - Live product scraper for justvitamins.co.uk + competitor pages - API: /api/scrape, /api/generate-pack, /api/competitor-xray, /api/pdp-surgeon, /api/generate-images
2026-03-02 20:02:25 +08:00
parent 26532ade3c
commit 09d837a660
18 changed files with 4138 additions and 2296 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,197 @@
+"""Scrape product pages — JustVitamins specific + generic competitor."""
+
+import requests
+from bs4 import BeautifulSoup
+import re, json
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                  "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+}
+
+
+def scrape_product(url: str) -> dict:
+    """Scrape a JV product URL and return structured product data."""
+    r = requests.get(url, headers=HEADERS, timeout=15)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+
+    data = {}
+
+    # Title
+    h1 = soup.select_one("h1[itemprop='name']") or soup.select_one("h1")
+    data["title"] = h1.get_text(strip=True) if h1 else ""
+
+    # Subtitle
+    h2 = soup.select_one(".ProdDet h2")
+    data["subtitle"] = h2.get_text(strip=True) if h2 else ""
+
+    # Price from offer microdata
+    offer_price = soup.select_one("meta[itemprop='price']")
+    if offer_price:
+        data["price"] = f"£{offer_price.get('content', '')}"
+    else:
+        price_match = re.search(r'£[\d.]+', soup.get_text())
+        data["price"] = price_match.group(0) if price_match else ""
+
+    # SKU
+    sku = soup.select_one("meta[itemprop='sku']")
+    data["sku"] = sku.get("content", "") if sku else ""
+
+    # Images
+    images = []
+    main_img = soup.select_one("img[itemprop='image']")
+    if main_img:
+        src = main_img.get("src", "")
+        if src and not src.startswith("http"):
+            src = "https://images.justvitamins.co.uk" + src
+        images.append(src)
+    for a in soup.select("#lightboxGallery a, .ThumbnailPhoto a"):
+        href = a.get("href", "")
+        if href:
+            if not href.startswith("http"):
+                href = "https://www.justvitamins.co.uk" + href
+            full = href.replace("/Fullsize/", "/Normal/").replace("/fullsize/", "/Normal/")
+            if full not in images and href not in images:
+                images.append(full if "Normal" in full else href)
+    data["images"] = images
+
+    # Key benefits
+    benefits = []
+    for li in soup.select(".ProdDet li"):
+        txt = li.get_text(strip=True)
+        if txt and 10 < len(txt) < 120:
+            skip = ["subscribe", "save", "free delivery", "pause", "never run out"]
+            if not any(s in txt.lower() for s in skip):
+                benefits.append(txt)
+    seen = set()
+    unique = []
+    for b in benefits:
+        if b not in seen:
+            seen.add(b)
+            unique.append(b)
+    data["benefits"] = unique[:10]
+
+    # Quantity
+    qty = ""
+    for text in soup.stripped_strings:
+        m = re.match(r'(\d+)\s*(tablets?|capsules?|softgels?)', text, re.I)
+        if m:
+            qty = text.strip()
+            break
+    data["quantity"] = qty
+
+    # Per unit cost
+    per_unit = ""
+    for text in soup.stripped_strings:
+        if re.search(r'only\s+[\d.]+p\s+per', text, re.I):
+            per_unit = text.strip()
+            break
+    data["per_unit_cost"] = per_unit
+
+    # Description
+    desc_parts = []
+    found_about = False
+    for el in soup.select(".ProdDet h2, .ProdDet h3, .ProdDet p"):
+        txt = el.get_text(strip=True)
+        if "about this" in txt.lower():
+            found_about = True
+            continue
+        if "product information" in txt.lower():
+            break
+        if found_about and txt:
+            desc_parts.append(txt)
+    data["description"] = "\n".join(desc_parts)
+
+    # EFSA health claims
+    claims = []
+    for li in soup.select(".ProdDet li"):
+        txt = li.get_text(strip=True)
+        if any(k in txt.lower() for k in ["contributes", "maintenance of normal",
+                                            "normal function", "normal absorption"]):
+            claims.append(txt)
+    data["health_claims"] = list(dict.fromkeys(claims))
+
+    # Category from breadcrumbs
+    crumbs = [a.get_text(strip=True) for a in soup.select(".breadC a")]
+    data["category"] = crumbs[1] if len(crumbs) >= 2 else ""
+
+    data["url"] = url
+    return data
+
+
+def scrape_competitor(url: str) -> dict:
+    """Scrape any ecommerce product page and extract what we can."""
+    r = requests.get(url, headers=HEADERS, timeout=15)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    text = soup.get_text(" ", strip=True)
+
+    data = {"url": url, "raw_text": text[:5000]}
+
+    # Title
+    h1 = soup.select_one("h1")
+    data["title"] = h1.get_text(strip=True) if h1 else ""
+
+    # Meta description
+    meta = soup.select_one("meta[name='description']")
+    data["meta_description"] = meta.get("content", "") if meta else ""
+
+    # OG data
+    og_title = soup.select_one("meta[property='og:title']")
+    og_desc = soup.select_one("meta[property='og:description']")
+    data["og_title"] = og_title.get("content", "") if og_title else ""
+    data["og_description"] = og_desc.get("content", "") if og_desc else ""
+
+    # Price — try schema.org, then regex
+    price_meta = soup.select_one("meta[itemprop='price']")
+    if price_meta:
+        data["price"] = price_meta.get("content", "")
+    else:
+        price_match = re.search(r'[$£€][\d,.]+', text)
+        data["price"] = price_match.group(0) if price_match else ""
+
+    # Bullets / features
+    bullets = []
+    for li in soup.select("li"):
+        txt = li.get_text(strip=True)
+        if 15 < len(txt) < 200:
+            bullets.append(txt)
+    data["bullets"] = bullets[:15]
+
+    # Images
+    images = []
+    for img in soup.select("img[src]"):
+        src = img.get("src", "")
+        if src and any(ext in src.lower() for ext in [".jpg", ".png", ".webp"]):
+            if not src.startswith("http"):
+                from urllib.parse import urljoin
+                src = urljoin(url, src)
+            if src not in images:
+                images.append(src)
+    data["images"] = images[:5]
+
+    # Brand from schema
+    brand = soup.select_one("[itemprop='brand']")
+    data["brand"] = brand.get_text(strip=True) if brand else ""
+
+    # Description paragraphs
+    paras = []
+    for p in soup.select("p"):
+        txt = p.get_text(strip=True)
+        if 30 < len(txt) < 500:
+            paras.append(txt)
+    data["description"] = "\n".join(paras[:8])
+
+    return data
+
+
+if __name__ == "__main__":
+    import sys
+    url = sys.argv[1] if len(sys.argv) > 1 else \
+        "https://www.justvitamins.co.uk/Bone-Health/Super-Strength-Vitamin-D3-4000iu-K2-MK-7-100mcg.aspx"
+    if "justvitamins" in url:
+        d = scrape_product(url)
+    else:
+        d = scrape_competitor(url)
+    print(json.dumps(d, indent=2))