calvana/ayn-antivirus/ayn_antivirus/signatures/feeds/urlhaus.py

"""URLhaus feed for AYN Antivirus.

Fetches malicious URLs and payload hashes from the abuse.ch URLhaus
CSV/text exports (free, no API key required).
"""

from __future__ import annotations

import csv
import io
import logging
from typing import Any, Dict, List

import requests

from ayn_antivirus.signatures.feeds.base_feed import BaseFeed

logger = logging.getLogger(__name__)

_CSV_RECENT_URL = "https://urlhaus.abuse.ch/downloads/csv_recent/"
_TEXT_ONLINE_URL = "https://urlhaus.abuse.ch/downloads/text_online/"
_PAYLOAD_RECENT_URL = "https://urlhaus.abuse.ch/downloads/payloads_recent/"
_TIMEOUT = 60


class URLHausFeed(BaseFeed):
    """Fetch malware URLs and payload hashes from URLhaus."""

    def get_name(self) -> str:
        return "urlhaus"

    def fetch(self) -> List[Dict[str, Any]]:
        results = self.fetch_recent()
        results.extend(self.fetch_payloads())
        return results

    def fetch_recent(self) -> List[Dict[str, Any]]:
        """Fetch recent malicious URLs from CSV export."""
        self._rate_limit_wait()
        self._log("Fetching recent URLs from CSV export")

        try:
            resp = requests.get(_CSV_RECENT_URL, timeout=_TIMEOUT)
            resp.raise_for_status()
        except requests.RequestException as exc:
            self._error("CSV download failed: %s", exc)
            return []

        results: List[Dict[str, Any]] = []
        lines = [l for l in resp.text.splitlines() if l.strip() and not l.startswith("#")]
        reader = csv.reader(io.StringIO("\n".join(lines)))
        for row in reader:
            if len(row) < 4:
                continue
            # 0:id, 1:dateadded, 2:url, 3:url_status, 4:threat, 5:tags, 6:urlhaus_link, 7:reporter
            url = row[2].strip().strip('"')
            if not url or not url.startswith("http"):
                continue
            threat = row[4].strip().strip('"') if len(row) > 4 else ""
            results.append({
                "ioc_type": "url",
                "value": url,
                "threat_name": threat if threat and threat != "None" else "Malware.Distribution",
                "type": "malware_distribution",
                "source": "urlhaus",
            })

        self._log("Fetched %d URL(s)", len(results))
        self._mark_updated()
        return results

    def fetch_payloads(self) -> List[Dict[str, Any]]:
        """Fetch recent payload hashes (SHA256) from URLhaus."""
        self._rate_limit_wait()
        self._log("Fetching payload hashes")

        try:
            resp = requests.get(_PAYLOAD_RECENT_URL, timeout=_TIMEOUT)
            resp.raise_for_status()
        except requests.RequestException as exc:
            self._error("Payload download failed: %s", exc)
            return []

        results: List[Dict[str, Any]] = []
        lines = [l for l in resp.text.splitlines() if l.strip() and not l.startswith("#")]
        reader = csv.reader(io.StringIO("\n".join(lines)))
        for row in reader:
            if len(row) < 7:
                continue
            # 0:first_seen, 1:url, 2:file_type, 3:md5, 4:sha256, 5:signature
            sha256 = row[4].strip().strip('"') if len(row) > 4 else ""
            if not sha256 or len(sha256) != 64:
                continue
            sig = row[5].strip().strip('"') if len(row) > 5 else ""
            results.append({
                "hash": sha256.lower(),
                "threat_name": sig if sig and sig != "None" else "Malware.URLhaus.Payload",
                "threat_type": "MALWARE",
                "severity": "HIGH",
                "source": "urlhaus",
                "details": f"file_type={row[2].strip()}" if len(row) > 2 else "",
            })

        self._log("Fetched %d payload hash(es)", len(results))
        return results

    def fetch_active(self) -> List[Dict[str, Any]]:
        """Fetch currently-active malware URLs."""
        self._rate_limit_wait()
        try:
            resp = requests.get(_TEXT_ONLINE_URL, timeout=_TIMEOUT)
            resp.raise_for_status()
        except requests.RequestException as exc:
            self._error("Download failed: %s", exc)
            return []

        results = []
        for line in resp.text.splitlines():
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            results.append({
                "ioc_type": "url",
                "value": line,
                "threat_name": "Malware.Distribution.Active",
                "type": "malware_distribution",
                "source": "urlhaus",
            })
        self._log("Fetched %d active URL(s)", len(results))
        self._mark_updated()
        return results