calvana/ayn-antivirus/ayn_antivirus/detectors/ai_analyzer.py

"""AYN Antivirus — AI-Powered Threat Analyzer.

Uses Claude to analyze suspicious files and filter false positives.
Each detection from heuristic/signature scanners is verified by AI
before being reported as a real threat.
"""

from __future__ import annotations

import json
import logging
import os
import platform
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

SYSTEM_PROMPT = """Linux VPS antivirus analyst. {environment}
Normal: pip/npm scripts in /usr/local/bin, Docker hex IDs, cron jobs (fstrim/certbot/logrotate), high-entropy archives, curl/wget in deploy scripts, recently-modified files after apt/pip.
Reply ONLY JSON: {{"verdict":"threat"|"safe"|"suspicious","confidence":0-100,"reason":"short","recommended_action":"quarantine"|"delete"|"ignore"|"monitor"}}"""

ANALYSIS_PROMPT = """FILE:{file_path} DETECT:{threat_name}({threat_type}) SEV:{severity} DET:{detector} CONF:{original_confidence}% SIZE:{file_size} PERM:{permissions} OWN:{owner} MOD:{mtime}
PREVIEW:
{content_preview}
JSON verdict:"""


@dataclass
class AIVerdict:
    """Result of AI analysis on a detection."""
    verdict: str          # threat, safe, suspicious
    confidence: int       # 0-100
    reason: str
    recommended_action: str  # quarantine, delete, ignore, monitor
    raw_response: str = ""

    @property
    def is_threat(self) -> bool:
        return self.verdict == "threat"

    @property
    def is_safe(self) -> bool:
        return self.verdict == "safe"


class AIAnalyzer:
    """AI-powered threat analysis using Claude."""

    def __init__(self, api_key: Optional[str] = None, model: str = "claude-sonnet-4-20250514"):
        self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "") or self._load_key_from_env_file()
        self._model = model
        self._client = None
        self._environment = self._detect_environment()

    @staticmethod
    def _load_key_from_env_file() -> str:
        for p in ["/opt/ayn-antivirus/.env", Path.home() / ".ayn-antivirus" / ".env"]:
            try:
                for line in Path(p).read_text().splitlines():
                    line = line.strip()
                    if line.startswith("ANTHROPIC_API_KEY=") and not line.endswith("="):
                        return line.split("=", 1)[1].strip().strip("'\"")
            except Exception:
                pass
        return ""

    @property
    def available(self) -> bool:
        return bool(self._api_key)

    def _get_client(self):
        if not self._client:
            try:
                import anthropic
                self._client = anthropic.Anthropic(api_key=self._api_key)
            except Exception as exc:
                logger.error("Failed to init Anthropic client: %s", exc)
                return None
        return self._client

    @staticmethod
    def _detect_environment() -> str:
        """Gather environment context for the AI."""
        import shutil
        parts = [
            f"OS: {platform.system()} {platform.release()}",
            f"Hostname: {platform.node()}",
            f"Arch: {platform.machine()}",
        ]
        if shutil.which("incus"):
            parts.append("Container runtime: Incus/LXC (containers run Docker inside)")
        if shutil.which("docker"):
            parts.append("Docker: available")
        if Path("/etc/dokploy").exists() or shutil.which("dokploy"):
            parts.append("Platform: Dokploy (Docker deployment platform)")

        # Check if we're inside a container
        if Path("/run/host/container-manager").exists():
            parts.append("Running inside: managed container")
        return "\n".join(parts)

    def _get_file_context(self, file_path: str) -> Dict[str, Any]:
        """Gather file metadata and content preview."""
        p = Path(file_path)
        ctx = {
            "file_size": 0,
            "permissions": "",
            "owner": "",
            "mtime": "",
            "content_preview": "[file not readable]",
        }
        try:
            st = p.stat()
            ctx["file_size"] = st.st_size
            ctx["permissions"] = oct(st.st_mode)[-4:]
            ctx["mtime"] = str(st.st_mtime)
            try:
                import pwd
                ctx["owner"] = pwd.getpwuid(st.st_uid).pw_name
            except Exception:
                ctx["owner"] = str(st.st_uid)
        except OSError:
            pass

        try:
            with open(file_path, "rb") as f:
                raw = f.read(512)
            # Try text decode, fall back to hex
            try:
                ctx["content_preview"] = raw.decode("utf-8", errors="replace")
            except Exception:
                ctx["content_preview"] = raw.hex()[:512]
        except Exception:
            pass

        return ctx

    def analyze(
        self,
        file_path: str,
        threat_name: str,
        threat_type: str,
        severity: str,
        detector: str,
        confidence: int = 50,
    ) -> AIVerdict:
        """Analyze a single detection with AI."""
        if not self.available:
            # No API key — pass through as-is
            return AIVerdict(
                verdict="suspicious",
                confidence=confidence,
                reason="AI analysis unavailable (no API key)",
                recommended_action="quarantine",
            )

        client = self._get_client()
        if not client:
            return AIVerdict(
                verdict="suspicious",
                confidence=confidence,
                reason="AI client init failed",
                recommended_action="quarantine",
            )

        ctx = self._get_file_context(file_path)

        # Sanitize content preview to avoid format string issues
        preview = ctx.get("content_preview", "")
        if len(preview) > 500:
            preview = preview[:500] + "..."
        # Replace curly braces to avoid format() issues
        preview = preview.replace("{", "{{").replace("}", "}}")

        user_msg = ANALYSIS_PROMPT.format(
            file_path=file_path,
            threat_name=threat_name,
            threat_type=threat_type,
            severity=severity,
            detector=detector,
            original_confidence=confidence,
            file_size=ctx.get("file_size", 0),
            permissions=ctx.get("permissions", ""),
            owner=ctx.get("owner", ""),
            mtime=ctx.get("mtime", ""),
            content_preview=preview,
        )

        text = ""
        try:
            response = client.messages.create(
                model=self._model,
                max_tokens=150,
                system=SYSTEM_PROMPT.format(environment=self._environment),
                messages=[{"role": "user", "content": user_msg}],
            )
            text = response.content[0].text.strip()

            # Parse JSON from response (handle markdown code blocks)
            if "```" in text:
                parts = text.split("```")
                for part in parts[1:]:
                    cleaned = part.strip()
                    if cleaned.startswith("json"):
                        cleaned = cleaned[4:].strip()
                    if cleaned.startswith("{"):
                        text = cleaned
                        break

            # Find the JSON object in the response
            start = text.find("{")
            end = text.rfind("}") + 1
            if start >= 0 and end > start:
                text = text[start:end]

            data = json.loads(text)
            return AIVerdict(
                verdict=data.get("verdict", "suspicious"),
                confidence=data.get("confidence", 50),
                reason=data.get("reason", ""),
                recommended_action=data.get("recommended_action", "quarantine"),
                raw_response=text,
            )
        except json.JSONDecodeError as exc:
            logger.warning("AI returned non-JSON: %s — raw: %s", exc, text[:200])
            return AIVerdict(
                verdict="suspicious",
                confidence=confidence,
                reason=f"AI parse error: {text[:100]}",
                recommended_action="quarantine",
                raw_response=text,
            )
        except Exception as exc:
            logger.error("AI analysis failed: %s", exc)
            return AIVerdict(
                verdict="suspicious",
                confidence=confidence,
                reason=f"AI error: {exc}",
                recommended_action="quarantine",
            )

    def analyze_batch(
        self,
        detections: List[Dict[str, Any]],
    ) -> List[Dict[str, Any]]:
        """Analyze a batch of detections. Returns enriched detections with AI verdicts.

        Each detection dict should have: file_path, threat_name, threat_type, severity, detector
        """
        results = []
        for d in detections:
            verdict = self.analyze(
                file_path=d.get("file_path", ""),
                threat_name=d.get("threat_name", ""),
                threat_type=d.get("threat_type", ""),
                severity=d.get("severity", "MEDIUM"),
                detector=d.get("detector", ""),
                confidence=d.get("confidence", 50),
            )
            enriched = dict(d)
            enriched["ai_verdict"] = verdict.verdict
            enriched["ai_confidence"] = verdict.confidence
            enriched["ai_reason"] = verdict.reason
            enriched["ai_action"] = verdict.recommended_action
            results.append(enriched)
        return results