"""SQLite-backed malware hash database for AYN Antivirus. Stores SHA-256 / MD5 hashes of known threats with associated metadata (threat name, type, severity, source feed) and provides efficient lookup, bulk-insert, search, and export operations. """ from __future__ import annotations import csv import logging import sqlite3 from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple from ayn_antivirus.constants import DEFAULT_DB_PATH logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Schema # --------------------------------------------------------------------------- _SCHEMA = """ CREATE TABLE IF NOT EXISTS threats ( hash TEXT PRIMARY KEY, threat_name TEXT NOT NULL, threat_type TEXT NOT NULL DEFAULT 'MALWARE', severity TEXT NOT NULL DEFAULT 'HIGH', source TEXT NOT NULL DEFAULT '', added_date TEXT NOT NULL DEFAULT (datetime('now')), details TEXT NOT NULL DEFAULT '' ); CREATE INDEX IF NOT EXISTS idx_threats_type ON threats(threat_type); CREATE INDEX IF NOT EXISTS idx_threats_source ON threats(source); CREATE INDEX IF NOT EXISTS idx_threats_name ON threats(threat_name); CREATE TABLE IF NOT EXISTS meta ( key TEXT PRIMARY KEY, value TEXT ); """ class HashDatabase: """Manage a local SQLite database of known-malicious file hashes. Parameters ---------- db_path: Path to the SQLite file. Created automatically (with parent dirs) if it doesn't exist. """ def __init__(self, db_path: str | Path = DEFAULT_DB_PATH) -> None: self.db_path = Path(db_path) self._conn: Optional[sqlite3.Connection] = None # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ def initialize(self) -> None: """Open the database and create tables if necessary.""" self.db_path.parent.mkdir(parents=True, exist_ok=True) self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False) self._conn.row_factory = sqlite3.Row self._conn.execute("PRAGMA journal_mode=WAL") self._conn.executescript(_SCHEMA) self._conn.commit() logger.info("HashDatabase opened: %s (%d hashes)", self.db_path, self.count()) def close(self) -> None: """Flush and close the database.""" if self._conn: self._conn.close() self._conn = None @property def conn(self) -> sqlite3.Connection: if self._conn is None: self.initialize() assert self._conn is not None return self._conn # ------------------------------------------------------------------ # Single-record operations # ------------------------------------------------------------------ def add_hash( self, hash_str: str, threat_name: str, threat_type: str = "MALWARE", severity: str = "HIGH", source: str = "", details: str = "", ) -> None: """Insert or replace a single hash record.""" self.conn.execute( "INSERT OR REPLACE INTO threats " "(hash, threat_name, threat_type, severity, source, added_date, details) " "VALUES (?, ?, ?, ?, ?, ?, ?)", ( hash_str.lower(), threat_name, threat_type, severity, source, datetime.utcnow().isoformat(), details, ), ) self.conn.commit() def lookup(self, hash_str: str) -> Optional[Dict[str, Any]]: """Look up a hash and return its metadata, or ``None``.""" row = self.conn.execute( "SELECT * FROM threats WHERE hash = ?", (hash_str.lower(),) ).fetchone() if row is None: return None return dict(row) def remove(self, hash_str: str) -> bool: """Delete a hash record. Returns ``True`` if a row was deleted.""" cur = self.conn.execute( "DELETE FROM threats WHERE hash = ?", (hash_str.lower(),) ) self.conn.commit() return cur.rowcount > 0 # ------------------------------------------------------------------ # Bulk operations # ------------------------------------------------------------------ def bulk_add( self, records: Sequence[Tuple[str, str, str, str, str, str]], ) -> int: """Efficiently insert new hashes in a single transaction. Uses ``INSERT OR IGNORE`` so existing entries are preserved and only genuinely new hashes are counted. Parameters ---------- records: Sequence of ``(hash, threat_name, threat_type, severity, source, details)`` tuples. Returns ------- int Number of **new** rows actually inserted. """ if not records: return 0 now = datetime.utcnow().isoformat() rows = [ (h.lower(), name, ttype, sev, src, now, det) for h, name, ttype, sev, src, det in records ] before = self.count() self.conn.executemany( "INSERT OR IGNORE INTO threats " "(hash, threat_name, threat_type, severity, source, added_date, details) " "VALUES (?, ?, ?, ?, ?, ?, ?)", rows, ) self.conn.commit() return self.count() - before # ------------------------------------------------------------------ # Query helpers # ------------------------------------------------------------------ def count(self) -> int: """Total number of hashes in the database.""" return self.conn.execute("SELECT COUNT(*) FROM threats").fetchone()[0] def get_stats(self) -> Dict[str, Any]: """Return aggregate statistics about the database.""" c = self.conn by_type = { row[0]: row[1] for row in c.execute( "SELECT threat_type, COUNT(*) FROM threats GROUP BY threat_type" ).fetchall() } by_source = { row[0]: row[1] for row in c.execute( "SELECT source, COUNT(*) FROM threats GROUP BY source" ).fetchall() } latest = c.execute( "SELECT MAX(added_date) FROM threats" ).fetchone()[0] return { "total": self.count(), "by_type": by_type, "by_source": by_source, "latest_update": latest, } def search(self, query: str) -> List[Dict[str, Any]]: """Search threat names with a SQL LIKE pattern. Example: ``search("%Trojan%")`` """ rows = self.conn.execute( "SELECT * FROM threats WHERE threat_name LIKE ? ORDER BY added_date DESC LIMIT 500", (query,), ).fetchall() return [dict(r) for r in rows] # ------------------------------------------------------------------ # Export # ------------------------------------------------------------------ def export_hashes(self, filepath: str | Path) -> int: """Export all hashes to a CSV file. Returns the row count.""" filepath = Path(filepath) filepath.parent.mkdir(parents=True, exist_ok=True) rows = self.conn.execute( "SELECT hash, threat_name, threat_type, severity, source, added_date, details " "FROM threats ORDER BY added_date DESC" ).fetchall() with open(filepath, "w", newline="") as fh: writer = csv.writer(fh) writer.writerow(["hash", "threat_name", "threat_type", "severity", "source", "added_date", "details"]) for row in rows: writer.writerow(list(row)) return len(rows) # ------------------------------------------------------------------ # Meta helpers (used by manager to track feed state) # ------------------------------------------------------------------ def set_meta(self, key: str, value: str) -> None: self.conn.execute( "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", (key, value) ) self.conn.commit() def get_meta(self, key: str) -> Optional[str]: row = self.conn.execute( "SELECT value FROM meta WHERE key = ?", (key,) ).fetchone() return row[0] if row else None