Files
calvana/ayn-antivirus/ayn_antivirus/signatures/db/hash_db.py

252 lines
8.4 KiB
Python

"""SQLite-backed malware hash database for AYN Antivirus.
Stores SHA-256 / MD5 hashes of known threats with associated metadata
(threat name, type, severity, source feed) and provides efficient lookup,
bulk-insert, search, and export operations.
"""
from __future__ import annotations
import csv
import logging
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
from ayn_antivirus.constants import DEFAULT_DB_PATH
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Schema
# ---------------------------------------------------------------------------
_SCHEMA = """
CREATE TABLE IF NOT EXISTS threats (
hash TEXT PRIMARY KEY,
threat_name TEXT NOT NULL,
threat_type TEXT NOT NULL DEFAULT 'MALWARE',
severity TEXT NOT NULL DEFAULT 'HIGH',
source TEXT NOT NULL DEFAULT '',
added_date TEXT NOT NULL DEFAULT (datetime('now')),
details TEXT NOT NULL DEFAULT ''
);
CREATE INDEX IF NOT EXISTS idx_threats_type ON threats(threat_type);
CREATE INDEX IF NOT EXISTS idx_threats_source ON threats(source);
CREATE INDEX IF NOT EXISTS idx_threats_name ON threats(threat_name);
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT
);
"""
class HashDatabase:
"""Manage a local SQLite database of known-malicious file hashes.
Parameters
----------
db_path:
Path to the SQLite file. Created automatically (with parent dirs)
if it doesn't exist.
"""
def __init__(self, db_path: str | Path = DEFAULT_DB_PATH) -> None:
self.db_path = Path(db_path)
self._conn: Optional[sqlite3.Connection] = None
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def initialize(self) -> None:
"""Open the database and create tables if necessary."""
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
self._conn.row_factory = sqlite3.Row
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.executescript(_SCHEMA)
self._conn.commit()
logger.info("HashDatabase opened: %s (%d hashes)", self.db_path, self.count())
def close(self) -> None:
"""Flush and close the database."""
if self._conn:
self._conn.close()
self._conn = None
@property
def conn(self) -> sqlite3.Connection:
if self._conn is None:
self.initialize()
assert self._conn is not None
return self._conn
# ------------------------------------------------------------------
# Single-record operations
# ------------------------------------------------------------------
def add_hash(
self,
hash_str: str,
threat_name: str,
threat_type: str = "MALWARE",
severity: str = "HIGH",
source: str = "",
details: str = "",
) -> None:
"""Insert or replace a single hash record."""
self.conn.execute(
"INSERT OR REPLACE INTO threats "
"(hash, threat_name, threat_type, severity, source, added_date, details) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(
hash_str.lower(),
threat_name,
threat_type,
severity,
source,
datetime.utcnow().isoformat(),
details,
),
)
self.conn.commit()
def lookup(self, hash_str: str) -> Optional[Dict[str, Any]]:
"""Look up a hash and return its metadata, or ``None``."""
row = self.conn.execute(
"SELECT * FROM threats WHERE hash = ?", (hash_str.lower(),)
).fetchone()
if row is None:
return None
return dict(row)
def remove(self, hash_str: str) -> bool:
"""Delete a hash record. Returns ``True`` if a row was deleted."""
cur = self.conn.execute(
"DELETE FROM threats WHERE hash = ?", (hash_str.lower(),)
)
self.conn.commit()
return cur.rowcount > 0
# ------------------------------------------------------------------
# Bulk operations
# ------------------------------------------------------------------
def bulk_add(
self,
records: Sequence[Tuple[str, str, str, str, str, str]],
) -> int:
"""Efficiently insert new hashes in a single transaction.
Uses ``INSERT OR IGNORE`` so existing entries are preserved and
only genuinely new hashes are counted.
Parameters
----------
records:
Sequence of ``(hash, threat_name, threat_type, severity, source, details)``
tuples.
Returns
-------
int
Number of **new** rows actually inserted.
"""
if not records:
return 0
now = datetime.utcnow().isoformat()
rows = [
(h.lower(), name, ttype, sev, src, now, det)
for h, name, ttype, sev, src, det in records
]
before = self.count()
self.conn.executemany(
"INSERT OR IGNORE INTO threats "
"(hash, threat_name, threat_type, severity, source, added_date, details) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
rows,
)
self.conn.commit()
return self.count() - before
# ------------------------------------------------------------------
# Query helpers
# ------------------------------------------------------------------
def count(self) -> int:
"""Total number of hashes in the database."""
return self.conn.execute("SELECT COUNT(*) FROM threats").fetchone()[0]
def get_stats(self) -> Dict[str, Any]:
"""Return aggregate statistics about the database."""
c = self.conn
by_type = {
row[0]: row[1]
for row in c.execute(
"SELECT threat_type, COUNT(*) FROM threats GROUP BY threat_type"
).fetchall()
}
by_source = {
row[0]: row[1]
for row in c.execute(
"SELECT source, COUNT(*) FROM threats GROUP BY source"
).fetchall()
}
latest = c.execute(
"SELECT MAX(added_date) FROM threats"
).fetchone()[0]
return {
"total": self.count(),
"by_type": by_type,
"by_source": by_source,
"latest_update": latest,
}
def search(self, query: str) -> List[Dict[str, Any]]:
"""Search threat names with a SQL LIKE pattern.
Example: ``search("%Trojan%")``
"""
rows = self.conn.execute(
"SELECT * FROM threats WHERE threat_name LIKE ? ORDER BY added_date DESC LIMIT 500",
(query,),
).fetchall()
return [dict(r) for r in rows]
# ------------------------------------------------------------------
# Export
# ------------------------------------------------------------------
def export_hashes(self, filepath: str | Path) -> int:
"""Export all hashes to a CSV file. Returns the row count."""
filepath = Path(filepath)
filepath.parent.mkdir(parents=True, exist_ok=True)
rows = self.conn.execute(
"SELECT hash, threat_name, threat_type, severity, source, added_date, details "
"FROM threats ORDER BY added_date DESC"
).fetchall()
with open(filepath, "w", newline="") as fh:
writer = csv.writer(fh)
writer.writerow(["hash", "threat_name", "threat_type", "severity", "source", "added_date", "details"])
for row in rows:
writer.writerow(list(row))
return len(rows)
# ------------------------------------------------------------------
# Meta helpers (used by manager to track feed state)
# ------------------------------------------------------------------
def set_meta(self, key: str, value: str) -> None:
self.conn.execute(
"INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)", (key, value)
)
self.conn.commit()
def get_meta(self, key: str) -> Optional[str]:
row = self.conn.execute(
"SELECT value FROM meta WHERE key = ?", (key,)
).fetchone()
return row[0] if row else None