API reference¶
Core Python classes and their interfaces.
HYDRA components¶
SessionClassifier¶
Classifies sessions as signal or noise.
from src.session_classifier import SessionClassifier
classifier = SessionClassifier(config={
"ephemeral_threshold_s": 5, # Max duration for bot_ephemeral
"recon_threshold_s": 20, # Max duration for bot_recon
"recon_max_cmds": 3, # Max commands for bot_recon
"human_min_duration_s": 20, # Min duration for likely_human
"human_min_non_disc": 1, # Min non-discovery commands
})
result = classifier.classify_session(session_data)
# result.label: "bot_ephemeral" | "bot_exec_scanner" | "bot_recon" | "likely_human"
# result.confidence: 0.0-1.0
# result.is_signal: bool
PromptGuard¶
Scores commands for injection likelihood.
from src.prompt_guard import PromptGuard
guard = PromptGuard()
result = guard.score(command="ignore previous instructions")
# result.score: 0.95
# result.patterns: ["ignore_instructions"]
# result.action: "log_only"
PersonaEngine¶
Manages persona selection and injection.
from src.persona_engine import PersonaEngine
engine = PersonaEngine()
persona = engine.select()
# persona.name: "fintech_trading"
# persona.hostname: "srv-prod-01"
# persona.llm_context: "You are simulating a fintech trading server..."
# persona.vfs_overlay: dict of files
PDX components¶
DataRouter¶
Routes events into dual-use streams.
from pdx.training.data_router import DataRouter
router = DataRouter(
hydra_logs_dir="path/to/logs",
output_dir="training_output/data_router"
)
# Split raw logs
stats = router.split_sessions()
# stats: {"sessions_processed": 3337, "defensive_events": 8668, "offensive_events": 4910}
# Generate training data
router.generate_defensive() # → sft_detection_patterns.jsonl + dpo_lure_quality.jsonl
router.generate_offensive() # → sft_attack_chains.jsonl + raft_kill_chains.jsonl
router.generate_combined() # → react_dual_perspective.jsonl
RouterEngine¶
Multi-model analysis cascade.
from engines.router import RouterEngine
router = RouterEngine(config)
# Tier 1: LocalCopilotEngine (7B)
# Tier 2: LocalTeacherEngine (32B)
# Tier 3: AnthropicEngine (API)
# Tier 4: WebChatEngine (fallback)
QualityPipeline¶
Deduplication, filtering, and curriculum ordering.
from pdx.training.quality.pipeline import QualityPipeline
qp = QualityPipeline(
min_quality=0.3,
dedup_threshold=0.85,
min_tokens=50,
max_tokens=2000
)
clean = qp.run(entries, dedup=True, quality_filter=True, curriculum=True)
TrainingPipeline¶
Processes .pdx binary files into training entries.
from pdx.training.pipeline import TrainingPipeline
pipeline = TrainingPipeline(hmac_key=None, max_age_days=90)
pipeline.load_session("path/to/session/")
entries = pipeline.entries
# entries: list of training-ready dicts
Data formats¶
JSONL event (HYDRA output)¶
{
"timestamp": "2026-04-06T13:37:38.000Z",
"session_id": "a92f516c",
"event_type": "command_executed",
"client_ip": "185.213.154.248",
"data": {
"command": "cat /etc/shadow",
"output_preview": "root:$6$...",
"source": "llm",
"latency_ms": 342,
"exit_code": 0,
"cwd": "/root",
"mitre_tags": [{"tactic": "credential-access", "technique_id": "T1003", "confidence": 0.95}],
"prompt_guard": {"score": 0.1, "action": "none"}
}
}