{
  "id": "anthropic-sitemap:research:probes-catch-sleeper-agents",
  "type": "article",
  "title": "Simple probes can catch sleeper agents",
  "abstract": "_This “Alignment Note” presents some early-stage research from the Anthropic Alignment Science team following up on our recent “Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training” paper. It should be treated as a work-in-progress update, and is intended for a more technical audience than our typical blog post. This research makes use of some simple interpretability techniques, and we expect to share more results from collaborations between our Alignment and Interpretability teams soon._",
  "issued": {
    "date-parts": [
      [
        2024,
        4,
        23
      ]
    ]
  },
  "URL": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
  "publisher": "Anthropic",
  "source": "vendor/anthropic-sitemap/research/probes-catch-sleeper-agents.md"
}
