{
  "id": "anthropic-sitemap:research:auditing-hidden-objectives",
  "type": "article",
  "title": "Auditing language models for hidden objectives",
  "abstract": "_A new paper from the Anthropic Alignment Science and Interpretability teams studies **alignment audits**—systematic investigations into whether models are pursuing hidden objectives. We practice alignment audits by deliberately training a language model with a hidden misaligned objective and asking teams of blinded researchers to investigate it. This exercise built practical experience conducting alignment audits and served as a testbed for developing auditing techniques for future study._",
  "issued": {
    "date-parts": [
      [
        2025,
        3,
        13
      ]
    ]
  },
  "URL": "https://www.anthropic.com/research/auditing-hidden-objectives",
  "publisher": "Anthropic",
  "source": "vendor/anthropic-sitemap/research/auditing-hidden-objectives.md"
}