{
  "id": "anthropic-sitemap:research:diff-tool",
  "type": "article",
  "title": "A “diff” tool for AI: Finding behavioral differences in new models",
  "abstract": "Every time a new AI model is released, its developers run a suite of evaluations to measure its performance and safety. These tests are essential, but they are somewhat limited. Because these benchmarks are human-authored, they can only test for risks we have already conceptualized and learned to measure.",
  "issued": {
    "date-parts": [
      [
        2026,
        3,
        13
      ]
    ]
  },
  "URL": "https://www.anthropic.com/research/diff-tool",
  "publisher": "Anthropic",
  "source": "vendor/anthropic-sitemap/research/diff-tool.md"
}