{
  "id": "anthropic-sitemap:research:influence-functions",
  "type": "article",
  "title": "Tracing Model Outputs to the Training Data",
  "abstract": "As large language models become more powerful and their risks become clearer, there is increasing value to figuring out what makes them tick. In our previous work, we have found that large language models change along many personality and behavioral dimensions as a function of both scale and the amount of fine-tuning. Understanding these changes requires seeing how models work, for instance to determine if a model’s outputs rely on memorization or more sophisticated processing. Understanding the inner workings of language models will have substantial implications for forecasting AI capabilities as well as for approaches to aligning AI systems with human preferences.",
  "issued": {
    "date-parts": [
      [
        2023,
        8,
        8
      ]
    ]
  },
  "URL": "https://www.anthropic.com/research/influence-functions",
  "publisher": "Anthropic",
  "source": "vendor/anthropic-sitemap/research/influence-functions.md"
}
