{
  "id": "anthropic-sitemap:research:evaluating-feature-steering",
  "type": "article",
  "title": "Evaluating feature steering: A case study in mitigating social biases",
  "abstract": "A few months ago, we published an interpretability paper demonstrating our ability to learn interpretable features that correspond to various concepts (e.g., famous individuals, types of computer code, etc.) represented in Claude 3 Sonnet. To verify our feature interpretations, we ran qualitative feature steering experiments, where we artificially dialed up and down various features to see if they changed model outputs in intuitive ways. The results were promising – for example, turning up a feature that responded to mentions of the Golden Gate Bridge made the model talk about the Golden Gate Bridge. Such examples led us to hypothesize that feature steering might be a promising way to modify model outputs in specific interpretable ways.",
  "issued": {
    "date-parts": [
      [
        2024,
        10,
        25
      ]
    ]
  },
  "URL": "https://www.anthropic.com/research/evaluating-feature-steering",
  "publisher": "Anthropic",
  "source": "vendor/anthropic-sitemap/research/evaluating-feature-steering.md"
}
