{
  "kind": "answer",
  "studySlug": "model-divergence",
  "slug": "which-metrics-best-summarize-cross-model-disagreement",
  "title": "Which metrics best summarize cross-model disagreement?",
  "description": "The clearest summary metrics are average agreement, perfect agreement, and the share of high-divergence prompts. In this study those land at 43.3%, 4.0%, and 14.6% respectively.",
  "lastUpdated": "2026-03-11",
  "lastTested": "2026-03-11",
  "sourceStudyUrl": "/trakkr-research/model-divergence",
  "sourceStudyTitle": "Same Question, Different AI, Different Answers",
  "claimIds": [
    "model-divergence:avg-agreement",
    "model-divergence:perfect-agreement",
    "model-divergence:high-divergence"
  ],
  "relatedSlugs": [
    "answer:what-should-brands-do-when-models-disagree",
    "answer:why-are-comparison-queries-the-most-stable-query-class",
    "fact:only-four-percent-of-prompts-produce-perfect-consensus",
    "tracker:cross-model-consensus-tracker"
  ],
  "methodologySummary": "Built from 797,644 valid comparisons across 44,088 reports and 8 models, covering 6,439,133 model responses in the observed window.",
  "limitations": [
    "Agreement is measured across recommendation outputs, not across hidden reasoning or retrieval context.",
    "Average agreement can hide large differences between query classes and model pairs.",
    "The study measures overlap, not which answer was objectively “right”."
  ],
  "keywords": [
    "model divergence",
    "AI agreement",
    "ChatGPT vs Claude",
    "Gemini vs Perplexity",
    "divergence metrics",
    "AI agreement benchmark"
  ],
  "schemaHints": {
    "pageType": "Article",
    "includeDataset": true
  },
  "question": "Which metrics best summarize cross-model disagreement?",
  "directAnswer": "The clearest summary metrics are average agreement, perfect agreement, and the share of high-divergence prompts. In this study those land at 43.3%, 4.0%, and 14.6% respectively.",
  "answerSummary": "Together, they show the center, the extreme, and the risk tail of cross-model recommendation behavior.",
  "keyFacts": [
    {
      "label": "Average agreement",
      "value": "43.3%",
      "detail": "Mean cross-model agreement rate.",
      "claimId": "model-divergence:avg-agreement"
    },
    {
      "label": "Perfect agreement",
      "value": "4.0%",
      "detail": "Only a small share of prompts produce unanimous outcomes.",
      "claimId": "model-divergence:perfect-agreement"
    },
    {
      "label": "High divergence rate",
      "value": "14.6%",
      "detail": "Prompts in the 0-25% agreement bucket.",
      "claimId": "model-divergence:high-divergence"
    }
  ],
  "evidenceTable": [
    {
      "label": "Average agreement",
      "value": "43.3%",
      "note": "Mean cross-model agreement rate."
    },
    {
      "label": "Perfect agreement",
      "value": "4.0%",
      "note": "Only a small share of prompts produce unanimous outcomes."
    },
    {
      "label": "High divergence rate",
      "value": "14.6%",
      "note": "Prompts in the 0-25% agreement bucket."
    }
  ],
  "whyItMatters": "This answer matters because it turns a study finding into an operating rule teams can use when they decide what to publish, refresh, or measure next.",
  "whatToDo": [
    "Track visibility across multiple models instead of using one platform as a proxy for the whole market.",
    "Prioritize query classes where disagreement is highest because that is where share can move fastest.",
    "Treat consensus as a benchmark, but treat divergence as the operating reality."
  ],
  "faqs": [
    {
      "question": "Which metrics best summarize cross-model disagreement?",
      "answer": "The clearest summary metrics are average agreement, perfect agreement, and the share of high-divergence prompts. In this study those land at 43.3%, 4.0%, and 14.6% respectively."
    },
    {
      "question": "Which numbers from Same Question, Different AI, Different Answers matter most here?",
      "answer": "Average agreement: 43.3%. Mean cross-model agreement rate. Perfect agreement: 4.0%. Only a small share of prompts produce unanimous outcomes."
    },
    {
      "question": "What should a team do next?",
      "answer": "Track visibility across multiple models instead of using one platform as a proxy for the whole market. Prioritize query classes where disagreement is highest because that is where share can move fastest. Treat consensus as a benchmark, but treat divergence as the operating reality."
    }
  ]
}
