// /benchmark/lib/methodology.jsx — two flavours:
//   HubMethodology   — general rules that apply to every batch.
//   BatchMethodology — this-batch fingerprint + raw-data download.
// FAQ component is shared.
//
// Framing note: we don't ship a one-command repro harness. Verification is
// (a) read the methodology, (b) download the raw JSON and re-run your own
// cost calc, (c) run WebArena upstream against your own agent. That's
// what these cards point at. Nothing claims "git clone, make bench."

function HubMethodology() {
  const latest = window.BATCHES.list[0];
  // Point per-run JSON at the latest batch's check-our-work anchor.
  // Until a published batch has its raw data hosted, fall back to the
  // detail page root.
  const perRunHref = latest ? (latest.detailPath + '#reproduce') : '#';
  const upstreamHref = 'https://github.com/web-arena-x/webarena';
  return (
    <div className="method-grid" id="methodology">
      <div className="method-card">
        <h3>The rules we apply on every batch</h3>
        <dl>
          <dt>Suite</dt>
          <dd>WebArena <span className="muted">(public agent benchmark using a real Magento admin)</span></dd>

          <dt>Tasks</dt>
          <dd>A handful of lookup tasks per batch. <span className="muted">A deliberate mix of easy and hard so model variants span the full pass-rate range.</span></dd>

          <dt>Reps</dt>
          <dd>3 runs per task per model variant. <span className="muted">Models are non-deterministic. One run isn’t enough.</span></dd>

          <dt>Matched pair</dt>
          <dd>Every model is run twice. Once with the codec on, once with it off. <span className="muted">Relative claims are apples-to-apples.</span></dd>

          <dt>Token accounting</dt>
          <dd>Provider-native counts. <span className="muted">What the model provider sends back with each response, and what they bill on. Not an estimator.</span></dd>

          <dt>Cost basis</dt>
          <dd>Public list price for the model used, as of the batch’s pricing date. <span className="muted">Vertex for Gemini, Anthropic for Sonnet.</span></dd>

          <dt>Codec cost</dt>
          <dd>Codec runtime is added to the codec-on rows. <span className="muted">It never disappears into the headline number.</span></dd>

          <dt>Scoring</dt>
          <dd>Benchmark’s built-in deterministic checker. <span className="muted">No human judgment. No LLM-as-judge.</span></dd>

          <dt>Privacy</dt>
          <dd>Personal data is redacted client-side before snapshots leave the network. <span className="muted">Private by default. A deliberate per-task opt-out is on the roadmap.</span></dd>

          <dt>Publication</dt>
          <dd>Every batch keeps its own page. <span className="muted">Older claims stay verifiable when the codec ships a new version.</span></dd>
        </dl>
      </div>

      <div className="method-card reproduce-card">
        <h3>How to check our work</h3>
        <p style={{margin:'0 0 16px', color:'var(--fg-1)', fontSize:15, lineHeight:1.7, fontWeight:300}}>
          We don’t ship a one-command repro harness. We ship the raw output
          of every run so you can re-add the numbers yourself, and we point at
          the public benchmark so you can run it against your own agent.
        </p>
        <div className="files">
          <a className="file" href={perRunHref} style={{textDecoration:'none', color:'inherit'}}>
            <span>Per-run JSON for every batch <span style={{color:'var(--fg-3)'}}>(on each batch page)</span></span>
            <span className="size" style={{color:'var(--accent)'}}>open latest →</span>
          </a>
          <a className="file" href={upstreamHref} target="_blank" rel="noopener noreferrer" style={{textDecoration:'none', color:'inherit'}}>
            <span>WebArena, upstream <span style={{color:'var(--fg-3)'}}>(public benchmark)</span></span>
            <span className="size" style={{color:'var(--accent)'}}>github →</span>
          </a>
          <a className="file" href="/#bottom-cta" style={{textDecoration:'none', color:'inherit'}}>
            <span>The codec runs behind an API key <span style={{color:'var(--fg-3)'}}>(request access from the homepage)</span></span>
            <span className="size" style={{color:'var(--accent)'}}>request key →</span>
          </a>
        </div>
        <p style={{margin:'18px 0 0', color:'var(--fg-2)', fontSize:13, lineHeight:1.6}}>
          The most meaningful check is the one that doesn’t need anything from
          us. Install WebArena upstream, run it against your agent twice.
          Once bare, once through the codec. The shape of the results should
          match.
        </p>
      </div>
    </div>
  );
}

function BatchMethodology() {
  const m = window.BENCH.meta;
  const batch = window.BATCHES.bySlug(window.BATCH_SLUG);
  const ca = m.costAggregate;
  const rawDataUrl = batch && batch.rawDataUrl;
  // Derive the displayed filename from the URL itself so whatever the file
  // is called on the host is what appears on the page. Falls back to the
  // canonical name when no URL is set yet (upload-pending state).
  const fallbackName = `jdc-benchmark-results-${m.capturedDate}.tar.gz`;
  const displayName = rawDataUrl
    ? (rawDataUrl.split('?')[0].split('/').pop() || fallbackName)
    : fallbackName;
  return (
    <div className="method-grid">
      <div className="method-card">
        <h3>This batch’s fingerprint</h3>
        <dl>
          <dt>Captured</dt>
          <dd>{m.capturedDate}</dd>

          <dt>JDC version</dt>
          <dd>v{m.codecVersion}</dd>

          <dt>Suite preset</dt>
          <dd>{batch ? batch.preset : m.preset}</dd>

          <dt>Models</dt>
          <dd>{batch ? batch.results.length : m.nModels} tested</dd>

          <dt>Tasks</dt>
          <dd>{m.nTasksTotal} attempted <span className="muted">({ca.nTasks} in the cost aggregate, 1 privacy probe reported separately)</span></dd>

          <dt>Attempts per variant</dt>
          <dd>{ca.nPerCell} on the cost aggregate <span className="muted">({ca.nTasks} tasks × {m.nReps} reps)</span></dd>

          <dt>Total runs</dt>
          <dd>{m.totalAttempts} <span className="muted">({ca.totalAttempts} cost-aggregate, {m.privacySlice.totalAttempts} privacy-probe)</span></dd>

          <dt>Total spend</dt>
          <dd>${m.totalSpend.toFixed(2)} <span className="muted">across all runs</span></dd>

          <dt>Pricing applied</dt>
          <dd>Vertex AI list price, {m.pricingDate}</dd>

          <dt>Upstream commit</dt>
          <dd className="muted">WebArena {m.upstreamWebArenaSha.slice(0,8)}</dd>
        </dl>
      </div>

      <div className="method-card reproduce-card" id="reproduce">
        <h3>Check our work on this batch</h3>
        <p style={{margin:'0 0 16px', color:'var(--fg-1)', fontSize:15, lineHeight:1.7, fontWeight:300}}>
          The raw JSON for every run in this batch is downloadable below.
          Re-add the numbers with your own pricing assumptions, or diff against
          our published table. Both should line up.
        </p>
        <div className="files">
          {rawDataUrl ? (
            <a className="file" href={rawDataUrl} download style={{textDecoration:'none', color:'inherit'}}>
              <span>{displayName} <span style={{color:'var(--fg-3)'}}>({m.totalAttempts} per-attempt JSON, plus per-cell summaries)</span></span>
              <span className="size" style={{color:'var(--accent)'}}>download →</span>
            </a>
          ) : (
            <div className="file">
              <span>{displayName} <span style={{color:'var(--fg-3)'}}>({m.totalAttempts} per-attempt JSON, plus per-cell summaries)</span></span>
              <span className="size" style={{color:'var(--fg-3)'}}>upload pending</span>
            </div>
          )}
          <a className="file" href="/benchmark/#methodology" style={{textDecoration:'none', color:'inherit'}}>
            <span>How we ran it <span style={{color:'var(--fg-3)'}}>(methodology on the hub)</span></span>
            <span className="size" style={{color:'var(--accent)'}}>read →</span>
          </a>
          <a className="file" href="/#bottom-cta" style={{textDecoration:'none', color:'inherit'}}>
            <span>Run it against your own agent <span style={{color:'var(--fg-3)'}}>(API key on request)</span></span>
            <span className="size" style={{color:'var(--accent)'}}>request key →</span>
          </a>
        </div>
      </div>
    </div>
  );
}

function FAQItem({ item, open, onToggle }) {
  return (
    <div className={`faq-item ${open ? 'open' : ''}`}>
      <button className="faq-q" onClick={onToggle}>
        <span>{item.q}</span>
        <span className="chev">+</span>
      </button>
      <div className="faq-a"><div><p style={{margin:0}}>{item.a}</p></div></div>
    </div>
  );
}

function FAQ() {
  const [openIdx, setOpenIdx] = React.useState(0);
  const items = window.BENCH.faq;
  return (
    <div className="faq-list">
      {items.map((item, i) => (
        <FAQItem
          key={i}
          item={item}
          open={openIdx === i}
          onToggle={() => setOpenIdx(openIdx === i ? -1 : i)}
        />
      ))}
    </div>
  );
}

window.HubMethodology = HubMethodology;
window.BatchMethodology = BatchMethodology;
window.FAQ = FAQ;
