Compare commits

...

27 Commits

Author SHA1 Message Date
Bryan Thompson
34dfeb7236 Add box plugin (box/box-for-ai) — first-party skills plugin for Box Platform integrations.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 15:05:19 -05:00
Bryan Thompson
b32879bf76 Add SAP CAP MCP Server plugin (cds-mcp) (#1328)
URL-source plugin pointing to cap-js/mcp-server which already has
.claude-plugin/plugin.json and .mcp.json at repo root.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:30:37 +01:00
Tobin South
98c01d3fbf Add zoom-plugin to official marketplace (#1313)
Promotes zoom-plugin from community-internal (PR #1567, merged
2026-04-07) to official. Uses url source pointing at
github.com/zoom/zoom-plugin without a SHA pin to track upstream.

Ref: DIR-77, DIR-79
2026-04-08 23:42:03 +01:00
Tobin South
ce0166dde2 Add expo plugin to official marketplace (#1312)
Promotes expo from community to official. Uses git-subdir source
(expo/skills @ plugins/expo, ref main) without a SHA pin so it
tracks upstream. Description cleaned to remove the reviewer-note
preamble that leaked into the community entry.

Ref: DIR-77, DIR-79
2026-04-08 14:31:55 -07:00
Thariq Shihipar
62f2063abc Merge pull request #1293 from anthropics/thariq/session-report-plugin
Add session-report plugin
2026-04-08 09:35:14 -07:00
Thariq Shihipar
66ca8fc540 Sort session-report plugin into marketplace order 2026-04-08 09:31:35 -07:00
Thariq Shihipar
147ddf8ee3 Add session-report plugin
Generates an explorable HTML report of Claude Code session usage from
local ~/.claude/projects transcripts: total tokens, cache efficiency,
per-project/subagent/skill breakdowns, most expensive prompts with
transcript context, and cache breaks. Terminal-styled, single-file
output with sortable tables and expandable drill-downs.
2026-04-07 17:40:13 -07:00
Sarah Deaton
104d39be10 Merge pull request #885 from anthropics/sarah/restore-reload-plugins-readme
Restore /reload-plugins step in telegram/discord READMEs
2026-04-03 11:44:30 -07:00
Bryan Thompson
decc737a56 Add sonarqube-agent-plugins plugin (#1085)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 10:13:39 -05:00
Bryan Thompson
0484869680 remove(plugin-json): zoominfo — Cowork-only plugin (#1204)
ZoomInfo submitted with platform "Claude Cowork" on their form (2/24/2026)
and is correctly listed on knowledge-work-plugins. This entry was swept
into -official via staging merge PR #730 but should not be on the Claude
Code plugin marketplace.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 08:06:01 -05:00
Bryan Thompson
b091cb4179 Merge pull request #1188 from anthropics/tobinsouth-patch-1
Removing posthog pin
2026-03-31 17:15:14 -05:00
Tobin South
a54e5292a6 Removing posthog pin 2026-03-31 22:04:18 +01:00
Bryan Thompson
52e95f6756 Add mongodb plugin (#1095)
Official MongoDB plugin (MCP Server + Skills) from mongodb/agent-skills.
Partner escalation — submitted via PR #158, Forge, and Slack.
Already merged on -internal (PR #667).
2026-03-31 01:04:48 +01:00
Bryan Thompson
9ed16511d1 Add UI5 plugins from SAP (ui5 + ui5-typescript-conversion) (#1086)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 01:04:27 +01:00
Bryan Thompson
92e3c1ce6e Update postman plugin to latest version (#1080)
Bumps pinned SHA from 0714280 (Feb 20) to 40b11ac (Mar 26).
New commit adds private network search support.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 01:03:09 +01:00
Kenneth Lien
31e7200b33 Merge pull request #1055 from anthropics/kenneth/imessage-permission-selfchat-only
imessage: restrict permission relay to self-chat, fix echo filter & tapback noise
2026-03-30 14:08:48 -07:00
Tobin South
9d468adfb8 math-olympiad: housekeeping (#1172)
* math-olympiad: add LICENSE, marketplace entry, and prettier formatting

- Add Apache 2.0 LICENSE file
- Register plugin in marketplace.json
- Run prettier (prose-wrap=always, 80 cols) over all plugin markdown
- Simplify model tier naming in reference docs

🏠 Remote-Dev: homespace

* Update .claude-plugin/marketplace.json
2026-03-30 20:56:21 +01:00
Dickson Tsai
183a6ca35d Merge pull request #1115 from anthropics/dickson/bash-prefix-hooks
fix: invoke .sh hooks via bash prefix (fixes #993)
2026-03-28 22:21:14 -07:00
Dickson Tsai
986deab6a1 fix: invoke .sh hooks via bash prefix; add version for cache invalidation
Fixes #993 (Permission denied on hook scripts) without relying on
client-side +x preservation.

The hook executor spawns commands via /bin/sh -c, which requires +x
to execute a script directly. Prefixing with bash reads the script
as data — mode bits are irrelevant. This works on all Claude Code
versions, whereas the client-side chmod fix (claude-cli #24666) only
shipped in v2.1.86.

All 3 scripts declare #!/bin/bash and use bashisms ([[ ]], =~), so
bash (not sh) is the correct interpreter.

The version field forces a fresh cache path (1.0.0/ instead of
unknown/), ensuring the new hooks.json reaches users with stale
caches.
2026-03-28 08:40:34 -07:00
Kenneth Lien
22bd61d01f imessage: bump to 0.1.0 2026-03-26 23:43:51 -07:00
Kenneth Lien
60c3fc36ed imessage: drop SMS/RCS by default, opt-in via IMESSAGE_ALLOW_SMS
SMS sender IDs are spoofable; iMessage is Apple-ID-authenticated and
end-to-end encrypted. The plugin previously treated both identically,
so a forged SMS from the owner's own number would match SELF, bypass
the access gate, and inherit owner-level trust — including permission
approval.

handleInbound now drops anything with service != 'iMessage' unless
IMESSAGE_ALLOW_SMS=true. Default is the safe path; users who want SMS
can opt in after reading the warning in README.
2026-03-26 23:41:39 -07:00
Kenneth Lien
c4274521de imessage: trim comment cruft 2026-03-26 23:16:27 -07:00
Kenneth Lien
8dfc279258 imessage: harden echo filter normalization
The self-chat echo filter matches outbound text against what chat.db
stores on round-trip. Three divergence sources caused false negatives
and duplicate bubbles:

- Signature suffix: "\nSent by Claude" is appended on send, but the
  \n may not round-trip identically through attributedBody
- Emoji variation selectors (U+FE00-FE0F) and ZWJ (U+200D): chat.db
  can add or drop these on emoji characters
- Smart quotes: macOS auto-substitutes straight quotes on the way in

Strip/normalize all three in echoKey() before the existing whitespace
collapse.

Fixes #1024
2026-03-26 23:12:19 -07:00
Kenneth Lien
c29338f276 imessage: drop whitespace-only messages from tapbacks/receipts
Tapback reactions and read receipts synced from linked devices arrive
as chat.db rows with whitespace-only text. The existing empty-check
used falsy comparison which doesn't catch ' ' or invisible chars,
causing unsolicited replies to reaction taps.

Fixes #1041
2026-03-26 23:11:49 -07:00
Kenneth Lien
03a685d5f6 imessage: restrict permission relay to self-chat only
Permission prompts were being broadcast to all allowlisted contacts plus
every DM resolvable from the SELF address set. Two compounding bugs:

1. SELF was polluted by chat.last_addressed_handle, which on machines
   with SMS history returns short codes, business handles, and other
   contacts' numbers — not just the owner's addresses. One reporter's
   query returned 50 addresses (2 actually theirs) resolving to 148 DM
   chats, all of which received permission prompts.

2. Even with a clean SELF, the handler sent to allowFrom + SELF, so
   every allowlisted contact received the prompt and could reply to
   approve tool execution on the owner's machine.

Fix:
- Build SELF from message.account WHERE is_from_me=1 only
- Send permission prompts to self-chat only, not allowFrom
- Accept permission replies from self-chat only

Fixes #1048
Fixes #1010
2026-03-26 23:11:29 -07:00
Sarah Deaton
92d061553f Drop tab-complete check, keep just /reload-plugins line 2026-03-21 20:18:35 -07:00
Sarah Deaton
b4f0bdd93a Restore /reload-plugins step in telegram/discord READMEs
Partially reverts #758. The reload step is not redundant: the configure
skill runs before the restart step, so it is not loaded yet when the user
types /telegram:configure. CLI prints 'Run /reload-plugins to activate.'
after install (pluginInstallationHelpers.ts:529). Mintlify reports
confirm users hit 'Unknown skill: discord:configure' at step 3.
2026-03-21 20:17:33 -07:00
26 changed files with 2404 additions and 305 deletions

View File

@@ -147,6 +147,17 @@
},
"homepage": "https://github.com/awslabs/agent-plugins"
},
{
"name": "box",
"description": "Work with your Box content directly from Claude Code — search files, organize folders, collaborate with your team, and use Box AI to answer questions, summarize documents, and extract data without leaving your workflow.",
"category": "productivity",
"source": {
"source": "url",
"url": "https://github.com/box/box-for-ai.git",
"sha": "6f4ec3549f3e869b115628403555b1c9220b2b34"
},
"homepage": "https://github.com/box/box-for-ai"
},
{
"name": "brightdata-plugin",
"description": "Web scraping, Google search, structured data extraction, and MCP server integration powered by Bright Data. Includes 7 skills: scrape any webpage as markdown (with bot detection/CAPTCHA bypass), search Google with structured JSON results, extract data from 40+ websites (Amazon, LinkedIn, Instagram, TikTok, YouTube, and more), orchestrate Bright Data's 60+ MCP tools, built-in best practices for Web Unlocker, SERP API, Web Scraper API, and Browser API, Python SDK best practices for the brightda...",
@@ -157,6 +168,17 @@
},
"homepage": "https://docs.brightdata.com"
},
{
"name": "cds-mcp",
"description": "AI-assisted development of SAP Cloud Application Programming Model (CAP) projects. Search CDS models and CAP documentation.",
"category": "development",
"source": {
"source": "url",
"url": "https://github.com/cap-js/mcp-server.git",
"sha": "4d59d7070a52761a9b8028cbe710c8d7477cbc92"
},
"homepage": "https://cap.cloud.sap/"
},
{
"name": "chrome-devtools-mcp",
"description": "Control and inspect a live Chrome browser from your coding agent. Record performance traces, analyze network requests, check console messages with source-mapped stack traces, and automate browser actions with Puppeteer.",
@@ -384,6 +406,18 @@
"category": "learning",
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/explanatory-output-style"
},
{
"name": "expo",
"description": "Official Expo skills for building, deploying, upgrading, and debugging React Native apps with Expo. Covers UI development with Expo Router, SwiftUI and Jetpack Compose components, Tailwind CSS setup, API routes, data fetching, CI/CD workflows, App Store and Play Store deployment, SDK upgrades, DOM components, and dev client distribution.",
"category": "development",
"source": {
"source": "git-subdir",
"url": "expo/skills",
"path": "plugins/expo",
"ref": "main"
},
"homepage": "https://github.com/expo/skills/blob/main/plugins/expo/README.md"
},
{
"name": "fakechat",
"description": "Localhost web chat for testing the channel notification flow. No tokens, no access control, no third-party service.",
@@ -694,6 +728,17 @@
}
}
},
{
"name": "math-olympiad",
"description": "Solve competition math (IMO, Putnam, USAMO) with adversarial verification that catches what self-verification misses. Fresh-context verifiers attack proofs with specific failure patterns. Calibrated abstention over bluffing.",
"author": {
"name": "Anthropic",
"email": "support@anthropic.com"
},
"source": "./plugins/math-olympiad",
"category": "math",
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/math-olympiad"
},
{
"name": "mcp-server-dev",
"description": "Skills for designing and building MCP servers that work seamlessly with Claude. Guides you through deployment models (remote HTTP, MCPB, local), tool design patterns, auth, and interactive MCP apps.",
@@ -738,6 +783,17 @@
},
"homepage": "https://www.mintlify.com/"
},
{
"name": "mongodb",
"description": "Official Claude plugin for MongoDB (MCP Server + Skills). Connect to databases, explore data, manage collections, optimize queries, generate reliable code, implement best practices, develop advanced features, and more.",
"category": "database",
"source": {
"source": "url",
"url": "https://github.com/mongodb/agent-skills.git",
"sha": "c47079f65e88a113c52d1ce0618684cef300246c"
},
"homepage": "https://www.mongodb.com/docs/mcp-server/overview/"
},
{
"name": "neon",
"description": "Manage your Neon projects and databases with the neon-postgres agent skill and the Neon MCP Server.",
@@ -901,8 +957,7 @@
"category": "monitoring",
"source": {
"source": "url",
"url": "https://github.com/PostHog/ai-plugin.git",
"sha": "f2f37954ecef9f1afce4fa81b6a612454a96c410"
"url": "https://github.com/PostHog/ai-plugin.git"
},
"homepage": "https://posthog.com/docs/model-context-protocol"
},
@@ -923,7 +978,7 @@
"source": {
"source": "url",
"url": "https://github.com/Postman-Devrel/postman-claude-code-plugin.git",
"sha": "0714280351c1a137e79aad465a66730511ffbd57"
"sha": "40b11ac3466c500cf4625ac016d5c01cd00046f4"
},
"homepage": "https://learning.postman.com/docs/developer/postman-mcp-server/"
},
@@ -1158,6 +1213,17 @@
"community-managed"
]
},
{
"name": "session-report",
"description": "Generate an explorable HTML report of Claude Code session usage — tokens, cache efficiency, subagents, skills, and the most expensive prompts — from local ~/.claude/projects transcripts.",
"author": {
"name": "Anthropic",
"email": "support@anthropic.com"
},
"source": "./plugins/session-report",
"category": "productivity",
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/session-report"
},
{
"name": "skill-creator",
"description": "Create new skills, improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, or benchmark skill performance with variance analysis.",
@@ -1179,6 +1245,17 @@
},
"homepage": "https://github.com/slackapi/slack-mcp-plugin/tree/main"
},
{
"name": "sonarqube-agent-plugins",
"description": "Integrate SonarQube code quality and security analysis into Claude Code: namespaced slash commands, a guided skill to setup the SonarQube CLI, and a startup check for CLI wiring. MCP server registration and secrets-scanning hooks are installed by the SonarQube CLI as part of setup.",
"category": "security",
"source": {
"source": "url",
"url": "https://github.com/SonarSource/sonarqube-agent-plugins.git",
"sha": "0cae644cee9318e6245b62ca779abdc60e6daa49"
},
"homepage": "https://github.com/SonarSource/sonarqube-agent-plugins"
},
{
"name": "sonatype-guide",
"description": "Sonatype Guide MCP server for software supply chain intelligence and dependency security. Analyze dependencies for vulnerabilities, get secure version recommendations, and check component quality metrics.",
@@ -1331,6 +1408,32 @@
}
}
},
{
"name": "ui5",
"description": "SAPUI5 / OpenUI5 plugin for Claude. Create and validate UI5 projects, access API documentation, run UI5 linter, get development guidelines and best practices for UI5 development.",
"category": "development",
"source": {
"source": "git-subdir",
"url": "UI5/plugins-claude",
"path": "plugins/ui5",
"ref": "main",
"sha": "5070dfc1cef711d6efad40beb43750027039d71f"
},
"homepage": "https://github.com/UI5/plugins-claude"
},
{
"name": "ui5-typescript-conversion",
"description": "SAPUI5 / OpenUI5 plugin for Claude. Convert JavaScript based UI5 projects to TypeScript.",
"category": "development",
"source": {
"source": "git-subdir",
"url": "UI5/plugins-claude",
"path": "plugins/ui5-typescript-conversion",
"ref": "main",
"sha": "5070dfc1cef711d6efad40beb43750027039d71f"
},
"homepage": "https://github.com/UI5/plugins-claude"
},
{
"name": "vercel",
"description": "Vercel deployment platform integration. Manage deployments, check build status, access logs, configure domains, and control your frontend infrastructure directly from Claude Code.",
@@ -1386,15 +1489,14 @@
"homepage": "https://github.com/zapier/zapier-mcp/tree/main/plugins/zapier"
},
{
"name": "zoominfo",
"description": "Search companies and contacts, enrich leads, find lookalikes, and get AI-ranked contact recommendations. Pre-built skills chain multiple ZoomInfo tools into complete B2B sales workflows.",
"category": "productivity",
"name": "zoom-plugin",
"description": "Claude plugin for planning, building, and debugging Zoom integrations across REST APIs, SDKs, webhooks, bots, and MCP workflows.",
"category": "development",
"source": {
"source": "url",
"url": "https://github.com/Zoominfo/zoominfo-mcp-plugin.git",
"sha": "0705316ef8a2d0c64f81e50d4612ccc6a74edf03"
"url": "https://github.com/zoom/zoom-plugin.git"
},
"homepage": "https://zoominfo.com"
"homepage": "https://developers.zoom.us/"
}
]
}

View File

@@ -47,6 +47,7 @@ These are Claude Code commands — run `claude` to start a session first.
Install the plugin:
```
/plugin install discord@claude-plugins-official
/reload-plugins
```
**5. Give the server the token.**

View File

@@ -1,7 +1,7 @@
{
"name": "imessage",
"description": "iMessage channel for Claude Code \u2014 reads chat.db directly, sends via AppleScript. Built-in access control; manage pairing, allowlists, and policy via /imessage:access.",
"version": "0.0.1",
"version": "0.1.0",
"keywords": [
"imessage",
"messaging",

View File

@@ -62,6 +62,7 @@ Handles are phone numbers (`+15551234567`) or Apple ID emails (`them@icloud.com`
| Variable | Default | Effect |
| --- | --- | --- |
| `IMESSAGE_APPEND_SIGNATURE` | `true` | Appends `\nSent by Claude` to outbound messages. Set to `false` to disable. |
| `IMESSAGE_ALLOW_SMS` | `false` | Accept inbound SMS/RCS in addition to iMessage. **Off by default because SMS sender IDs are spoofable** — a forged SMS from your own number would otherwise bypass access control. Only enable if you understand the risk. |
| `IMESSAGE_ACCESS_MODE` | — | Set to `static` to disable runtime pairing and read `access.json` only. |
| `IMESSAGE_STATE_DIR` | `~/.claude/channels/imessage` | Override where `access.json` and pairing state live. |

View File

@@ -1,6 +1,6 @@
{
"name": "claude-channel-imessage",
"version": "0.0.1",
"version": "0.1.0",
"license": "Apache-2.0",
"type": "module",
"bin": "./server.ts",

View File

@@ -32,6 +32,10 @@ import { join, basename, sep } from 'path'
const STATIC = process.env.IMESSAGE_ACCESS_MODE === 'static'
const APPEND_SIGNATURE = process.env.IMESSAGE_APPEND_SIGNATURE !== 'false'
// SMS sender IDs are spoofable; iMessage is Apple-ID-authenticated. Default
// drops SMS/RCS so a forged sender can't reach the gate. Opt in only if you
// understand the risk.
const ALLOW_SMS = process.env.IMESSAGE_ALLOW_SMS === 'true'
const SIGNATURE = '\nSent by Claude'
const CHAT_DB =
process.env.IMESSAGE_DB_PATH ?? join(homedir(), 'Library', 'Messages', 'chat.db')
@@ -105,6 +109,7 @@ type Row = {
date: number
is_from_me: number
cache_has_attachments: number
service: string | null
handle_id: string | null
chat_guid: string
chat_style: number | null
@@ -114,7 +119,7 @@ const qWatermark = db.query<{ max: number | null }, []>('SELECT MAX(ROWID) AS ma
const qPoll = db.query<Row, [number]>(`
SELECT m.ROWID AS rowid, m.guid, m.text, m.attributedBody, m.date, m.is_from_me,
m.cache_has_attachments, h.id AS handle_id, c.guid AS chat_guid, c.style AS chat_style
m.cache_has_attachments, m.service, h.id AS handle_id, c.guid AS chat_guid, c.style AS chat_style
FROM message m
JOIN chat_message_join cmj ON cmj.message_id = m.ROWID
JOIN chat c ON c.ROWID = cmj.chat_id
@@ -125,7 +130,7 @@ const qPoll = db.query<Row, [number]>(`
const qHistory = db.query<Row, [string, number]>(`
SELECT m.ROWID AS rowid, m.guid, m.text, m.attributedBody, m.date, m.is_from_me,
m.cache_has_attachments, h.id AS handle_id, c.guid AS chat_guid, c.style AS chat_style
m.cache_has_attachments, m.service, h.id AS handle_id, c.guid AS chat_guid, c.style AS chat_style
FROM message m
JOIN chat_message_join cmj ON cmj.message_id = m.ROWID
JOIN chat c ON c.ROWID = cmj.chat_id
@@ -165,12 +170,10 @@ const qAttachments = db.query<AttRow, [number]>(`
WHERE maj.message_id = ?
`)
// Your own addresses. message.account ("E:you@icloud.com" / "p:+1555...") is
// the identity you sent *from* on each row — but an Apple ID can be reachable
// at both an email and a phone, and account only shows whichever you sent
// from. chat.last_addressed_handle covers the rest: it's the per-chat "which
// of your addresses reaches this person" field, so it accumulates every
// identity you've actually used. Union both.
// Your own addresses, from message.account ("E:you@icloud.com" / "p:+1555...")
// on rows you sent. Don't supplement with chat.last_addressed_handle — on
// machines with SMS history that column is polluted with short codes and
// other people's numbers, not just your own identities.
const SELF = new Set<string>()
{
type R = { addr: string }
@@ -178,9 +181,6 @@ const SELF = new Set<string>()
for (const { addr } of db.query<R, []>(
`SELECT DISTINCT account AS addr FROM message WHERE is_from_me = 1 AND account IS NOT NULL AND account != '' LIMIT 50`,
).all()) SELF.add(norm(addr))
for (const { addr } of db.query<R, []>(
`SELECT DISTINCT last_addressed_handle AS addr FROM chat WHERE last_addressed_handle IS NOT NULL AND last_addressed_handle != '' LIMIT 50`,
).all()) SELF.add(norm(addr))
}
process.stderr.write(`imessage channel: self-chat addresses: ${[...SELF].join(', ') || '(none)'}\n`)
@@ -432,7 +432,14 @@ const ECHO_WINDOW_MS = 15000
const echo = new Map<string, number>()
function echoKey(raw: string): string {
return raw.trim().replace(/\s+/g, ' ').slice(0, 120)
return raw
.replace(/\s*Sent by Claude\s*$/, '')
.replace(/[\u200d\ufe00-\ufe0f]/g, '') // ZWJ + variation selectors — chat.db is inconsistent about these
.replace(/[\u2018\u2019]/g, "'")
.replace(/[\u201c\u201d]/g, '"')
.trim()
.replace(/\s+/g, ' ')
.slice(0, 120)
}
function trackEcho(chatGuid: string, key: string): void {
@@ -540,11 +547,10 @@ const mcp = new Server(
tools: {},
experimental: {
'claude/channel': {},
// Permission-relay opt-in (anthropics/claude-cli-internal#23061).
// Declaring this asserts we authenticate the replier — which we do:
// gate()/access.allowFrom already drops non-allowlisted senders before
// handleInbound delivers. Self-chat is the owner by definition. A
// server that can't authenticate the replier should NOT declare this.
// Permission-relay opt-in. Declaring this asserts we authenticate the
// replier — which we do: prompts go to self-chat only and replies are
// accepted from self-chat only (see handleInbound). A server that
// can't authenticate the replier should NOT declare this.
'claude/channel/permission': {},
},
},
@@ -562,11 +568,9 @@ const mcp = new Server(
},
)
// Receive permission_request from CC → format → send to all allowlisted DMs.
// Groups are intentionally excluded — the security thread resolution was
// "single-user mode for official plugins." Anyone in access.allowFrom
// already passed explicit pairing; group members haven't. Self-chat is
// always included (owner).
// Permission prompts go to self-chat only. A "yes" grants tool execution on
// this machine — that authority is the owner's alone, not allowlisted
// contacts'.
mcp.setNotificationHandler(
z.object({
method: z.literal('notifications/claude/channel/permission_request'),
@@ -579,7 +583,6 @@ mcp.setNotificationHandler(
}),
async ({ params }) => {
const { request_id, tool_name, description, input_preview } = params
const access = loadAccess()
// input_preview is unbearably long for Write/Edit; show only for Bash
// where the command itself is the dangerous part.
const preview = tool_name === 'Bash' ? `${input_preview}\n\n` : '\n'
@@ -588,14 +591,17 @@ mcp.setNotificationHandler(
`${tool_name}: ${description}\n` +
preview +
`Reply "yes ${request_id}" to allow or "no ${request_id}" to deny.`
// allowFrom holds handle IDs, not chat GUIDs — resolve via qChatsForHandle.
// Include SELF addresses so the owner's self-chat gets the prompt even
// when allowFrom is empty (default config).
const handles = new Set([...access.allowFrom.map(h => h.toLowerCase()), ...SELF])
const targets = new Set<string>()
for (const h of handles) {
for (const h of SELF) {
for (const { guid } of qChatsForHandle.all(h)) targets.add(guid)
}
if (targets.size === 0) {
process.stderr.write(
`imessage channel: permission_request ${request_id} not relayed — no self-chat found. ` +
`Send yourself an iMessage to create one.\n`,
)
return
}
for (const guid of targets) {
const err = sendText(guid, text)
if (err) {
@@ -770,6 +776,7 @@ function expandTilde(p: string): string {
function handleInbound(r: Row): void {
if (!r.chat_guid) return
if (!ALLOW_SMS && r.service !== 'iMessage') return
// style 45 = DM, 43 = group. Drop unknowns rather than risk routing a
// group message through the DM gate and leaking a pairing code.
@@ -781,7 +788,9 @@ function handleInbound(r: Row): void {
const text = messageText(r)
const hasAttachments = r.cache_has_attachments === 1
if (!text && !hasAttachments) return
// trim() catches tapbacks/receipts synced from other devices — those land
// as whitespace-only rows.
if (!text.trim() && !hasAttachments) return
// Never deliver our own sends. In self-chat the is_from_me=1 rows are empty
// sent-receipts anyway — the content lands on the is_from_me=0 copy below.
@@ -817,12 +826,9 @@ function handleInbound(r: Row): void {
}
}
// Permission-reply intercept: if this looks like "yes xxxxx" for a
// pending permission request, emit the structured event instead of
// relaying as chat. The sender is already gate()-approved at this point
// (non-allowlisted senders were dropped above; self-chat is the owner),
// so we trust the reply.
const permMatch = PERMISSION_REPLY_RE.exec(text)
// Permission replies: emit the structured event instead of relaying as
// chat. Owner-only — same gate as the send side.
const permMatch = isSelfChat ? PERMISSION_REPLY_RE.exec(text) : null
if (permMatch) {
void mcp.notification({
method: 'notifications/claude/channel/permission',

View File

@@ -27,6 +27,7 @@ These are Claude Code commands — run `claude` to start a session first.
Install the plugin:
```
/plugin install telegram@claude-plugins-official
/reload-plugins
```
**3. Give the server the token.**

View File

@@ -1,5 +1,6 @@
{
"name": "explanatory-output-style",
"version": "1.0.0",
"description": "Adds educational insights about implementation choices and codebase patterns (mimics the deprecated Explanatory output style)",
"author": {
"name": "Anthropic",

View File

@@ -6,7 +6,7 @@
"hooks": [
{
"type": "command",
"command": "${CLAUDE_PLUGIN_ROOT}/hooks-handlers/session-start.sh"
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks-handlers/session-start.sh\""
}
]
}

View File

@@ -1,5 +1,6 @@
{
"name": "learning-output-style",
"version": "1.0.0",
"description": "Interactive learning mode that requests meaningful code contributions at decision points (mimics the unshipped Learning output style)",
"author": {
"name": "Anthropic",

View File

@@ -6,7 +6,7 @@
"hooks": [
{
"type": "command",
"command": "${CLAUDE_PLUGIN_ROOT}/hooks-handlers/session-start.sh"
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks-handlers/session-start.sh\""
}
]
}

View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -4,18 +4,25 @@ Competition math solver with adversarial verification.
## The problem
Self-verification gets fooled. A verifier that sees the reasoning is biased toward agreement. arXiv:2503.21934 ("Proof or Bluff") showed 85.7% self-verified IMO success drops to <5% under human grading.
Self-verification gets fooled. A verifier that sees the reasoning is biased
toward agreement. arXiv:2503.21934 ("Proof or Bluff") showed 85.7% self-verified
IMO success drops to <5% under human grading.
## The approach
- **Context-isolated verification**: verifier sees only the clean proof, never the reasoning trace
- **Pattern-armed adversarial checks**: not "is this correct?" but "does this accidentally prove RH?" / "extract the general lemma, find a 2×2 counterexample"
- **Context-isolated verification**: verifier sees only the clean proof, never
the reasoning trace
- **Pattern-armed adversarial checks**: not "is this correct?" but "does this
accidentally prove RH?" / "extract the general lemma, find a 2×2
counterexample"
- **Calibrated abstention**: says "no confident solution" rather than bluff
- **Presentation pass**: produces clean LaTeX/PDF after verification passes
## Validation
17/18 IMO+Putnam 2025 problems solved, 0 false positives, 2 novel proofs found. See the skill's eval data in the [anthropic monorepo](https://github.com/anthropics/anthropic/tree/staging/sandbox/sandbox/ralph/math_skills/eval_harness).
17/18 IMO+Putnam 2025 problems solved, 0 false positives, 2 novel proofs found.
See the skill's eval data in the
[anthropic monorepo](https://github.com/anthropics/anthropic/tree/staging/sandbox/sandbox/ralph/math_skills/eval_harness).
## Install
@@ -29,4 +36,5 @@ Self-verification gets fooled. A verifier that sees the reasoning is biased towa
> Solve this IMO problem: [statement]
```
The skill auto-triggers on "IMO", "Putnam", "olympiad", "verify this proof", etc.
The skill auto-triggers on "IMO", "Putnam", "olympiad", "verify this proof",
etc.

View File

@@ -1,6 +1,16 @@
---
name: math-olympiad
description: "Solve competition math problems (IMO, Putnam, USAMO, AIME) with adversarial verification that catches the errors self-verification misses. Activates when asked to 'solve this IMO problem', 'prove this olympiad inequality', 'verify this competition proof', 'find a counterexample', 'is this proof correct', or for any problem with 'IMO', 'Putnam', 'USAMO', 'olympiad', or 'competition math' in it. Uses pure reasoning (no tools) — then a fresh-context adversarial verifier attacks the proof using specific failure patterns, not generic 'check logic'. Outputs calibrated confidence — will say 'no confident solution' rather than bluff. If LaTeX is available, produces a clean PDF after verification passes."
description:
"Solve competition math problems (IMO, Putnam, USAMO, AIME) with adversarial
verification that catches the errors self-verification misses. Activates when
asked to 'solve this IMO problem', 'prove this olympiad inequality', 'verify
this competition proof', 'find a counterexample', 'is this proof correct', or
for any problem with 'IMO', 'Putnam', 'USAMO', 'olympiad', or 'competition
math' in it. Uses pure reasoning (no tools) — then a fresh-context adversarial
verifier attacks the proof using specific failure patterns, not generic 'check
logic'. Outputs calibrated confidence — will say 'no confident solution'
rather than bluff. If LaTeX is available, produces a clean PDF after
verification passes."
version: 0.1.0
---
@@ -8,35 +18,51 @@ version: 0.1.0
## The five things that change outcomes
1. **Strip thinking before verifying** — a verifier that sees the reasoning is biased toward agreement. Fresh context, cleaned proof only.
2. **"Does this prove RH?"** — if your theorem's specialization to ζ is a famous open problem, you have a gap. Most reliable red flag.
3. **Short proof → extract the general lemma**try 2×2 counterexamples. If general form is false, find what's special about THIS instance.
4. **Same gap twice → step back** — the case split may be obscuring a unified argument. Three lines sometimes does what twelve pages couldn't.
5. **Say "no confident solution"** — wrong-and-confident is worse than honest abstain.
1. **Strip thinking before verifying** — a verifier that sees the reasoning is
biased toward agreement. Fresh context, cleaned proof only.
2. **"Does this prove RH?"** — if your theorem's specialization to ζ is a famous
open problem, you have a gap. Most reliable red flag.
3. **Short proof → extract the general lemma** — try 2×2 counterexamples. If
general form is false, find what's special about THIS instance.
4. **Same gap twice → step back** — the case split may be obscuring a unified
argument. Three lines sometimes does what twelve pages couldn't.
5. **Say "no confident solution"** — wrong-and-confident is worse than honest
abstain.
---
**Tool policy**: Solvers and verifiers use THINKING ONLY in the tight-budget workflow. Competition math is reasoning. Computation is for deep mode (§6c), and even then bounded — a recurrence that's doubly-exponential can't be computed past n~30, work mod 2^m instead.
**Tool policy**: Solvers and verifiers use THINKING ONLY in the tight-budget
workflow. Competition math is reasoning. Computation is for deep mode (§6c), and
even then bounded — a recurrence that's doubly-exponential can't be computed
past n~30, work mod 2^m instead.
---
## When to use which approach
| Problem | Approach | Verification |
|---|---|---|
| AIME numeric answer | Best-of-N → majority vote | Answer check only |
| Olympiad proof (IMO/Putnam/USAMO) | Full workflow below | 5-pass adversarial |
| "Is this proof correct?" | Skip to verification (step 4) | Adversarial + spec-gaming |
| **Full problem set** (e.g. all 6 from a competition) | Sequential: one full workflow per problem, collect results, compile single PDF | Per-problem adversarial |
| Problem | Approach | Verification |
| ---------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------- |
| AIME numeric answer | Best-of-N → majority vote | Answer check only |
| Olympiad proof (IMO/Putnam/USAMO) | Full workflow below | 5-pass adversarial |
| "Is this proof correct?" | Skip to verification (step 4) | Adversarial + spec-gaming |
| **Full problem set** (e.g. all 6 from a competition) | Sequential: one full workflow per problem, collect results, compile single PDF | Per-problem adversarial |
**Batch in one Workflow**: Set `opts.label` on every `agent()` call to include the problem ID (e.g., `label: "P3:solver:2"`). Without labels, 36 results come back with no problem association. Run problems in parallel — the label is what matters, not ordering.
**Batch in one Workflow**: Set `opts.label` on every `agent()` call to include
the problem ID (e.g., `label: "P3:solver:2"`). Without labels, 36 results come
back with no problem association. Run problems in parallel — the label is what
matters, not ordering.
### For a full problem set
Launch one solver workflow per problem (same VERBATIM prompt, different statement). Run them in parallel. When all return, run adversarial verification per problem. Problems that pass get their proof in the PDF; problems that abstain get "No confident solution" with partial notes.
Launch one solver workflow per problem (same VERBATIM prompt, different
statement). Run them in parallel. When all return, run adversarial verification
per problem. Problems that pass get their proof in the PDF; problems that
abstain get "No confident solution" with partial notes.
Don't try to solve all N problems in one agent's context — each problem needs its own thinking budget and its own fresh-context verifier. The composition is mechanical: collect the per-problem outputs, fill in LaTeX sections, compile once.
| "Simplify this proof" | Skip to presentation (step 8) | — |
Don't try to solve all N problems in one agent's context — each problem needs
its own thinking budget and its own fresh-context verifier. The composition is
mechanical: collect the per-problem outputs, fill in LaTeX sections, compile
once. | "Simplify this proof" | Skip to presentation (step 8) | — |
---
@@ -46,15 +72,24 @@ Don't try to solve all N problems in one agent's context — each problem needs
Before solving anything, identify the interpretation.
> Read the problem statement. List 2-3 ways it could be interpreted. For each: is this reading TRIVIAL? If one reading makes the problem easy and another makes it hard, the hard one is almost certainly intended. State which interpretation you're solving and WHY you believe it's the intended one.
> Read the problem statement. List 2-3 ways it could be interpreted. For each:
> is this reading TRIVIAL? If one reading makes the problem easy and another
> makes it hard, the hard one is almost certainly intended. State which
> interpretation you're solving and WHY you believe it's the intended one.
The Aletheia case study found 50 of 63 "technically correct" solutions were for the wrong interpretation. Olympiad problems often have a trap easy reading.
The Aletheia case study found 50 of 63 "technically correct" solutions were for
the wrong interpretation. Olympiad problems often have a trap easy reading.
### 2. Generate candidates with internal refinement (parallel, thinking only)
Launch 8-12 attempt agents in parallel. **Each agent internally iterates** solve → self-improve → self-verify → correct → repeat. This is the Yang-Huang structure that achieves 85.7% on IMO: one-shot solving isn't enough; per-attempt refinement matters.
Launch 8-12 attempt agents in parallel. **Each agent internally iterates**
solve → self-improve → self-verify → correct → repeat. This is the Yang-Huang
structure that achieves 85.7% on IMO: one-shot solving isn't enough; per-attempt
refinement matters.
**The Agent tool cannot enforce tool restriction.** Subagents get the full tool set. The only mechanism is the prompt. Use this prompt VERBATIM — do not summarize, do not synthesize your own:
**The Agent tool cannot enforce tool restriction.** Subagents get the full tool
set. The only mechanism is the prompt. Use this prompt VERBATIM — do not
summarize, do not synthesize your own:
```
NO COMPUTATION. Do not use Bash, Python, WebSearch, Read, Write, or any tool that runs code or fetches data. Numerical verification is not a proof step. "I computed n=1..10 and the pattern holds" is not a proof.
@@ -74,9 +109,13 @@ PROBLEM: <insert the problem statement here>
ANGLE: <insert one starting angle here>
```
The first two paragraphs are load-bearing. A session that writes its own prompt and omits them will produce subagents that grind Python for 30 iterations and confidently get wrong answers — a pattern that fits n≤10 but fails at n=100 is not a proof.
The first two paragraphs are load-bearing. A session that writes its own prompt
and omits them will produce subagents that grind Python for 30 iterations and
confidently get wrong answers — a pattern that fits n≤10 but fails at n=100 is
not a proof.
Starting angles (vary across agents — see `references/solver_heuristics.md`):
- Work out small cases (test past n=3)
- Look for an invariant or monovariant
- Consider the extremal case
@@ -97,36 +136,44 @@ Each returns its FINAL state (not intermediate rounds):
**Self-verification notes**: [what you caught and fixed; remaining concerns]
```
**Retry policy**: If an agent fails or times out, retry once. Transient failures happen.
**Retry policy**: If an agent fails or times out, retry once. Transient failures
happen.
### 3. Clean the solution (context isolation — the #1 lever)
The thinking trace biases the verifier toward agreement — a long chain of reasoning reads as supporting evidence even when the conclusion is wrong. Before any verification, strip:
The thinking trace biases the verifier toward agreement — a long chain of
reasoning reads as supporting evidence even when the conclusion is wrong. Before
any verification, strip:
- All thinking-block content
- All "Let me try..." / "Actually wait..." / "Hmm" prose
- All false starts and backtracking
What remains: problem statement + clean final argument only.
Extract only the **Method** + **Proof** + **Answer** sections from each solver's output. The verifier never sees how the solver got there.
Extract only the **Method** + **Proof** + **Answer** sections from each solver's
output. The verifier never sees how the solver got there.
### 4. Adversarial verify (fresh context, pattern-armed)
For each cleaned solution, launch a fresh verifier agent. **Fresh context**: it sees only (problem statement + cleaned solution). **No tools.**
For each cleaned solution, launch a fresh verifier agent. **Fresh context**: it
sees only (problem statement + cleaned solution). **No tools.**
The verifier's job is to ATTACK, not grade. Load `references/adversarial_prompts.md` for the prompts. The key patterns it runs:
The verifier's job is to ATTACK, not grade. Load
`references/adversarial_prompts.md` for the prompts. The key patterns it runs:
| Pattern | The check |
|---|---|
| **#4** | Does this theorem specialize to a famous object (ζ, quadratic reciprocity, etc.) and prove something open about it? → gap |
| **#18** | Substitute the proof's own intermediate identities into any "remaining gap." Recover the original claim? → tautological |
| Pattern | The check |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| **#4** | Does this theorem specialize to a famous object (ζ, quadratic reciprocity, etc.) and prove something open about it? → gap |
| **#18** | Substitute the proof's own intermediate identities into any "remaining gap." Recover the original claim? → tautological |
| **#40** | Is any step a "one-line lemma"? Extract the GENERAL form. Find a 2×2 counterexample. If the general form is false, find what special structure saves THIS instance |
| **#5** | For each invoked theorem: re-check hypotheses FROM SCRATCH. "Continuous on [0,1]" ≠ "continuous on " |
| **#6** | Any infinite sum "bounded" via a regularized value? Check the boundary — if there's a pole there, the sum diverges |
| **#5** | For each invoked theorem: re-check hypotheses FROM SCRATCH. "Continuous on [0,1]" ≠ "continuous on " |
| **#6** | Any infinite sum "bounded" via a regularized value? Check the boundary — if there's a pole there, the sum diverges |
Full pattern list: `references/verifier_patterns.md`
Verifier returns:
```
**Verdict**: HOLDS | HOLE FOUND | UNCLEAR
@@ -139,37 +186,60 @@ Verifier returns:
### 5. Rank and vote-verify (asymmetric + early exit)
Rank solutions by (verdict, verifier confidence). Take the top one. Run up to 5 fresh verifier agents.
Rank solutions by (verdict, verifier confidence). Take the top one. Run up to 5
fresh verifier agents.
**Asymmetric thresholds**: 4 HOLDS to confirm, 2 HOLE FOUND to refute. Why asymmetric: one flaky verifier shouldn't kill a correct proof; but two independent dissents is a real signal.
**Asymmetric thresholds**: 4 HOLDS to confirm, 2 HOLE FOUND to refute. Why
asymmetric: one flaky verifier shouldn't kill a correct proof; but two
independent dissents is a real signal.
**Pigeonhole early exit**: stop launching verifiers once the outcome is decided.
- 2 say HOLE FOUND → refuted, stop (save the remaining 3 calls)
- 4 say HOLDS → confirmed, stop (save the 5th)
- After 3 verifiers: if 2 HOLDS + 1 HOLE, launch 2 more (outcome undecided). If 3 HOLDS + 0 HOLE, launch 1 more (could still hit 4-1).
- After 3 verifiers: if 2 HOLDS + 1 HOLE, launch 2 more (outcome undecided). If
3 HOLDS + 0 HOLE, launch 1 more (could still hit 4-1).
**Dual context-isolation**: each verifier is blind to (a) the solver's thinking trace — already stripped in step 3 — AND (b) other verifiers' verdicts. Each verifier thinks it's the first. No "3 agents already confirmed this" social proof.
**Dual context-isolation**: each verifier is blind to (a) the solver's thinking
trace — already stripped in step 3 — AND (b) other verifiers' verdicts. Each
verifier thinks it's the first. No "3 agents already confirmed this" social
proof.
**A solver cannot verify its own solution.** Different agent, fresh context.
### 5b. When one case won't close — step back before grinding
If a proof splits into cases and one case proves easily but the other resists: **before grinding through the hard case, ask whether there's a route that makes the split disappear.**
If a proof splits into cases and one case proves easily but the other resists:
**before grinding through the hard case, ask whether there's a route that makes
the split disappear.**
The pattern that saves you: the hard case's very hypothesis often implies something strong about an *intermediate object* you haven't looked at. Use that implication directly instead of the original chain.
The pattern that saves you: the hard case's very hypothesis often implies
something strong about an _intermediate object_ you haven't looked at. Use that
implication directly instead of the original chain.
Concrete shape: proving f(n) ≤ cn for a constrained function f, with a case split on a prime p dividing f(n). One branch closes by index arguments in (/p^e)*. The other branch resists — same group structure, but the arithmetic doesn't contradict. The fix: the hypothesis "p | f(n)" plugged back into the governing equation implies **f(p) = p itself**. Once you have that, a Fermat+Dirichlet argument kills both branches in three lines. The case split was a detour — it was splitting on a variable that, under the hypothesis, takes a known value.
Concrete shape: proving f(n) ≤ cn for a constrained function f, with a case
split on a prime p dividing f(n). One branch closes by index arguments in
(/p^e)\*. The other branch resists — same group structure, but the arithmetic
doesn't contradict. The fix: the hypothesis "p | f(n)" plugged back into the
governing equation implies **f(p) = p itself**. Once you have that, a
Fermat+Dirichlet argument kills both branches in three lines. The case split was
a detour — it was splitting on a variable that, under the hypothesis, takes a
known value.
Check when stuck on case B:
- What does case B's hypothesis imply about f at *other* inputs?
- What does case B's hypothesis imply about f at _other_ inputs?
- Is there a different pair (a,b) to plug into the governing equation?
- Are you proving too much? (A cleaner contradiction needs less machinery.)
This is also a presentation-pass win: the split-free proof is shorter AND more general.
This is also a presentation-pass win: the split-free proof is shorter AND more
general.
### 6. Revise (if needed)
If verification finds a hole: launch a reviser agent. It gets (cleaned solution + verifier's hole report). STILL no access to the original thinking — the reviser works from the hole, not by rereading how you got there.
If verification finds a hole: launch a reviser agent. It gets (cleaned
solution + verifier's hole report). STILL no access to the original thinking —
the reviser works from the hole, not by rereading how you got there.
```
A verifier found this issue in the proof:
@@ -182,38 +252,75 @@ For any step you cannot fully close, mark it inline: [GAP: specific description
Up to 3 revise cycles. Then re-run the vote on the revised proof.
**If pattern #40 fired** (one-line-proof-too-clean), the reviser gets a stronger brief — the Adversarial Brief template from `references/adversarial_prompts.md` §7. It forces a binary: "the general lemma is obviously false (here's a 2×2 counterexample) — so either find what's special about THIS case, or find where the proof breaks." Can't return "looks fine."
**If pattern #40 fired** (one-line-proof-too-clean), the reviser gets a stronger
brief — the Adversarial Brief template from `references/adversarial_prompts.md`
§7. It forces a binary: "the general lemma is obviously false (here's a 2×2
counterexample) — so either find what's special about THIS case, or find where
the proof breaks." Can't return "looks fine."
### 6c. Deep mode (when tight-budget abstains)
The standard workflow is tight-budget: 8 solvers, ~15 min, pure reasoning. When it abstains, the problem may need more time, not more capability.
The standard workflow is tight-budget: 8 solvers, ~15 min, pure reasoning. When
it abstains, the problem may need more time, not more capability.
**Deep mode** is a single focused agent with:
- **Unlimited time** — no wall-clock pressure
- **Targeted computation allowed** — modular arithmetic checks, small-case enumeration, symbolic verification of identities. NOT exploratory brute force or unbounded recursion.
- **The abstention reason as starting point** — if verifiers found a specific gap, start there. If solvers never claimed complete, start from what they partially proved.
- **Targeted computation allowed** — modular arithmetic checks, small-case
enumeration, symbolic verification of identities. NOT exploratory brute force
or unbounded recursion.
- **The abstention reason as starting point** — if verifiers found a specific
gap, start there. If solvers never claimed complete, start from what they
partially proved.
The archetype: a focused agent that gets the proven-so-far state plus "one case of Lemma 5 is open" — and finds a 3-line argument the case split was obscuring. Often under 10 minutes with almost no computation. Deep mode is about giving the problem sustained attention, not throwing compute at it.
The archetype: a focused agent that gets the proven-so-far state plus "one case
of Lemma 5 is open" — and finds a 3-line argument the case split was obscuring.
Often under 10 minutes with almost no computation. Deep mode is about giving the
problem sustained attention, not throwing compute at it.
**What deep mode is NOT**: open-ended exploration, literature search, looking up solutions, multi-day investigation. That's a different workflow (`math-research`). Deep mode is still "solve THIS problem yourself" — just without the clock.
**What deep mode is NOT**: open-ended exploration, literature search, looking up
solutions, multi-day investigation. That's a different workflow
(`math-research`). Deep mode is still "solve THIS problem yourself" — just
without the clock.
**NO WEB. NO LOOKUP.** Deep mode may use Bash/Python for bounded computation, but NEVER WebFetch, WebSearch, or any network access. Finding the solution on AoPS or a blog is not solving the problem — it's cheating on an olympiad, and it teaches us nothing about the skill's actual capability. Put this at the TOP of the deep-mode prompt:
**NO WEB. NO LOOKUP.** Deep mode may use Bash/Python for bounded computation,
but NEVER WebFetch, WebSearch, or any network access. Finding the solution on
AoPS or a blog is not solving the problem — it's cheating on an olympiad, and it
teaches us nothing about the skill's actual capability. Put this at the TOP of
the deep-mode prompt:
```
NO WEB ACCESS. Do not use WebFetch, WebSearch, or any tool that touches the internet. Do not look up this problem, its solution, or related problems. You are solving this yourself — the only allowed computation is local (Bash/Python for mod-k arithmetic, small-case enumeration n≤10, symbolic identity checks). If you invoke a web tool, the proof is void.
```
**Computation bounds in deep mode** (bug #8 lesson): A6's b_{n+1}=2b_n²+b_n+1 is doubly-exponential; b_99 has ~10^{2^98} digits. Never compute such objects exactly — work in /2^m, or track only v_p(·), or prove the recursion mod the quantity you care about. If a computation is running longer than 60 seconds, it's probably unbounded. Kill it and work symbolically.
**Computation bounds in deep mode** (bug #8 lesson): A6's b\_{n+1}=2b_n²+b_n+1
is doubly-exponential; b_99 has ~10^{2^98} digits. Never compute such objects
exactly — work in /2^m, or track only v_p(·), or prove the recursion mod the
quantity you care about. If a computation is running longer than 60 seconds,
it's probably unbounded. Kill it and work symbolically.
**Step 6d (not optional)**: After any ABSTAIN at the verify stage, automatically
launch one deep-mode agent before writing the abstention into the output. Give
it:
**Step 6d (not optional)**: After any ABSTAIN at the verify stage, automatically launch one deep-mode agent before writing the abstention into the output. Give it:
- The problem statement
- The best partial proof from tight-budget solvers
- The verifier gap descriptions (what specifically didn't close)
- The instruction: "NO WEB ACCESS — do not look up this problem or its solution. Bounded local computation allowed (mod 2^k, small cases n≤10, symbolic identity checks via Bash/Python only). 60-second computation limit. If n≤10 brute force reveals a pattern the tight-budget solvers missed, that pattern IS the proof structure."
- The instruction: "NO WEB ACCESS — do not look up this problem or its solution.
Bounded local computation allowed (mod 2^k, small cases n≤10, symbolic
identity checks via Bash/Python only). 60-second computation limit. If n≤10
brute force reveals a pattern the tight-budget solvers missed, that pattern IS
the proof structure."
The deep agent may find the construction the pure-reasoning solvers couldn't see. If it also abstains, THEN write the abstention. Do not skip this step — problems with √n or log n answers are often invisible to pure reasoning because the optimal structure is the asymmetric one.
The deep agent may find the construction the pure-reasoning solvers couldn't
see. If it also abstains, THEN write the abstention. Do not skip this step —
problems with √n or log n answers are often invisible to pure reasoning because
the optimal structure is the asymmetric one.
**Orchestrator self-restraint**: The orchestrator itself must not web-search the problem "to help" the deep agent. If you're tempted to Fetch an AoPS thread "just to check the answer," don't — that contaminates the skill's output and misrepresents its capability.
**Orchestrator self-restraint**: The orchestrator itself must not web-search the
problem "to help" the deep agent. If you're tempted to Fetch an AoPS thread
"just to check the answer," don't — that contaminates the skill's output and
misrepresents its capability.
### 7. Calibrated abstention
@@ -227,19 +334,26 @@ If 3 revise cycles all fail: **stop and admit it.**
**Where it breaks**: [the unfixed hole]
```
Do NOT guess. A wrong confident answer is worse than an honest "couldn't solve it." The metric that matters is CONDITIONAL accuracy — when you say "solved," are you right?
Do NOT guess. A wrong confident answer is worse than an honest "couldn't solve
it." The metric that matters is CONDITIONAL accuracy — when you say "solved,"
are you right?
### 8. Presentation pass (after correctness is established)
A VERIFIED-CORRECT proof is often not a BEAUTIFUL proof. The order you discovered it is rarely the best order to present it. Launch a fresh presentation agent with the verified proof.
A VERIFIED-CORRECT proof is often not a BEAUTIFUL proof. The order you
discovered it is rarely the best order to present it. Launch a fresh
presentation agent with the verified proof.
Load `references/presentation_prompts.md`. The agent asks:
- What's the simplest way to say this?
- Which lemmas should be inlined? Which deserve to stand alone?
- Is anything OVERKILL? (constructing a double exponential when linear suffices)
- Now that we know the answer, is there a 3-line hindsight proof?
Output: LaTeX-formatted proof. If `pdflatex` is available (`scripts/check_latex.sh` returns 0), also compile to PDF via `scripts/compile_pdf.sh`.
Output: LaTeX-formatted proof. If `pdflatex` is available
(`scripts/check_latex.sh` returns 0), also compile to PDF via
`scripts/compile_pdf.sh`.
---
@@ -247,19 +361,22 @@ Output: LaTeX-formatted proof. If `pdflatex` is available (`scripts/check_latex.
Read `references/model_tier_defaults.md` for full details. Summary:
| Model | Solvers | Verify passes | Abstain after | Presentation |
|---|---|---|---|---|
| Haiku 4.5 | 8 | 3 | 2 revise fails | skip |
| Sonnet 4.6 | 4 | 5 | 3 revise fails | yes |
| Opus 4.6 / Capybara | 3 | 5 + full pattern sweep | 4 revise fails | 2 drafts, pick cleaner |
| Model | Solvers | Verify passes | Abstain after | Presentation |
| ------ | ------- | ---------------------- | -------------- | ---------------------- |
| Haiku | 8 | 3 | 2 revise fails | skip |
| Sonnet | 4 | 5 | 3 revise fails | yes |
| Opus | 3 | 5 + full pattern sweep | 4 revise fails | 2 drafts, pick cleaner |
Weaker models: more parallel attempts, faster abstention. Stronger models: deeper verification, more presentation effort.
Weaker models: more parallel attempts, faster abstention. Stronger models:
deeper verification, more presentation effort.
---
## For numeric-answer problems (AIME-style)
Skip the proof machinery. Run 5-7 solvers with varied approaches, take majority vote on the numeric answer. If no majority: verify the top 2 candidates by substitution.
Skip the proof machinery. Run 5-7 solvers with varied approaches, take majority
vote on the numeric answer. If no majority: verify the top 2 candidates by
substitution.
---
@@ -274,9 +391,21 @@ Skip the proof machinery. Run 5-7 solvers with varied approaches, take majority
## What makes this different from generic verify-and-refine
1. **Dual context isolation**: verifier is blind to (a) the solver's thinking trace — which biases toward agreement — and (b) other verifiers' verdicts — social proof also biases. Each verifier thinks it's first.
2. **Pattern-specific attacks**: not "is this correct?" but "does this make the #40 mistake? the #4 mistake?" Specific beats generic. The 7-category refutation taxonomy gives the verifier a checklist.
3. **Asymmetric vote + pigeonhole exit**: 4-to-confirm, 2-to-refute. One flaky verifier doesn't kill a correct proof; two dissents does. Stop launching verifiers once the outcome is decided — saves ~30% of verification cost on clear cases.
4. **Specification-gaming check first**: explicitly asks "is this the intended interpretation?" before solving. The #1 failure mode in prior work (50/63 "correct" answers solved the wrong reading).
5. **Calibrated abstention**: will say "no confident solution" with partial results. Optimizes conditional accuracy, not coverage.
6. **Presentation pass**: correctness and elegance are separate steps. The presentation agent gets the VERIFIED proof and finds the cleanest way to say it.
1. **Dual context isolation**: verifier is blind to (a) the solver's thinking
trace — which biases toward agreement — and (b) other verifiers' verdicts —
social proof also biases. Each verifier thinks it's first.
2. **Pattern-specific attacks**: not "is this correct?" but "does this make the
#40 mistake? the #4 mistake?" Specific beats generic. The 7-category
refutation taxonomy gives the verifier a checklist.
3. **Asymmetric vote + pigeonhole exit**: 4-to-confirm, 2-to-refute. One flaky
verifier doesn't kill a correct proof; two dissents does. Stop launching
verifiers once the outcome is decided — saves ~30% of verification cost on
clear cases.
4. **Specification-gaming check first**: explicitly asks "is this the intended
interpretation?" before solving. The #1 failure mode in prior work (50/63
"correct" answers solved the wrong reading).
5. **Calibrated abstention**: will say "no confident solution" with partial
results. Optimizes conditional accuracy, not coverage.
6. **Presentation pass**: correctness and elegance are separate steps. The
presentation agent gets the VERIFIED proof and finds the cleanest way to say
it.

View File

@@ -1,26 +1,42 @@
# Adversarial Verifier Prompts — Math Olympiad
Prompt bank for the verifier subagent. Fresh context: problem statement + cleaned solution, NO thinking trace. Agent has NO tools — pure reasoning only.
Prompt bank for the verifier subagent. Fresh context: problem statement +
cleaned solution, NO thinking trace. Agent has NO tools — pure reasoning only.
**Source**: `shared/verifier_patterns_source.md`. Background: arXiv:2503.21934 showed self-verified 85.7% IMO success drops to <5% under human grading. These prompts are the human grader.
**Source**: `shared/verifier_patterns_source.md`. Background: arXiv:2503.21934
showed self-verified 85.7% IMO success drops to <5% under human grading. These
prompts are the human grader.
**Verifier isolation**: You do NOT know how other verifiers voted. You are not told if this proof has been confirmed or refuted by anyone else. Assume you're the first and only reviewer. (Social proof — "3 others confirmed" — biases toward agreement.)
**Verifier isolation**: You do NOT know how other verifiers voted. You are not
told if this proof has been confirmed or refuted by anyone else. Assume you're
the first and only reviewer. (Social proof — "3 others confirmed" — biases
toward agreement.)
---
## Reasons to REFUTE (the taxonomy — look for ANY one of these)
Your goal is to find ANY reason to refute. These are the seven categories a hole falls into:
Your goal is to find ANY reason to refute. These are the seven categories a hole
falls into:
1. **Step doesn't follow** — The conclusion of some step is not implied by its premises. (Includes direction errors: A>B and C>D does NOT give AC>BD.)
2. **Hypothesis not satisfied** — An invoked theorem needs a condition the proof never verified. (Pattern #5: "entire" ≠ "analytic in a disk".)
3. **Claim false in small case** — A stated identity or bound fails at n=2, n=3, or the first nontrivial block. Mentally test it.
4. **Tautological** — The "gap" at the end is the original problem in disguise. (Pattern #18: substitute the proof's own identities back in.)
5. **Proves too much** — The argument's skeleton applies to a famous object and proves something open or false about it. (Pattern #4.)
6. **Wrong interpretation** — Solves an easier reading of the problem than the intended one. (Pattern #60.)
7. **Hand-wave at the crux** — "iterating and optimizing gives the result", "by standard methods", "the details are routine" — at exactly the step that ISN'T routine.
1. **Step doesn't follow** — The conclusion of some step is not implied by its
premises. (Includes direction errors: A>B and C>D does NOT give AC>BD.)
2. **Hypothesis not satisfied** — An invoked theorem needs a condition the proof
never verified. (Pattern #5: "entire" ≠ "analytic in a disk".)
3. **Claim false in small case** — A stated identity or bound fails at n=2, n=3,
or the first nontrivial block. Mentally test it.
4. **Tautological** — The "gap" at the end is the original problem in disguise.
(Pattern #18: substitute the proof's own identities back in.)
5. **Proves too much** — The argument's skeleton applies to a famous object and
proves something open or false about it. (Pattern #4.)
6. **Wrong interpretation** — Solves an easier reading of the problem than the
intended one. (Pattern #60.)
7. **Hand-wave at the crux** — "iterating and optimizing gives the result", "by
standard methods", "the details are routine" — at exactly the step that ISN'T
routine.
If none of these fire after a genuine attempt, CONFIRM. Do not confirm because the proof *sounds* confident.
If none of these fire after a genuine attempt, CONFIRM. Do not confirm because
the proof _sounds_ confident.
---
@@ -28,18 +44,31 @@ If none of these fire after a genuine attempt, CONFIRM. Do not confirm because t
You are an adversarial verifier. Below is a problem and a proposed solution.
**You are NOT grading this. You are trying to BREAK it.** Assume the author is a strong student who made one subtle error that a sympathetic reader would gloss over. Your job is to find that error. If you cannot find one after genuinely trying, say so — but do not say so just because the solution is confidently written.
**You are NOT grading this. You are trying to BREAK it.** Assume the author is a
strong student who made one subtle error that a sympathetic reader would gloss
over. Your job is to find that error. If you cannot find one after genuinely
trying, say so — but do not say so just because the solution is confidently
written.
Attack each step:
- Is the claimed inequality actually in the claimed direction? Reason through a small case mentally.
- Is every "clearly" / "obviously" / "it follows that" actually clear? These words often mark the exact spot where the author convinced themselves of something false.
- Does every cited theorem's hypothesis actually hold? Check quantifiers: "for all" vs "there exists", pointwise vs average.
- At each "WLOG": is generality actually preserved, or does the reduction discard the hard case?
- Does the argument use a property that's true for the *generic* object but not the *specific* one in the problem?
You have no tools. Reason about small cases in your head — do not claim to have "computed" anything.
- Is the claimed inequality actually in the claimed direction? Reason through a
small case mentally.
- Is every "clearly" / "obviously" / "it follows that" actually clear? These
words often mark the exact spot where the author convinced themselves of
something false.
- Does every cited theorem's hypothesis actually hold? Check quantifiers: "for
all" vs "there exists", pointwise vs average.
- At each "WLOG": is generality actually preserved, or does the reduction
discard the hard case?
- Does the argument use a property that's true for the _generic_ object but not
the _specific_ one in the problem?
You have no tools. Reason about small cases in your head — do not claim to have
"computed" anything.
**Output format:**
```
VERDICT: CORRECT | INCORRECT | GAP
CONFIDENCE: high | medium | low
@@ -50,19 +79,29 @@ ISSUE: [if INCORRECT/GAP: one-sentence location, then one-paragraph explanation.
## 2. Pattern #4 — Would It Prove Too Much?
You are an adversarial verifier running a single check: **does this argument prove something famously open or famously false?**
You are an adversarial verifier running a single check: **does this argument
prove something famously open or famously false?**
Read the proposed solution. Ignore whether the proof is locally valid. Instead:
1. Strip the argument down to its skeleton: what properties of the given objects does it *actually use*?
2. Find the most famous object that shares exactly those properties. (If it bounds a sum using only "positive decreasing terms" — does the harmonic series have positive decreasing terms? If it uses only "multiplicative and bounded by 1" — does the Möbius function qualify?)
1. Strip the argument down to its skeleton: what properties of the given objects
does it _actually use_?
2. Find the most famous object that shares exactly those properties. (If it
bounds a sum using only "positive decreasing terms" — does the harmonic
series have positive decreasing terms? If it uses only "multiplicative and
bounded by 1" — does the Möbius function qualify?)
3. Mentally rerun the argument on that substitute. What does it now prove?
If the substitute conclusion is a known open problem or a known falsehood, the original proof has a gap. The gap is at the step where the argument stops working for the substitute — find that step. That step is silently using a property the author never stated.
If the substitute conclusion is a known open problem or a known falsehood, the
original proof has a gap. The gap is at the step where the argument stops
working for the substitute — find that step. That step is silently using a
property the author never stated.
If the argument genuinely uses a property specific to the problem's object that the famous substitute lacks, say which property and where it's used.
If the argument genuinely uses a property specific to the problem's object that
the famous substitute lacks, say which property and where it's used.
**Output format:**
```
VERDICT: CORRECT | INCORRECT
CONFIDENCE: high | medium | low
@@ -74,19 +113,32 @@ ISSUE: [if it proves too much: which step fails for the substitute, and what uns
## 3. Pattern #40 — One-Line-Proof-Too-Clean
You are an adversarial verifier targeting short proofs. The solution below contains at least one step that is suspiciously brief — one line doing a lot of work.
You are an adversarial verifier targeting short proofs. The solution below
contains at least one step that is suspiciously brief — one line doing a lot of
work.
For the shortest load-bearing step in the solution:
1. **Extract the general lemma.** Write down the most general claim the step is implicitly using. Not "for this sum" but "for any sum of this shape." Not "for the determinant" but "for any function of the matrix entries with this property."
2. **Try to break the general lemma with a 2×2 case.** Two elements, two terms, a 2×2 matrix — the smallest nontrivial instance. Reason it through in your head. Can you find values where the general lemma fails?
1. **Extract the general lemma.** Write down the most general claim the step is
implicitly using. Not "for this sum" but "for any sum of this shape." Not
"for the determinant" but "for any function of the matrix entries with this
property."
2. **Try to break the general lemma with a 2×2 case.** Two elements, two terms,
a 2×2 matrix — the smallest nontrivial instance. Reason it through in your
head. Can you find values where the general lemma fails?
3. **Judge:**
- If the general lemma survives your 2×2 attack: the step is probably fine.
- If the general lemma FAILS at 2×2 but the specific instance in the proof still seems to work: the step is **INCORRECT as written**. There is special structure in the problem that makes it true, and the proof does not invoke that structure. The author got the right answer for the wrong reason.
- If the general lemma FAILS at 2×2 but the specific instance in the proof
still seems to work: the step is **INCORRECT as written**. There is special
structure in the problem that makes it true, and the proof does not invoke
that structure. The author got the right answer for the wrong reason.
The classic failure: "rank depends only on support" — but [[1,1],[1,1]] has rank 1 and [[1,1],[1,1]] has rank 2, same support. General lemma false; a specific instance was true because of a sign-factorization the proof never mentioned.
The classic failure: "rank depends only on support" — but [[1,1],[1,1]] has rank
1 and [[1,1],[1,1]] has rank 2, same support. General lemma false; a specific
instance was true because of a sign-factorization the proof never mentioned.
**Output format:**
```
VERDICT: CORRECT | INCORRECT | GAP
CONFIDENCE: high | medium | low
@@ -99,18 +151,30 @@ ISSUE: [if the general lemma is false: what special structure the proof failed t
## 4. Pattern #18 — Tautological Reduction
You are an adversarial verifier checking one thing: **did the solution argue itself in a circle?**
You are an adversarial verifier checking one thing: **did the solution argue
itself in a circle?**
The solution likely proceeds through a chain of reductions or equivalent reformulations, ending at a "final estimate" or "key inequality" that it then proves directly. Your task:
The solution likely proceeds through a chain of reductions or equivalent
reformulations, ending at a "final estimate" or "key inequality" that it then
proves directly. Your task:
1. List every identity, equality, or substitution the solution establishes along the way. (Things like "A = B + C", "the sum splits as X + Y", "by the earlier lemma, P = Q".)
2. Take the FINAL claim — the one the solution presents as "and this is now easy" or "this follows from [standard fact]".
3. Substitute the chain's OWN identities (from step 1) back into that final claim. Expand. Simplify.
4. What do you get? If you recover the ORIGINAL problem — or something trivially equivalent to it — then the "reduction" is a tautology. The proof has done nothing; it renamed the problem and declared it solved.
1. List every identity, equality, or substitution the solution establishes along
the way. (Things like "A = B + C", "the sum splits as X + Y", "by the earlier
lemma, P = Q".)
2. Take the FINAL claim — the one the solution presents as "and this is now
easy" or "this follows from [standard fact]".
3. Substitute the chain's OWN identities (from step 1) back into that final
claim. Expand. Simplify.
4. What do you get? If you recover the ORIGINAL problem — or something trivially
equivalent to it — then the "reduction" is a tautology. The proof has done
nothing; it renamed the problem and declared it solved.
The trap: long chains feel like progress. "We've reduced it to bounding X!" is only progress if X is actually different from what you started with. Sometimes X is just the original, wearing a hat.
The trap: long chains feel like progress. "We've reduced it to bounding X!" is
only progress if X is actually different from what you started with. Sometimes X
is just the original, wearing a hat.
**Output format:**
```
VERDICT: CORRECT | INCORRECT | GAP
CONFIDENCE: high | medium | low
@@ -123,19 +187,31 @@ ISSUE: [is it the original problem? trivially equivalent? genuinely simpler? say
## 5. Pattern #60 — Specification-Gaming
You are an adversarial verifier checking one thing: **did the solution answer the easiest interpretation of the question instead of the intended one?**
You are an adversarial verifier checking one thing: **did the solution answer
the easiest interpretation of the question instead of the intended one?**
Read the problem statement alone. Before looking at the solution in detail:
1. Write down 23 plausible readings of what the problem is asking. Pay attention to: scope of quantifiers ("find all" vs "find one"), what "determine" means (a formula? a characterization? an existence proof?), boundary cases (does n=0 or n=1 count? is the empty set allowed? are degenerate configurations included?).
1. Write down 23 plausible readings of what the problem is asking. Pay
attention to: scope of quantifiers ("find all" vs "find one"), what
"determine" means (a formula? a characterization? an existence proof?),
boundary cases (does n=0 or n=1 count? is the empty set allowed? are
degenerate configurations included?).
2. Rank them by how hard they would be to solve.
3. Which reading did the solution actually address?
If the solution addresses the EASIEST reading — and especially if the problem under that reading would be trivially short for its stated source (an IMO problem that becomes a two-liner is a red flag) — then be suspicious. Olympiad problems are calibrated to their point values. A final-problem that falls in three lines means you're probably not solving the final problem.
If the solution addresses the EASIEST reading — and especially if the problem
under that reading would be trivially short for its stated source (an IMO
problem that becomes a two-liner is a red flag) — then be suspicious. Olympiad
problems are calibrated to their point values. A final-problem that falls in
three lines means you're probably not solving the final problem.
Also check: did the solution prove something about *an* object when the problem asked about *all* such objects? Did it show *possibility* when the problem wanted *necessity*?
Also check: did the solution prove something about _an_ object when the problem
asked about _all_ such objects? Did it show _possibility_ when the problem
wanted _necessity_?
**Output format:**
```
VERDICT: CORRECT | INCORRECT | GAP
CONFIDENCE: high | medium | low
@@ -148,17 +224,29 @@ ISSUE: [if they differ: what the solution is missing. If they match: why the eas
## 6. Consecutive-Verify (5-pass loop)
You are verifier pass {K} of 5. A solution passes only if all five independent verifiers agree.
You are verifier pass {K} of 5. A solution passes only if all five independent
verifiers agree.
**Verify INDEPENDENTLY.** You have not seen — and must not imagine — what any other verifier said. Do not reason "this probably already got checked." Your vote is the only vote you control. If you wave something through on the assumption that another pass will catch it, and the other four passes reason the same way, a wrong solution ships.
**Verify INDEPENDENTLY.** You have not seen — and must not imagine — what any
other verifier said. Do not reason "this probably already got checked." Your
vote is the only vote you control. If you wave something through on the
assumption that another pass will catch it, and the other four passes reason the
same way, a wrong solution ships.
Read the problem. Read the solution. Trace every step yourself, from scratch.
One bias to actively resist: when a solution is well-written, confident, and uses standard machinery correctly in *most* places, you will be inclined to trust the one place you can't quite follow. **Invert this.** Well-written and confident is exactly what a subtly wrong solution looks like — the author convinced themselves before they convinced the math. The place you can't quite follow is the place to press hardest.
One bias to actively resist: when a solution is well-written, confident, and
uses standard machinery correctly in _most_ places, you will be inclined to
trust the one place you can't quite follow. **Invert this.** Well-written and
confident is exactly what a subtly wrong solution looks like — the author
convinced themselves before they convinced the math. The place you can't quite
follow is the place to press hardest.
You have no tools. Reason through small cases mentally; do not claim numerical verification.
You have no tools. Reason through small cases mentally; do not claim numerical
verification.
**Output format:**
```
VERDICT: CORRECT | INCORRECT | GAP
CONFIDENCE: high | medium | low
@@ -170,21 +258,34 @@ ISSUE: [if INCORRECT/GAP: exact step and why. If CORRECT: the step you found har
## 7. Adversarial Brief (for the reviser when pattern #40 fires)
Use this instead of a general "fix the hole" prompt when a verifier flagged a one-line lemma whose general form is false. This framing forces a binary — the reviser cannot return "looks fine."
Use this instead of a general "fix the hole" prompt when a verifier flagged a
one-line lemma whose general form is false. This framing forces a binary — the
reviser cannot return "looks fine."
> **Adversarial brief**: The principle "[extracted general lemma]" is obviously false in general — [trivial counterexample, e.g., [[1,1],[1,1]] has rank 1 and [[1,1],[1,1]] has rank 2, same support].
> **Adversarial brief**: The principle "[extracted general lemma]" is obviously
> false in general — [trivial counterexample, e.g., [[1,1],[1,1]] has rank 1 and
> [[1,1],[1,1]] has rank 2, same support].
>
> So exactly one of these is true, and your job is to determine which:
>
> **(A)** The conclusion holds for a DIFFERENT reason specific to this case. Find that reason. What structure does [the specific object in the problem] have that [the counterexample] lacks? That structure is the real proof.
> **(A)** The conclusion holds for a DIFFERENT reason specific to this case.
> Find that reason. What structure does [the specific object in the problem]
> have that [the counterexample] lacks? That structure is the real proof.
>
> **(B)** The proof is wrong and the conclusion fails at [concrete prediction of where it diverges — e.g., "the first case where the block is ≥2×2, which is m=4"].
> **(B)** The proof is wrong and the conclusion fails at [concrete prediction of
> where it diverges — e.g., "the first case where the block is ≥2×2, which is
> m=4"].
>
> Return (A) with the special structure identified, or (B) with the failure point. "The original proof is actually fine" is not an available answer — the general lemma is false, so either something saves this instance or nothing does.
> Return (A) with the special structure identified, or (B) with the failure
> point. "The original proof is actually fine" is not an available answer — the
> general lemma is false, so either something saves this instance or nothing
> does.
The best outcome is (A) — the thesis survives AND you learn why. The corrected proof is more informative than the false one.
The best outcome is (A) — the thesis survives AND you learn why. The corrected
proof is more informative than the false one.
**Output format:**
```
RESOLUTION: (A) SPECIAL_STRUCTURE | (B) CONCLUSION_FALSE
IF (A): The structure [specific object] has that [counterexample] lacks: [...]. Revised proof: [...]

View File

@@ -1,6 +1,7 @@
# Solver-Refiner Agent Prompt
You are solving a competition math problem. You have NO tools — pure reasoning only.
You are solving a competition math problem. You have NO tools — pure reasoning
only.
## Your process (iterate internally until done)
@@ -10,22 +11,32 @@ Think deeply. Produce a complete solution.
**Round 2: Self-improve**
Reread your solution. Fix any errors or gaps you find. This is your chance to catch your own mistakes before a grader does.
Reread your solution. Fix any errors or gaps you find. This is your chance to
catch your own mistakes before a grader does.
**Round 3: Self-verify**
Switch roles. You are now a strict IMO grader. Check every step. Classify each issue as:
- **Critical Error**: breaks the logical chain (e.g., claiming A>B and C>D implies A-C>B-D)
Switch roles. You are now a strict IMO grader. Check every step. Classify each
issue as:
- **Critical Error**: breaks the logical chain (e.g., claiming A>B and C>D
implies A-C>B-D)
- **Justification Gap**: conclusion may be correct but argument incomplete
If you find issues: note them, go back to your solver role, correct the solution, verify again. Repeat up to 5 times.
If you find issues: note them, go back to your solver role, correct the
solution, verify again. Repeat up to 5 times.
**Stop when**: Either your self-verification passes cleanly 2 times in a row, OR you've done 5 correction rounds, OR you're certain the approach is fundamentally wrong.
**Stop when**: Either your self-verification passes cleanly 2 times in a row, OR
you've done 5 correction rounds, OR you're certain the approach is fundamentally
wrong.
## Core principles (from Yang-Huang IMO25)
- **Rigor is paramount**: A correct final answer from flawed reasoning is a failure.
- **Honesty about completeness**: If you cannot find a complete solution, say so. Present significant partial results (key lemma proven, one case resolved, a bound without achievability). Do NOT guess or hide gaps.
- **Rigor is paramount**: A correct final answer from flawed reasoning is a
failure.
- **Honesty about completeness**: If you cannot find a complete solution, say
so. Present significant partial results (key lemma proven, one case resolved,
a bound without achievability). Do NOT guess or hide gaps.
- **Use TeX**: All mathematics in `$...$` or `$$...$$`.
## Output format (ONLY your FINAL state after all rounds — not the intermediate iterations)
@@ -47,7 +58,6 @@ If you find issues: note them, go back to your solver role, correct the solution
---
PROBLEM:
{statement}
PROBLEM: {statement}
HINT: {angle}

View File

@@ -1,25 +1,46 @@
# Construction Patterns
Methodological patterns for finding optimal constructions. No specific problem answers.
Methodological patterns for finding optimal constructions. No specific problem
answers.
## Spread vs cluster
For optimization problems over permutations/configurations: the **symmetric choice (identity, diagonal, regular spacing) is often the worst case, not the best**. The intuition "symmetric = optimal" fails when the objective rewards *large substructures* that symmetry prevents.
For optimization problems over permutations/configurations: the **symmetric
choice (identity, diagonal, regular spacing) is often the worst case, not the
best**. The intuition "symmetric = optimal" fails when the objective rewards
_large substructures_ that symmetry prevents.
**When to suspect this**: The problem asks to maximize the size of something (tiles, intervals, independent sets) subject to a one-per-row/one-per-column constraint. The symmetric placement makes the forbidden region a contiguous band, leaving only thin slivers. Spreading the forbidden positions leaves fat windows.
**When to suspect this**: The problem asks to maximize the size of something
(tiles, intervals, independent sets) subject to a one-per-row/one-per-column
constraint. The symmetric placement makes the forbidden region a contiguous
band, leaving only thin slivers. Spreading the forbidden positions leaves fat
windows.
**What to try**: Partition into √n groups, assign each group to a residue class mod √n. Within a group, place in reverse order. This makes any contiguous block of √n rows/columns have its forbidden positions spread across all residue classes.
**What to try**: Partition into √n groups, assign each group to a residue class
mod √n. Within a group, place in reverse order. This makes any contiguous block
of √n rows/columns have its forbidden positions spread across all residue
classes.
## Moment curve for distinctness
When you need n objects in ^k where "any k are independent" (or similar genericity), the moment curve `(1, t, t², ..., t^{k-1})` at n distinct parameter values gives this for free. Vandermonde determinants are nonzero, so any k of the vectors are linearly independent.
When you need n objects in ^k where "any k are independent" (or similar
genericity), the moment curve `(1, t, t², ..., t^{k-1})` at n distinct parameter
values gives this for free. Vandermonde determinants are nonzero, so any k of
the vectors are linearly independent.
**Rank-1 from vectors**: If you need matrices instead of vectors, rank-1 idempotents `A_i = v_i w_i^T` (projection onto `span(v_i)` along a complementary hyperplane) turn vector genericity into commutator conditions. `[A_i, A_j] = 0` iff a specific determinant vanishes.
**Rank-1 from vectors**: If you need matrices instead of vectors, rank-1
idempotents `A_i = v_i w_i^T` (projection onto `span(v_i)` along a complementary
hyperplane) turn vector genericity into commutator conditions. `[A_i, A_j] = 0`
iff a specific determinant vanishes.
## When brute-force reveals √n
If brute-forcing n=2..8 gives a sequence that fits `an + b√n + c` better than `an + b`, the optimal structure has √n-sized blocks. Look for a construction parameterized by k where k=√n balances two competing costs (e.g., k things each of size n/k).
If brute-forcing n=2..8 gives a sequence that fits `an + b√n + c` better than
`an + b`, the optimal structure has √n-sized blocks. Look for a construction
parameterized by k where k=√n balances two competing costs (e.g., k things each
of size n/k).
## Avoid: storing specific answers here
This file is for construction *techniques*, not solutions. If you find yourself writing "the answer to Problem X is Y," delete it.
This file is for construction _techniques_, not solutions. If you find yourself
writing "the answer to Problem X is Y," delete it.

View File

@@ -1,19 +1,29 @@
# Model Tier Defaults
Parameters scale with model capability. Budget is not the constraint — the constraints are diminishing returns (more voters stop helping past a point) and the asymmetric noise floor (Haiku verifiers are individually less reliable, so the right response is width not depth).
Parameters scale with model capability. Budget is not the constraint — the
constraints are diminishing returns (more voters stop helping past a point) and
the asymmetric noise floor (Haiku verifiers are individually less reliable, so
the right response is width not depth).
## Haiku 4.5
## Haiku
Width compensates for per-sample noise. Scaffolding is where the leverage is.
- **Parallel solvers**: 12 (wide fan — each individual solve is weaker, so cast a wider net)
- **Vote budget**: 7 verifiers, need 5-confirm / 3-refute (pigeonhole exit: stop when outcome decided)
- **Parallel solvers**: 12 (wide fan — each individual solve is weaker, so cast
a wider net)
- **Vote budget**: 7 verifiers, need 5-confirm / 3-refute (pigeonhole exit: stop
when outcome decided)
- **Abstain threshold**: 3 consecutive revise cycles fail
- **Pattern sweep**: all 12 patterns — Haiku can follow a checklist, the patterns are the scaffold
- **Presentation pass**: yes, 3 drafts, comparator picks cleanest. Haiku's raw output is rougher, so this matters MORE not less.
- **Rationale**: The skill's value is highest where the base model is weakest. Give Haiku the full harness. The 3-refute threshold (higher than Sonnet's 2) accounts for Haiku verifiers being individually noisier — don't let 2 confused Haikus kill a correct proof.
- **Pattern sweep**: all 12 patterns — Haiku can follow a checklist, the
patterns are the scaffold
- **Presentation pass**: yes, 3 drafts, comparator picks cleanest. Haiku's raw
output is rougher, so this matters MORE not less.
- **Rationale**: The skill's value is highest where the base model is weakest.
Give Haiku the full harness. The 3-refute threshold (higher than Sonnet's 2)
accounts for Haiku verifiers being individually noisier — don't let 2 confused
Haikus kill a correct proof.
## Sonnet 4.6
## Sonnet
Balanced.
@@ -24,21 +34,33 @@ Balanced.
- **Presentation pass**: 2 drafts, comparator picks cleaner
- **Rationale**: 4-of-5 tolerates one flake. 2 dissents is signal.
## Opus 4.6 / Capybara
## Opus
Depth. Each sample is strong, so invest in making the adversarial pass harder.
- **Parallel solvers**: 4
- **Vote budget**: 5 general verifiers (4-confirm / 2-refute) PLUS one dedicated verifier per pattern in `verifier_patterns.md` (12 targeted attacks). Any pattern-specific HOLE FOUND counts toward refute.
- **Abstain threshold**: 5 consecutive revise cycles fail (trust the model's ability to eventually fix)
- **Vote budget**: 5 general verifiers (4-confirm / 2-refute) PLUS one dedicated
verifier per pattern in `verifier_patterns.md` (12 targeted attacks). Any
pattern-specific HOLE FOUND counts toward refute.
- **Abstain threshold**: 5 consecutive revise cycles fail (trust the model's
ability to eventually fix)
- **Pattern sweep**: all 12, each with its own dedicated agent
- **Presentation pass**: 3 drafts with different instructions ("most elegant," "most elementary," "shortest"), comparator picks the best. Strong models can genuinely produce different *styles* of proof.
- **Rationale**: Opus/Capybara can execute the deep patterns (#19 base-vs-derived, #22 mean-first) that need real mathematical judgment. The 12 dedicated pattern passes are where the model's capability is best spent — it's the difference between "be skeptical" and "check THIS specific thing."
- **Presentation pass**: 3 drafts with different instructions ("most elegant,"
"most elementary," "shortest"), comparator picks the best. Strong models can
genuinely produce different _styles_ of proof.
- **Rationale**: Opus can execute the deep patterns (#19 base-vs-derived, #22
mean-first) that need real mathematical judgment. The 12 dedicated pattern
passes are where the model's capability is best spent — it's the difference
between "be skeptical" and "check THIS specific thing."
## On the pigeonhole exit
Kept at all tiers — not because of cost, but because once `inflight >= confirm_needed + refute_needed - 1`, the remaining votes carry no information regardless of how they land. Launching them anyway is pure latency.
Kept at all tiers — not because of cost, but because once
`inflight >= confirm_needed + refute_needed - 1`, the remaining votes carry no
information regardless of how they land. Launching them anyway is pure latency.
## Identifying the tier
If the orchestrating session doesn't know which model it is, default to Sonnet configuration. A reasonable heuristic: ask the model to self-identify in its first response and match against `haiku`/`sonnet`/`opus`/`capybara` in the output.
If the orchestrating session doesn't know which model it is, default to Sonnet
configuration. A reasonable heuristic: ask the model to self-identify in its
first response and match against `haiku`/`sonnet`/`opus` in the output.

View File

@@ -1,38 +1,68 @@
# Presentation Pass — Prompts and Templates
**Premise**: Aletheia's PDFs are beautiful; raw IMO output is not. The difference is a *presentation pass*: after a proof is **verified correct**, a fresh agent — one who didn't sweat through the discovery — finds the cleanest way to say it. The discoverer is too attached to the scaffolding.
**Premise**: Aletheia's PDFs are beautiful; raw IMO output is not. The
difference is a _presentation pass_: after a proof is **verified correct**, a
fresh agent — one who didn't sweat through the discovery — finds the cleanest
way to say it. The discoverer is too attached to the scaffolding.
The Erdős paper even criticizes Aletheia's *own* output: *"somewhat overkill; any f whose inverse is at most [X] would suffice, no need to take the double exponential."* The presentation pass is where overkill goes to die.
The Erdős paper even criticizes Aletheia's _own_ output: _"somewhat overkill;
any f whose inverse is at most [X] would suffice, no need to take the double
exponential."_ The presentation pass is where overkill goes to die.
---
## 1. The Presentation Pass Prompt
Paste this to a **fresh subagent** along with the verified proof. The agent must not have discovery-context; that's the point.
Paste this to a **fresh subagent** along with the verified proof. The agent must
not have discovery-context; that's the point.
> You are given a **verified, correct proof**. Your job is not to check it — it is correct. Your job is to find the **cleanest presentation**. The order it was discovered in is almost never the order it should be read in.
> You are given a **verified, correct proof**. Your job is not to check it — it
> is correct. Your job is to find the **cleanest presentation**. The order it
> was discovered in is almost never the order it should be read in.
>
> Work through these questions in order:
>
> **Hindsight shortcuts.** Now that you know the answer, is there a 3-line argument? The discoverer built machinery to *find* the key step; you already *have* the key step. Can the machinery be discarded? (Classic: a long case-bash that, in hindsight, collapses once you spot the invariant.)
> **Hindsight shortcuts.** Now that you know the answer, is there a 3-line
> argument? The discoverer built machinery to _find_ the key step; you already
> _have_ the key step. Can the machinery be discarded? (Classic: a long
> case-bash that, in hindsight, collapses once you spot the invariant.)
>
> **Overkill.** Is any bound stronger than needed? Any construction more general than the problem requires? If a double exponential works but a linear function also works, use the linear one — the reader will wonder what the double exponential is hiding. Match the strength of each tool to the strength of what it's proving.
> **Overkill.** Is any bound stronger than needed? Any construction more general
> than the problem requires? If a double exponential works but a linear function
> also works, use the linear one — the reader will wonder what the double
> exponential is hiding. Match the strength of each tool to the strength of what
> it's proving.
>
> **What to cut.** Which steps *verify* without *illuminating*? Discovery leaves a debris field: sanity checks, dead ends backed out of, "note that X (we won't use this)". Delete them. If a paragraph can be removed and the proof still compiles in the reader's head, remove it.
> **What to cut.** Which steps _verify_ without _illuminating_? Discovery leaves
> a debris field: sanity checks, dead ends backed out of, "note that X (we won't
> use this)". Delete them. If a paragraph can be removed and the proof still
> compiles in the reader's head, remove it.
>
> **Lemma granularity.** Inline a lemma if it's used once and the proof is ≤3 lines. Keep it standalone if it's used twice, or if its *statement alone* clarifies the structure (even with a 1-line proof). Name standalone lemmas descriptively — "Combinatorial dimension bound", not "Lemma 2".
> **Lemma granularity.** Inline a lemma if it's used once and the proof is ≤3
> lines. Keep it standalone if it's used twice, or if its _statement alone_
> clarifies the structure (even with a 1-line proof). Name standalone lemmas
> descriptively — "Combinatorial dimension bound", not "Lemma 2".
>
> **Order.** Lead with the main statement. Then the one idea that makes it work. Then the details. Isolate the one genuinely clever step — there's almost always exactly one — and let everything else be obviously routine *by contrast*.
> **Order.** Lead with the main statement. Then the one idea that makes it work.
> Then the details. Isolate the one genuinely clever step — there's almost
> always exactly one — and let everything else be obviously routine _by
> contrast_.
>
> **Step names.** Number steps *and* name them: "**Step 3: Fourier inversion and translation invariance.**" The name is a promise to the reader about what this block accomplishes. Signpost reductions explicitly: "We are reduced to showing that…"
> **Step names.** Number steps _and_ name them: "**Step 3: Fourier inversion and
> translation invariance.**" The name is a promise to the reader about what this
> block accomplishes. Signpost reductions explicitly: "We are reduced to showing
> that…"
>
> Output clean LaTeX using the template below. Aim for: a strong grad student could reconstruct every suppressed detail, a professor could skim the step names alone and nod.
> Output clean LaTeX using the template below. Aim for: a strong grad student
> could reconstruct every suppressed detail, a professor could skim the step
> names alone and nod.
---
## 2. LaTeX Output Template
Minimal preamble — Aletheia's environments, none of its ornament. No `tcolorbox`, no custom colors.
Minimal preamble — Aletheia's environments, none of its ornament. No
`tcolorbox`, no custom colors.
```latex
\documentclass[11pt]{article}
@@ -89,10 +119,16 @@ Minimal preamble — Aletheia's environments, none of its ornament. No `tcolorbo
```
**Style conventions lifted from the Aletheia samples:**
- Display math for the equation a step *produces*; inline math for the algebra getting there.
- Cite precisely when invoking a named result: *(JacquetPiatetski-ShapiroShalika, 1981)* — not "by a well-known theorem".
- In contradiction proofs: state the false assumption plainly ("Suppose, for contradiction, that…"), and flag the collision plainly ("We are led to the contradiction $0 > 0$.").
- Integer bounds earn the ceiling: if $d \ge n/k$ and $d \in \mathbb{Z}$, write $d \ge \lceil n/k \rceil$. Free sharpness.
- Display math for the equation a step _produces_; inline math for the algebra
getting there.
- Cite precisely when invoking a named result:
_(JacquetPiatetski-ShapiroShalika, 1981)_ — not "by a well-known theorem".
- In contradiction proofs: state the false assumption plainly ("Suppose, for
contradiction, that…"), and flag the collision plainly ("We are led to the
contradiction $0 > 0$.").
- Integer bounds earn the ceiling: if $d \ge n/k$ and $d \in \mathbb{Z}$, write
$d \ge \lceil n/k \rceil$. Free sharpness.
---
@@ -100,10 +136,18 @@ Minimal preamble — Aletheia's environments, none of its ornament. No `tcolorbo
The presentation agent should flag and fix these:
- **Discovery-order exposition.** "First I tried X, which led me to notice Y…" — the reader doesn't care. State Y.
- **Overkill constructions.** The tell: the bound you prove is parametrically stronger than what the next line consumes. Weaken it until it's tight.
- **Proof by intimidation.** *"It is trivial to see that…"*, *"Obviously…"*, *"A standard argument shows…"* — if it's trivial, one sentence suffices. Write the sentence.
- **Unnecessary generality.** Proving it for all $n$ when the problem asks about $n=3$ and the general case adds no insight, only indices.
- **Discovery-order exposition.** "First I tried X, which led me to notice Y…" —
the reader doesn't care. State Y.
- **Overkill constructions.** The tell: the bound you prove is parametrically
stronger than what the next line consumes. Weaken it until it's tight.
- **Proof by intimidation.** _"It is trivial to see that…"_, _"Obviously…"_, _"A
standard argument shows…"_ — if it's trivial, one sentence suffices. Write the
sentence.
- **Unnecessary generality.** Proving it for all $n$ when the problem asks about
$n=3$ and the general case adds no insight, only indices.
- **Orphan lemmas.** Stated, proved, cited once, three lines long. Inline it.
- **Unlabeled case splits.** Five cases, no indication of why five or what distinguishes them. Name the cases; say upfront which one carries the content.
- **Missing signposts.** A page of computation with no "we are reduced to" / "it suffices to show" markers. The reader shouldn't have to reverse-engineer your strategy.
- **Unlabeled case splits.** Five cases, no indication of why five or what
distinguishes them. Name the cases; say upfront which one carries the content.
- **Missing signposts.** A page of computation with no "we are reduced to" / "it
suffices to show" markers. The reader shouldn't have to reverse-engineer your
strategy.

View File

@@ -1,80 +1,135 @@
# Solver Heuristics (Pólya + Olympiad Practice)
For solver subagents. These are the moves to try when the direct approach stalls.
For solver subagents. These are the moves to try when the direct approach
stalls.
## Pólya's core moves (from "How to Solve It")
**Have you seen a related problem?** Not the same problem — one with the same UNKNOWN, or the same STRUCTURE. A problem about covering points with lines has the same shape as one about covering lattice points with arithmetic progressions.
**Have you seen a related problem?** Not the same problem — one with the same
UNKNOWN, or the same STRUCTURE. A problem about covering points with lines has
the same shape as one about covering lattice points with arithmetic
progressions.
**Specialize.** If you can't solve the given problem, solve n=3, n=4, n=5 by hand. The pattern is often the proof. (But: test past the first nontrivial case — n≤3 may be degenerate.)
**Specialize.** If you can't solve the given problem, solve n=3, n=4, n=5 by
hand. The pattern is often the proof. (But: test past the first nontrivial case
— n≤3 may be degenerate.)
**Generalize (inventor's paradox).** The more ambitious problem sometimes has MORE structure and is easier. "Prove for all primes" might be harder than "prove for all integers" if the integer case has a clean induction.
**Generalize (inventor's paradox).** The more ambitious problem sometimes has
MORE structure and is easier. "Prove for all primes" might be harder than "prove
for all integers" if the integer case has a clean induction.
**Drop a condition.** What happens if you relax one hypothesis? Does the result become trivially false? Where? That WHERE is often the key step — the point where the condition is load-bearing.
**Drop a condition.** What happens if you relax one hypothesis? Does the result
become trivially false? Where? That WHERE is often the key step — the point
where the condition is load-bearing.
**Work backwards.** Start from what you want to prove. What would imply it? What would imply THAT? If this chain meets something you can prove directly, you have the proof (reversed).
**Work backwards.** Start from what you want to prove. What would imply it? What
would imply THAT? If this chain meets something you can prove directly, you have
the proof (reversed).
**Auxiliary element.** Introduce something not in the problem — a new variable, a reflection, a well-chosen function. Olympiad geometry lives on this (auxiliary points, circles).
**Auxiliary element.** Introduce something not in the problem — a new variable,
a reflection, a well-chosen function. Olympiad geometry lives on this (auxiliary
points, circles).
## Olympiad-specific moves
**Find the invariant.** If there's a process (game, transformation, iteration), what quantity is preserved? Parity, sum, product modulo something.
**Find the invariant.** If there's a process (game, transformation, iteration),
what quantity is preserved? Parity, sum, product modulo something.
**Find the extremal.** Take the LARGEST, or SMALLEST, or LEFTMOST object. Extremal choices often have extra properties that generic choices don't.
**Find the extremal.** Take the LARGEST, or SMALLEST, or LEFTMOST object.
Extremal choices often have extra properties that generic choices don't.
**Double count.** Count the same thing two ways. Incidences, edges, sums over pairs.
**Double count.** Count the same thing two ways. Incidences, edges, sums over
pairs.
**Coloring / parity.** Can you 2-color the objects so the claim becomes a parity statement?
**Coloring / parity.** Can you 2-color the objects so the claim becomes a parity
statement?
**Smoothing / adjusting.** For inequalities: if you perturb two variables closer together (or further apart), does the expression increase or decrease? Extremize.
**Smoothing / adjusting.** For inequalities: if you perturb two variables closer
together (or further apart), does the expression increase or decrease?
Extremize.
**Symmetry → WLOG.** If the problem is symmetric in x,y,z, you can assume x≤y≤z. But only if the conclusion is ALSO symmetric.
**Symmetry → WLOG.** If the problem is symmetric in x,y,z, you can assume x≤y≤z.
But only if the conclusion is ALSO symmetric.
## Geometry-specific moves
Standard angles (induction, invariants, extremal) are often wrong-shaped for olympiad geometry. Use these instead:
Standard angles (induction, invariants, extremal) are often wrong-shaped for
olympiad geometry. Use these instead:
**Coordinate bash.** Place the configuration in coordinates. Choose them to kill degrees of freedom (origin at a center, axis along a line). Grind out the algebra. Ugly but reliable.
**Coordinate bash.** Place the configuration in coordinates. Choose them to kill
degrees of freedom (origin at a center, axis along a line). Grind out the
algebra. Ugly but reliable.
**Auxiliary point.** Introduce a point not in the problem — a reflection, a second intersection, the point where two lines "should" meet. Often the key construction is finding the right extra point.
**Auxiliary point.** Introduce a point not in the problem — a reflection, a
second intersection, the point where two lines "should" meet. Often the key
construction is finding the right extra point.
**Power of a point.** For any point P and circle ω, PA·PB is the same for every line through P meeting ω at A, B. Use it to turn ratios into equalities.
**Power of a point.** For any point P and circle ω, PA·PB is the same for every
line through P meeting ω at A, B. Use it to turn ratios into equalities.
**Spiral similarity / rotation.** Two directly similar triangles are related by a spiral similarity (rotation + scaling about a fixed point). Find that point — it often lies on a circle you already have.
**Spiral similarity / rotation.** Two directly similar triangles are related by
a spiral similarity (rotation + scaling about a fixed point). Find that point —
it often lies on a circle you already have.
**Inversion.** When there are many circles or tangencies, invert about a well-chosen center. Circles through the center become lines; tangencies become simpler tangencies.
**Inversion.** When there are many circles or tangencies, invert about a
well-chosen center. Circles through the center become lines; tangencies become
simpler tangencies.
**Angle chase.** Cyclic quadrilaterals give equal angles. Tangent-chord gives an angle equal to the inscribed angle. Chase around the figure.
**Angle chase.** Cyclic quadrilaterals give equal angles. Tangent-chord gives an
angle equal to the inscribed angle. Chase around the figure.
## Geometry-specific moves (these are DIFFERENT)
The standard angles (invariant, extremal, induction) don't fit circles/circumcenters/orthocenters. Geometry needs:
The standard angles (invariant, extremal, induction) don't fit
circles/circumcenters/orthocenters. Geometry needs:
**Coordinate bash.** Place one point at origin, another on the x-axis. Compute everything explicitly. The algebra is heavy but mechanical. For two circles with centers M, N and radii r, R: set M=(0,0), N=(d,0), then the intersection points have x-coordinate (r²+d²R²)/2d and everything follows.
**Coordinate bash.** Place one point at origin, another on the x-axis. Compute
everything explicitly. The algebra is heavy but mechanical. For two circles with
centers M, N and radii r, R: set M=(0,0), N=(d,0), then the intersection points
have x-coordinate (r²+d²R²)/2d and everything follows.
**Auxiliary point.** Introduce a point not in the problem — the reflection, the foot of a perpendicular, the second intersection. Olympiad geometry lives on finding the right extra point.
**Auxiliary point.** Introduce a point not in the problem — the reflection, the
foot of a perpendicular, the second intersection. Olympiad geometry lives on
finding the right extra point.
**Power of a point.** For point P and circle Γ: PA·PB is constant for any line through P meeting Γ at A,B. This converts circles to products.
**Power of a point.** For point P and circle Γ: PA·PB is constant for any line
through P meeting Γ at A,B. This converts circles to products.
**Inversion.** Circles through the center become lines. Sometimes the inverted problem is trivial.
**Inversion.** Circles through the center become lines. Sometimes the inverted
problem is trivial.
**Angle chasing / cyclic quads.** Four points are concyclic iff opposite angles sum to π. Chase angles until enough equalities force concyclicity.
**Angle chasing / cyclic quads.** Four points are concyclic iff opposite angles
sum to π. Chase angles until enough equalities force concyclicity.
## Recurrence-specific trap
For recurrences like b_{n+1} = P(b_n) where P is polynomial degree ≥ 2: **b_n grows doubly-exponentially**. You cannot compute b_30 exactly — it has trillions of digits. Work in /2^m (or /p^m) from the start. Prove b_n ≡ r_n (mod 2^m) by induction on n, NOT by computing b_n.
For recurrences like b\_{n+1} = P(b_n) where P is polynomial degree ≥ 2: **b_n
grows doubly-exponentially**. You cannot compute b_30 exactly — it has trillions
of digits. Work in /2^m (or /p^m) from the start. Prove b_n ≡ r_n (mod 2^m) by
induction on n, NOT by computing b_n.
## When the answer involves √n or log n
These answers often come from a structure that is NOT the obvious/symmetric one. The diagonal, the identity, the "natural" choice frequently gives the WORST case, not the best — it clusters the constraint in a way that prevents large substructures.
These answers often come from a structure that is NOT the obvious/symmetric one.
The diagonal, the identity, the "natural" choice frequently gives the WORST
case, not the best — it clusters the constraint in a way that prevents large
substructures.
**For pure-reasoning solvers**: Before claiming the symmetric choice is optimal, ask "what if I deliberately break the symmetry?" For grid/covering problems: what if the gaps are SPREAD OUT instead of clustered? For sequences: what if the extremal sequence is NOT constant or linear?
**For pure-reasoning solvers**: Before claiming the symmetric choice is optimal,
ask "what if I deliberately break the symmetry?" For grid/covering problems:
what if the gaps are SPREAD OUT instead of clustered? For sequences: what if the
extremal sequence is NOT constant or linear?
**For deep-mode agents**: Brute-force n=3..8 before theorizing. If the formula that fits is n+c√n instead of cn, the structure has √n-sized blocks.
**For deep-mode agents**: Brute-force n=3..8 before theorizing. If the formula
that fits is n+c√n instead of cn, the structure has √n-sized blocks.
## The Look Back phase (after you have a proof)
- **Can you check it?** Plug in small cases. Does n=3 give what your formula says?
- **Can you prove it differently?** A second proof is a verification. And often shorter.
- **Is your bound tight?** If you proved ≤ N and the answer is exactly N, find the extremal case. If you can't, your bound might be loose.
- **What did you actually use?** Sometimes you used less than all the hypotheses — the real theorem is stronger.
- **Can you check it?** Plug in small cases. Does n=3 give what your formula
says?
- **Can you prove it differently?** A second proof is a verification. And often
shorter.
- **Is your bound tight?** If you proved ≤ N and the answer is exactly N, find
the extremal case. If you can't, your bound might be loose.
- **What did you actually use?** Sometimes you used less than all the hypotheses
— the real theorem is stronger.

View File

@@ -1,135 +1,206 @@
# Verifier Patterns — Olympiad Subset
For a verifier with **no tools, only reasoning**. Each pattern is a mental check you can run on a candidate proof. These are the specific ways proofs go wrong that self-verification misses. (Source: 59 patterns from real research sessions; these 13 need no grep/fetch/compute.)
For a verifier with **no tools, only reasoning**. Each pattern is a mental check
you can run on a candidate proof. These are the specific ways proofs go wrong
that self-verification misses. (Source: 59 patterns from real research sessions;
these 13 need no grep/fetch/compute.)
Run #18 and #19 after any positive finding. Run #40 on any proof that feels too short.
Run #18 and #19 after any positive finding. Run #40 on any proof that feels too
short.
---
### Pattern 4: Would it prove a famous open problem?
**The check**: Specialize the claimed theorem to the most famous object in its class (ζ(s), the Ramsey number, the Collatz map). Does the specialization settle a known open problem?
**The check**: Specialize the claimed theorem to the most famous object in its
class (ζ(s), the Ramsey number, the Collatz map). Does the specialization settle
a known open problem?
**What it catches**: A bound "for all Dirichlet series with property P" that, applied to ζ, would prove Lindelöf — the proof treated arithmetic input as generic.
**What it catches**: A bound "for all Dirichlet series with property P" that,
applied to ζ, would prove Lindelöf — the proof treated arithmetic input as
generic.
**How to run it**: Find the step where the argument uses a generic property. Ask: does ζ (or the canonical hard instance) actually have this property? The gap is always where it doesn't.
**How to run it**: Find the step where the argument uses a generic property.
Ask: does ζ (or the canonical hard instance) actually have this property? The
gap is always where it doesn't.
---
### Pattern 5: Outside the hypothesis class
**The check**: For each example claimed to satisfy a theorem, re-derive the hypotheses from the definition — don't trust the label.
**The check**: For each example claimed to satisfy a theorem, re-derive the
hypotheses from the definition — don't trust the label.
**What it catches**: "f is entire of order ≤1, so by Thm 3.1…" — but Thm 3.1 needs f analytic in a *full disk* around 0; a natural boundary on the imaginary axis blocks it.
**What it catches**: "f is entire of order ≤1, so by Thm 3.1…" — but Thm 3.1
needs f analytic in a _full disk_ around 0; a natural boundary on the imaginary
axis blocks it.
**How to run it**: Write out the theorem's hypothesis verbatim. For each claimed instance, check inclusion from scratch. Watch for near-synonyms ("bounded" vs "bounded on the line"; "entire" vs "analytic on a domain").
**How to run it**: Write out the theorem's hypothesis verbatim. For each claimed
instance, check inclusion from scratch. Watch for near-synonyms ("bounded" vs
"bounded on the line"; "entire" vs "analytic on a domain").
---
### Pattern 6: Divergent sum behind analytic continuation
**The check**: When a divergent-looking sum is "bounded by ζ(s)" or similar, evaluate the bounding function at the boundary of the claimed range.
**The check**: When a divergent-looking sum is "bounded by ζ(s)" or similar,
evaluate the bounding function at the boundary of the claimed range.
**What it catches**: "Σ 1/n ≤ ζ(1)" — but ζ(1) is a pole. The analytic continuation of a sum is not the sum.
**What it catches**: "Σ 1/n ≤ ζ(1)" — but ζ(1) is a pole. The analytic
continuation of a sum is not the sum.
**How to run it**: Mentally substitute the boundary value of the parameter into the bounding expression. A pole or ∞ there means the original sum diverges, regardless of what the continued function says elsewhere.
**How to run it**: Mentally substitute the boundary value of the parameter into
the bounding expression. A pole or ∞ there means the original sum diverges,
regardless of what the continued function says elsewhere.
---
### Pattern 10: Same keywords, different theorem
**The check**: When a cited theorem has the right *words* but the fit feels off — check pointwise vs averaged, uniform vs a.e., finite vs asymptotic.
**The check**: When a cited theorem has the right _words_ but the fit feels off
— check pointwise vs averaged, uniform vs a.e., finite vs asymptotic.
**What it catches**: Invoking "Fourier decay ⇒ bound" for a pointwise estimate, when the cited decay theorem needs curvature and you only have it on average.
**What it catches**: Invoking "Fourier decay ⇒ bound" for a pointwise estimate,
when the cited decay theorem needs curvature and you only have it on average.
**How to run it**: State precisely what the proof *needs* (pointwise? for all x? with what uniformity?). State what the cited theorem *gives*. Sometimes the weaker version is enough and this *closes* a gap; sometimes the gap is real.
**How to run it**: State precisely what the proof _needs_ (pointwise? for all x?
with what uniformity?). State what the cited theorem _gives_. Sometimes the
weaker version is enough and this _closes_ a gap; sometimes the gap is real.
---
### Pattern 17: Test past the first nontrivial block
**The check**: Before accepting a pattern from small cases, identify where the structure first becomes nontrivial. Confirm the pattern holds *past* that threshold.
**The check**: Before accepting a pattern from small cases, identify where the
structure first becomes nontrivial. Confirm the pattern holds _past_ that
threshold.
**What it catches**: "Checked m = 1, 2, 3: all blocks have rank 1." But m ≤ 3 gives only 1×2 blocks — rank 1 is forced. First 2×2 appears at m = 4, and there the claim fails.
**What it catches**: "Checked m = 1, 2, 3: all blocks have rank 1." But m ≤ 3
gives only 1×2 blocks — rank 1 is forced. First 2×2 appears at m = 4, and there
the claim fails.
**How to run it**: Ask "what makes the small cases easy?" Find the parameter value where that degeneracy disappears. The claim must survive at least one case beyond it.
**How to run it**: Ask "what makes the small cases easy?" Find the parameter
value where that degeneracy disappears. The claim must survive at least one case
beyond it.
---
### Pattern 18: Tautological reduction
**The check**: When a reduction chain ends at "estimate X would finish it," substitute the chain's own already-proven identities into X.
**The check**: When a reduction chain ends at "estimate X would finish it,"
substitute the chain's own already-proven identities into X.
**What it catches**: "Suffices to show ∫|P|² ≤ C·H." But the chain itself proved ∫|P|² = H + 2Re(OD') *exactly*. So X is just the original conjecture plus a cosmetic shift — not a reduction.
**What it catches**: "Suffices to show ∫|P|² ≤ C·H." But the chain itself proved
∫|P|² = H + 2Re(OD') _exactly_. So X is just the original conjecture plus a
cosmetic shift — not a reduction.
**How to run it**: Take each identity the chain proved along the way and plug it into the "final gap." If you recover the starting conjecture (or something at least as strong), the chain went in a circle.
**How to run it**: Take each identity the chain proved along the way and plug it
into the "final gap." If you recover the starting conjecture (or something at
least as strong), the chain went in a circle.
---
### Pattern 19: Derived obstruction vs base obstruction
**The check**: When the same obstruction kills 3+ independent approaches, compute the disputed property on the *original* object — before any reduction.
**The check**: When the same obstruction kills 3+ independent approaches,
compute the disputed property on the _original_ object — before any reduction.
**What it catches**: "det(Hessian) = 0, ruled surface, decoupling fails" — for the phase log(2πmθ). But the *base* phase is nθ t·log(n), and *its* Hessian has det = 1. The obstruction lived in the proxy.
**What it catches**: "det(Hessian) = 0, ruled surface, decoupling fails" — for
the phase log(2πmθ). But the _base_ phase is nθ t·log(n), and _its_ Hessian
has det = 1. The obstruction lived in the proxy.
**How to run it**: Name the object the obstruction is *about*. Is it the thing you started with, or something a reduction produced? Go back to the start and check directly.
**How to run it**: Name the object the obstruction is _about_. Is it the thing
you started with, or something a reduction produced? Go back to the start and
check directly.
---
### Pattern 22: Absolute-sum gives O(K); compute the mean first
**The check**: Before accepting that Σₖ Xₖ = O(1) is "too hard because |Xₖ| summed gives O(K)," compute the mean of Xₖ over the varying parameter.
**The check**: Before accepting that Σₖ Xₖ = O(1) is "too hard because |Xₖ|
summed gives O(K)," compute the mean of Xₖ over the varying parameter.
**What it catches**: Weyl equidistribution gives mean(Xₖ) = 0 *exactly*. So Σ Xₖ is a fluctuation sum — the target is Var = O(1), and half the conjecture falls in one line.
**What it catches**: Weyl equidistribution gives mean(Xₖ) = 0 _exactly_. So Σ Xₖ
is a fluctuation sum — the target is Var = O(1), and half the conjecture falls
in one line.
**How to run it**: Separate Xₖ into mean + fluctuation. If orthogonality/equidistribution forces the mean to zero, you were never fighting K terms of size 1 — you were fighting √K terms (or better). Rewrite the target.
**How to run it**: Separate Xₖ into mean + fluctuation. If
orthogonality/equidistribution forces the mean to zero, you were never fighting
K terms of size 1 — you were fighting √K terms (or better). Rewrite the target.
---
### Pattern 23: Formula's scope never stated
**The check**: For any identity used in the proof, ask: was this proved for the general case, or for a special case that the author silently generalized?
**The check**: For any identity used in the proof, ask: was this proved for the
general case, or for a special case that the author silently generalized?
**What it catches**: "κ₄ = 3d 1" was derived for 2-piece Cantor sets. The proof applies it to an m-piece set, where the real formula involves additive energy and can differ by a constant factor.
**What it catches**: "κ₄ = 3d 1" was derived for 2-piece Cantor sets. The
proof applies it to an m-piece set, where the real formula involves additive
energy and can differ by a constant factor.
**How to run it**: Trace the identity to where it was first introduced. What were the standing assumptions *there*? Check that those assumptions still hold at the point of use.
**How to run it**: Trace the identity to where it was first introduced. What
were the standing assumptions _there_? Check that those assumptions still hold
at the point of use.
---
### Pattern 35: Count quantifiers before diagonalizing
**The check**: Before "diagonalize against class C using property P," ask whether *certifying* P is an ∃-statement or a ∀-statement.
**The check**: Before "diagonalize against class C using property P," ask
whether _certifying_ P is an ∃-statement or a ∀-statement.
**What it catches**: "Find an x not computed by any small circuit" — but verifying "no small circuit computes x" is a ∀ over circuits. Your diagonalizer is in Σ₂, not NP. (This is *why* Kannan gives Σ₂ᴾ ⊄ SIZE, not NP ⊄ SIZE.)
**What it catches**: "Find an x not computed by any small circuit" — but
verifying "no small circuit computes x" is a ∀ over circuits. Your diagonalizer
is in Σ₂, not NP. (This is _why_ Kannan gives Σ₂ᴾ ⊄ SIZE, not NP ⊄ SIZE.)
**How to run it**: Write the diagonalization as a formula. Count alternations. If you need ∀∃ to describe the witness, you've jumped a level in the hierarchy.
**How to run it**: Write the diagonalization as a formula. Count alternations.
If you need ∀∃ to describe the witness, you've jumped a level in the hierarchy.
---
### Pattern 40: One-line-proof-too-clean
**The check**: Extract the proof's key step as a lemma in *full generality* not specialized to the objects at hand. Try a 2×2 counterexample to the general lemma.
**The check**: Extract the proof's key step as a lemma in _full generality_
not specialized to the objects at hand. Try a 2×2 counterexample to the general
lemma.
**What it catches**: "rank depends only on monomial support" — but [[1,1],[1,1]] has rank 1 and [[1,1],[1,1]] has rank 2 with the same support. The general lemma is false; the specific case holds because sgn(π) = f(S)·g(T) factors. *That's* the real proof.
**What it catches**: "rank depends only on monomial support" — but [[1,1],[1,1]]
has rank 1 and [[1,1],[1,1]] has rank 2 with the same support. The general
lemma is false; the specific case holds because sgn(π) = f(S)·g(T) factors.
_That's_ the real proof.
**How to run it**: If the general lemma dies but the specific conclusion survives numerically, there's hidden structure. Find it. The real proof goes through *that*, not the false lemma.
**How to run it**: If the general lemma dies but the specific conclusion
survives numerically, there's hidden structure. Find it. The real proof goes
through _that_, not the false lemma.
---
### Pattern 58: Quantifier direction on domain size
**The check**: Before claiming one statement is "strictly stronger" than another because its domain is smaller — check whether the quantifier is ∀ or ∃.
**The check**: Before claiming one statement is "strictly stronger" than another
because its domain is smaller — check whether the quantifier is ∀ or ∃.
**What it catches**: "∀ S ∈ D, φ(S)" over a *smaller* D is *weaker* (fewer obligations). "∃ S ∈ D, φ(S)" over smaller D is *stronger* (fewer candidates). Backwards strength claims swap these.
**What it catches**: "∀ S ∈ D, φ(S)" over a _smaller_ D is _weaker_ (fewer
obligations). "∃ S ∈ D, φ(S)" over smaller D is _stronger_ (fewer candidates).
Backwards strength claims swap these.
**How to run it**: Say the statement out loud with the quantifier explicit. Shrinking the domain under ∀ drops requirements. Shrinking under ∃ drops witnesses. Only one direction is "harder."
**How to run it**: Say the statement out loud with the quantifier explicit.
Shrinking the domain under ∀ drops requirements. Shrinking under ∃ drops
witnesses. Only one direction is "harder."
---
### Pattern 60: Easiest-interpretation trap
**The check**: Before solving, write down 23 readings of the problem statement. Flag whichever one makes the problem trivial.
**The check**: Before solving, write down 23 readings of the problem statement.
Flag whichever one makes the problem trivial.
**What it catches**: 63 "technically correct" solutions; only 13 "meaningfully correct." The gap: solving the easiest grammatically-valid reading instead of the intended one. Olympiad problems often *plant* an easy misreading.
**What it catches**: 63 "technically correct" solutions; only 13 "meaningfully
correct." The gap: solving the easiest grammatically-valid reading instead of
the intended one. Olympiad problems often _plant_ an easy misreading.
**How to run it**: Ask "under which reading is this a real problem?" If your interpretation makes it a one-liner and the problem is worth 7 points, you've probably chosen wrong. Solve the hard reading; note the easy one only as a remark.
**How to run it**: Ask "under which reading is this a real problem?" If your
interpretation makes it a one-liner and the problem is worth 7 points, you've
probably chosen wrong. Solve the hard reading; note the easy one only as a
remark.

View File

@@ -1,5 +1,6 @@
{
"name": "ralph-loop",
"version": "1.0.0",
"description": "Continuous self-referential AI loops for interactive iterative development, implementing the Ralph Wiggum technique. Run Claude in a while-true loop with the same prompt until task completion.",
"author": {
"name": "Anthropic",

View File

@@ -6,7 +6,7 @@
"hooks": [
{
"type": "command",
"command": "${CLAUDE_PLUGIN_ROOT}/hooks/stop-hook.sh"
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/stop-hook.sh\""
}
]
}

View File

@@ -0,0 +1,42 @@
---
name: session-report
description: Generate an explorable HTML report of Claude Code session usage (tokens, cache, subagents, skills, expensive prompts) from ~/.claude/projects transcripts.
---
# Session Report
Produce a self-contained HTML report of Claude Code usage and save it to the current working directory.
## Steps
1. **Get data.** Run the bundled analyzer (default window: last 7 days; honor a different range if the user passed one, e.g. `24h`, `30d`, or `all`). The script `analyze-sessions.mjs` lives in the same directory as this SKILL.md — use its absolute path:
```sh
node <skill-dir>/analyze-sessions.mjs --json --since 7d > /tmp/session-report.json
```
For all-time, omit `--since`.
2. **Read** `/tmp/session-report.json`. Skim `overall`, `by_project`, `by_subagent_type`, `by_skill`, `cache_breaks`, `top_prompts`.
3. **Copy the template** (also bundled alongside this SKILL.md) to the output path in the current working directory:
```sh
cp <skill-dir>/template.html ./session-report-$(date +%Y%m%d-%H%M).html
```
4. **Edit the output file** (use Edit, not Write — preserve the template's JS/CSS):
- Replace the contents of `<script id="report-data" type="application/json">` with the full JSON from step 1. The page's JS renders the hero total, all tables, bars, and drill-downs from this blob automatically.
- Fill the `<!-- AGENT: anomalies -->` block with **35 one-line findings**. Express figures as a **% of total tokens** wherever possible (total = `overall.input_tokens.total + overall.output_tokens`). One line per finding, exact markup:
```html
<div class="take bad"><div class="fig">41.2%</div><div class="txt"><b>cc-monitor</b> consumed 41% of the week across just 3 sessions</div></div>
```
Classes: `.take bad` for waste/anomalies (red), `.take good` for healthy signals (green), `.take info` for neutral facts (blue). The `.fig` is one short number (a %, a count, or a multiplier like `12×`). The `.txt` is one plain-English sentence naming the project/skill/prompt; wrap the subject in `<b>`. Look for: a project or skill eating a disproportionate share, cache-hit <85%, a single prompt >2% of total, subagent types averaging >1M tokens/call, cache breaks clustering.
- Fill the `<!-- AGENT: optimizations -->` block (at the **bottom** of the page) with 14 `<div class="callout">` suggestions tied to specific rows (e.g. "`/weekly-status` spawned 7 subagents for 8.1% of total — scope it to fewer parallel agents").
- Do not restructure existing sections.
5. **Report** the saved file path to the user. Do not open it or render it.
## Notes
- The template is the source of interactivity (sorting, expand/collapse, block-char bars). Your job is data + narrative, not markup.
- Keep commentary terse and specific — reference actual project names, numbers, timestamps from the JSON.
- `top_prompts` already includes subagent tokens and rolls task-notification continuations into the originating prompt.
- If the JSON is >2MB, trim `top_prompts` to 100 entries and `cache_breaks` to 100 before embedding (they should already be capped).

View File

@@ -0,0 +1,816 @@
#!/usr/bin/env node
/* eslint-disable */
/**
* analyze-sessions.js
*
* Scans ~/.claude/projects/**.jsonl transcript files and reports token usage,
* message counts, runtime, cache breaks, subagent and skill activity.
*
* Output is human-readable text by default; pass --json for machine-readable.
*
* Usage:
* node scripts/analyze-sessions.js [--dir <projects-dir>] [--json] [--since <ISO|7d|24h>] [--top N]
*
* Notes on JSONL structure (discovered empirically):
* - One API response is split into MULTIPLE `type:"assistant"` entries (one per
* content block). They share the same `requestId` / `message.id`, and only the
* LAST one carries the final `output_tokens`. We dedupe by requestId and keep
* the max output_tokens to avoid 3-10x overcounting.
* - `type:"user"` entries include tool_result messages, interrupt markers,
* compact summaries and meta-injected text. A "human" message is one where
* isSidechain/isMeta/isCompactSummary are falsy and the content is a plain
* string (or text block) that isn't a tool_result or interrupt marker.
* - Subagent transcripts live in <project>/<sessionId>/subagents/*.jsonl with a
* sibling *.meta.json containing {agentType}. When meta is absent we fall back
* to the filename label (`agent-a<label>-<hash>.jsonl` → label) or "fork".
* - Resumed sessions can re-serialize prior entries into a new file; we dedupe
* globally by entry `uuid` so replayed history isn't double-counted.
*/
import fs from 'fs'
import os from 'os'
import path from 'path'
import readline from 'readline'
// ---------------------------------------------------------------------------
// CLI args
// ---------------------------------------------------------------------------
const argv = process.argv.slice(2)
function flag(name, dflt) {
const i = argv.indexOf(name)
if (i === -1) return dflt
const v = argv[i + 1]
return v === undefined || v.startsWith('--') ? true : v
}
const ROOT = flag('--dir', path.join(os.homedir(), '.claude', 'projects'))
const AS_JSON = argv.includes('--json')
const TOP_N = parseInt(flag('--top', '15'), 10)
const SINCE = parseSince(flag('--since', null))
const CACHE_BREAK_THRESHOLD = parseInt(flag('--cache-break', '100000'), 10)
const IDLE_GAP_MS = 5 * 60 * 1000 // gaps >5min don't count toward "active" time
function parseSince(s) {
if (!s) return null
const m = /^(\d+)([dh])$/.exec(s)
if (m) {
const ms = m[2] === 'd' ? 86400000 : 3600000
return new Date(Date.now() - parseInt(m[1], 10) * ms)
}
const d = new Date(s)
return isNaN(d) ? null : d
}
// ---------------------------------------------------------------------------
// Stats container
// ---------------------------------------------------------------------------
function newStats() {
return {
sessions: new Set(),
apiCalls: 0,
inputUncached: 0, // usage.input_tokens
inputCacheCreate: 0, // usage.cache_creation_input_tokens
inputCacheRead: 0, // usage.cache_read_input_tokens
outputTokens: 0,
humanMessages: 0,
wallClockMs: 0,
activeMs: 0,
cacheBreaks: [], // [{ts, session, project, uncached, total}]
subagentCalls: 0,
subagentTokens: 0, // total (in+out) inside subagent transcripts
skillInvocations: {}, // name -> count
firstTs: null,
lastTs: null,
}
}
function addUsage(s, u) {
s.apiCalls++
s.inputUncached += u.input_tokens || 0
s.inputCacheCreate += u.cache_creation_input_tokens || 0
s.inputCacheRead += u.cache_read_input_tokens || 0
s.outputTokens += u.output_tokens || 0
}
// ---------------------------------------------------------------------------
// File discovery
// ---------------------------------------------------------------------------
function* walk(dir) {
let ents
try {
ents = fs.readdirSync(dir, { withFileTypes: true })
} catch {
return
}
for (const e of ents) {
const p = path.join(dir, e.name)
if (e.isDirectory()) yield* walk(p)
else if (e.isFile() && e.name.endsWith('.jsonl')) yield p
}
}
function classifyFile(p) {
// returns { project, sessionId, kind, agentId?, agentTypeHint? }
// agentTypeHint is from meta.json or filename label; final type is resolved
// in main() after the parent-transcript map is built.
const rel = path.relative(ROOT, p)
const parts = rel.split(path.sep)
const project = parts[0]
const subIdx = parts.indexOf('subagents')
if (subIdx !== -1) {
const sessionId = parts[subIdx - 1]
const base = path.basename(p, '.jsonl')
const agentId = base.replace(/^agent-/, '')
return {
project,
sessionId,
kind: 'subagent',
agentId,
agentTypeHint:
inferAgentTypeFromMeta(p) || inferAgentTypeFromFilename(base),
}
}
if (parts.includes('workflows')) {
const sessionId = parts[1]
return { project, sessionId, kind: 'subagent', agentTypeHint: 'workflow' }
}
const sessionId = path.basename(p, '.jsonl')
return { project, sessionId, kind: 'main' }
}
function inferAgentTypeFromMeta(jsonlPath) {
const metaPath = jsonlPath.replace(/\.jsonl$/, '.meta.json')
try {
const m = JSON.parse(fs.readFileSync(metaPath, 'utf8'))
if (m && typeof m.agentType === 'string') return m.agentType
} catch {
/* no meta */
}
return null
}
function inferAgentTypeFromFilename(base) {
// agentId = 'a' + hex16 OR 'a' + label + '-' + hex16 (src/utils/uuid.ts)
const m = /^agent-a([a-zA-Z_][\w-]*?)-[0-9a-f]{6,}$/.exec(base)
if (m) return m[1] // internal background fork label
return null // unlabeled — resolve via agentIdToType map or default to 'fork'
}
// ---------------------------------------------------------------------------
// Per-file streaming parse
// ---------------------------------------------------------------------------
const seenUuids = new Set() // global dedupe across resumed sessions
const seenRequestIds = new Set() // global dedupe for usage accounting
const toolUseIdToType = new Map() // tool_use id -> subagent_type (from Agent/Task tool_use)
const agentIdToType = new Map() // agentId -> subagent_type (linked via tool_result)
const toolUseIdToPrompt = new Map() // tool_use id -> promptKey (Agent spawned during this prompt)
const agentIdToPrompt = new Map() // agentId -> promptKey
const prompts = new Map() // promptKey -> { text, ts, project, sessionId, ...usage }
const sessionTurns = new Map() // sessionId -> [promptKey, ...] in transcript order
function promptRecord(key, init) {
let r = prompts.get(key)
if (!r) {
r = {
text: init.text,
ts: init.ts,
project: init.project,
sessionId: init.sessionId,
apiCalls: 0,
subagentCalls: 0,
inputUncached: 0,
inputCacheCreate: 0,
inputCacheRead: 0,
outputTokens: 0,
}
prompts.set(key, r)
}
return r
}
async function processFile(p, info, buckets) {
const rl = readline.createInterface({
input: fs.createReadStream(p, { encoding: 'utf8' }),
crlfDelay: Infinity,
})
// Per-file: dedupe API calls by requestId, keep the one with max output_tokens.
// We collect first, then commit, because earlier blocks have stale output counts.
const fileApiCalls = new Map() // key -> {usage, ts}
let firstTs = null
let lastTs = null
let prevTs = null
let activeMs = 0
let currentSkill = null // skill attribution for this turn
// Prompt attribution: in main files this is set on each human message; in
// subagent files it's inherited from the spawning prompt (via agentIdToPrompt).
let currentPrompt =
info.kind === 'subagent' && info.agentId
? agentIdToPrompt.get(info.agentId) || null
: null
const project = buckets.project
const overall = buckets.overall
const subagent = buckets.subagent // may be null
const skillStats = buckets.skillStats // map name -> stats
for await (const line of rl) {
if (!line) continue
let e
try {
e = JSON.parse(line)
} catch {
continue
}
// global uuid dedupe (resumed sessions replay history)
if (e.uuid) {
if (seenUuids.has(e.uuid)) continue
seenUuids.add(e.uuid)
}
// timestamp tracking
if (e.timestamp) {
const ts = Date.parse(e.timestamp)
if (!isNaN(ts)) {
if (SINCE && ts < SINCE.getTime()) continue
if (firstTs === null) firstTs = ts
if (prevTs !== null) {
const gap = ts - prevTs
if (gap > 0 && gap < IDLE_GAP_MS) activeMs += gap
}
prevTs = ts
lastTs = ts
}
}
if (e.type === 'user') {
// Link Agent tool_result -> agentId for type + prompt attribution.
const tur = e.toolUseResult
if (tur && tur.agentId) {
const c0 = Array.isArray(e.message?.content)
? e.message.content[0]
: null
const tuid = c0 && c0.tool_use_id
if (tuid) {
const st = toolUseIdToType.get(tuid)
if (st) agentIdToType.set(tur.agentId, st)
const pk = toolUseIdToPrompt.get(tuid)
if (pk) {
agentIdToPrompt.set(tur.agentId, pk)
const r = prompts.get(pk)
if (r) r.subagentCalls++
}
}
}
handleUser(
e,
info,
{ project, overall, subagent },
v => {
currentSkill = v
},
pk => {
currentPrompt = pk
},
)
continue
}
if (e.type === 'assistant') {
const msg = e.message || {}
const usage = msg.usage
// detect Skill / Agent tool calls in content
if (Array.isArray(msg.content)) {
for (const c of msg.content) {
if (c && c.type === 'tool_use') {
if (c.name === 'Skill' && c.input && c.input.skill) {
const sk = String(c.input.skill)
bumpSkill(overall, sk)
bumpSkill(project, sk)
if (subagent) bumpSkill(subagent, sk)
currentSkill = sk
}
if (c.name === 'Agent' || c.name === 'Task') {
if (c.input && c.input.subagent_type) {
toolUseIdToType.set(c.id, String(c.input.subagent_type))
}
if (currentPrompt) toolUseIdToPrompt.set(c.id, currentPrompt)
}
}
}
}
if (!usage) continue
const key =
e.requestId ||
(msg.id && msg.id.startsWith('msg_0') && msg.id.length > 10
? msg.id
: null) ||
`${p}:${e.uuid || ''}`
const prev = fileApiCalls.get(key)
if (
!prev ||
(usage.output_tokens || 0) >= (prev.usage.output_tokens || 0)
) {
fileApiCalls.set(key, {
usage,
ts: e.timestamp,
skill: currentSkill,
prompt: currentPrompt,
})
}
continue
}
}
// commit timestamps
if (firstTs !== null && lastTs !== null) {
const wall = lastTs - firstTs
for (const s of [overall, project, subagent].filter(Boolean)) {
s.wallClockMs += wall
s.activeMs += activeMs
if (!s.firstTs || firstTs < s.firstTs) s.firstTs = firstTs
if (!s.lastTs || lastTs > s.lastTs) s.lastTs = lastTs
}
}
// commit API calls
for (const [key, { usage, ts, skill, prompt }] of fileApiCalls) {
if (key && seenRequestIds.has(key)) continue
seenRequestIds.add(key)
const targets = [overall, project]
if (subagent) targets.push(subagent)
if (skill && skillStats) {
if (!skillStats.has(skill)) skillStats.set(skill, newStats())
targets.push(skillStats.get(skill))
}
for (const s of targets) addUsage(s, usage)
if (prompt) {
const r = prompts.get(prompt)
if (r) {
r.apiCalls++
r.inputUncached += usage.input_tokens || 0
r.inputCacheCreate += usage.cache_creation_input_tokens || 0
r.inputCacheRead += usage.cache_read_input_tokens || 0
r.outputTokens += usage.output_tokens || 0
}
}
// subagent token accounting on parent buckets
if (info.kind === 'subagent') {
const tot =
(usage.input_tokens || 0) +
(usage.cache_creation_input_tokens || 0) +
(usage.cache_read_input_tokens || 0) +
(usage.output_tokens || 0)
overall.subagentTokens += tot
project.subagentTokens += tot
if (subagent) subagent.subagentTokens += tot
}
// cache break detection
const uncached =
(usage.input_tokens || 0) + (usage.cache_creation_input_tokens || 0)
if (uncached > CACHE_BREAK_THRESHOLD) {
const total = uncached + (usage.cache_read_input_tokens || 0)
const cb = {
ts,
session: info.sessionId,
project: info.project,
uncached,
total,
kind: info.kind,
agentType: info.agentType,
prompt,
}
overall.cacheBreaks.push(cb)
project.cacheBreaks.push(cb)
if (subagent) subagent.cacheBreaks.push(cb)
}
}
// only count this file toward session/subagent tallies if it had in-range entries
if (firstTs !== null || fileApiCalls.size > 0) {
for (const s of [overall, project, subagent].filter(Boolean)) {
s.sessions.add(info.sessionId)
}
if (info.kind === 'subagent') {
overall.subagentCalls++
project.subagentCalls++
if (subagent) subagent.subagentCalls++
}
}
}
function handleUser(
e,
info,
{ project, overall, subagent },
setSkill,
setPrompt,
) {
if (e.isMeta || e.isCompactSummary) return
const content = e.message && e.message.content
let isToolResult = false
let text = null
if (typeof content === 'string') {
text = content
} else if (Array.isArray(content)) {
const first = content[0]
if (first && first.type === 'tool_result') isToolResult = true
else if (first && first.type === 'text') text = first.text || ''
}
if (isToolResult) return
let slashCmd = null
if (text) {
// Auto-continuations (task notifications, scheduled wakeups) are not new
// human prompts; keep attributing to the previously active prompt.
if (
text.startsWith('<task-notification') ||
text.startsWith('<scheduled-wakeup') ||
text.startsWith('<background-task')
) {
return
}
const m = /<command-(?:name|message)>\/?([^<]+)<\/command-/.exec(text)
if (m) {
slashCmd = m[1].trim()
bumpSkill(overall, slashCmd)
bumpSkill(project, slashCmd)
if (subagent) bumpSkill(subagent, slashCmd)
setSkill(slashCmd)
} else {
setSkill(null) // plain human message resets skill attribution
}
if (text.startsWith('[Request interrupted')) return
}
// Only count as human message / start a prompt in main (non-sidechain) transcripts
if (info.kind === 'main' && !e.isSidechain) {
overall.humanMessages++
project.humanMessages++
const pk = e.uuid || `${info.sessionId}:${e.timestamp}`
promptRecord(pk, {
text: promptPreview(text, slashCmd),
ts: e.timestamp,
project: info.project,
sessionId: info.sessionId,
})
setPrompt(pk)
let turns = sessionTurns.get(info.sessionId)
if (!turns) sessionTurns.set(info.sessionId, (turns = []))
turns.push(pk)
}
}
function promptPreview(text, slashCmd) {
if (slashCmd) return `/${slashCmd}`
if (!text) return '(non-text)'
const t = text
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
return t.length > 240 ? t.slice(0, 237) + '…' : t
}
// ±2 user messages around a given prompt, with the api-call count that
// followed each one. Used for drill-down in the HTML report.
function buildContext(pk) {
const r = prompts.get(pk)
if (!r) return null
const turns = sessionTurns.get(r.sessionId)
if (!turns) return null
const i = turns.indexOf(pk)
if (i === -1) return null
const lo = Math.max(0, i - 2)
const hi = Math.min(turns.length, i + 3)
return turns.slice(lo, hi).map((k, j) => {
const t = prompts.get(k) || {}
return {
text: t.text || '',
ts: t.ts || null,
calls: t.apiCalls || 0,
here: lo + j === i,
}
})
}
function bumpSkill(s, name) {
s.skillInvocations[name] = (s.skillInvocations[name] || 0) + 1
}
const _btCache = new Map()
function birthtime(p) {
let t = _btCache.get(p)
if (t === undefined) {
try {
t = fs.statSync(p).birthtimeMs
} catch {
t = 0
}
_btCache.set(p, t)
}
return t
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
const overall = newStats()
const perProject = new Map() // project -> stats
const perSubagent = new Map() // agentType -> stats
const perSkill = new Map() // skill -> stats (token-attributed)
// Classify, then sort main files before subagent files. Fork-style subagents
// replay parent entries with identical uuids; processing parents first ensures
// the global uuid-dedupe attributes those entries to the parent, not the fork.
// Among subagents, sort by birthtime so a parent subagent is processed before
// any nested children it spawned (needed for prompt-attribution propagation).
const files = [...walk(ROOT)]
.map(p => ({ p, info: classifyFile(p) }))
.sort((a, b) => {
const ka = a.info.kind === 'main' ? 0 : 1
const kb = b.info.kind === 'main' ? 0 : 1
if (ka !== kb) return ka - kb
if (ka === 1) return birthtime(a.p) - birthtime(b.p)
return 0
})
let n = 0
for (const { p, info } of files) {
if (!perProject.has(info.project)) perProject.set(info.project, newStats())
const project = perProject.get(info.project)
let subagent = null
if (info.kind === 'subagent') {
// Resolve agent type: meta.json/filename hint > parent-transcript map > 'fork'
const at =
info.agentTypeHint ||
(info.agentId && agentIdToType.get(info.agentId)) ||
'fork'
info.agentType = at
if (!perSubagent.has(at)) perSubagent.set(at, newStats())
subagent = perSubagent.get(at)
}
await processFile(p, info, {
overall,
project,
subagent,
skillStats: perSkill,
})
n++
if (!AS_JSON && n % 200 === 0) {
process.stderr.write(`\r scanned ${n}/${files.length} files…`)
}
}
if (!AS_JSON)
process.stderr.write(`\r scanned ${n}/${files.length} files.\n`)
// Drop empty buckets (created for files that had no in-range entries under --since)
for (const m of [perProject, perSubagent, perSkill]) {
for (const [k, v] of m) {
if (v.apiCalls === 0 && v.sessions.size === 0) m.delete(k)
}
}
if (AS_JSON) {
printJson({ overall, perProject, perSubagent, perSkill })
} else {
printText({ overall, perProject, perSubagent, perSkill })
}
}
// ---------------------------------------------------------------------------
// Output
// ---------------------------------------------------------------------------
function fmt(n) {
if (n >= 1e9) return (n / 1e9).toFixed(2) + 'B'
if (n >= 1e6) return (n / 1e6).toFixed(2) + 'M'
if (n >= 1e3) return (n / 1e3).toFixed(1) + 'k'
return String(n)
}
function pct(a, b) {
return b > 0 ? ((100 * a) / b).toFixed(1) + '%' : '—'
}
function hrs(ms) {
return (ms / 3600000).toFixed(1)
}
function summarize(s) {
const inTotal = s.inputUncached + s.inputCacheCreate + s.inputCacheRead
return {
sessions: s.sessions.size,
api_calls: s.apiCalls,
input_tokens: {
uncached: s.inputUncached,
cache_create: s.inputCacheCreate,
cache_read: s.inputCacheRead,
total: inTotal,
pct_cached:
inTotal > 0 ? +((100 * s.inputCacheRead) / inTotal).toFixed(1) : 0,
},
output_tokens: s.outputTokens,
human_messages: s.humanMessages,
hours: { wall_clock: +hrs(s.wallClockMs), active: +hrs(s.activeMs) },
cache_breaks_over_100k: s.cacheBreaks.length,
subagent: {
calls: s.subagentCalls,
total_tokens: s.subagentTokens,
avg_tokens_per_call:
s.subagentCalls > 0
? Math.round(s.subagentTokens / s.subagentCalls)
: 0,
},
skill_invocations: s.skillInvocations,
span: s.firstTs
? {
from: new Date(s.firstTs).toISOString(),
to: new Date(s.lastTs).toISOString(),
}
: null,
}
}
function printJson({ overall, perProject, perSubagent, perSkill }) {
const out = {
root: ROOT,
generated_at: new Date().toISOString(),
overall: summarize(overall),
cache_breaks: overall.cacheBreaks
.sort((a, b) => b.uncached - a.uncached)
.slice(0, 100)
.map(({ prompt, ...b }) => ({
...b,
context: prompt ? buildContext(prompt) : null,
})),
by_project: Object.fromEntries(
[...perProject].map(([k, v]) => [k, summarize(v)]),
),
by_subagent_type: Object.fromEntries(
[...perSubagent].map(([k, v]) => [k, summarize(v)]),
),
by_skill: Object.fromEntries(
[...perSkill].map(([k, v]) => [k, summarize(v)]),
),
top_prompts: topPrompts(100),
}
process.stdout.write(JSON.stringify(out, null, 2) + '\n')
}
function promptTotal(r) {
return (
r.inputUncached + r.inputCacheCreate + r.inputCacheRead + r.outputTokens
)
}
function topPrompts(n) {
return [...prompts.entries()]
.filter(([, r]) => r.apiCalls > 0)
.sort((a, b) => promptTotal(b[1]) - promptTotal(a[1]))
.slice(0, n)
.map(([pk, r]) => ({
ts: r.ts,
project: r.project,
session: r.sessionId,
text: r.text,
api_calls: r.apiCalls,
subagent_calls: r.subagentCalls,
total_tokens: promptTotal(r),
input: {
uncached: r.inputUncached,
cache_create: r.inputCacheCreate,
cache_read: r.inputCacheRead,
},
output: r.outputTokens,
context: buildContext(pk),
}))
}
function printText({ overall, perProject, perSubagent, perSkill }) {
const line = (...a) => console.log(...a)
const hr = () => line('─'.repeat(78))
line()
line(`Claude Code session analysis — ${ROOT}`)
if (SINCE) line(`(since ${SINCE.toISOString()})`)
hr()
printBlock('OVERALL', overall)
hr()
line(
`CACHE BREAKS (>${fmt(CACHE_BREAK_THRESHOLD)} uncached input on a single call)`,
)
const breaks = overall.cacheBreaks
.sort((a, b) => b.uncached - a.uncached)
.slice(0, TOP_N)
if (breaks.length === 0) line(' none')
for (const b of breaks) {
line(
` ${fmt(b.uncached).padStart(8)} uncached / ${fmt(b.total).padStart(8)} total ` +
`${(b.ts || '').slice(0, 19)} ${b.project}` +
(b.kind === 'subagent' ? ` [${b.agentType}]` : ''),
)
}
if (overall.cacheBreaks.length > TOP_N)
line(`${overall.cacheBreaks.length - TOP_N} more`)
hr()
line(
'MOST EXPENSIVE PROMPTS (total tokens incl. subagents spawned during the turn)',
)
const top = topPrompts(TOP_N)
if (top.length === 0) line(' none')
for (const r of top) {
const inTot = r.input.uncached + r.input.cache_create + r.input.cache_read
line(
` ${fmt(r.total_tokens).padStart(8)} ` +
`(in ${fmt(inTot)} ${pct(r.input.cache_read, inTot)} cached, out ${fmt(r.output)}) ` +
`${r.api_calls} calls` +
(r.subagent_calls ? `, ${r.subagent_calls} subagents` : '') +
` ${(r.ts || '').slice(0, 16)} ${r.project}`,
)
line(` "${r.text}"`)
}
line(
' (note: internal background forks like task_summary/compact are not attributed to a prompt)',
)
hr()
line('BY PROJECT (top by total input tokens)')
const projects = [...perProject.entries()].sort(
(a, b) => totalIn(b[1]) - totalIn(a[1]),
)
for (const [name, s] of projects.slice(0, TOP_N)) {
printBlock(name, s, ' ')
line()
}
if (projects.length > TOP_N)
line(`${projects.length - TOP_N} more projects`)
hr()
line('BY SUBAGENT TYPE')
const agents = [...perSubagent.entries()].sort(
(a, b) => totalIn(b[1]) - totalIn(a[1]),
)
for (const [name, s] of agents) {
printBlock(name, s, ' ')
line()
}
hr()
line(
'BY SKILL / SLASH COMMAND (tokens attributed = from invocation until next human msg)',
)
const skills = [...perSkill.entries()].sort(
(a, b) => totalIn(b[1]) - totalIn(a[1]),
)
for (const [name, s] of skills.slice(0, TOP_N)) {
printBlock(name, s, ' ')
line()
}
if (skills.length > TOP_N) line(`${skills.length - TOP_N} more`)
line()
}
function totalIn(s) {
return s.inputUncached + s.inputCacheCreate + s.inputCacheRead
}
function printBlock(title, s, indent = '') {
const inTotal = totalIn(s)
console.log(`${indent}${title}`)
console.log(
`${indent} sessions: ${s.sessions.size} api calls: ${s.apiCalls} human msgs: ${s.humanMessages}`,
)
console.log(
`${indent} input: ${fmt(inTotal)} total ` +
`(uncached ${fmt(s.inputUncached)}, cache-create ${fmt(s.inputCacheCreate)}, cache-read ${fmt(s.inputCacheRead)} = ${pct(s.inputCacheRead, inTotal)} cached)`,
)
console.log(`${indent} output: ${fmt(s.outputTokens)}`)
console.log(
`${indent} hours: ${hrs(s.wallClockMs)} wall-clock, ${hrs(s.activeMs)} active (gaps >5m excluded)`,
)
console.log(
`${indent} cache breaks >${fmt(CACHE_BREAK_THRESHOLD)}: ${s.cacheBreaks.length}`,
)
console.log(
`${indent} subagents: ${s.subagentCalls} calls, ${fmt(s.subagentTokens)} tokens, avg ${fmt(
s.subagentCalls ? Math.round(s.subagentTokens / s.subagentCalls) : 0,
)}/call`,
)
const topSkills = Object.entries(s.skillInvocations)
.sort((a, b) => b[1] - a[1])
.slice(0, 5)
if (topSkills.length)
console.log(
`${indent} skills: ${topSkills.map(([k, v]) => `${k}×${v}`).join(', ')}`,
)
}
main().catch(e => {
console.error(e)
process.exit(1)
})

View File

@@ -0,0 +1,464 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>claude usage</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;700&display=swap" rel="stylesheet">
<style>
:root {
/* anthropic / claude-code palette */
--ivory: #FAF9F5;
--term-bg: #1a1918;
--term-fg: #d1cfc5;
--titlebar: #252321;
--outline: rgba(255,255,255,0.08);
--hover: rgba(255,255,255,0.035);
--clay: #D97757;
--dim: rgb(136,136,136);
--subtle: rgb(80,80,80);
--green: rgb(78,186,101);
--red: rgb(255,107,128);
--blue: rgb(177,185,249);
--yellow: rgb(255,193,7);
--mono: 'JetBrains Mono', 'SF Mono', ui-monospace, Menlo, Monaco, monospace;
}
* { box-sizing: border-box; }
html { background: var(--ivory); }
body {
margin: 0; padding: 48px 24px 80px;
font: 13px/1.55 var(--mono);
font-variant-numeric: tabular-nums;
color: var(--term-fg);
}
/* ——— terminal window chrome ——— */
.term {
max-width: 1180px; margin: 0 auto;
background: var(--term-bg);
border-radius: 8px;
outline: 1px solid var(--outline);
box-shadow: 0 20px 60px rgba(20,20,19,0.22);
}
.titlebar {
background: var(--titlebar);
border-radius: 8px 8px 0 0;
border-bottom: 1px solid var(--outline);
padding: 11px 14px;
display: flex; align-items: center; gap: 7px;
}
.titlebar .dot { width: 11px; height: 11px; border-radius: 50%; background: #3a3836; }
.titlebar .path { margin-left: 14px; color: var(--dim); font-size: 11px; }
.term-body { padding: 22px 30px 30px; }
/* ——— command + hero ——— */
.cmd { color: var(--dim); margin-bottom: 6px; }
.cmd .prompt { color: var(--clay); }
.cmd .flag { color: var(--blue); }
#meta-line { color: var(--subtle); font-size: 11px; }
#hero { margin: 14px 0 6px; }
#hero-total { font-size: 56px; font-weight: 700; line-height: 1; }
#hero-total .unit { color: var(--clay); }
#hero-total .label { font-size: 18px; font-weight: 400; color: var(--dim); margin-left: 8px; }
#hero-split { color: var(--dim); margin-top: 8px; }
#hero-split b { color: var(--term-fg); font-weight: 500; }
#hero-split .ok { color: var(--green); }
#hero-split .warn { color: var(--yellow); }
/* ——— sections ——— */
section { margin-top: 26px; }
.hr { color: var(--subtle); overflow: hidden; white-space: nowrap;
user-select: none; margin-bottom: 8px; }
.hr::after { content: '────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────'; }
h2 { all: unset; display: block; color: var(--clay); font-weight: 500; }
h2::before { content: '▸ '; }
h2 .hint { color: var(--subtle); font-size: 11px; font-weight: 400; margin-left: 10px; }
.section-body { margin-top: 10px; }
/* ——— overall stat grid ——— */
#overall-grid { display: grid; gap: 4px 28px;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); }
.stat { padding: 4px 0; }
.stat .label { font-size: 11px; color: var(--dim); }
.stat .val { font-size: 20px; font-weight: 500; }
.stat .detail { font-size: 11px; color: var(--subtle); }
/* ——— takeaways ——— */
.take { display: grid; grid-template-columns: 9ch 1fr; gap: 18px;
padding: 6px 0; align-items: baseline; }
.take .fig { text-align: right; font-weight: 700; font-size: 15px; }
.take .txt { color: var(--dim); }
.take .txt b { color: var(--term-fg); font-weight: 500; }
.take.bad .fig { color: var(--red); }
.take.good .fig { color: var(--green); }
.take.info .fig { color: var(--blue); }
/* ——— callouts (recommendations) ——— */
.callout { padding: 6px 0 6px 14px; border-left: 2px solid var(--subtle);
color: var(--dim); margin: 6px 0; }
.callout b, .callout code { color: var(--term-fg); }
/* ——— block-char bars ——— */
.bar { display: grid; grid-template-columns: 26ch 1fr 8ch; gap: 14px;
padding: 2px 0; align-items: center; }
.bar .name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.bar .blocks { color: var(--clay); white-space: pre; overflow: hidden; }
.bar .blocks .empty { color: var(--subtle); }
.bar .pct { text-align: right; color: var(--dim); }
.bar:hover .name { color: var(--clay); }
/* ——— drill-down lists (top prompts, cache breaks) ——— */
.drill details { border-top: 1px solid var(--outline); }
.drill details:last-of-type { border-bottom: 1px solid var(--outline); }
.drill summary { list-style: none; cursor: pointer;
display: grid; grid-template-columns: 8ch 1fr; gap: 16px;
padding: 9px 4px; align-items: baseline; }
.drill summary::-webkit-details-marker { display: none; }
.drill summary:hover { background: var(--hover); }
.drill .amt { font-weight: 700; text-align: right; color: var(--clay); }
.drill .desc { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.drill .meta { grid-column: 2; font-size: 11px; color: var(--subtle); }
.drill details[open] summary .desc { white-space: normal; }
.drill .body { padding: 4px 4px 14px calc(8ch + 20px); font-size: 12px; color: var(--dim); }
/* ——— transcript context (±2 user msgs) — light inset for legibility ——— */
.ctx { margin: 8px 0 12px; padding: 10px 14px;
background: #F0EEE6; color: #1a1918;
border-radius: 6px; font-size: 12px; }
.ctx-msg { padding: 4px 0; white-space: pre-wrap; }
.ctx-msg .who { color: #87867F; font-size: 11px; }
.ctx-msg .ts { color: #87867F; font-size: 10px; margin-left: 8px; }
.ctx-msg.here { margin: 2px -14px; padding: 6px 11px 6px 14px;
border-left: 3px solid var(--clay);
background: rgba(217,119,87,0.10); }
.ctx-msg.here .who { color: var(--clay); font-weight: 500; }
.ctx-gap { color: #87867F; font-size: 11px; padding: 3px 0 3px 7ch; }
.ctx-gap::before { content: '⟨ '; }
.ctx-gap::after { content: ' ⟩'; }
.ctx-break { margin: 2px -14px; padding: 8px 11px 8px 14px;
border-left: 3px solid #BD5E6D;
background: rgba(189,94,109,0.12); color: #A63244; }
.ctx-break b { color: #1a1918; margin-right: 6px; }
.more-btn { display: block; width: 100%; margin-top: 10px; padding: 9px;
background: none; border: 1px dashed var(--subtle); cursor: pointer;
font: 500 11px/1 var(--mono); letter-spacing: 0.06em;
color: var(--dim); }
.more-btn:hover { border-color: var(--dim); color: var(--term-fg); }
/* ——— tables ——— */
.scroll { max-height: 440px; overflow: auto;
border-top: 1px solid var(--outline); border-bottom: 1px solid var(--outline); }
table { width: 100%; border-collapse: collapse; font-size: 12px; }
th, td { text-align: left; padding: 6px 10px; }
td { border-top: 1px solid rgba(255,255,255,0.04); }
th { position: sticky; top: 0; background: var(--term-bg); z-index: 1;
font-weight: 500; font-size: 11px; color: var(--subtle);
cursor: pointer; user-select: none;
border-bottom: 1px solid var(--outline); }
th:hover { color: var(--dim); }
th.sorted { color: var(--clay); }
th.sorted::after { content: ' ↓'; }
th.sorted.asc::after { content: ' ↑'; }
td.num, th.num { text-align: right; }
tbody tr:hover td { background: var(--hover); }
footer { margin-top: 28px; color: var(--subtle); font-size: 11px;
display: flex; justify-content: space-between; gap: 16px; flex-wrap: wrap; }
code { color: var(--blue); }
a { color: var(--clay); }
::selection { background: var(--clay); color: var(--term-bg); }
@media (max-width: 760px) {
body { padding: 20px 12px 48px; }
.term-body { padding: 16px 16px 24px; }
#hero-total { font-size: 40px; }
.bar { grid-template-columns: 14ch 1fr 7ch; gap: 8px; }
.drill summary { grid-template-columns: 6ch 1fr; }
.drill .body { padding-left: 12px; }
.take { grid-template-columns: 7ch 1fr; gap: 12px; }
}
</style>
</head>
<body>
<div class="term">
<div class="titlebar">
<span class="dot"></span><span class="dot"></span><span class="dot"></span>
<span class="path" id="title-path">~/.claude — session-report</span>
</div>
<div class="term-body">
<div class="cmd"><span class="prompt">&gt;</span> claude usage <span id="cmd-flags"></span></div>
<div id="meta-line">loading…</div>
<div id="hero">
<div id="hero-total"></div>
<div id="hero-split"></div>
</div>
<!-- ====================================================================
FINDINGS — agent fills this. 35 one-line takeaways. Use .bad for
waste/anomalies, .good for healthy signals, .info for neutral.
==================================================================== -->
<section>
<div class="hr"></div>
<h2>findings</h2>
<div class="section-body" id="takeaways">
<!-- AGENT: anomalies -->
<div class="take"><div class="fig"></div><div class="txt">No findings generated yet.</div></div>
<!-- /AGENT -->
</div>
</section>
<!-- ====================================================================
Everything below renders automatically from #report-data.
==================================================================== -->
<section>
<div class="hr"></div>
<h2>summary</h2>
<div class="section-body" id="overall-grid"></div>
</section>
<section>
<div class="hr"></div>
<h2>tokens by project<span class="hint">share of total</span></h2>
<div class="section-body" id="project-bars"></div>
</section>
<section>
<div class="hr"></div>
<h2>most expensive prompts<span class="hint">click to expand context</span></h2>
<div class="section-body drill" id="top-prompts"></div>
</section>
<section>
<div class="hr"></div>
<h2>cache breaks<span class="hint">&gt;100k uncached · click for context</span></h2>
<div class="section-body drill" id="cache-breaks"></div>
</section>
<section>
<div class="hr"></div>
<h2>projects</h2>
<div class="section-body scroll"><table id="tbl-projects"></table></div>
</section>
<section>
<div class="hr"></div>
<h2>subagent types</h2>
<div class="section-body scroll"><table id="tbl-subagents"></table></div>
</section>
<section>
<div class="hr"></div>
<h2>skills &amp; slash commands</h2>
<div class="section-body scroll"><table id="tbl-skills"></table></div>
</section>
<section>
<div class="hr"></div>
<h2>recommendations</h2>
<div class="section-body">
<!-- AGENT: optimizations -->
<div class="callout">No suggestions generated yet.</div>
<!-- /AGENT -->
</div>
</section>
<footer>
<span id="foot-gen"></span>
<span id="foot-stats"></span>
</footer>
</div>
</div>
<!-- ========================================================================
DATA — agent replaces the {} below with the full --json output.
======================================================================== -->
<script id="report-data" type="application/json">{}</script>
<script>
(function() {
const DATA = JSON.parse(document.getElementById('report-data').textContent || '{}');
const $ = id => document.getElementById(id);
const fmt = n => n>=1e9 ? (n/1e9).toFixed(2)+'B' : n>=1e6 ? (n/1e6).toFixed(2)+'M'
: n>=1e3 ? (n/1e3).toFixed(1)+'k' : String(n);
const pct = (a,b) => b>0 ? ((100*a/b).toFixed(1)+'%') : '—';
const esc = s => String(s).replace(/[&<>"]/g, c =>
({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;'}[c]));
const short = p => String(p||'').replace(/^-Users-[^-]+-/,'').replace(/^code-/,'');
const MON = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'];
const niceDate = iso => { const d = new Date(iso); return isNaN(d) ? '' :
`${MON[d.getMonth()]} ${d.getDate()} · ${String(d.getHours()).padStart(2,'0')}:${String(d.getMinutes()).padStart(2,'0')}`; };
if (!DATA.overall) { $('meta-line').textContent = 'no data — embed JSON in #report-data.'; return; }
// header + hero
const o = DATA.overall, it = o.input_tokens;
const GRAND = it.total + o.output_tokens;
const share = n => GRAND>0 ? (100*n/GRAND).toFixed(1)+'%' : '—';
const span = o.span;
$('cmd-flags').innerHTML = DATA.since
? `<span class="flag">--since</span> ${esc(DATA.since)}` : '';
$('meta-line').textContent =
(span ? `${span.from.slice(0,10)}${span.to.slice(0,10)}` : '') +
` · ${DATA.root || ''}`;
$('foot-gen').textContent = `generated ${DATA.generated_at?.slice(0,16).replace('T',' ') || ''}`;
$('foot-stats').textContent =
`${o.sessions} sessions · ${o.api_calls} api calls · ${o.human_messages} prompts`;
const num = fmt(GRAND), m = num.match(/^([\d.]+)([A-Za-z]*)$/);
$('hero-total').innerHTML =
`${m?m[1]:num}<span class="unit">${m?m[2]:''}</span><span class="label">tokens</span>`;
const cacheCls = it.pct_cached>=85 ? 'ok' : 'warn';
$('hero-split').innerHTML =
`<b>${fmt(it.total)}</b> input <span class="${cacheCls}">(${it.pct_cached}% cache-read)</span> · `+
`<b>${fmt(o.output_tokens)}</b> output · all figures below are % of this total`;
// overall stat grid
$('overall-grid').innerHTML = [
['sessions', o.sessions],
['api calls', o.api_calls],
['human msgs', o.human_messages],
['active hours', o.hours.active, `${o.hours.wall_clock} wall-clock`],
['cache breaks', o.cache_breaks_over_100k, '>100k uncached'],
['subagent calls', o.subagent.calls, `avg ${fmt(o.subagent.avg_tokens_per_call)}`],
].map(([l,v,d]) =>
`<div class="stat"><div class="label">${l}</div>`+
`<div class="val">${typeof v==='number'&&v>=1e4?fmt(v):v}</div>`+
(d?`<div class="detail">${d}</div>`:'')+`</div>`).join('');
// block-char project bars
(function() {
const W = 48;
const rows = Object.entries(DATA.by_project||{})
.map(([k,v]) => [k, v.input_tokens.total + v.output_tokens])
.sort((a,b)=>b[1]-a[1]).slice(0,15);
const max = rows[0]?.[1] || 1;
$('project-bars').innerHTML = rows.map(([k,v]) => {
const n = Math.max(1, Math.round(W*v/max));
return `<div class="bar"><span class="name" title="${esc(k)}${fmt(v)}">${esc(short(k))}</span>`+
`<span class="blocks">${'█'.repeat(n)}<span class="empty">${'░'.repeat(W-n)}</span></span>`+
`<span class="pct">${share(v)}</span></div>`;
}).join('');
})();
// ±2 user-message transcript context
function renderContext(ctx, mark) {
if (!ctx || !ctx.length) return '<div class="ctx-gap">no transcript context available</div>';
let h = '<div class="ctx">';
ctx.forEach((m, i) => {
const who = m.here ? '&gt; user' : ' user';
h += `<div class="ctx-msg${m.here?' here':''}">`+
`<span class="who">${who}</span> ${esc(m.text||'(non-text)')}`+
`<span class="ts">${niceDate(m.ts)}</span></div>`;
if (m.here && mark) h += mark;
if (i < ctx.length-1 || m.here)
h += `<div class="ctx-gap">${m.calls} api call${m.calls===1?'':'s'}</div>`;
});
return h + '</div>';
}
// top prompts — share of grand total
(function() {
const ps = (DATA.top_prompts||[]).slice(0,100);
const SHOW = 5;
const row = p => {
const inTot = p.input.uncached+p.input.cache_create+p.input.cache_read;
return `<details><summary>`+
`<span class="amt">${share(p.total_tokens)}</span>`+
`<span class="desc">${esc(p.text)}</span>`+
`<span class="meta">${niceDate(p.ts)} · ${esc(short(p.project))} · ${p.api_calls} calls`+
(p.subagent_calls?` · ${p.subagent_calls} subagents`:'')+
` · ${pct(p.input.cache_read,inTot)} cached</span>`+
`</summary><div class="body">`+
renderContext(p.context)+
`<div>session <code>${esc(p.session)}</code></div>`+
`<div>in: uncached ${fmt(p.input.uncached)} · cache-create ${fmt(p.input.cache_create)} · `+
`cache-read ${fmt(p.input.cache_read)} · out ${fmt(p.output)}</div>`+
`</div></details>`;
};
const head = ps.slice(0,SHOW).map(row).join('');
const rest = ps.slice(SHOW).map(row).join('');
$('top-prompts').innerHTML = ps.length
? head + (rest
? `<div id="tp-rest" hidden>${rest}</div>`+
`<button id="tp-more" class="more-btn">show ${ps.length-SHOW} more</button>`
: '')
: '<div class="callout">No prompts in range.</div>';
const btn = $('tp-more');
if (btn) btn.onclick = () => {
const r = $('tp-rest'); r.hidden = !r.hidden;
btn.textContent = r.hidden ? `show ${ps.length-SHOW} more` : 'show less';
};
})();
// cache breaks
(function() {
const bs = (DATA.cache_breaks||[]).slice(0,100);
$('cache-breaks').innerHTML = bs.map(b =>
`<details><summary>`+
`<span class="amt">${fmt(b.uncached)}</span>`+
`<span class="desc">${esc(short(b.project))} · `+
`${b.kind==='subagent'?esc(b.agentType||'subagent'):'main'}</span>`+
`<span class="meta">${niceDate(b.ts)} · ${pct(b.uncached,b.total)} of ${fmt(b.total)} uncached</span>`+
`</summary><div class="body">`+
renderContext(b.context,
`<div class="ctx-break"><b>${fmt(b.uncached)}</b> uncached `+
`(${pct(b.uncached,b.total)} of ${fmt(b.total)}) — cache break here</div>`)+
`<div>session <code>${esc(b.session)}</code></div>`+
`</div></details>`
).join('') || '<div class="callout">No cache breaks over threshold.</div>';
})();
// sortable table
function table(el, cols, rows) {
let sortIdx = cols.findIndex(c=>c.sort), asc = false;
function render() {
const sorted = rows.slice().sort((a,b)=>{
const va=a[sortIdx], vb=b[sortIdx];
return (asc?1:-1)*(typeof va==='number' ? va-vb : String(va).localeCompare(String(vb)));
});
el.innerHTML = `<thead><tr>${cols.map((c,i)=>
`<th class="${c.num?'num':''} ${i===sortIdx?'sorted'+(asc?' asc':''):''}" data-i="${i}">${c.h}</th>`
).join('')}</tr></thead><tbody>${sorted.map(r=>
`<tr>${r.map((v,i)=>`<td class="${cols[i].num?'num':''}">${
cols[i].fmt?cols[i].fmt(v):esc(v)}</td>`).join('')}</tr>`
).join('')}</tbody>`;
el.querySelectorAll('th').forEach(th=>th.onclick=()=>{
const i=+th.dataset.i; if(i===sortIdx)asc=!asc; else{sortIdx=i;asc=false;} render();
});
}
render();
}
function statRows(obj) {
return Object.entries(obj||{}).map(([k,v])=>[
short(k), GRAND>0 ? 100*(v.input_tokens.total+v.output_tokens)/GRAND : 0,
v.sessions, v.api_calls, v.human_messages,
v.input_tokens.total, v.input_tokens.pct_cached, v.output_tokens,
v.hours.active, v.cache_breaks_over_100k,
v.subagent.calls, v.subagent.avg_tokens_per_call,
]);
}
const statCols = [
{h:'name'},{h:'% total',num:1,sort:1,fmt:v=>v.toFixed(1)+'%'},
{h:'sess',num:1},{h:'calls',num:1},{h:'msgs',num:1},
{h:'input',num:1,fmt:fmt},{h:'%cached',num:1,fmt:v=>v+'%'},
{h:'output',num:1,fmt:fmt},{h:'active h',num:1},{h:'breaks',num:1},
{h:'subagents',num:1},{h:'avg sub tok',num:1,fmt:fmt},
];
table($('tbl-projects'), statCols, statRows(DATA.by_project));
table($('tbl-subagents'), statCols, statRows(DATA.by_subagent_type));
table($('tbl-skills'), statCols, statRows(DATA.by_skill));
})();
</script>
</body>
</html>