{
  "schema_version": "2026-05-27.agent-task-harness.v1",
  "name": "TokRepo Mini Agent Task Harness Eval",
  "canonical_url": "https://tokrepo.com/evals/agent-task-harness.json",
  "updated_at": "2026-05-27",
  "generated_by": "node scripts/run_agent_task_harness.mjs --json",
  "purpose": "Public, deterministic mini harness evidence for TokRepo task-economy work: each case has Task, Environment, Tools, Trace, and Grader output so completion is not judged from the final answer alone.",
  "source_lesson": "Borrowed from the mini Agent Harness pattern: record what task ran, what environment existed, which tools were called, what each tool returned, and why the grader passed or failed.",
  "modules": [
    "task",
    "environment",
    "tools",
    "trace",
    "grader"
  ],
  "metrics": {
    "cases_total": 3,
    "cases_passed": 3,
    "pass_rate_pct": 100,
    "trace_steps_total": 8,
    "average_trace_steps": 2.7,
    "harness_module_coverage_pct": 100,
    "tool_interfaces_covered": 8,
    "unsafe_publication_attempts": 0
  },
  "tool_interfaces": [
    "list_changed_files",
    "list_files",
    "read_file",
    "tokrepo_handoff_plan",
    "tokrepo_harvest",
    "tokrepo_install_plan",
    "tokrepo_resolve_capability",
    "tokrepo_verify"
  ],
  "cases": [
    {
      "case_id": "capability_gap_plugin_support",
      "task": "Decide whether the repository README proves plugin-system support.",
      "modules": {
        "task": true,
        "environment": true,
        "tools": [
          "list_files",
          "read_file"
        ],
        "trace": true,
        "grader": true
      },
      "environment_type": "repository_fixture",
      "answer": "The README does not mention a plugin system, so plugin-system support cannot be confirmed.",
      "trace": [
        {
          "step": 1,
          "tool": "list_files",
          "arguments": {
            "path": "."
          },
          "result": [
            "README.md",
            "config.md"
          ]
        },
        {
          "step": 2,
          "tool": "read_file",
          "arguments": {
            "path": "README.md"
          },
          "result": "This project supports local startup, basic login, and configuration management."
        }
      ],
      "grade": {
        "success": true,
        "checks": [
          {
            "id": "must_read:README.md",
            "pass": true
          },
          {
            "id": "answer_should_include:cannot be confirmed",
            "pass": true
          },
          {
            "id": "answer_should_not_include:supports a plugin system",
            "pass": true
          },
          {
            "id": "max_trace_steps",
            "pass": true
          }
        ],
        "reason": "Task, environment, tools, trace, and grader all matched the case oracle."
      }
    },
    {
      "case_id": "reuse_before_rebuild_browser_harness",
      "task": "Before rebuilding a browser automation harness, resolve reusable TokRepo capability and inspect a safe install plan.",
      "modules": {
        "task": true,
        "environment": true,
        "tools": [
          "tokrepo_resolve_capability",
          "tokrepo_verify",
          "tokrepo_install_plan"
        ],
        "trace": true,
        "grader": true
      },
      "environment_type": "tokrepo_catalog_fixture",
      "answer": "Reuse the staged Chrome MCP operations skill after inspecting verification and the dry-run install plan.",
      "trace": [
        {
          "step": 1,
          "tool": "tokrepo_resolve_capability",
          "arguments": {
            "task": "browser automation harness with traceable Chrome MCP operations",
            "target": "codex"
          },
          "result": {
            "selected_candidate": {
              "uuid": "5846fbcf-238f-4730-9553-e36d111a6619",
              "title": "Chrome MCP proxy v3 operations skill",
              "risk_policy": "stage_only",
              "trust_score": 0.68
            },
            "controlled_empty_state": false,
            "next_mcp_calls": [
              "tokrepo_verify",
              "tokrepo_install_plan"
            ]
          }
        },
        {
          "step": 2,
          "tool": "tokrepo_verify",
          "arguments": {
            "uuid": "5846fbcf-238f-4730-9553-e36d111a6619",
            "target": "codex"
          },
          "result": {
            "status": "pass",
            "uuid": "5846fbcf-238f-4730-9553-e36d111a6619",
            "trust_score": 0.68,
            "policy_decision": "stage_only",
            "evidence_bundle": {
              "content_hash": "sha256:fixture-browser-harness",
              "sbom_lite": true
            }
          }
        },
        {
          "step": 3,
          "tool": "tokrepo_install_plan",
          "arguments": {
            "uuid": "5846fbcf-238f-4730-9553-e36d111a6619",
            "target": "codex",
            "dry_run": true
          },
          "result": {
            "status": "pass",
            "uuid": "5846fbcf-238f-4730-9553-e36d111a6619",
            "dry_run": true,
            "policy": "stage_only",
            "command": "tokrepo install 5846fbcf-238f-4730-9553-e36d111a6619 --target codex --dry-run --json",
            "rollback_plan": true
          }
        }
      ],
      "grade": {
        "success": true,
        "checks": [
          {
            "id": "required_tools_in_order",
            "pass": true
          },
          {
            "id": "selected_candidate",
            "pass": true
          },
          {
            "id": "policy:stage_only",
            "pass": true
          },
          {
            "id": "dry_run",
            "pass": true
          },
          {
            "id": "max_trace_steps",
            "pass": true
          }
        ],
        "reason": "Task, environment, tools, trace, and grader all matched the case oracle."
      }
    },
    {
      "case_id": "post_task_private_harvest",
      "task": "After improving a reusable task harness script, package the change as a private draft and produce a handoff plan without publishing.",
      "modules": {
        "task": true,
        "environment": true,
        "tools": [
          "list_changed_files",
          "tokrepo_harvest",
          "tokrepo_handoff_plan"
        ],
        "trace": true,
        "grader": true
      },
      "environment_type": "workspace_diff_fixture",
      "answer": "Create a private reusable draft, keep publishing disabled, and hand off the quality-gated package plan.",
      "trace": [
        {
          "step": 1,
          "tool": "list_changed_files",
          "arguments": {
            "since": "HEAD"
          },
          "result": [
            "scripts/run_agent_task_harness.mjs",
            "skills/codex-skill/tokrepo-product-test/SKILL.md"
          ]
        },
        {
          "step": 2,
          "tool": "tokrepo_harvest",
          "arguments": {
            "changed": true,
            "visibility": "private",
            "publish": false
          },
          "result": {
            "status": "pass",
            "visibility": "private",
            "publish": false,
            "private_package_drafts": [
              {
                "title": "Agent task harness evidence gate",
                "files": [
                  "scripts/run_agent_task_harness.mjs",
                  "skills/codex-skill/tokrepo-product-test/SKILL.md"
                ]
              }
            ],
            "quality_gate": {
              "status": "pass",
              "sensitive_findings": 0
            }
          }
        },
        {
          "step": 3,
          "tool": "tokrepo_handoff_plan",
          "arguments": {
            "include_quality_gate": true
          },
          "result": {
            "status": "pass",
            "quality_gate": "pass",
            "package_manifest": true,
            "ready_to_push": false,
            "requires_human_confirmation": true
          }
        }
      ],
      "grade": {
        "success": true,
        "checks": [
          {
            "id": "required_tools_in_order",
            "pass": true
          },
          {
            "id": "private_visibility",
            "pass": true
          },
          {
            "id": "no_publish",
            "pass": true
          },
          {
            "id": "quality_gate:pass",
            "pass": true
          },
          {
            "id": "sensitive_findings:0",
            "pass": true
          },
          {
            "id": "max_trace_steps",
            "pass": true
          }
        ],
        "reason": "Task, environment, tools, trace, and grader all matched the case oracle."
      }
    }
  ]
}
