# Maestro / BarnOS — 90-Day Build Plan
# Consumable by Claude Code. Tasks are ordered, dependency-aware, and
# scoped tight enough to be one PR each. Every task lists deliverables,
# acceptance criteria, and the cloud surface it touches.

project:
  name: maestro
  codename: barnos
  product_owner: anh_digital_platform
  first_customers: [cargill_research, dairy_consultants]
  repo_layout:
    monorepo_root: /
    packages:
      - core/                  # cloud-agnostic Maestro core
      - substrate-azure/       # Azure Substrate impl
      - substrate-aws/         # AWS Substrate impl
      - context-field/         # ranking service
      - policy-kernel/         # decision service (ONNX + rules)
      - api/                   # public API (FastAPI)
      - notebook/              # researcher UI (Astro + React islands)
      - site/                  # Maestro public site (this Astro app)
      - infra/azure/           # bicep / terraform
      - infra/aws/             # terraform / cdk
      - eval/                  # offline policy training, replays
      - docs/                  # architecture, ADRs

conventions:
  language: python_3_12_for_services
  ui_language: typescript
  formatters: [ruff, black, prettier]
  testing: [pytest, vitest, playwright]
  iac: terraform_primary_with_bicep_for_azure_specific
  observability: opentelemetry_everywhere
  branching: trunk_based_with_short_lived_branches
  pr_size_target_lines: 400

# ---------------------------------------------------------------------
# PHASE 1 — SPINE (Days 1–30)
# Goal: a continuation can be created, sleep, wake, run a no-op step,
# and complete, on both Azure and AWS, with the same core code.
# ---------------------------------------------------------------------

phase_1_spine:
  duration_days: 30
  exit_criteria:
    - "A continuation can be POSTed to the API, runs a no-op step, sleeps for 30s, wakes, completes."
    - "Same test suite passes against both Azure and AWS substrates."
    - "Event log is queryable; warm store ingests CDC on both clouds."
    - "OpenTelemetry traces show the full lifecycle end-to-end."

  tasks:

    - id: P1-001
      title: Define the Substrate interface
      owner: core
      depends_on: []
      deliverables:
        - core/maestro/substrate/interface.py    # Protocol classes
        - core/maestro/substrate/types.py        # shared types
        - docs/adr/0001-substrate-interface.md
      acceptance:
        - "Protocol covers: ContinuationStore, EventLog, Scheduler, StreamBus, ModelDispatch, Identity, Observability."
        - "Zero cloud SDK imports in core/."
        - "ADR documents the trade-off and the lock-down rule (narrow interface)."

    - id: P1-002
      title: Continuation data model and event types
      owner: core
      depends_on: [P1-001]
      deliverables:
        - core/maestro/models/continuation.py
        - core/maestro/models/events.py
        - core/maestro/models/goal_frame.py
        - core/maestro/models/budget.py
        - core/maestro/models/wake.py
      acceptance:
        - "Pydantic v2 models with full JSON schemas exported."
        - "Event types: spawn, wake, model_call, tool_call, publish, fork, merge, sleep, kill, budget_charge, human_signal, error."
        - "Round-trip serialization tested for every model."

    - id: P1-003
      title: Azure substrate — Cosmos DB continuation store
      owner: substrate-azure
      depends_on: [P1-002]
      deliverables:
        - substrate-azure/maestro_azure/store.py
        - substrate-azure/maestro_azure/lease.py
        - infra/azure/modules/cosmos.bicep
      acceptance:
        - "CRUD against Cosmos DB SQL API container."
        - "Lease with fencing token (generation), tested under concurrent writer chaos."
        - "Change feed wired to a stub consumer."

    - id: P1-004
      title: AWS substrate — DynamoDB continuation store
      owner: substrate-aws
      depends_on: [P1-002]
      deliverables:
        - substrate-aws/maestro_aws/store.py
        - substrate-aws/maestro_aws/lease.py
        - infra/aws/modules/dynamodb.tf
      acceptance:
        - "Same test suite as P1-003 passes."
        - "Conditional writes used for fencing."
        - "DynamoDB Streams enabled and consumed by stub."

    - id: P1-005
      title: Event log — append-only, partitioned by continuation
      owner: core
      depends_on: [P1-003, P1-004]
      deliverables:
        - core/maestro/eventlog/api.py
        - substrate-azure/.../eventlog.py        # Cosmos container or Event Hubs
        - substrate-aws/.../eventlog.py          # DynamoDB or Kinesis
      acceptance:
        - "Append is idempotent on (continuation_id, generation, sequence)."
        - "Range queries by continuation_id work."
        - "p99 append latency < 50ms in load test (1k events/s)."

    - id: P1-006
      title: Scheduler v0 — timer wakes only
      owner: core
      depends_on: [P1-005]
      deliverables:
        - substrate-azure/.../scheduler_durable.py    # Durable Functions
        - substrate-aws/.../scheduler_stepfn.py        # Step Functions Express
        - core/maestro/scheduler/api.py
      acceptance:
        - "register_wake(continuation_id, at_timestamp) reliably wakes within ±2s."
        - "Cancellation works."
        - "Survives worker restart."

    - id: P1-007
      title: Worker pod — execute one tick of a no-op continuation
      owner: core
      depends_on: [P1-006]
      deliverables:
        - core/maestro/worker/loop.py
        - core/maestro/worker/tick.py
        - infra/azure/modules/container_apps.bicep
        - infra/aws/modules/eks_worker.tf
      acceptance:
        - "Worker acquires lease, runs no-op tick, releases lease, persists event."
        - "Health check endpoint."
        - "Graceful shutdown drains in-flight ticks."

    - id: P1-008
      title: Public API v0 — submit continuation, get status
      owner: api
      depends_on: [P1-007]
      deliverables:
        - api/maestro_api/app.py                 # FastAPI
        - api/maestro_api/routes/continuations.py
        - api/openapi.json
      acceptance:
        - "POST /continuations creates one; GET /continuations/{id} returns status + recent events."
        - "Auth via Entra ID (Azure) and Cognito (AWS) behind the same interface."
        - "OpenTelemetry middleware attaches trace ids to every event."

    - id: P1-009
      title: Warm store — CDC from hot store into Redshift / Synapse
      owner: substrate-aws + substrate-azure
      depends_on: [P1-005]
      deliverables:
        - infra/aws/modules/redshift.tf
        - infra/azure/modules/synapse.bicep
        - eval/cdc/aws_firehose_handler.py
        - eval/cdc/azure_eventhub_handler.py
        - eval/sql/fact_continuation_step.sql
        - eval/sql/fact_continuation_lifecycle.sql
      acceptance:
        - "Every event appears in warm store within 60s."
        - "Facts and dims defined, populated by dbt-style SQL."
        - "End-of-phase replay test reconstructs continuation state from warm store."

    - id: P1-010
      title: Observability — traces, metrics, audit
      owner: core
      depends_on: [P1-008]
      deliverables:
        - core/maestro/obs/setup.py
        - infra/azure/modules/app_insights.bicep
        - infra/aws/modules/grafana_amp.tf
        - dashboards/continuation_lifecycle.json
      acceptance:
        - "Every continuation produces a single trace spanning all ticks."
        - "Dashboard shows: active count, sleeping count, p50/p99 tick latency, cost per tick."

# ---------------------------------------------------------------------
# PHASE 2 — FIELD AND POLICY (Days 31–60)
# Goal: a continuation reads from a real context field, the policy
# kernel routes model calls, real money is spent against real budgets.
# ---------------------------------------------------------------------

phase_2_field_and_policy:
  duration_days: 30
  depends_on: [phase_1_spine]
  exit_criteria:
    - "A continuation with a real goal frame retrieves a ranked field, calls Opus/Sonnet/Haiku via routing, respects budget, produces a structured artifact."
    - "Field reranker p99 latency < 800ms; cost < $0.005 per field recompute."
    - "Policy kernel runs in < 10ms p99 with rules-only mode."

  tasks:

    - id: P2-001
      title: Semantic channel — index dairy nutrition corpus
      owner: context-field
      depends_on: [P1-009]
      deliverables:
        - context-field/ingest/dairy_corpus.py
        - infra/azure/modules/ai_search.bicep
        - infra/aws/modules/opensearch.tf
      acceptance:
        - "10k+ chunks indexed from a curated dairy nutrition corpus (PubMed open access + Cargill-cleared internal docs)."
        - "Hybrid (vector + keyword) retrieval working on both clouds."
        - "Top-K query p99 < 50ms."

    - id: P2-002
      title: Stage-1 recall — fan-out to all channels
      owner: context-field
      depends_on: [P2-001]
      deliverables:
        - context-field/recall/fanout.py
        - context-field/recall/channels/{semantic,structured,episodic,graph,stream,human}.py
      acceptance:
        - "Recall service returns 200-500 candidates from up to 6 channels in < 200ms p99."
        - "Each candidate has provenance, embedding, structured features."

    - id: P2-003
      title: Stage-2 reranker — cross-encoder with Haiku-class
      owner: context-field
      depends_on: [P2-002]
      deliverables:
        - context-field/rerank/cross_encoder.py
        - context-field/rerank/score_function.py
        - context-field/rerank/cache.py
      acceptance:
        - "Score function implements the formula in §2.3 with configurable weights."
        - "Cache hit rate > 60% on the pilot workload."
        - "Per-recompute cost < $0.005 at target latency."

    - id: P2-004
      title: Field materialization service
      owner: context-field
      depends_on: [P2-003]
      deliverables:
        - context-field/service/api.py
        - context-field/service/manifest.py
      acceptance:
        - "GET /fields/{continuation_id} returns a manifest with items + evicted log."
        - "TTL respected; stale fields trigger recompute."
        - "Manifest token budget enforced."

    - id: P2-005
      title: Policy kernel v0 — rules-only cascade
      owner: policy-kernel
      depends_on: [P1-007]
      deliverables:
        - policy-kernel/kernel/decision.py
        - policy-kernel/kernel/layers/{a_gates,b_routing,c_branching,d_tools,e_escalation}.py
        - policy-kernel/service/api.py
      acceptance:
        - "POST /decide returns Decision in < 10ms p99."
        - "All five layers implemented per §3.3."
        - "Rationale field always populated, human-readable."

    - id: P2-006
      title: Model dispatch — tiered, metered, multi-provider
      owner: core
      depends_on: [P2-005]
      deliverables:
        - core/maestro/dispatch/router.py
        - core/maestro/dispatch/providers/{bedrock,foundry_inference,anthropic_direct,openai_direct}.py
        - core/maestro/dispatch/metering.py
      acceptance:
        - "Tier names (haiku/sonnet/opus) map to provider-specific model ids per tenant."
        - "Every call emits a budget_charge event with tokens, dollars, latency, and a runtime tag."
        - "Failover: if Bedrock Claude is unhealthy, fall back to Foundry Models Claude, then Anthropic direct."

    - id: P2-006a
      title: Foundry Models inference provider
      owner: core
      depends_on: [P2-005]
      deliverables:
        - core/maestro/dispatch/providers/foundry_inference.py
        - substrate-azure/maestro_azure/foundry_resource.bicep
        - eval/scenarios/foundry_inference_conformance.py
      acceptance:
        - "Hits /openai/v1/chat/completions on a Foundry resource via Entra ID (DefaultAzureCredential / Managed Identity)."
        - "Tier names (haiku/sonnet/opus) route to per-tenant deployment names (e.g. claude-sonnet-4-6, claude-opus-4-6, claude-haiku-4-5, gpt-5, o4)."
        - "Emits budget_charge events with tokens, dollars, latency in the same shape as the Bedrock provider."
        - "Supports per-call cost levers: PTU vs MaaS deployment routing, prompt-cache hint pass-through."
        - "Content safety verdicts normalized into the model_call event's safety_outcome enum."

    - id: P2-006b
      title: Tool-schema normalization across providers
      owner: core
      depends_on: [P2-006, P2-006a]
      deliverables:
        - core/maestro/dispatch/tools/abi.py             # internal tool-call ABI
        - core/maestro/dispatch/tools/adapters.py        # to_provider_tool_schema / from_provider_tool_call
        - eval/scenarios/tool_schema_conformance.py
      acceptance:
        - "Single internal ToolCall / ToolResult shape used by all providers."
        - "Bedrock Converse (toolConfig.tools[].toolSpec) and Foundry Responses tool shapes both round-trip without loss."
        - "Conformance test: same goal frame + same tool definitions produce equivalent toolUse envelopes across both providers."

    - id: P2-007
      title: Budget enforcement — soft cap downshift, hard cap terminate
      owner: core
      depends_on: [P2-006]
      deliverables:
        - core/maestro/budget/tracker.py
        - core/maestro/budget/enforcement.py
      acceptance:
        - "Soft cap triggers route downshift via policy kernel."
        - "Hard cap forces publish-and-terminate."
        - "Per-tenant daily caps enforced at API ingress."

    - id: P2-008
      title: End-to-end pilot continuation — dairy evidence brief
      owner: core
      depends_on: [P2-004, P2-007]
      deliverables:
        - core/maestro/recipes/evidence_brief.py
        - eval/scenarios/evidence_brief_test.py
      acceptance:
        - "Goal frame from §1.3 of the architecture doc executes end-to-end."
        - "Output: a structured evidence brief with citations from the field."
        - "Total spend < $2 per brief on the pilot corpus."

    - id: P2-009
      title: Notebook UI v0 — submit goal frame, watch continuations
      owner: notebook
      depends_on: [P2-008]
      deliverables:
        - notebook/src/pages/index.astro
        - notebook/src/components/ContinuationList.tsx
        - notebook/src/components/FieldInspector.tsx
        - notebook/src/components/GoalFrameEditor.tsx
      acceptance:
        - "Researcher can submit a goal frame, see continuations live, inspect a field manifest."
        - "WebSocket / SSE for live updates."
        - "Works against both clouds via API."

    - id: P2-010
      title: Tenant onboarding for Cargill pilot
      owner: api
      depends_on: [P2-009]
      deliverables:
        - api/maestro_api/tenants/cargill.py
        - infra/aws/modules/cargill_tenant.tf
        - docs/runbooks/onboard_tenant.md
      acceptance:
        - "Cargill tenant has its own corpus, budgets, identity, and data isolation."
        - "Two named researchers can log in."
        - "Compliance check: PII redaction in event log payloads where flagged."

# ---------------------------------------------------------------------
# PHASE 3 — CONTINUITY AND LEARNING (Days 61–90)
# Goal: wake conditions beyond timers, forking and merging, the
# policy kernel learns from real usage, first customer pilot live.
# ---------------------------------------------------------------------

phase_3_continuity_and_learning:
  duration_days: 30
  depends_on: [phase_2_field_and_policy]
  exit_criteria:
    - "Continuations sleep on stream / sibling-publish / human-signal wakes and resume correctly."
    - "Forking and merging work with lineage integrity."
    - "Policy kernel v1 (GBT) deployed in shadow mode with side-by-side decision comparison."
    - "Two Cargill consultants use Maestro on real briefs for two weeks; we have a NPS, a cost-per-brief, and a defect log."

  tasks:

    - id: P3-001
      title: Wake conditions — stream subscriptions
      owner: core
      depends_on: [P2-008]
      deliverables:
        - core/maestro/scheduler/streams.py
        - context-field/recall/channels/stream.py    # tie-in
        - substrate-azure/.../stream_bus.py          # Event Grid + Service Bus
        - substrate-aws/.../stream_bus.py            # MSK + EventBridge
      acceptance:
        - "A continuation subscribed to arxiv.q-bio with a predicate wakes when a matching event arrives."
        - "Replay-safe: same event does not re-wake."

    - id: P3-002
      title: Wake conditions — sibling-publish and human-signal
      owner: core
      depends_on: [P3-001]
      deliverables:
        - core/maestro/scheduler/sibling.py
        - core/maestro/scheduler/human.py
        - notebook/src/components/SignalButton.tsx
      acceptance:
        - "Researcher click on Approve in the notebook wakes the waiting continuation within 2s."
        - "Sibling publishes to a tag trigger wakes for subscribers."

    - id: P3-003
      title: Forking and merging
      owner: core
      depends_on: [P3-002]
      deliverables:
        - core/maestro/lineage/fork.py
        - core/maestro/lineage/merge.py
      acceptance:
        - "Fork copies state, assigns new id, links parent edge, scoped budget."
        - "Merge takes N continuations + merge function, produces a single child."
        - "Lineage queries work: ancestors, descendants, siblings."

    - id: P3-004
      title: Policy kernel v1 — GBT training pipeline
      owner: policy-kernel
      depends_on: [P1-009]
      deliverables:
        - eval/policy/feature_extraction.sql
        - eval/policy/train_route.py
        - eval/policy/train_branching.py
        - eval/policy/export_onnx.py
      acceptance:
        - "Training set built from warm store with ≥ 10k labeled decisions."
        - "Models exported to ONNX, latency tested in kernel pod."
        - "Holdout AUC reported per decision dimension."

    - id: P3-005
      title: Policy kernel v1 — shadow deployment
      owner: policy-kernel
      depends_on: [P3-004]
      deliverables:
        - policy-kernel/service/shadow.py
        - dashboards/policy_shadow_compare.json
      acceptance:
        - "Every live decision is shadow-evaluated by v1; both decisions logged."
        - "Dashboard shows agreement rate, divergence buckets, downstream outcome delta."
        - "Kill switch verified in chaos test."

    - id: P3-006
      title: Tool stream adapters — PubMed, J. Dairy Sci., Cargill internal
      owner: core
      depends_on: [P3-001]
      deliverables:
        - core/maestro/tools/streams/pubmed.py
        - core/maestro/tools/streams/jds.py
        - core/maestro/tools/streams/cargill_internal.py
      acceptance:
        - "Each adapter emits structured events to the stream bus with provenance."
        - "Backfill mode supported for warm-start."

    - id: P3-007
      title: Researcher notebook v1 — full workflow
      owner: notebook
      depends_on: [P3-003]
      deliverables:
        - notebook/src/pages/notebook/[id].astro
        - notebook/src/components/Lineage.tsx
        - notebook/src/components/Findings.tsx
        - notebook/src/components/AuditTrail.tsx
      acceptance:
        - "Researcher sees: lineage tree, live continuations, findings stream, audit trail."
        - "Approve/redirect/kill actions on any continuation."
        - "Budget visualization per continuation and root."

    - id: P3-008
      title: Cargill pilot — two consultants, two weeks
      owner: api + notebook + core
      depends_on: [P3-007, P3-005]
      deliverables:
        - docs/pilot/cargill_pilot_runbook.md
        - eval/pilot/metrics_collection.py
        - docs/pilot/pilot_report_template.md
      acceptance:
        - "Two named Cargill consultants complete ≥ 5 real evidence briefs each."
        - "Metrics collected: time-to-brief, cost-per-brief, NPS, defects."
        - "Defect log triaged into roadmap for days 91+."

    - id: P3-009
      title: Maestro public site (Astro) — continuous publication of architecture
      owner: site
      depends_on: []
      deliverables:
        - site/                                  # full Astro app, see scaffold
        - site/src/content/architecture/
        - .github/workflows/site_deploy.yml
      acceptance:
        - "Architecture doc renders as content; updates redeploy automatically."
        - "Pages: Overview, Continuations, Context Field, Policy Kernel, Build Plan, Pilot Status."
        - "RSS + sitemap. Deployed to Azure Static Web Apps and CloudFront fronting S3."

    - id: P3-010
      title: ADR backlog + handoff doc for days 91+
      owner: core
      depends_on: [P3-008]
      deliverables:
        - docs/adr/                              # ≥ 8 ADRs by end of phase 3
        - docs/handoff/days_91_plus.md
      acceptance:
        - "Every load-bearing decision in phases 1-3 has an ADR."
        - "Handoff doc lists top-10 risks, next-3-months priorities, open questions."

    - id: P3-011
      title: Foundry Agent Service substrate adapter
      owner: substrate-azure
      depends_on: [P2-006a, P3-003]
      deliverables:
        - core/maestro/substrate/agent_runtime.py            # interface only
        - substrate-azure/maestro_azure/agent_runtime.py     # FoundryAgentRuntime impl
        - infra/azure/modules/foundry_project.bicep
        - eval/scenarios/foundry_delegation.py
        - docs/adr/0007-agent-runtime-delegation.md
      acceptance:
        - "AgentRuntime interface: spawn_session, stream_events, signal, terminate, collect_terminal_state."
        - "FoundryAgentRuntime spawns a Hosted-agent session via azure-ai-projects; persistent $HOME and scale-to-zero resume work end-to-end."
        - "All session events stream back into the continuation event log with provenance runtime=foundry."
        - "OTEL parent-trace stitching: the Foundry session's spans appear under the continuation's root trace."
        - "Private endpoint friendly; runs in BYO VNet with the project Managed Identity."
        - "Delegation eligibility rule from architecture §4.4 enforced by the policy kernel before spawn."

    - id: P3-012
      title: AgentCore substrate adapter
      owner: substrate-aws
      depends_on: [P2-006, P3-003]
      deliverables:
        - substrate-aws/maestro_aws/agent_runtime.py         # AgentCoreRuntime impl
        - infra/aws/modules/agentcore.tf
        - eval/scenarios/agentcore_delegation.py
      acceptance:
        - "AgentCoreRuntime composes AgentCore Runtime (session), Memory (short-term events), Gateway (MCP tool egress)."
        - "8-hour session ceiling and 15-min idle timeout respected; delegation rejected for sub-tasks whose expected wall-clock exceeds ceiling minus safety margin."
        - "All session events stream back into the continuation event log with provenance runtime=agentcore."
        - "OTEL parent-trace stitching to the continuation's root trace via CloudWatch GenAI Observability."
        - "Inbound Auth via IAM SigV4; per-user identity passed through via X-Amzn-Bedrock-AgentCore-Runtime-User-Id."
        - "Same eval scenario as P3-011 runs and matches outcome shape (modulo provider-specific fields)."

# ---------------------------------------------------------------------
# CROSS-CUTTING CONCERNS
# These run across all three phases and must not be skipped.
# ---------------------------------------------------------------------

cross_cutting:

  security:
    - "All inter-service auth via short-lived tokens (Managed Identity / IRSA)."
    - "Tenant isolation enforced at the store layer, not just the API."
    - "PII redaction policy applied to event log payloads on write."
    - "Compliance freeze switch tested before any Cargill data lands."

  testing:
    - "Substrate test suite runs against both clouds in CI."
    - "Chaos tests for lease expiry, worker death mid-tick, stream replay."
    - "End-to-end test (P2-008 scenario) runs nightly on both clouds."

  cost_discipline:
    - "Per-tenant daily $ cap enforced at ingress; alerts at 50/80/100%."
    - "Field reranker cost is the #1 watched metric."
    - "Policy kernel must run on CPU; no GPU dependency for control plane."

  vendor_runtime_delegation:
    - "Continuations live in the Maestro worker pool by default. The policy kernel may delegate a tick to AgentCore or Foundry Agent Service only when (a) the kernel's route is a tier the vendor can serve, (b) expected wall-clock < vendor.session_ceiling − safety_margin, (c) the sub-task does not fork/merge/sleep on a multi-day signal, (d) the tenant's policy class allows external-runtime execution."
    - "Every model_call event records runtime: maestro | agentcore | foundry."
    - "Vendor session events stream back into the continuation event log so the audit trail in Redshift/Synapse remains the single source of truth."

  documentation:
    - "Every package has a README with the one-paragraph why and how."
    - "ADRs for every load-bearing decision."
    - "Architecture doc is the spec; if code disagrees, one of them is wrong."

claude_code_usage:
  recommended_invocation: |
    Use this file as the source of truth for task selection.
    Pick a task by `id`, verify `depends_on` are met (look at git log for the
    deliverables of those tasks), then implement the deliverables and verify
    against `acceptance`. One task per branch. PR title is the task id + title.

  task_selection_priority:
    - "Always pick the lowest-numbered task whose dependencies are met."
    - "If a P1 task is open, P2 and P3 work is blocked unless explicitly parallel."
    - "Cross-cutting concerns are not optional — every PR must address them where relevant."

  do_not_do:
    - "Do not put LLM calls on the policy kernel critical path."
    - "Do not import cloud SDKs in core/."
    - "Do not skip the event log — every state change is an event."
    - "Do not invent new substrate methods without an ADR."
