Skip to main content

End-to-End Operator Examples

This guide provides comprehensive, production-ready Terraform examples for operators managing Kubiya Control Plane infrastructure. Each example demonstrates real-world use cases with best practices.

Example 1: Complete Platform Setup

This example creates a full platform infrastructure with multiple environments, teams, agents, skills, policies, and automated jobs.

Use Case

Set up a complete platform engineering infrastructure for a medium-sized organization with separate production, staging, and development environments.
terraform {
  required_version = ">= 1.0"

  required_providers {
    controlplane = {
      source  = "kubiya/control-plane"
      version = "~> 1.0"
    }
  }
}

provider "controlplane" {
  # Configuration via environment variables:
  # KUBIYA_CONTROL_PLANE_API_KEY (required)
  # KUBIYA_CONTROL_PLANE_BASE_URL (optional - for self-hosted)
}

# ============================================================================
# ENVIRONMENTS
# ============================================================================

resource "controlplane_environment" "production" {
  name         = "production"
  display_name = "Production Environment"
  description  = "Production environment for AI agents"
  tags         = ["production", "managed-by-terraform"]

  configuration = jsonencode({
    region         = "us-east-1"
    max_workers    = 20
    auto_scaling   = true
    retention_days = 90
  })

  execution_environment = jsonencode({
    env_vars = {
      LOG_LEVEL = "info"
      APP_ENV   = "production"
    }
  })
}

resource "controlplane_environment" "staging" {
  name         = "staging"
  display_name = "Staging Environment"
  description  = "Staging environment for testing"
  tags         = ["staging", "managed-by-terraform"]

  configuration = jsonencode({
    region         = "us-west-2"
    max_workers    = 10
    auto_scaling   = true
    retention_days = 30
  })
}

resource "controlplane_environment" "development" {
  name         = "development"
  display_name = "Development Environment"
  description  = "Development environment"
  tags         = ["development", "managed-by-terraform"]

  configuration = jsonencode({
    region         = "us-west-2"
    max_workers    = 5
    auto_scaling   = false
    retention_days = 7
  })
}

# ============================================================================
# PROJECTS
# ============================================================================

resource "controlplane_project" "platform" {
  name        = "platform-engineering"
  key         = "PLAT"
  description = "Platform engineering and infrastructure"
  goals       = "Manage and automate platform infrastructure"
  visibility  = "private"

  metadata = jsonencode({
    owner       = "platform-team"
    cost_center = "engineering"
  })
}

resource "controlplane_project" "security" {
  name        = "security-compliance"
  key         = "SEC"
  description = "Security and compliance automation"
  goals       = "Ensure security and compliance across infrastructure"
  visibility  = "private"

  metadata = jsonencode({
    owner       = "security-team"
    cost_center = "security"
  })
}

# ============================================================================
# SKILLS
# ============================================================================

resource "controlplane_skill" "shell_operations" {
  name        = "shell-operations"
  description = "Shell command execution for infrastructure operations"
  type        = "shell"
  enabled     = true

  configuration = jsonencode({
    allowed_commands = ["kubectl", "helm", "aws", "gcloud", "terraform", "ansible"]
    timeout          = 600
    working_dir      = "/app"
  })
}

resource "controlplane_skill" "filesystem_access" {
  name        = "filesystem-access"
  description = "File system operations"
  type        = "file_system"
  enabled     = true

  configuration = jsonencode({
    allowed_paths = ["/app/configs", "/app/data", "/app/logs"]
    max_file_size = 52428800 # 50MB
    operations    = ["read", "write", "list", "delete"]
  })
}

resource "controlplane_skill" "docker_operations" {
  name        = "docker-operations"
  description = "Docker container management"
  type        = "docker"
  enabled     = true

  configuration = jsonencode({
    allowed_registries = ["docker.io", "gcr.io", "ghcr.io"]
    max_containers     = 20
    network_mode       = "bridge"
  })
}

# ============================================================================
# POLICIES
# ============================================================================

resource "controlplane_policy" "production_security" {
  name        = "production-security"
  description = "Security policy for production environment"
  enabled     = true

  policy_content = <<-EOT
    package kubiya.security

    # Deny destructive operations without approval
    deny[msg] {
      input.operation = "delete"
      input.environment = "production"
      count(input.approvals) < 2
      msg := "Delete operations in production require at least 2 approvals"
    }

    # Require MFA for sensitive operations
    deny[msg] {
      input.operation = "deploy"
      input.environment = "production"
      not input.mfa_verified
      msg := "Production deployments require MFA verification"
    }

    # Restrict resource modifications
    deny[msg] {
      input.operation = "modify"
      input.environment = "production"
      input.resource_type = "database"
      not input.maintenance_window
      msg := "Database modifications in production must occur during maintenance windows"
    }
  EOT

  tags = ["security", "production", "compliance"]
}

resource "controlplane_policy" "cost_control" {
  name        = "cost-control"
  description = "Cost control and resource limits policy"
  enabled     = true

  policy_content = <<-EOT
    package kubiya.cost

    # Limit instance sizes
    deny[msg] {
      input.action = "create_instance"
      input.instance_type = "x2.32xlarge"
      msg := "Instance type too large, maximum allowed is m5.2xlarge"
    }

    # Require cost tags
    deny[msg] {
      input.action = "create_resource"
      not input.tags.cost_center
      msg := "All resources must have a cost_center tag"
    }

    # Budget limits
    deny[msg] {
      input.action = "create_resource"
      input.estimated_monthly_cost > 10000
      not input.budget_approved
      msg := "Resources exceeding $10,000/month require budget approval"
    }
  EOT

  tags = ["cost", "governance", "finops"]
}

# ============================================================================
# TEAMS
# ============================================================================

resource "controlplane_team" "devops" {
  name        = "devops-team"
  description = "DevOps and platform engineering team"

  # Runtime: "default" (Agno) or "claude_code" (Claude Code SDK)
  runtime = "claude_code"

  configuration = jsonencode({
    max_agents        = 15
    slack_channel     = "#devops-agents"
    enable_monitoring = true
    alert_on_error    = true
  })

  capabilities = ["deployment", "monitoring", "incident_response"]
}

resource "controlplane_team" "sre" {
  name        = "sre-team"
  description = "Site reliability engineering team"

  runtime = "claude_code"

  configuration = jsonencode({
    max_agents        = 10
    slack_channel     = "#sre-agents"
    enable_monitoring = true
  })

  capabilities = ["monitoring", "incident_response", "performance_optimization"]
}

resource "controlplane_team" "security" {
  name        = "security-team"
  description = "Security and compliance team"

  runtime = "default"

  configuration = jsonencode({
    max_agents     = 5
    slack_channel  = "#security-agents"
    audit_logging  = true
  })

  capabilities = ["security_scanning", "compliance_checking", "threat_detection"]
}

# ============================================================================
# AGENTS
# ============================================================================

resource "controlplane_agent" "deployment_agent" {
  name        = "production-deployer"
  description = "AI agent for production deployments"

  model_id = "kubiya/claude-sonnet-4"
  runtime  = "claude_code"
  team_id  = controlplane_team.devops.id

  llm_config = jsonencode({
    temperature = 0.3
    max_tokens  = 4096
  })

  configuration = jsonencode({
    capabilities    = ["kubernetes", "helm", "terraform"]
    permissions     = ["read", "execute", "deploy"]
    max_retries     = 3
    timeout         = 900
    approval_needed = true
  })

  capabilities = ["kubernetes_deploy", "helm_deploy", "rollback"]
}

resource "controlplane_agent" "monitoring_agent" {
  name        = "monitoring-assistant"
  description = "AI agent for monitoring and alerting"

  model_id = "kubiya/claude-sonnet-4"
  runtime  = "claude_code"
  team_id  = controlplane_team.sre.id

  llm_config = jsonencode({
    temperature = 0.5
    max_tokens  = 3000
  })

  configuration = jsonencode({
    capabilities   = ["metrics", "logging", "tracing"]
    check_interval = 60
    alert_channels = ["slack", "pagerduty"]
  })

  capabilities = ["metrics_collection", "alerting", "log_analysis"]
}

resource "controlplane_agent" "incident_responder" {
  name        = "incident-responder"
  description = "AI agent for incident response"

  model_id = "kubiya/claude-sonnet-4"
  runtime  = "claude_code"
  team_id  = controlplane_team.sre.id

  llm_config = jsonencode({
    temperature = 0.4
    max_tokens  = 4096
  })

  configuration = jsonencode({
    capabilities       = ["diagnostics", "remediation", "communication"]
    escalation_timeout = 600
    auto_remediation   = false
  })

  capabilities = ["incident_management", "root_cause_analysis", "remediation"]
}

resource "controlplane_agent" "security_scanner" {
  name        = "security-scanner"
  description = "AI agent for security scanning"

  model_id = "kubiya/claude-sonnet-4"
  runtime  = "default"
  team_id  = controlplane_team.security.id

  llm_config = jsonencode({
    temperature = 0.2
    max_tokens  = 4096
  })

  configuration = jsonencode({
    capabilities    = ["vulnerability_scanning", "compliance_checking"]
    scan_frequency  = "daily"
    severity_levels = ["critical", "high", "medium"]
  })

  capabilities = ["security_scanning", "compliance_reporting"]
}

# ============================================================================
# JOBS
# ============================================================================

resource "controlplane_job" "daily_health_check" {
  name          = "daily-health-check"
  description   = "Daily health check at 9am UTC"
  enabled       = true
  trigger_type  = "cron"
  cron_schedule = "0 9 * * *" # 9 AM UTC daily
  cron_timezone = "UTC"

  planning_mode   = "predefined_agent"
  entity_type     = "agent"
  entity_id       = controlplane_agent.monitoring_agent.id
  prompt_template = "Run comprehensive health check for all production services"
  system_prompt   = "Check health of all production services, databases, and infrastructure. Report any issues immediately."

  executor_type = "auto"

  execution_env_vars = {
    CHECK_TYPE       = "comprehensive"
    ALERT_ON_FAILURE = "true"
    INCLUDE_METRICS  = "true"
  }
}

resource "controlplane_job" "deployment_webhook" {
  name         = "deployment-webhook"
  description  = "Handle deployment webhook events"
  enabled      = true
  trigger_type = "webhook"

  planning_mode   = "predefined_agent"
  entity_type     = "agent"
  entity_id       = controlplane_agent.deployment_agent.id
  prompt_template = "Deploy {{service_name}} version {{version}} to {{environment}}"
  system_prompt   = "Process deployment request. Verify prerequisites, execute deployment, and confirm success."

  executor_type    = "environment"
  environment_name = controlplane_environment.production.name

  config = jsonencode({
    timeout = 1800 # 30 minutes
    retry_policy = {
      max_attempts = 3
      backoff      = "exponential"
    }
  })
}

resource "controlplane_job" "security_scan" {
  name          = "security-scan"
  description   = "Daily security vulnerability scan"
  enabled       = true
  trigger_type  = "cron"
  cron_schedule = "0 2 * * *" # 2 AM UTC daily
  cron_timezone = "UTC"

  planning_mode   = "predefined_agent"
  entity_type     = "agent"
  entity_id       = controlplane_agent.security_scanner.id
  prompt_template = "Run security vulnerability scan for all production infrastructure"
  system_prompt   = "Perform comprehensive security scan. Report vulnerabilities by severity."

  executor_type = "auto"

  execution_env_vars = {
    SCAN_TYPE         = "full"
    REPORT_FORMAT     = "json"
    SEVERITY_FILTER   = "high,critical"
  }

  execution_secrets = ["security_scanner_token"]
}

resource "controlplane_job" "incident_response_manual" {
  name         = "incident-response"
  description  = "Manual incident response job"
  enabled      = true
  trigger_type = "manual"

  planning_mode   = "predefined_agent"
  entity_type     = "agent"
  entity_id       = controlplane_agent.incident_responder.id
  prompt_template = "Handle incident: {{incident_id}} - {{description}}"
  system_prompt   = "Coordinate incident response. Diagnose issue, implement remediation, and communicate status."

  executor_type = "auto"

  execution_secrets = ["pagerduty_token", "slack_webhook"]
}

# ============================================================================
# WORKER QUEUES
# ============================================================================

resource "controlplane_worker_queue" "production_primary" {
  name               = "production-primary"
  environment_name   = controlplane_environment.production.name
  display_name       = "Production Primary Queue"
  description        = "Primary worker queue for production workloads"
  heartbeat_interval = 60
  max_workers        = 20
  tags               = ["production", "primary", "high-priority"]

  settings = {
    region   = "us-east-1"
    tier     = "production"
    priority = "high"
  }
}

resource "controlplane_worker_queue" "production_batch" {
  name               = "production-batch"
  environment_name   = controlplane_environment.production.name
  display_name       = "Production Batch Queue"
  description        = "Worker queue for batch jobs and scheduled tasks"
  heartbeat_interval = 120
  max_workers        = 10
  tags               = ["production", "batch", "normal-priority"]

  settings = {
    region   = "us-east-1"
    tier     = "production"
    priority = "normal"
  }
}

# ============================================================================
# OUTPUTS
# ============================================================================

output "environment_ids" {
  description = "Environment IDs"
  value = {
    production  = controlplane_environment.production.id
    staging     = controlplane_environment.staging.id
    development = controlplane_environment.development.id
  }
}

output "team_ids" {
  description = "Team IDs"
  value = {
    devops   = controlplane_team.devops.id
    sre      = controlplane_team.sre.id
    security = controlplane_team.security.id
  }
}

output "agent_ids" {
  description = "Agent IDs"
  value = {
    deployer           = controlplane_agent.deployment_agent.id
    monitor            = controlplane_agent.monitoring_agent.id
    incident_responder = controlplane_agent.incident_responder.id
    security_scanner   = controlplane_agent.security_scanner.id
  }
}

output "deployment_webhook_url" {
  description = "Webhook URL for deployments"
  value       = controlplane_job.deployment_webhook.webhook_url
  sensitive   = true
}

output "worker_queue_names" {
  description = "Worker queue task names"
  value = {
    primary = controlplane_worker_queue.production_primary.task_name
    batch   = controlplane_worker_queue.production_batch.task_name
  }
}

Deployment Instructions

  1. Set up authentication:
    # For hosted control plane
    export KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key"
    
    # For self-hosted
    export KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key"
    export KUBIYA_CONTROL_PLANE_BASE_URL="https://control-plane.company.com"
    
  2. Initialize and apply:
    terraform init
    terraform plan
    terraform apply
    
  3. Verify resources:
    terraform output
    

Example 2: GitOps Workflow with Multiple Environments

This example demonstrates using Terraform workspaces to manage multiple environments with GitOps practices.

Directory Structure

terraform/
  ├── main.tf
  ├── variables.tf
  ├── outputs.tf
  ├── backend.tf
  └── environments/
      ├── production.tfvars
      ├── staging.tfvars
      └── development.tfvars

variables.tf

variable "environment" {
  description = "Environment name (production, staging, development)"
  type        = string
}

variable "region" {
  description = "Primary region"
  type        = string
  default     = "us-east-1"
}

variable "max_workers" {
  description = "Maximum number of workers per environment"
  type        = number
}

variable "retention_days" {
  description = "Log retention in days"
  type        = number
}

variable "team_runtime" {
  description = "Runtime type for teams (default or claude_code)"
  type        = string
  default     = "claude_code"
}

variable "agent_model" {
  description = "LLM model for agents"
  type        = string
  default     = "kubiya/claude-sonnet-4"
}

variable "enable_auto_scaling" {
  description = "Enable auto-scaling for workers"
  type        = bool
  default     = true
}

environments/production.tfvars

environment         = "production"
region              = "us-east-1"
max_workers         = 20
retention_days      = 90
enable_auto_scaling = true
team_runtime        = "claude_code"
agent_model         = "kubiya/claude-sonnet-4"

environments/staging.tfvars

environment         = "staging"
region              = "us-west-2"
max_workers         = 10
retention_days      = 30
enable_auto_scaling = true
team_runtime        = "claude_code"
agent_model         = "kubiya/claude-sonnet-4"

Deployment Workflow

# Production
terraform workspace select production
terraform apply -var-file=environments/production.tfvars

# Staging
terraform workspace select staging
terraform apply -var-file=environments/staging.tfvars

# Development
terraform workspace select development
terraform apply -var-file=environments/development.tfvars

Example 3: CI/CD Pipeline Integration

GitHub Actions Workflow

name: Terraform Apply

on:
  push:
    branches: [main]
    paths:
      - 'terraform/**'
  pull_request:
    branches: [main]
    paths:
      - 'terraform/**'

jobs:
  terraform:
    runs-on: ubuntu-latest
    environment: production

    defaults:
      run:
        working-directory: terraform

    steps:
      - name: Checkout
        uses: actions/checkout@v3

      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v2
        with:
          terraform_version: 1.5.0

      - name: Terraform Format Check
        run: terraform fmt -check

      - name: Terraform Init
        run: terraform init

      - name: Terraform Validate
        run: terraform validate

      - name: Terraform Plan
        env:
          KUBIYA_CONTROL_PLANE_API_KEY: ${{ secrets.KUBIYA_API_KEY }}
          KUBIYA_CONTROL_PLANE_BASE_URL: ${{ secrets.KUBIYA_BASE_URL }}
        run: terraform plan -out=tfplan

      - name: Terraform Apply
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        env:
          KUBIYA_CONTROL_PLANE_API_KEY: ${{ secrets.KUBIYA_API_KEY }}
          KUBIYA_CONTROL_PLANE_BASE_URL: ${{ secrets.KUBIYA_BASE_URL }}
        run: terraform apply -auto-approve tfplan

Example 4: Self-Hosted Control Plane with Custom Network

For self-hosted deployments in private networks:
provider "controlplane" {
  # Environment variables:
  # KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key"
  # KUBIYA_CONTROL_PLANE_BASE_URL="https://kubiya.internal.company.net"
}

resource "controlplane_environment" "on_premise" {
  name         = "on-premise"
  display_name = "On-Premise Environment"
  description  = "Self-hosted environment in private network"

  configuration = jsonencode({
    network_mode   = "private"
    vpc_id         = "vpc-xxxxx"
    subnet_ids     = ["subnet-xxxxx", "subnet-yyyyy"]
    security_groups = ["sg-xxxxx"]
  })

  execution_environment = jsonencode({
    env_vars = {
      PROXY_URL     = "http://proxy.internal:8080"
      NO_PROXY      = "localhost,127.0.0.1,.internal"
      HTTPS_PROXY   = "http://proxy.internal:8080"
    }
  })
}

Example 5: Using Data Sources

Look up and reference existing resources:
# Look up existing environment
data "controlplane_environment" "existing_prod" {
  id = "env-xxxxx"
}

# Look up existing team
data "controlplane_team" "existing_devops" {
  id = "team-xxxxx"
}

# Create agent using existing resources
resource "controlplane_agent" "new_agent" {
  name        = "new-deployment-agent"
  description = "New agent using existing team"
  model_id    = "kubiya/claude-sonnet-4"
  runtime     = "claude_code"

  # Reference existing team
  team_id = data.controlplane_team.existing_devops.id

  llm_config = jsonencode({
    temperature = 0.7
    max_tokens  = 4096
  })

  configuration = jsonencode({
    environment = data.controlplane_environment.existing_prod.name
  })
}

# Output information from data sources
output "existing_env_info" {
  value = {
    name        = data.controlplane_environment.existing_prod.name
    description = data.controlplane_environment.existing_prod.description
    tags        = data.controlplane_environment.existing_prod.tags
  }
}

Best Practices Summary

1. State Management

# backend.tf
terraform {
  backend "s3" {
    bucket         = "company-terraform-state"
    key            = "kubiya/control-plane/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"
  }
}

2. Secrets Management

Never store secrets in Terraform configurations:
# Use secrets manager
export KUBIYA_CONTROL_PLANE_API_KEY=$(aws secretsmanager get-secret-value \
  --secret-id kubiya/api-key \
  --query SecretString \
  --output text)

3. Resource Naming

Use consistent, descriptive naming:
locals {
  env_prefix = "${var.environment}-${var.region}"
  common_tags = [
    "environment:${var.environment}",
    "managed-by:terraform",
    "team:${var.team_name}"
  ]
}

resource "controlplane_agent" "example" {
  name = "${local.env_prefix}-agent-${var.agent_purpose}"
  tags = local.common_tags
}

4. Modular Design

Create reusable modules:
modules/
  ├── agent-team/
  │   ├── main.tf
  │   ├── variables.tf
  │   └── outputs.tf
  ├── environment/
  │   ├── main.tf
  │   ├── variables.tf
  │   └── outputs.tf
  └── job/
      ├── main.tf
      ├── variables.tf
      └── outputs.tf

Additional Resources

Support

For questions or issues: