End-to-End Operator Examples
This guide provides comprehensive, production-ready Terraform examples for operators managing Kubiya Control Plane infrastructure. Each example demonstrates real-world use cases with best practices.Example 1: Complete Platform Setup
This example creates a full platform infrastructure with multiple environments, teams, agents, skills, policies, and automated jobs.Use Case
Set up a complete platform engineering infrastructure for a medium-sized organization with separate production, staging, and development environments.Copy
Ask AI
terraform {
required_version = ">= 1.0"
required_providers {
controlplane = {
source = "kubiya/control-plane"
version = "~> 1.0"
}
}
}
provider "controlplane" {
# Configuration via environment variables:
# KUBIYA_CONTROL_PLANE_API_KEY (required)
# KUBIYA_CONTROL_PLANE_BASE_URL (optional - for self-hosted)
}
# ============================================================================
# ENVIRONMENTS
# ============================================================================
resource "controlplane_environment" "production" {
name = "production"
display_name = "Production Environment"
description = "Production environment for AI agents"
tags = ["production", "managed-by-terraform"]
configuration = jsonencode({
region = "us-east-1"
max_workers = 20
auto_scaling = true
retention_days = 90
})
execution_environment = jsonencode({
env_vars = {
LOG_LEVEL = "info"
APP_ENV = "production"
}
})
}
resource "controlplane_environment" "staging" {
name = "staging"
display_name = "Staging Environment"
description = "Staging environment for testing"
tags = ["staging", "managed-by-terraform"]
configuration = jsonencode({
region = "us-west-2"
max_workers = 10
auto_scaling = true
retention_days = 30
})
}
resource "controlplane_environment" "development" {
name = "development"
display_name = "Development Environment"
description = "Development environment"
tags = ["development", "managed-by-terraform"]
configuration = jsonencode({
region = "us-west-2"
max_workers = 5
auto_scaling = false
retention_days = 7
})
}
# ============================================================================
# PROJECTS
# ============================================================================
resource "controlplane_project" "platform" {
name = "platform-engineering"
key = "PLAT"
description = "Platform engineering and infrastructure"
goals = "Manage and automate platform infrastructure"
visibility = "private"
metadata = jsonencode({
owner = "platform-team"
cost_center = "engineering"
})
}
resource "controlplane_project" "security" {
name = "security-compliance"
key = "SEC"
description = "Security and compliance automation"
goals = "Ensure security and compliance across infrastructure"
visibility = "private"
metadata = jsonencode({
owner = "security-team"
cost_center = "security"
})
}
# ============================================================================
# SKILLS
# ============================================================================
resource "controlplane_skill" "shell_operations" {
name = "shell-operations"
description = "Shell command execution for infrastructure operations"
type = "shell"
enabled = true
configuration = jsonencode({
allowed_commands = ["kubectl", "helm", "aws", "gcloud", "terraform", "ansible"]
timeout = 600
working_dir = "/app"
})
}
resource "controlplane_skill" "filesystem_access" {
name = "filesystem-access"
description = "File system operations"
type = "file_system"
enabled = true
configuration = jsonencode({
allowed_paths = ["/app/configs", "/app/data", "/app/logs"]
max_file_size = 52428800 # 50MB
operations = ["read", "write", "list", "delete"]
})
}
resource "controlplane_skill" "docker_operations" {
name = "docker-operations"
description = "Docker container management"
type = "docker"
enabled = true
configuration = jsonencode({
allowed_registries = ["docker.io", "gcr.io", "ghcr.io"]
max_containers = 20
network_mode = "bridge"
})
}
# ============================================================================
# POLICIES
# ============================================================================
resource "controlplane_policy" "production_security" {
name = "production-security"
description = "Security policy for production environment"
enabled = true
policy_content = <<-EOT
package kubiya.security
# Deny destructive operations without approval
deny[msg] {
input.operation = "delete"
input.environment = "production"
count(input.approvals) < 2
msg := "Delete operations in production require at least 2 approvals"
}
# Require MFA for sensitive operations
deny[msg] {
input.operation = "deploy"
input.environment = "production"
not input.mfa_verified
msg := "Production deployments require MFA verification"
}
# Restrict resource modifications
deny[msg] {
input.operation = "modify"
input.environment = "production"
input.resource_type = "database"
not input.maintenance_window
msg := "Database modifications in production must occur during maintenance windows"
}
EOT
tags = ["security", "production", "compliance"]
}
resource "controlplane_policy" "cost_control" {
name = "cost-control"
description = "Cost control and resource limits policy"
enabled = true
policy_content = <<-EOT
package kubiya.cost
# Limit instance sizes
deny[msg] {
input.action = "create_instance"
input.instance_type = "x2.32xlarge"
msg := "Instance type too large, maximum allowed is m5.2xlarge"
}
# Require cost tags
deny[msg] {
input.action = "create_resource"
not input.tags.cost_center
msg := "All resources must have a cost_center tag"
}
# Budget limits
deny[msg] {
input.action = "create_resource"
input.estimated_monthly_cost > 10000
not input.budget_approved
msg := "Resources exceeding $10,000/month require budget approval"
}
EOT
tags = ["cost", "governance", "finops"]
}
# ============================================================================
# TEAMS
# ============================================================================
resource "controlplane_team" "devops" {
name = "devops-team"
description = "DevOps and platform engineering team"
# Runtime: "default" (Agno) or "claude_code" (Claude Code SDK)
runtime = "claude_code"
configuration = jsonencode({
max_agents = 15
slack_channel = "#devops-agents"
enable_monitoring = true
alert_on_error = true
})
capabilities = ["deployment", "monitoring", "incident_response"]
}
resource "controlplane_team" "sre" {
name = "sre-team"
description = "Site reliability engineering team"
runtime = "claude_code"
configuration = jsonencode({
max_agents = 10
slack_channel = "#sre-agents"
enable_monitoring = true
})
capabilities = ["monitoring", "incident_response", "performance_optimization"]
}
resource "controlplane_team" "security" {
name = "security-team"
description = "Security and compliance team"
runtime = "default"
configuration = jsonencode({
max_agents = 5
slack_channel = "#security-agents"
audit_logging = true
})
capabilities = ["security_scanning", "compliance_checking", "threat_detection"]
}
# ============================================================================
# AGENTS
# ============================================================================
resource "controlplane_agent" "deployment_agent" {
name = "production-deployer"
description = "AI agent for production deployments"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.devops.id
llm_config = jsonencode({
temperature = 0.3
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["kubernetes", "helm", "terraform"]
permissions = ["read", "execute", "deploy"]
max_retries = 3
timeout = 900
approval_needed = true
})
capabilities = ["kubernetes_deploy", "helm_deploy", "rollback"]
}
resource "controlplane_agent" "monitoring_agent" {
name = "monitoring-assistant"
description = "AI agent for monitoring and alerting"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.sre.id
llm_config = jsonencode({
temperature = 0.5
max_tokens = 3000
})
configuration = jsonencode({
capabilities = ["metrics", "logging", "tracing"]
check_interval = 60
alert_channels = ["slack", "pagerduty"]
})
capabilities = ["metrics_collection", "alerting", "log_analysis"]
}
resource "controlplane_agent" "incident_responder" {
name = "incident-responder"
description = "AI agent for incident response"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.sre.id
llm_config = jsonencode({
temperature = 0.4
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["diagnostics", "remediation", "communication"]
escalation_timeout = 600
auto_remediation = false
})
capabilities = ["incident_management", "root_cause_analysis", "remediation"]
}
resource "controlplane_agent" "security_scanner" {
name = "security-scanner"
description = "AI agent for security scanning"
model_id = "kubiya/claude-sonnet-4"
runtime = "default"
team_id = controlplane_team.security.id
llm_config = jsonencode({
temperature = 0.2
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["vulnerability_scanning", "compliance_checking"]
scan_frequency = "daily"
severity_levels = ["critical", "high", "medium"]
})
capabilities = ["security_scanning", "compliance_reporting"]
}
# ============================================================================
# JOBS
# ============================================================================
resource "controlplane_job" "daily_health_check" {
name = "daily-health-check"
description = "Daily health check at 9am UTC"
enabled = true
trigger_type = "cron"
cron_schedule = "0 9 * * *" # 9 AM UTC daily
cron_timezone = "UTC"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.monitoring_agent.id
prompt_template = "Run comprehensive health check for all production services"
system_prompt = "Check health of all production services, databases, and infrastructure. Report any issues immediately."
executor_type = "auto"
execution_env_vars = {
CHECK_TYPE = "comprehensive"
ALERT_ON_FAILURE = "true"
INCLUDE_METRICS = "true"
}
}
resource "controlplane_job" "deployment_webhook" {
name = "deployment-webhook"
description = "Handle deployment webhook events"
enabled = true
trigger_type = "webhook"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.deployment_agent.id
prompt_template = "Deploy {{service_name}} version {{version}} to {{environment}}"
system_prompt = "Process deployment request. Verify prerequisites, execute deployment, and confirm success."
executor_type = "environment"
environment_name = controlplane_environment.production.name
config = jsonencode({
timeout = 1800 # 30 minutes
retry_policy = {
max_attempts = 3
backoff = "exponential"
}
})
}
resource "controlplane_job" "security_scan" {
name = "security-scan"
description = "Daily security vulnerability scan"
enabled = true
trigger_type = "cron"
cron_schedule = "0 2 * * *" # 2 AM UTC daily
cron_timezone = "UTC"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.security_scanner.id
prompt_template = "Run security vulnerability scan for all production infrastructure"
system_prompt = "Perform comprehensive security scan. Report vulnerabilities by severity."
executor_type = "auto"
execution_env_vars = {
SCAN_TYPE = "full"
REPORT_FORMAT = "json"
SEVERITY_FILTER = "high,critical"
}
execution_secrets = ["security_scanner_token"]
}
resource "controlplane_job" "incident_response_manual" {
name = "incident-response"
description = "Manual incident response job"
enabled = true
trigger_type = "manual"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.incident_responder.id
prompt_template = "Handle incident: {{incident_id}} - {{description}}"
system_prompt = "Coordinate incident response. Diagnose issue, implement remediation, and communicate status."
executor_type = "auto"
execution_secrets = ["pagerduty_token", "slack_webhook"]
}
# ============================================================================
# WORKER QUEUES
# ============================================================================
resource "controlplane_worker_queue" "production_primary" {
name = "production-primary"
environment_name = controlplane_environment.production.name
display_name = "Production Primary Queue"
description = "Primary worker queue for production workloads"
heartbeat_interval = 60
max_workers = 20
tags = ["production", "primary", "high-priority"]
settings = {
region = "us-east-1"
tier = "production"
priority = "high"
}
}
resource "controlplane_worker_queue" "production_batch" {
name = "production-batch"
environment_name = controlplane_environment.production.name
display_name = "Production Batch Queue"
description = "Worker queue for batch jobs and scheduled tasks"
heartbeat_interval = 120
max_workers = 10
tags = ["production", "batch", "normal-priority"]
settings = {
region = "us-east-1"
tier = "production"
priority = "normal"
}
}
# ============================================================================
# OUTPUTS
# ============================================================================
output "environment_ids" {
description = "Environment IDs"
value = {
production = controlplane_environment.production.id
staging = controlplane_environment.staging.id
development = controlplane_environment.development.id
}
}
output "team_ids" {
description = "Team IDs"
value = {
devops = controlplane_team.devops.id
sre = controlplane_team.sre.id
security = controlplane_team.security.id
}
}
output "agent_ids" {
description = "Agent IDs"
value = {
deployer = controlplane_agent.deployment_agent.id
monitor = controlplane_agent.monitoring_agent.id
incident_responder = controlplane_agent.incident_responder.id
security_scanner = controlplane_agent.security_scanner.id
}
}
output "deployment_webhook_url" {
description = "Webhook URL for deployments"
value = controlplane_job.deployment_webhook.webhook_url
sensitive = true
}
output "worker_queue_names" {
description = "Worker queue task names"
value = {
primary = controlplane_worker_queue.production_primary.task_name
batch = controlplane_worker_queue.production_batch.task_name
}
}
Deployment Instructions
-
Set up authentication:
CopyAsk AI
# For hosted control plane export KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key" # For self-hosted export KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key" export KUBIYA_CONTROL_PLANE_BASE_URL="https://control-plane.company.com" -
Initialize and apply:
CopyAsk AI
terraform init terraform plan terraform apply -
Verify resources:
CopyAsk AI
terraform output
Example 2: GitOps Workflow with Multiple Environments
This example demonstrates using Terraform workspaces to manage multiple environments with GitOps practices.Directory Structure
Copy
Ask AI
terraform/
├── main.tf
├── variables.tf
├── outputs.tf
├── backend.tf
└── environments/
├── production.tfvars
├── staging.tfvars
└── development.tfvars
variables.tf
Copy
Ask AI
variable "environment" {
description = "Environment name (production, staging, development)"
type = string
}
variable "region" {
description = "Primary region"
type = string
default = "us-east-1"
}
variable "max_workers" {
description = "Maximum number of workers per environment"
type = number
}
variable "retention_days" {
description = "Log retention in days"
type = number
}
variable "team_runtime" {
description = "Runtime type for teams (default or claude_code)"
type = string
default = "claude_code"
}
variable "agent_model" {
description = "LLM model for agents"
type = string
default = "kubiya/claude-sonnet-4"
}
variable "enable_auto_scaling" {
description = "Enable auto-scaling for workers"
type = bool
default = true
}
environments/production.tfvars
Copy
Ask AI
environment = "production"
region = "us-east-1"
max_workers = 20
retention_days = 90
enable_auto_scaling = true
team_runtime = "claude_code"
agent_model = "kubiya/claude-sonnet-4"
environments/staging.tfvars
Copy
Ask AI
environment = "staging"
region = "us-west-2"
max_workers = 10
retention_days = 30
enable_auto_scaling = true
team_runtime = "claude_code"
agent_model = "kubiya/claude-sonnet-4"
Deployment Workflow
Copy
Ask AI
# Production
terraform workspace select production
terraform apply -var-file=environments/production.tfvars
# Staging
terraform workspace select staging
terraform apply -var-file=environments/staging.tfvars
# Development
terraform workspace select development
terraform apply -var-file=environments/development.tfvars
Example 3: CI/CD Pipeline Integration
GitHub Actions Workflow
Copy
Ask AI
name: Terraform Apply
on:
push:
branches: [main]
paths:
- 'terraform/**'
pull_request:
branches: [main]
paths:
- 'terraform/**'
jobs:
terraform:
runs-on: ubuntu-latest
environment: production
defaults:
run:
working-directory: terraform
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: 1.5.0
- name: Terraform Format Check
run: terraform fmt -check
- name: Terraform Init
run: terraform init
- name: Terraform Validate
run: terraform validate
- name: Terraform Plan
env:
KUBIYA_CONTROL_PLANE_API_KEY: ${{ secrets.KUBIYA_API_KEY }}
KUBIYA_CONTROL_PLANE_BASE_URL: ${{ secrets.KUBIYA_BASE_URL }}
run: terraform plan -out=tfplan
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
env:
KUBIYA_CONTROL_PLANE_API_KEY: ${{ secrets.KUBIYA_API_KEY }}
KUBIYA_CONTROL_PLANE_BASE_URL: ${{ secrets.KUBIYA_BASE_URL }}
run: terraform apply -auto-approve tfplan
Example 4: Self-Hosted Control Plane with Custom Network
For self-hosted deployments in private networks:Copy
Ask AI
provider "controlplane" {
# Environment variables:
# KUBIYA_CONTROL_PLANE_API_KEY="kcp_your_api_key"
# KUBIYA_CONTROL_PLANE_BASE_URL="https://kubiya.internal.company.net"
}
resource "controlplane_environment" "on_premise" {
name = "on-premise"
display_name = "On-Premise Environment"
description = "Self-hosted environment in private network"
configuration = jsonencode({
network_mode = "private"
vpc_id = "vpc-xxxxx"
subnet_ids = ["subnet-xxxxx", "subnet-yyyyy"]
security_groups = ["sg-xxxxx"]
})
execution_environment = jsonencode({
env_vars = {
PROXY_URL = "http://proxy.internal:8080"
NO_PROXY = "localhost,127.0.0.1,.internal"
HTTPS_PROXY = "http://proxy.internal:8080"
}
})
}
Example 5: Using Data Sources
Look up and reference existing resources:Copy
Ask AI
# Look up existing environment
data "controlplane_environment" "existing_prod" {
id = "env-xxxxx"
}
# Look up existing team
data "controlplane_team" "existing_devops" {
id = "team-xxxxx"
}
# Create agent using existing resources
resource "controlplane_agent" "new_agent" {
name = "new-deployment-agent"
description = "New agent using existing team"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
# Reference existing team
team_id = data.controlplane_team.existing_devops.id
llm_config = jsonencode({
temperature = 0.7
max_tokens = 4096
})
configuration = jsonencode({
environment = data.controlplane_environment.existing_prod.name
})
}
# Output information from data sources
output "existing_env_info" {
value = {
name = data.controlplane_environment.existing_prod.name
description = data.controlplane_environment.existing_prod.description
tags = data.controlplane_environment.existing_prod.tags
}
}
Best Practices Summary
1. State Management
Copy
Ask AI
# backend.tf
terraform {
backend "s3" {
bucket = "company-terraform-state"
key = "kubiya/control-plane/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
2. Secrets Management
Never store secrets in Terraform configurations:Copy
Ask AI
# Use secrets manager
export KUBIYA_CONTROL_PLANE_API_KEY=$(aws secretsmanager get-secret-value \
--secret-id kubiya/api-key \
--query SecretString \
--output text)
3. Resource Naming
Use consistent, descriptive naming:Copy
Ask AI
locals {
env_prefix = "${var.environment}-${var.region}"
common_tags = [
"environment:${var.environment}",
"managed-by:terraform",
"team:${var.team_name}"
]
}
resource "controlplane_agent" "example" {
name = "${local.env_prefix}-agent-${var.agent_purpose}"
tags = local.common_tags
}
4. Modular Design
Create reusable modules:Copy
Ask AI
modules/
├── agent-team/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
├── environment/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── job/
├── main.tf
├── variables.tf
└── outputs.tf
Additional Resources
- Provider Configuration - Detailed configuration options
- Resources Reference - Complete resource documentation
- GitHub Examples - More examples
- Kubiya API Reference - Control Plane API documentation