terraform {
required_version = ">= 1.0"
required_providers {
controlplane = {
source = "kubiya/control-plane"
version = "~> 1.0"
}
}
}
provider "controlplane" {
# Configuration via environment variables:
# KUBIYA_CONTROL_PLANE_API_KEY (required)
# KUBIYA_CONTROL_PLANE_BASE_URL (optional - for self-hosted)
}
# ============================================================================
# ENVIRONMENTS
# ============================================================================
resource "controlplane_environment" "production" {
name = "production"
display_name = "Production Environment"
description = "Production environment for AI agents"
tags = ["production", "managed-by-terraform"]
configuration = jsonencode({
region = "us-east-1"
max_workers = 20
auto_scaling = true
retention_days = 90
})
execution_environment = jsonencode({
env_vars = {
LOG_LEVEL = "info"
APP_ENV = "production"
}
})
}
resource "controlplane_environment" "staging" {
name = "staging"
display_name = "Staging Environment"
description = "Staging environment for testing"
tags = ["staging", "managed-by-terraform"]
configuration = jsonencode({
region = "us-west-2"
max_workers = 10
auto_scaling = true
retention_days = 30
})
}
resource "controlplane_environment" "development" {
name = "development"
display_name = "Development Environment"
description = "Development environment"
tags = ["development", "managed-by-terraform"]
configuration = jsonencode({
region = "us-west-2"
max_workers = 5
auto_scaling = false
retention_days = 7
})
}
# ============================================================================
# PROJECTS
# ============================================================================
resource "controlplane_project" "platform" {
name = "platform-engineering"
key = "PLAT"
description = "Platform engineering and infrastructure"
goals = "Manage and automate platform infrastructure"
visibility = "private"
metadata = jsonencode({
owner = "platform-team"
cost_center = "engineering"
})
}
resource "controlplane_project" "security" {
name = "security-compliance"
key = "SEC"
description = "Security and compliance automation"
goals = "Ensure security and compliance across infrastructure"
visibility = "private"
metadata = jsonencode({
owner = "security-team"
cost_center = "security"
})
}
# ============================================================================
# SKILLS
# ============================================================================
resource "controlplane_skill" "shell_operations" {
name = "shell-operations"
description = "Shell command execution for infrastructure operations"
type = "shell"
enabled = true
configuration = jsonencode({
allowed_commands = ["kubectl", "helm", "aws", "gcloud", "terraform", "ansible"]
timeout = 600
working_dir = "/app"
})
}
resource "controlplane_skill" "filesystem_access" {
name = "filesystem-access"
description = "File system operations"
type = "file_system"
enabled = true
configuration = jsonencode({
allowed_paths = ["/app/configs", "/app/data", "/app/logs"]
max_file_size = 52428800 # 50MB
operations = ["read", "write", "list", "delete"]
})
}
resource "controlplane_skill" "docker_operations" {
name = "docker-operations"
description = "Docker container management"
type = "docker"
enabled = true
configuration = jsonencode({
allowed_registries = ["docker.io", "gcr.io", "ghcr.io"]
max_containers = 20
network_mode = "bridge"
})
}
# ============================================================================
# POLICIES
# ============================================================================
resource "controlplane_policy" "production_security" {
name = "production-security"
description = "Security policy for production environment"
enabled = true
policy_content = <<-EOT
package kubiya.security
# Deny destructive operations without approval
deny[msg] {
input.operation = "delete"
input.environment = "production"
count(input.approvals) < 2
msg := "Delete operations in production require at least 2 approvals"
}
# Require MFA for sensitive operations
deny[msg] {
input.operation = "deploy"
input.environment = "production"
not input.mfa_verified
msg := "Production deployments require MFA verification"
}
# Restrict resource modifications
deny[msg] {
input.operation = "modify"
input.environment = "production"
input.resource_type = "database"
not input.maintenance_window
msg := "Database modifications in production must occur during maintenance windows"
}
EOT
tags = ["security", "production", "compliance"]
}
resource "controlplane_policy" "cost_control" {
name = "cost-control"
description = "Cost control and resource limits policy"
enabled = true
policy_content = <<-EOT
package kubiya.cost
# Limit instance sizes
deny[msg] {
input.action = "create_instance"
input.instance_type = "x2.32xlarge"
msg := "Instance type too large, maximum allowed is m5.2xlarge"
}
# Require cost tags
deny[msg] {
input.action = "create_resource"
not input.tags.cost_center
msg := "All resources must have a cost_center tag"
}
# Budget limits
deny[msg] {
input.action = "create_resource"
input.estimated_monthly_cost > 10000
not input.budget_approved
msg := "Resources exceeding $10,000/month require budget approval"
}
EOT
tags = ["cost", "governance", "finops"]
}
# ============================================================================
# TEAMS
# ============================================================================
resource "controlplane_team" "devops" {
name = "devops-team"
description = "DevOps and platform engineering team"
# Runtime: "default" (Agno) or "claude_code" (Claude Code SDK)
runtime = "claude_code"
configuration = jsonencode({
max_agents = 15
slack_channel = "#devops-agents"
enable_monitoring = true
alert_on_error = true
})
capabilities = ["deployment", "monitoring", "incident_response"]
}
resource "controlplane_team" "sre" {
name = "sre-team"
description = "Site reliability engineering team"
runtime = "claude_code"
configuration = jsonencode({
max_agents = 10
slack_channel = "#sre-agents"
enable_monitoring = true
})
capabilities = ["monitoring", "incident_response", "performance_optimization"]
}
resource "controlplane_team" "security" {
name = "security-team"
description = "Security and compliance team"
runtime = "default"
configuration = jsonencode({
max_agents = 5
slack_channel = "#security-agents"
audit_logging = true
})
capabilities = ["security_scanning", "compliance_checking", "threat_detection"]
}
# ============================================================================
# AGENTS
# ============================================================================
resource "controlplane_agent" "deployment_agent" {
name = "production-deployer"
description = "AI agent for production deployments"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.devops.id
llm_config = jsonencode({
temperature = 0.3
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["kubernetes", "helm", "terraform"]
permissions = ["read", "execute", "deploy"]
max_retries = 3
timeout = 900
approval_needed = true
})
capabilities = ["kubernetes_deploy", "helm_deploy", "rollback"]
}
resource "controlplane_agent" "monitoring_agent" {
name = "monitoring-assistant"
description = "AI agent for monitoring and alerting"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.sre.id
llm_config = jsonencode({
temperature = 0.5
max_tokens = 3000
})
configuration = jsonencode({
capabilities = ["metrics", "logging", "tracing"]
check_interval = 60
alert_channels = ["slack", "pagerduty"]
})
capabilities = ["metrics_collection", "alerting", "log_analysis"]
}
resource "controlplane_agent" "incident_responder" {
name = "incident-responder"
description = "AI agent for incident response"
model_id = "kubiya/claude-sonnet-4"
runtime = "claude_code"
team_id = controlplane_team.sre.id
llm_config = jsonencode({
temperature = 0.4
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["diagnostics", "remediation", "communication"]
escalation_timeout = 600
auto_remediation = false
})
capabilities = ["incident_management", "root_cause_analysis", "remediation"]
}
resource "controlplane_agent" "security_scanner" {
name = "security-scanner"
description = "AI agent for security scanning"
model_id = "kubiya/claude-sonnet-4"
runtime = "default"
team_id = controlplane_team.security.id
llm_config = jsonencode({
temperature = 0.2
max_tokens = 4096
})
configuration = jsonencode({
capabilities = ["vulnerability_scanning", "compliance_checking"]
scan_frequency = "daily"
severity_levels = ["critical", "high", "medium"]
})
capabilities = ["security_scanning", "compliance_reporting"]
}
# ============================================================================
# JOBS
# ============================================================================
resource "controlplane_job" "daily_health_check" {
name = "daily-health-check"
description = "Daily health check at 9am UTC"
enabled = true
trigger_type = "cron"
cron_schedule = "0 9 * * *" # 9 AM UTC daily
cron_timezone = "UTC"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.monitoring_agent.id
prompt_template = "Run comprehensive health check for all production services"
system_prompt = "Check health of all production services, databases, and infrastructure. Report any issues immediately."
executor_type = "auto"
execution_env_vars = {
CHECK_TYPE = "comprehensive"
ALERT_ON_FAILURE = "true"
INCLUDE_METRICS = "true"
}
}
resource "controlplane_job" "deployment_webhook" {
name = "deployment-webhook"
description = "Handle deployment webhook events"
enabled = true
trigger_type = "webhook"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.deployment_agent.id
prompt_template = "Deploy {{service_name}} version {{version}} to {{environment}}"
system_prompt = "Process deployment request. Verify prerequisites, execute deployment, and confirm success."
executor_type = "environment"
environment_name = controlplane_environment.production.name
config = jsonencode({
timeout = 1800 # 30 minutes
retry_policy = {
max_attempts = 3
backoff = "exponential"
}
})
}
resource "controlplane_job" "security_scan" {
name = "security-scan"
description = "Daily security vulnerability scan"
enabled = true
trigger_type = "cron"
cron_schedule = "0 2 * * *" # 2 AM UTC daily
cron_timezone = "UTC"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.security_scanner.id
prompt_template = "Run security vulnerability scan for all production infrastructure"
system_prompt = "Perform comprehensive security scan. Report vulnerabilities by severity."
executor_type = "auto"
execution_env_vars = {
SCAN_TYPE = "full"
REPORT_FORMAT = "json"
SEVERITY_FILTER = "high,critical"
}
execution_secrets = ["security_scanner_token"]
}
resource "controlplane_job" "incident_response_manual" {
name = "incident-response"
description = "Manual incident response job"
enabled = true
trigger_type = "manual"
planning_mode = "predefined_agent"
entity_type = "agent"
entity_id = controlplane_agent.incident_responder.id
prompt_template = "Handle incident: {{incident_id}} - {{description}}"
system_prompt = "Coordinate incident response. Diagnose issue, implement remediation, and communicate status."
executor_type = "auto"
execution_secrets = ["pagerduty_token", "slack_webhook"]
}
# ============================================================================
# WORKER QUEUES
# ============================================================================
resource "controlplane_worker_queue" "production_primary" {
environment_id = controlplane_environment.production.id
name = "production-primary"
display_name = "Production Primary Queue"
description = "Primary worker queue for production workloads"
status = "active"
heartbeat_interval = 60
max_workers = 20
tags = ["production", "primary", "high-priority"]
settings = {
region = "us-east-1"
tier = "production"
priority = "high"
}
}
resource "controlplane_worker_queue" "production_batch" {
environment_id = controlplane_environment.production.id
name = "production-batch"
display_name = "Production Batch Queue"
description = "Worker queue for batch jobs and scheduled tasks"
status = "active"
heartbeat_interval = 120
max_workers = 10
tags = ["production", "batch", "normal-priority"]
settings = {
region = "us-east-1"
tier = "production"
priority = "normal"
}
}
# ============================================================================
# OUTPUTS
# ============================================================================
output "environment_ids" {
description = "Environment IDs"
value = {
production = controlplane_environment.production.id
staging = controlplane_environment.staging.id
development = controlplane_environment.development.id
}
}
output "team_ids" {
description = "Team IDs"
value = {
devops = controlplane_team.devops.id
sre = controlplane_team.sre.id
security = controlplane_team.security.id
}
}
output "agent_ids" {
description = "Agent IDs"
value = {
deployer = controlplane_agent.deployment_agent.id
monitor = controlplane_agent.monitoring_agent.id
incident_responder = controlplane_agent.incident_responder.id
security_scanner = controlplane_agent.security_scanner.id
}
}
output "deployment_webhook_url" {
description = "Webhook URL for deployments"
value = controlplane_job.deployment_webhook.webhook_url
sensitive = true
}
output "worker_queue_names" {
description = "Worker queue task names"
value = {
primary = controlplane_worker_queue.production_primary.task_queue_name
batch = controlplane_worker_queue.production_batch.task_queue_name
}
}