Skip to main content
Datasets provide logical organization for cognitive memories and knowledge. Create datasets with different scopes and permissions to organize information by team, project, or use case.

Overview

Datasets are containers for storing and organizing cognitive memories:
  • Create Datasets: Set up new logical groupings for memories
  • List Datasets: View all accessible datasets
  • Get Details: Access dataset configuration and metadata
  • Delete Datasets: Remove datasets and their contents
  • Check Status: Monitor dataset processing status
Datasets support different scopes (organization, role, user) to control access and visibility across your team.

Quick Start

from kubiya import ControlPlaneClient

# Initialize the client
client = ControlPlaneClient(api_key="your-api-key")

# Create a dataset
dataset = client.datasets.create_dataset(
    name="production-knowledge",
    description="Production environment operational knowledge",
    scope="org"
)

print(f"Created dataset: {dataset['id']}")

# List all datasets
datasets = client.datasets.list_datasets()
for ds in datasets:
    print(f"Dataset: {ds['name']} ({ds['scope']})")

# Get dataset details
details = client.datasets.get_dataset(dataset_id=dataset['id'])
print(f"Dataset created by: {details['created_by']}")

Core Concepts

Dataset Scopes

Control who can access the dataset:
  • org: Available to all users in the organization
  • role: Limited to specific roles (requires allowed_roles)
  • user: Personal dataset, only accessible to creator

Dataset Lifecycle

  1. Create: Initialize empty dataset with name and scope
  2. Populate: Store memories into the dataset
  3. Query: Search and recall memories from the dataset
  4. Monitor: Check processing status for large datasets
  5. Delete: Remove dataset and all its memories

Basic Usage

Create Dataset

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Organization-wide dataset
org_dataset = client.datasets.create_dataset(
    name="incident-history",
    description="Historical incident records and resolutions",
    scope="org"
)

print(f"Created: {org_dataset['name']}")
print(f"ID: {org_dataset['id']}")
print(f"Scope: {org_dataset['scope']}")
{
  "id": "dataset-abc123def456",
  "name": "incident-history",
  "description": "Historical incident records and resolutions",
  "scope": "org",
  "organization_id": "org-xyz789",
  "created_by": "[email protected]",
  "created_at": "2024-12-10T14:30:00Z"
}

Create Role-Scoped Dataset

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Team-specific dataset
team_dataset = client.datasets.create_dataset(
    name="devops-runbooks",
    description="DevOps team operational runbooks",
    scope="role",
    allowed_roles=["devops-team", "sre-team"]
)

print(f"Created team dataset: {team_dataset['id']}")
print(f"Accessible by roles: {team_dataset.get('allowed_roles', [])}")

Create User-Scoped Dataset

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Personal dataset
personal_dataset = client.datasets.create_dataset(
    name="my-notes",
    description="Personal operational notes",
    scope="user"
)

print(f"Created personal dataset: {personal_dataset['id']}")

List Datasets

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# List all accessible datasets
datasets = client.datasets.list_datasets()

print(f"Total datasets: {len(datasets)}\n")

for dataset in datasets:
    print(f"Name: {dataset['name']}")
    print(f"Scope: {dataset['scope']}")
    print(f"Created: {dataset['created_at']}")
    print(f"Description: {dataset.get('description', 'N/A')}")
    print("---")

Get Dataset Details

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Get specific dataset
dataset = client.datasets.get_dataset(dataset_id="dataset-abc123def456")

print(f"Dataset: {dataset['name']}")
print(f"ID: {dataset['id']}")
print(f"Scope: {dataset['scope']}")
print(f"Organization: {dataset['organization_id']}")
print(f"Created by: {dataset['created_by']}")
print(f"Created at: {dataset['created_at']}")

if dataset.get('allowed_roles'):
    print(f"Allowed roles: {', '.join(dataset['allowed_roles'])}")

Check Dataset Status

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Get processing status
status = client.datasets.get_dataset_status(dataset_id="dataset-abc123def456")

print(f"Status: {status['status']}")
print(f"Progress: {status.get('progress', 0)}%")
if status.get('message'):
    print(f"Message: {status['message']}")
{
  "id": "dataset-abc123def456",
  "status": "ready",
  "progress": 100,
  "message": "Dataset fully indexed and ready for queries"
}

Delete Dataset

from kubiya import ControlPlaneClient

client = ControlPlaneClient(api_key="your-api-key")

# Delete dataset and all its memories
success = client.datasets.delete_dataset(dataset_id="dataset-abc123def456")

if success:
    print("✅ Dataset deleted successfully")
else:
    print("❌ Failed to delete dataset")
Deleting a dataset permanently removes all memories stored in it. This operation cannot be undone.

Practical Examples

1. Organize by Environment

Create datasets for different environments:
from kubiya import ControlPlaneClient

def setup_environment_datasets(client: ControlPlaneClient):
    """Create datasets for each environment."""

    environments = ["production", "staging", "development"]
    created = []

    for env in environments:
        dataset = client.datasets.create_dataset(
            name=f"{env}-knowledge",
            description=f"Operational knowledge for {env} environment",
            scope="org"
        )

        print(f"✅ Created {env} dataset: {dataset['id']}")
        created.append(dataset)

    return created

# Usage
client = ControlPlaneClient(api_key="your-api-key")
env_datasets = setup_environment_datasets(client)

2. Team-Based Organization

Create datasets for different teams:
from kubiya import ControlPlaneClient

def setup_team_datasets(client: ControlPlaneClient):
    """Create role-scoped datasets for teams."""

    teams = [
        {
            "name": "backend-runbooks",
            "description": "Backend team operational runbooks",
            "roles": ["backend-team", "sre-team"]
        },
        {
            "name": "frontend-guidelines",
            "description": "Frontend development guidelines",
            "roles": ["frontend-team"]
        },
        {
            "name": "security-policies",
            "description": "Security policies and procedures",
            "roles": ["security-team", "compliance-team"]
        }
    ]

    created = []

    for team in teams:
        dataset = client.datasets.create_dataset(
            name=team["name"],
            description=team["description"],
            scope="role",
            allowed_roles=team["roles"]
        )

        print(f"✅ Created {team['name']} for roles: {', '.join(team['roles'])}")
        created.append(dataset)

    return created

# Usage
client = ControlPlaneClient(api_key="your-api-key")
team_datasets = setup_team_datasets(client)

3. Dataset Inventory

Generate comprehensive dataset inventory:
from kubiya import ControlPlaneClient
from collections import defaultdict

def generate_dataset_inventory(client: ControlPlaneClient):
    """Generate comprehensive dataset inventory report."""

    datasets = client.datasets.list_datasets()

    # Group by scope
    by_scope = defaultdict(list)
    for dataset in datasets:
        by_scope[dataset['scope']].append(dataset)

    print("=== Dataset Inventory ===\n")
    print(f"Total Datasets: {len(datasets)}\n")

    for scope, ds_list in by_scope.items():
        print(f"{scope.upper()} SCOPE ({len(ds_list)} datasets):")
        for ds in ds_list:
            print(f"  - {ds['name']}")
            print(f"    ID: {ds['id']}")
            print(f"    Created: {ds['created_at']}")
            if ds.get('allowed_roles'):
                print(f"    Roles: {', '.join(ds['allowed_roles'])}")
        print()

    return {
        "total": len(datasets),
        "by_scope": {scope: len(ds_list) for scope, ds_list in by_scope.items()},
        "datasets": datasets
    }

# Usage
client = ControlPlaneClient(api_key="your-api-key")
inventory = generate_dataset_inventory(client)

4. Dataset Cleanup

Clean up unused or old datasets:
from kubiya import ControlPlaneClient
from datetime import datetime, timedelta

def cleanup_old_datasets(
    client: ControlPlaneClient,
    days_threshold: int = 90,
    dry_run: bool = True
):
    """Delete datasets older than threshold with no recent activity."""

    datasets = client.datasets.list_datasets()
    now = datetime.utcnow()

    to_delete = []

    for dataset in datasets:
        created_at = datetime.fromisoformat(dataset['created_at'].replace('Z', '+00:00'))
        age_days = (now - created_at.replace(tzinfo=None)).days

        # Simple heuristic: delete if old and name suggests temporary
        if age_days > days_threshold and any(word in dataset['name'].lower() for word in ['temp', 'test', 'tmp']):
            to_delete.append(dataset)

    print(f"=== Dataset Cleanup ===")
    print(f"Threshold: {days_threshold} days")
    print(f"Found {len(to_delete)} datasets to delete")

    if not dry_run:
        for dataset in to_delete:
            success = client.datasets.delete_dataset(dataset_id=dataset['id'])
            if success:
                print(f"✅ Deleted: {dataset['name']} ({dataset['id']})")
            else:
                print(f"❌ Failed to delete: {dataset['name']}")
    else:
        print("\nDRY RUN - Would delete:")
        for dataset in to_delete:
            print(f"  - {dataset['name']} (age: {(now - datetime.fromisoformat(dataset['created_at'].replace('Z', '+00:00')).replace(tzinfo=None)).days} days)")

    return to_delete

# Usage
client = ControlPlaneClient(api_key="your-api-key")

# Dry run first
cleanup_old_datasets(client, days_threshold=90, dry_run=True)

# Actual cleanup
# cleanup_old_datasets(client, days_threshold=90, dry_run=False)

5. Dataset Status Monitor

Monitor dataset processing status:
from kubiya import ControlPlaneClient
import time

def monitor_dataset_status(
    client: ControlPlaneClient,
    dataset_id: str,
    timeout_seconds: int = 300
):
    """Monitor dataset processing until ready or timeout."""

    start_time = time.time()

    print(f"Monitoring dataset {dataset_id}...")

    while time.time() - start_time < timeout_seconds:
        status = client.datasets.get_dataset_status(dataset_id=dataset_id)

        print(f"Status: {status['status']} - Progress: {status.get('progress', 0)}%")

        if status['status'] == 'ready':
            print("✅ Dataset is ready!")
            return status

        if status['status'] == 'error':
            print(f"❌ Dataset processing failed: {status.get('message', 'Unknown error')}")
            return status

        time.sleep(5)  # Check every 5 seconds

    print(f"⏱️  Timeout after {timeout_seconds} seconds")
    return None

# Usage
client = ControlPlaneClient(api_key="your-api-key")

# Create dataset
dataset = client.datasets.create_dataset(
    name="large-dataset",
    scope="org"
)

# Monitor until ready
final_status = monitor_dataset_status(client, dataset['id'], timeout_seconds=300)

Error Handling

from kubiya import ControlPlaneClient
from kubiya.resources.exceptions import GraphError

client = ControlPlaneClient(api_key="your-api-key")

# Handle creation errors
try:
    dataset = client.datasets.create_dataset(
        name="my-dataset",
        scope="org"
    )
except GraphError as e:
    if "already exists" in str(e).lower():
        print("Dataset with this name already exists")
    else:
        print(f"Failed to create dataset: {e}")

# Handle not found errors
try:
    dataset = client.datasets.get_dataset(dataset_id="non-existent-id")
except GraphError as e:
    if "not found" in str(e).lower():
        print("Dataset not found")
    else:
        print(f"Error accessing dataset: {e}")

# Handle permission errors
try:
    client.datasets.delete_dataset(dataset_id="restricted-dataset")
except GraphError as e:
    if "permission" in str(e).lower() or "forbidden" in str(e).lower():
        print("Insufficient permissions to delete this dataset")
    else:
        print(f"Delete failed: {e}")

Best Practices

1. Use Descriptive Names

# ❌ BAD - Generic names
client.datasets.create_dataset(name="data", scope="org")
client.datasets.create_dataset(name="dataset1", scope="org")

# ✅ GOOD - Descriptive names
client.datasets.create_dataset(name="incident-response-runbooks", scope="org")
client.datasets.create_dataset(name="customer-success-playbooks", scope="role")

2. Choose Appropriate Scope

# Organization-wide knowledge
client.datasets.create_dataset(
    name="company-policies",
    scope="org"  # Everyone can access
)

# Team-specific knowledge
client.datasets.create_dataset(
    name="devops-procedures",
    scope="role",
    allowed_roles=["devops", "sre"]  # Only specific teams
)

# Personal notes
client.datasets.create_dataset(
    name="personal-notes",
    scope="user"  # Only you can access
)

3. Add Meaningful Descriptions

dataset = client.datasets.create_dataset(
    name="production-incidents",
    description="Historical production incidents with root causes, resolutions, and preventive measures. Updated after each incident closure.",
    scope="org"
)

4. Monitor Large Datasets

# For datasets that will contain large amounts of data
dataset = client.datasets.create_dataset(name="large-logs", scope="org")

# Store data...

# Check status before querying
status = client.datasets.get_dataset_status(dataset_id=dataset['id'])
if status['status'] != 'ready':
    print(f"Dataset still processing: {status['progress']}%")

API Reference

Methods

MethodDescriptionParametersReturns
create_dataset()Create new datasetname, description, scope, allowed_rolesDict with dataset details
list_datasets()List all datasetsNoneList[Dict]
get_dataset()Get dataset detailsdataset_id: strDict
delete_dataset()Delete datasetdataset_id: strbool
get_dataset_status()Get processing statusdataset_id: strDict

Dataset Object Structure

{
    "id": str,
    "name": str,
    "description": str,
    "scope": str,  # "org", "role", or "user"
    "organization_id": str,
    "created_by": str,
    "created_at": str,
    "allowed_roles": List[str]  # Only for scope="role"
}

Dataset Status Structure

{
    "id": str,
    "status": str,  # "ready", "processing", "error"
    "progress": int,  # 0-100
    "message": str  # Status message
}

Next Steps