Skip to content

ClickHouse System Monitoring Guide

A comprehensive guide for monitoring ClickHouse deployments.

System Health Monitoring

Memory Usage

-- Current memory usage by query
SELECT
    query_id,
    user,
    formatReadableSize(memory_usage) as mem_used,
    formatReadableSize(peak_memory_usage) as peak_mem,
    query
FROM system.processes
ORDER BY memory_usage DESC;

-- Historical memory usage patterns
SELECT
    query_id,
    formatReadableSize(memory_usage) as mem_used,
    formatReadableSize(peak_memory_usage) as peak_mem,
    query_duration_ms,
    query
FROM system.query_log
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 1 DAY
ORDER BY memory_usage DESC
LIMIT 10;

CPU Usage

-- Current CPU usage by query
SELECT
    query_id,
    user,
    elapsed,
    read_rows,
    formatReadableSize(read_bytes) as read,
    total_rows_approx,
    query
FROM system.processes
ORDER BY elapsed DESC;

-- CPU usage patterns
SELECT
    query,
    count() as execution_count,
    avg(query_duration_ms) as avg_duration_ms,
    max(query_duration_ms) as max_duration_ms,
    sum(read_rows) as total_rows_read,
    sum(result_rows) as total_rows_returned
FROM system.query_log
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 1 DAY
GROUP BY query
ORDER BY avg_duration_ms DESC
LIMIT 10;

Disk Usage

-- Database sizes
SELECT
    name as database,
    formatReadableSize(sum(bytes_on_disk)) as disk_usage,
    sum(rows) as total_rows,
    count() as total_tables
FROM system.tables
GROUP BY database
ORDER BY sum(bytes_on_disk) DESC;

-- Table sizes
SELECT
    database,
    table,
    formatReadableSize(bytes_on_disk) as disk_usage,
    rows as total_rows,
    formatReadableSize(bytes_on_disk / rows) as avg_row_size
FROM system.tables
WHERE database NOT IN ('system')
ORDER BY bytes_on_disk DESC
LIMIT 10;

-- Part sizes and distribution
SELECT
    database,
    table,
    partition,
    formatReadableSize(sum(bytes_on_disk)) as part_size,
    count() as part_count,
    sum(rows) as rows
FROM system.parts
WHERE active
GROUP BY database, table, partition
ORDER BY sum(bytes_on_disk) DESC
LIMIT 10;

Query Performance Monitoring

Slow Queries

-- Recent slow queries
SELECT
    query_id,
    user,
    query_duration_ms,
    read_rows,
    formatReadableSize(read_bytes) as data_read,
    result_rows,
    formatReadableSize(memory_usage) as memory,
    query
FROM system.query_log
WHERE type = 'QueryFinish'
  AND query_duration_ms > 1000  -- Queries taking more than 1 second
  AND event_time >= now() - INTERVAL 1 DAY
  AND LOWER(query) NOT LIKE '%system%'  -- Filter out system queries
ORDER BY query_duration_ms DESC
LIMIT 10;

-- Query patterns by duration
SELECT
    user,
    count() as query_count,
    avg(query_duration_ms) as avg_duration_ms,
    max(query_duration_ms) as max_duration_ms
FROM system.query_log
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 1 DAY
GROUP BY user
ORDER BY avg_duration_ms DESC;

Query Errors

-- Recent query errors
SELECT
    user,
    query_id,
    event_time,
    query_duration_ms,
    exception,
    query
FROM system.query_log
WHERE type = 'ExceptionWhileProcessing'
  AND event_time >= now() - INTERVAL 1 DAY
ORDER BY event_time DESC;

-- Error patterns
SELECT
    exception,
    count() as error_count,
    any(query) as sample_query
FROM system.query_log
WHERE type = 'ExceptionWhileProcessing'
  AND event_time >= now() - INTERVAL 1 DAY
GROUP BY exception
ORDER BY error_count DESC;

System Configuration

Settings Check

-- Current system settings
SELECT
    name,
    value,
    changed,
    description
FROM system.settings
WHERE changed = 1
ORDER BY name;

-- User settings
SELECT
    *
FROM system.users
FORMAT Vertical;

Replication Status

-- Replication delays
SELECT
    database,
    table,
    is_leader,
    total_replicas,
    active_replicas,
    formatReadableSize(queue_size) as queue_size,
    absolute_delay
FROM system.replicas
ORDER BY absolute_delay DESC;

-- Replication queue
SELECT
    database,
    table,
    count() as queue_entries,
    max(create_time) as latest_entry
FROM system.replication_queue
GROUP BY database, table
ORDER BY queue_entries DESC;