General performance

Track MOVE PARTITION execution time

Useful when diagnosing Clickhouse Keeper issues or observing DDL query degradation.

SELECT
  toStartOfHour(event_time) AS hour,
  round(avg(query_duration_ms)/1000,2) AS avg_execution_time_s,
  round(quantile(.5)(query_duration_ms/1000),2) as p50,
  round(quantile(.99)(query_duration_ms/1000),2) as p99,
  round(quantile(.999)(query_duration_ms/1000),2) as p999,
  round(min(query_duration_ms/1000),2) as min,
  round(max(query_duration_ms/1000),2) as max,
  count() AS query_count
FROM clusterAllReplicas('default', merge(system, 'query_log*'))
WHERE
  (query LIKE '%MOVE PARTITION%')
  AND type = 'QueryFinish'
  AND event_date >= '2025-08-01'
GROUP BY hour
ORDER BY hour ASC;

Most used functions

SELECT
    f AS function,
    count() AS hits
FROM system.query_log
ARRAY JOIN used_functions AS f
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 7 DAY
  AND notEmpty(used_functions)
GROUP BY f
ORDER BY hits DESC
LIMIT 50;

Most used columns

SELECT
    col AS column,
    count() AS hits
FROM system.query_log
ARRAY JOIN columns AS col
WHERE type = 'QueryFinish'
  AND query_kind = 'Select'
  AND event_time >= now() - INTERVAL 7 DAY
  AND notEmpty(columns)
GROUP BY col
ORDER BY hits DESC
LIMIT 50;

Columns most commonly used in WHERE clauses

Useful when building or rebuilding schemas for performance. Update the table filter as needed.

WITH
    any(query) AS q,
    any(tables) AS _tables,
    arrayJoin(extractAll(query, '\\b(?:PRE)?WHERE\\s+(.*?)\\s+(?:GROUP BY|ORDER BY|UNION|SETTINGS|FORMAT$)')) AS w,
    any(columns) AS cols,
    arrayFilter(x -> (position(w, extract(x, '\\.(`[^`]+`|[^\\.]+)')) > 0), columns) AS c,
    arrayJoin(c) AS c2
SELECT
    c2,
    count()
FROM system.query_log
WHERE (event_time >= (now() - toIntervalDay(1)))
  AND arrayExists(x -> (x LIKE '%my_table%'), tables)
  AND (query ILIKE 'SELECT%')
GROUP BY c2
ORDER BY count() ASC

A/B test two queries

Compare performance between two query executions by their query IDs.

WITH
    query_id='8c050082-428e-4523-847a-caf29511d6ba' AS first,
    query_id='618e0c55-e21d-4630-97e7-5f82e2475c32' AS second,
    arrayConcat(mapKeys(ProfileEvents), ['query_duration_ms', 'read_rows', 'read_bytes', 'written_rows', 'written_bytes', 'result_rows', 'result_bytes', 'memory_usage', 'normalized_query_hash', 'peak_threads_usage', 'query_cache_usage']) AS metrics,
    arrayConcat(mapValues(ProfileEvents), [query_duration_ms, read_rows, read_bytes, written_rows, written_bytes, result_rows, result_bytes, memory_usage, normalized_query_hash, peak_threads_usage, toUInt64(query_cache_usage)]) AS metrics_values
SELECT
    metrics[i] AS metric,
    anyIf(metrics_values[i], first) AS v1,
    anyIf(metrics_values[i], second) AS v2,
    formatReadableQuantity(v1 - v2)
FROM clusterAllReplicas(default, system.query_log)
ARRAY JOIN arrayEnumerate(metrics) AS i
WHERE (first OR second) AND (type = 2)
GROUP BY metric
HAVING v1 != v2
ORDER BY
    (v2 - v1) / (v1 + v2) DESC,
    v2 DESC,
    metric ASC

Queries consuming most CPU/RAM

The big one. Shows detailed resource consumption by query pattern.

SELECT
    hostName() as host,
    normalized_query_hash,
    min(event_time),
    max(event_time),
    replace(substr(argMax(query, utime), 1, 80), '\n', ' ') AS query,
    argMax(query_id, utime) AS sample_query_id,
    count(),
    sum(query_duration_ms) / 1000 AS QueriesDuration,
    sum(ProfileEvents['RealTimeMicroseconds']) / 1000000 AS RealTime,
    sum(ProfileEvents['UserTimeMicroseconds'] as utime) / 1000000 AS UserTime,
    sum(ProfileEvents['SystemTimeMicroseconds']) / 1000000 AS SystemTime,
    sum(ProfileEvents['DiskReadElapsedMicroseconds']) / 1000000 AS DiskReadTime,
    sum(ProfileEvents['DiskWriteElapsedMicroseconds']) / 1000000 AS DiskWriteTime,
    sum(ProfileEvents['NetworkSendElapsedMicroseconds']) / 1000000 AS NetworkSendTime,
    sum(ProfileEvents['NetworkReceiveElapsedMicroseconds']) / 1000000 AS NetworkReceiveTime,
    sum(ProfileEvents['ZooKeeperWaitMicroseconds']) / 1000000 AS ZooKeeperWaitTime,
    sum(ProfileEvents['OSIOWaitMicroseconds']) / 1000000 AS OSIOWaitTime,
    sum(ProfileEvents['OSCPUWaitMicroseconds']) / 1000000 AS OSCPUWaitTime,
    sum(ProfileEvents['OSCPUVirtualTimeMicroseconds']) / 1000000 AS OSCPUVirtualTime,
    formatReadableSize(sum(ProfileEvents['NetworkReceiveBytes'])) AS NetworkReceiveBytes,
    formatReadableSize(sum(ProfileEvents['NetworkSendBytes'])) AS NetworkSendBytes,
    sum(ProfileEvents['SelectedParts']) as SelectedParts,
    sum(ProfileEvents['SelectedRanges']) as SelectedRanges,
    sum(ProfileEvents['SelectedMarks']) as SelectedMarks,
    sum(ProfileEvents['SelectedRows']) as SelectedRows,
    sum(ProfileEvents['SelectedBytes']) as SelectedBytes,
    sum(ProfileEvents['FileOpen']) as FileOpen,
    sum(ProfileEvents['ZooKeeperTransactions']) as ZooKeeperTransactions,
    formatReadableSize(sum(ProfileEvents['OSReadBytes'])) as OSReadBytesExcludePageCache,
    formatReadableSize(sum(ProfileEvents['OSWriteBytes'])) as OSWriteBytesExcludePageCache,
    formatReadableSize(sum(ProfileEvents['OSReadChars'])) as OSReadBytesIncludePageCache,
    formatReadableSize(sum(ProfileEvents['OSWriteChars'])) as OSWriteCharsIncludePageCache,
    formatReadableSize(quantile(0.97)(memory_usage)) as MemoryUsageQ97,
    sum(read_rows) AS ReadRows,
    formatReadableSize(sum(read_bytes)) AS ReadBytes,
    sum(written_rows) AS WrittenRows,
    formatReadableSize(sum(written_bytes)) AS WrittenBytes,
    sum(result_rows) AS ResultRows,
    formatReadableSize(sum(result_bytes)) AS ResultBytes
FROM clusterAllReplicas('default', system.query_log)
WHERE event_date >= today() AND type in (2,4)
GROUP BY
    GROUPING SETS (
        (normalized_query_hash, host),
        (host),
        ())
ORDER BY OSCPUVirtualTime DESC
LIMIT 30
FORMAT Vertical;

Storage diagnostics

Table size and row counts

SELECT
    name,
    engine,
    formatReadableQuantity(total_rows) AS rows,
    formatReadableSize(total_bytes) AS size,
    active_parts
FROM clusterAllReplicas(default, system.tables)
WHERE database = 'my_database' AND engine != 'MaterializedView'

Check partition locations

See which disk each partition lives on.

SELECT table, partition, disk_name, count() AS num_parts
FROM system.parts
WHERE database = 'my_database' AND table = 'my_table' AND active
GROUP BY ALL
ORDER BY partition;

Get size of parts by disk

SELECT database, table, disk_name, sum(bytes_on_disk) AS bytes
FROM system.parts
WHERE active
GROUP BY database, table, disk_name
ORDER BY database, table;

Most parts per partition

Clickhouse won’t merge parts with combined size greater than 150 GB - check max_bytes_to_merge_at_max_space_in_pool setting.

SELECT
    database,
    table,
    partition,
    sum(rows) AS rows,
    count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE '%')
GROUP BY database, table, partition
ORDER BY part_count DESC
LIMIT 20

S3/R2 diagnostics

Locate parts on remote storage

SELECT disk_name, local_path, remote_path, size
FROM system.remote_data_paths
ORDER BY local_path
LIMIT 100

S3/R2 operation statistics

Useful for estimating external storage costs.

SELECT
 sumIf(value, event = 'S3PutObject') AS S3PutObject,
 sumIf(value, event = 'S3GetObject') AS S3GetObject,
 sumIf(value, event = 'WriteBufferFromS3Bytes') AS WriteBufferFromS3Bytes,
 sumIf(value, event = 'ReadBufferFromS3Bytes') AS ReadBufferFromS3Bytes
FROM system.events;

Local cache

Check if cache is populated

DESCRIBE FILESYSTEM CACHE 'r2_cache' FORMAT Vertical

Cache statistics

SELECT * FROM system.metrics
WHERE metric ILIKE '%filesystemcache%'
ORDER BY metric;

SELECT
    cache_name,
    formatReadableSize(sum(size)) AS size
FROM system.filesystem_cache
GROUP BY ALL

Replication

Remove stuck tasks for empty partitions

Solves a common problem where tasks remain stuck in the replication queue for partitions that were already removed or never contained data.

Warning: Review the generated statements before executing.

SELECT 'ALTER TABLE ' || database || '.' || table || ' DROP PARTITION ID \''|| partition_id || '\';' FROM
(SELECT DISTINCT database, table, extract(new_part_name, '^[^_]+') AS partition_id
 FROM clusterAllReplicas('{cluster}', system.replication_queue)) AS rq
LEFT JOIN
(SELECT database, table, partition_id, sum(rows) AS rows_count, count() AS part_count
 FROM clusterAllReplicas('{cluster}', system.parts)
 WHERE active
 GROUP BY database, table, partition_id) AS p
USING (database, table, partition_id)
WHERE p.rows_count = 0 AND p.part_count = 0
FORMAT TSVRaw;

Check for missing blocks (consistency check)

SELECT
    database,
    table,
    partition_id,
    ranges.1 AS previous_part,
    ranges.2 AS next_part,
    ranges.3 AS previous_block_number,
    ranges.4 AS next_block_number,
    range(toUInt64(previous_block_number + 1), toUInt64(next_block_number)) AS missing_block_numbers
FROM
(
    WITH
        arrayPopFront(groupArray(min_block_number) AS min) AS min_adj,
        arrayPopBack(groupArray(max_block_number) AS max) AS max_adj,
        arrayFilter((x, y, z) -> (y != (z + 1)), arrayZip(arrayPopBack(groupArray(name) AS name_arr), arrayPopFront(name_arr), max_adj, min_adj), min_adj, max_adj) AS missing_ranges
    SELECT
        database,
        table,
        partition_id,
        missing_ranges
    FROM
    (
        SELECT *
        FROM system.parts
        WHERE active AND (table = 'my_table') AND (partition_id = '202108') AND active
        ORDER BY min_block_number ASC
    )
    GROUP BY database, table, partition_id
)
ARRAY JOIN missing_ranges AS ranges

Troubleshooting

Check cluster connectivity

SELECT count()
FROM clusterAllReplicas('{cluster}', cluster('{cluster}', system.one))

Node errors

SELECT
    hostName(),
    *
FROM clusterAllReplicas('{cluster}', system.clusters)
WHERE errors_count > 0

Compare query logs between two time intervals

WITH
    toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2025-10-30 13:00:00' AS before,
    toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2025-10-30 15:00:00' AS after
SELECT
    normalized_query_hash,
    anyIf(query, before) AS QueryBefore,
    anyIf(query, after) AS QueryAfter,
    countIf(before) AS CountBefore,
    countIf(after) AS CountAfter,
    sumIf(query_duration_ms, before) / 1000 AS QueriesDurationBefore,
    sumIf(query_duration_ms, after) / 1000 AS QueriesDurationAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], before) / 1000000 AS OSCPUVirtualTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], after) / 1000000 AS OSCPUVirtualTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], before) AS SelectedRangesBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], after) AS SelectedRangesAfter,
    sumIf(read_rows, before) AS ReadRowsBefore,
    sumIf(read_rows, after) AS ReadRowsAfter,
    formatReadableSize(sumIf(read_bytes, before)) AS ReadBytesBefore,
    formatReadableSize(sumIf(read_bytes, after)) AS ReadBytesAfter
FROM system.query_log
WHERE (before OR after) AND type IN (2, 4)
GROUP BY normalized_query_hash
    WITH TOTALS
ORDER BY SelectedRangesAfter - SelectedRangesBefore DESC
LIMIT 10
FORMAT Vertical

Debug Clickhouse freezing/hanging

Get stack traces for every running thread. Useful when Clickhouse hangs for no apparent reason.

SELECT
 arrayStringConcat(arrayMap(x -> concat('0x', lower(hex(x)), '\t', demangle(addressToSymbol(x))), trace), '\n') AS trace_functions,
 count()
FROM system.stack_trace
GROUP BY trace_functions
ORDER BY count() DESC
SETTINGS allow_introspection_functions=1
FORMAT Vertical;