公司的 mssql 太舊沒有辦法使用 mssql exporter (github來源) 進行監控 mssql,然而官網有提供支援 各種的 DB (官網連結) ,這邊我找到 docker 版本的(docker連結),透過此版本進行安裝監控 mssql,以及學習如何進行 grafana dashboard 建置。
本篇文章有引用先前架設的 Prometheus 與 Grafana 的 docker file 設定 (link)
撰寫 Docker Compose 文件
- 抽離出來後再 prometheus 再透過 實體IP 進行存 TSDB
version: '3.8'
services:
sql-exporter:
image: burningalchemist/sql_exporter:latest
container_name: sql-exporter
volumes:
- ./sql_exporter.yml:/etc/sql_exporter/sql_exporter.yml # 直接挂载到根目录
command: ["--config.file=/etc/sql_exporter/sql_exporter.yml"]
ports:
- '9399:9399'
- 新增一個 sql_exporter.yml
- 這邊有新增連線的字串與帳密要注意
# sql_exporter.yml - 绝对最简单的版本
target:
data_source_name: "sqlserver://rororo:123456@192.168.0.10:1433?database=JerryTest&encrypt=false&TrustServerCertificate=true&connection+timeout=30"
collectors: ["mssql_custom"]
collectors:
- collector_name: mssql_custom
metrics:
- metric_name: mssql_test
type: gauge
help: "Test metric"
values: [value]
query: "SELECT 1 as value"

- 下 docker compose 語法進行 create container
docker compose up -d
- 確認 docker desktop 確認有安裝成功

- 輸入 localhost:9399/metrics 可以看到實際抓到的 metrics

修改官方的 sqlserver metrics
- 官網有寫好一些可以監控的指標

- 我把 yml 修改成官方的
# sql_exporter.yml
target:
data_source_name: "sqlserver://rororo:123456@192.168.0.10:1433?database=JerryTest&encrypt=false&TrustServerCertificate=true&connection+timeout=30"
collectors: ["mssql_standard"]
collectors:
- collector_name: mssql_standard
metrics:
- metric_name: mssql_local_time_seconds
type: gauge
help: 'Local time in seconds since epoch (Unix time).'
values: [unix_time]
query: |
SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time
- metric_name: mssql_connections
type: gauge
help: 'Number of active connections.'
key_labels:
- db
values: [count]
query: |
SELECT DB_NAME(sp.dbid) AS db, COUNT(sp.spid) AS count
FROM sys.sysprocesses sp
WHERE sp.dbid > 0 -- 添加过滤条件,排除系统进程
GROUP BY DB_NAME(sp.dbid)
- metric_name: mssql_deadlocks_total
type: counter
help: 'Number of lock requests that resulted in a deadlock.'
values: [cntr_value]
query: |
SELECT cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Number of Deadlocks/sec' AND instance_name = '_Total'
- metric_name: mssql_user_errors_total
type: counter
help: 'Number of user errors.'
values: [cntr_value]
query: |
SELECT cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Errors/sec' AND instance_name = 'User Errors'
- metric_name: mssql_kill_connection_errors_total
type: counter
help: 'Number of severe errors that caused SQL Server to kill the connection.'
values: [cntr_value]
query: |
SELECT cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Errors/sec' AND instance_name = 'Kill Connection Errors'
- metric_name: mssql_page_life_expectancy_seconds
type: gauge
help: 'The minimum number of seconds a page will stay in the buffer pool on this node without references.'
values: [cntr_value]
query: |
SELECT top(1) cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Page life expectancy'
- metric_name: mssql_batch_requests_total
type: counter
help: 'Number of command batches received.'
values: [cntr_value]
query: |
SELECT cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Batch Requests/sec'
- metric_name: mssql_log_growths_total
type: counter
help: 'Number of times the transaction log has been expanded, per database.'
key_labels:
- db
values: [cntr_value]
query: |
SELECT rtrim(instance_name) AS db, cntr_value
FROM sys.dm_os_performance_counters WITH (NOLOCK)
WHERE counter_name = 'Log Growths' AND instance_name <> '_Total'
- metric_name: mssql_buffer_cache_hit_ratio
type: gauge
help: 'Ratio of requests that hit the buffer cache'
values: [BufferCacheHitRatio]
query: |
SELECT (a.cntr_value * 1.0 / b.cntr_value) * 100.0 as BufferCacheHitRatio
FROM sys.dm_os_performance_counters a
JOIN (SELECT cntr_value, OBJECT_NAME
FROM sys.dm_os_performance_counters
WHERE counter_name = 'Buffer cache hit ratio base'
AND OBJECT_NAME = 'SQLServer:Buffer Manager') b
ON a.OBJECT_NAME = b.OBJECT_NAME
WHERE a.counter_name = 'Buffer cache hit ratio'
AND a.OBJECT_NAME = 'SQLServer:Buffer Manager'
- metric_name: mssql_checkpoint_pages_sec
type: gauge
help: 'Checkpoint Pages Per Second'
values: [cntr_value]
query: |
SELECT cntr_value
FROM sys.dm_os_performance_counters
WHERE [counter_name] = 'Checkpoint pages/sec'
- metric_name: mssql_io_stall_seconds_total
type: counter
help: 'Stall time in seconds per database and I/O operation.'
key_labels:
- db
value_label: operation
values:
- read
- write
query_ref: mssql_io_stall
- metric_name: mssql_resident_memory_bytes
type: gauge
help: 'SQL Server resident memory size (AKA working set).'
values: [resident_memory_bytes]
query_ref: mssql_process_memory
- metric_name: mssql_virtual_memory_bytes
type: gauge
help: 'SQL Server committed virtual memory size.'
values: [virtual_memory_bytes]
query_ref: mssql_process_memory
- metric_name: mssql_available_commit_memory_bytes
type: gauge
help: 'SQL Server available to be committed memory size.'
values: [available_commit_limit_bytes]
query_ref: mssql_process_memory
- metric_name: mssql_memory_utilization_percentage
type: gauge
help: 'The percentage of committed memory that is in the working set.'
values: [memory_utilization_percentage]
query_ref: mssql_process_memory
- metric_name: mssql_page_fault_count_total
type: counter
help: 'The number of page faults that were incurred by the SQL Server process.'
values: [page_fault_count]
query_ref: mssql_process_memory
- metric_name: mssql_server_total_memory_bytes
type: gauge
help: 'SQL Server committed memory in the memory manager.'
values: [committed_memory_bytes]
query_ref: mssql_os_sys_info
- metric_name: mssql_server_target_memory_bytes
type: gauge
help: 'SQL Server target committed memory set for the memory manager.'
values: [committed_memory_target_bytes]
query_ref: mssql_os_sys_info
- metric_name: mssql_os_memory
type: gauge
help: 'OS physical memory, used and available.'
value_label: 'state'
values: [used, available]
query: |
SELECT
(total_physical_memory_kb - available_physical_memory_kb) * 1024 AS used,
available_physical_memory_kb * 1024 AS available
FROM sys.dm_os_sys_memory
- metric_name: mssql_os_page_file
type: gauge
help: 'OS page file, used and available.'
value_label: 'state'
values: [used, available]
query: |
SELECT
(total_page_file_kb - available_page_file_kb) * 1024 AS used,
available_page_file_kb * 1024 AS available
FROM sys.dm_os_sys_memory
queries:
- query_name: mssql_io_stall
query: |
SELECT
cast(DB_Name(a.database_id) as varchar) AS [db],
sum(io_stall_read_ms) / 1000.0 AS [read],
sum(io_stall_write_ms) / 1000.0 AS [write]
FROM
sys.dm_io_virtual_file_stats(null, null) a
INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id
GROUP BY a.database_id
- query_name: mssql_process_memory
query: |
SELECT
physical_memory_in_use_kb * 1024 AS resident_memory_bytes,
virtual_address_space_committed_kb * 1024 AS virtual_memory_bytes,
available_commit_limit_kb * 1024 AS available_commit_limit_bytes,
memory_utilization_percentage,
page_fault_count
FROM sys.dm_os_process_memory
- query_name: mssql_os_sys_info
query: |
SELECT
committed_kb * 1024 AS committed_memory_bytes,
committed_target_kb * 1024 AS committed_memory_target_bytes
FROM sys.dm_os_sys_info
- 重啟吃 yml 參數
docker restart f3dd3af741f3896d73d2035fffd44eac9058a01f406fbc597507330e0369322d
- 查詢就可以看到官網預設的 metrics
