Alerts


/etc/prometheus/backup_rules.yml > backup
Backup age (0 active)
alert: Backup
  age
expr: time()
  - last_backup > 60 * 60 * 48
for: 2m
labels:
  severity: important
annotations:
  summary: '{{ $labels.instance }} last backup age is >48 hours'
ZFS snapshot age (0 active)
alert: ZFS
  snapshot age
expr: time()
  - zfs_last_snapshot > 60 * 60 * 24
for: 2m
labels:
  severity: important
annotations:
  summary: '{{ $labels.instance }} last ZFS snapshot age is >24 hours'
/etc/prometheus/build_rules.yml > builds.sr.ht
High number of builds timing out (0 active)
alert: High
  number of builds timing out
expr: increase(buildsrht_builds_finished_total{status="timeout"}[1d])
  > 10
labels:
  severity: important
annotations:
  summary: High number of builds are timing out
High rate of build job submission (0 active)
alert: High
  rate of build job submission
expr: increase(buildsrht_builds_started_total[5m])
  > 25
labels:
  severity: important
annotations:
  summary: Unusual rate of build job submissions on {{$labels.instance}}
/etc/prometheus/meta_rules.yml > meta.sr.ht
High rate of login failures (0 active)
alert: High
  rate of login failures
expr: delta(meta_logins_failed_total[10m])
  > 10
labels:
  security: "true"
  severity: important
annotations:
  summary: Unusual number of failed logins
High rate of password resets (0 active)
alert: High
  rate of password resets
expr: delta(meta_pw_resets_total[10m])
  > 5
labels:
  security: "true"
  severity: urgent
annotations:
  summary: Unusual number of password resets
High rate of user registrations (0 active)
alert: High
  rate of user registrations
expr: delta(meta_registrations_total[10m])
  > 5
labels:
  severity: interesting
annotations:
  summary: High rate of user registrations
Sustained attack on user logins may be underway (0 active)
alert: Sustained
  attack on user logins may be underway
expr: delta(meta_logins_failed_total[10m])
  > 10
for: 30m
labels:
  security: "true"
  severity: urgent
annotations:
  summary: Sustained attack on user logins may be underway
/etc/prometheus/node_rules.yml > node
High CPU usage (0 active)
alert: High
  CPU usage
expr: avg
  by(instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
for: 5m
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} is under high CPU usage
High disk I/O (0 active)
alert: High
  disk I/O
expr: (rate(node_disk_read_bytes_total{device=~"sd.*|vd.*"}[5m])
  / 1024 ^ 2 > 5) or (rate(node_disk_written_bytes_total{device=~"sd.*|vd.*"}[5m])
  / 1024 ^ 2 > 5)
for: 5m
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} >2 MiB/s disk I/O
High disk usage (0 active)
alert: High
  disk usage
expr: (node_filesystem_size_bytes{mountpoint=~"/|/var"}
  - node_filesystem_avail_bytes{mountpoint=~"/|/var"}) / node_filesystem_size_bytes{mountpoint=~"/|/var"}
  > 0.9
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} has high disk usage on {{ $labels.mountpoint
    }}
High network activity (0 active)
High tmpfs usage (0 active)
alert: High
  tmpfs usage
expr: (node_filesystem_size_bytes{mountpoint=~"/tmp"}
  - node_filesystem_avail_bytes{mountpoint=~"/tmp"}) / node_filesystem_size_bytes{mountpoint=~"/tmp"}
  > 0.8
for: 5m
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} has tmpfs usage
Instance down (0 active)
alert: Instance
  down
expr: up == 0
for: 2m
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} is down
Instance rebooted (0 active)
alert: Instance
  rebooted
expr: node_boot_time_seconds
  < 60
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} was rebooted
Prolonged high CPU usage (0 active)
alert: Prolonged
  high CPU usage
expr: avg
  by(instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} is under sustained high CPU usage
Prolonged high disk I/O (0 active)
alert: Prolonged
  high disk I/O
expr: disk_gt_5mibsec
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} prolonged >2 MiB/s disk I/O
Prolonged high network activity (0 active)
alert: Prolonged
  high network activity
expr: net_gt_10mibsec
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} prolonged >10 MiB/s network use
Read-only filesystem (0 active)
alert: Read-only
  filesystem
expr: node_filesystem_readonly{mountpoint=~"/|/var"}
  != 0
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} read-only filesystem on {{ $labels.mountpoint
    }}
Sustained high CPU usage (0 active)
alert: Sustained
  high CPU usage
expr: avg
  by(instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} is under sustained high CPU usage
Sustained high disk I/O (0 active)
alert: Sustained
  high disk I/O
expr: disk_gt_5mibsec
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} sustained >2 MiB/s disk I/O
Sustained high network activity (0 active)
alert: Sustained
  high network activity
expr: net_gt_10mibsec
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} sustained >10 MiB/s network use
/etc/prometheus/service_rules.yml > service
High rate of 500 errors (0 active)
alert: High
  rate of 500 errors
expr: rate(http_requests_total{status="500"}[10m])
  > 5 / 60
for: 5m
labels:
  severity: urgent
annotations:
  summary: '{{ $labels.instance }} has a high rate of 500 errors'
/etc/prometheus/ssl_rules.yml > ssl
SSL expiration (0 active)
alert: SSL
  expiration
expr: (certificate_expiration
  - time()) / 60 / 60 / 24 < 7
for: 2m
labels:
  severity: important
annotations:
  summary: '{{ $labels.instance }} SSL certificate expires in < 1 week'
/etc/prometheus/test_rules.yml > test
Weekly test alarm (0 active)
alert: Weekly
  test alarm
expr: (day_of_week()
  == 3 and hour() == 16 and minute() < 30) > 0
labels:
  severity: interesting
annotations:
  summary: Weekly test alarm