Alerts


/etc/prometheus/backup_rules.yml > backup
Backup age (0 active)
alert: Backup
  age
expr: time()
  - last_backup > 60 * 60 * 48
for: 2m
labels:
  severity: important
annotations:
  summary: '{{ $labels.instance }} last backup age is >48 hours'
/etc/prometheus/build_rules.yml > builds.sr.ht
High number of builds timing out (0 active)
alert: High
  number of builds timing out
expr: increase(buildsrht_builds_finished_total{status="timeout"}[1d])
  > 10
labels:
  severity: important
annotations:
  summary: High number of builds are timing out
High rate of build job submission (0 active)
alert: High
  rate of build job submission
expr: increase(buildsrht_builds_started_total[5m])
  > 25
labels:
  severity: important
annotations:
  summary: Unusual rate of build job submissions on {{$labels.instance}}
/etc/prometheus/meta_rules.yml > meta.sr.ht
High rate of login failures (1 active)
alert: High
  rate of login failures
expr: delta(meta_logins_failed_total[10m])
  > 5
labels:
  security: "true"
  severity: important
annotations:
  summary: Unusual number of failed logins
Labels State Active Since Value
alertname="High rate of login failures" instance="meta.sr.ht:80" job="service" security="true" severity="important" firing 2020-03-29 20:20:03.711453475 +0000 UTC 8.205128205128204
High rate of password resets (0 active)
alert: High
  rate of password resets
expr: delta(meta_pw_resets_total[10m])
  > 5
labels:
  security: "true"
  severity: urgent
annotations:
  summary: Unusual number of failed logins
High rate of user registrations (0 active)
alert: High
  rate of user registrations
expr: delta(meta_registrations_total[10m])
  > 5
labels:
  severity: interesting
annotations:
  summary: High rate of user registrations
/etc/prometheus/node_rules.yml > node
High CPU usage (0 active)
alert: High
  CPU usage
expr: rate(node_cpu_seconds_total{mode="user"}[2m])
  > 0.75
for: 5m
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} is under high CPU usage
High disk I/O (0 active)
alert: High
  disk I/O
expr: (rate(node_disk_read_bytes_total{device=~"sd.*|vd.*"}[5m])
  / 1024 ^ 2 > 5) or (rate(node_disk_write_bytes_total{device=~"sd.*|vd.*"}[5m])
  / 1024 ^ 2 > 5)
for: 5m
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} >2 MiB/s disk I/O
High disk usage (0 active)
alert: High
  disk usage
expr: (node_filesystem_size_bytes{mountpoint=~"/|/var"}
  - node_filesystem_avail_bytes{mountpoint=~"/|/var"}) / node_filesystem_size_bytes{mountpoint=~"/|/var"}
  > 0.9
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} has high disk usage on {{ $labels.mountpoint
    }}
High network activity (0 active)
High tmpfs usage (0 active)
alert: High
  tmpfs usage
expr: (node_filesystem_size_bytes{mountpoint=~"/tmp"}
  - node_filesystem_avail_bytes{mountpoint=~"/tmp"}) / node_filesystem_size_bytes{mountpoint=~"/tmp"}
  > 0.8
for: 5m
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} has tmpfs usage
Instance down (0 active)
alert: Instance
  down
expr: up == 0
for: 2m
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} is down
Instance rebooted (0 active)
alert: Instance
  rebooted
expr: node_boot_time_seconds
  < 60
labels:
  severity: interesting
annotations:
  summary: Instance {{ $labels.instance }} was rebooted
Prolonged high CPU usage (0 active)
alert: Prolonged
  high CPU usage
expr: cpu_gt_75pct
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} is under sustained high CPU usage
Prolonged high disk I/O (0 active)
alert: Prolonged
  high disk I/O
expr: disk_gt_5mibsec
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} prolonged >2 MiB/s disk I/O
Prolonged high network activity (0 active)
alert: Prolonged
  high network activity
expr: net_gt_10mibsec
for: 1h
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} prolonged >10 MiB/s network use
Read-only filesystem (0 active)
alert: Read-only
  filesystem
expr: node_filesystem_readonly{mountpoint=~"/|/var"}
  != 0
labels:
  severity: urgent
annotations:
  summary: Instance {{ $labels.instance }} read-only filesystem on {{ $labels.mountpoint
    }}
Sustained high CPU usage (0 active)
alert: Sustained
  high CPU usage
expr: cpu_gt_75pct
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} is under sustained high CPU usage
Sustained high disk I/O (0 active)
alert: Sustained
  high disk I/O
expr: disk_gt_5mibsec
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} sustained >2 MiB/s disk I/O
Sustained high network activity (0 active)
alert: Sustained
  high network activity
expr: net_gt_10mibsec
for: 20m
labels:
  severity: important
annotations:
  summary: Instance {{ $labels.instance }} sustained >10 MiB/s network use
/etc/prometheus/ssl_rules.yml > ssl
SSL expiration (0 active)
alert: SSL
  expiration
expr: (certificate_expiration
  - time()) / 60 / 60 / 24 < 7
for: 2m
labels:
  severity: important
annotations:
  summary: '{{ $labels.instance }} SSL certificate expires in < 1 week'