Skip to content
Snippets Groups Projects
Commit 44524b8e authored by Daniel Bimschas's avatar Daniel Bimschas
Browse files

HOLI-10798: alerting policy for unacknowledged messages

parent edc2a645
No related branches found
No related tags found
No related merge requests found
......@@ -21,6 +21,14 @@ data "terraform_remote_state" "holi_infra_state" {
}
}
data "terraform_remote_state" "holi_infra_monitoring_state" {
backend = "gcs"
config = {
bucket = "holi-shared-terraform-state"
prefix = "infra-monitoring"
}
}
data "terraform_remote_state" "okuna_common_state" {
backend = "gcs"
config = {
......
......@@ -18,3 +18,23 @@ resource "google_pubsub_subscription" "holi-search-integration-okuna-sub" {
}
}
}
resource "google_monitoring_alert_policy" "pubsub_unacked_messages" {
project = data.terraform_remote_state.holi_infra_state.outputs.shared_project_id
display_name = "PubSub Unacked Messages Alert"
combiner = "OR"
conditions {
display_name = "Unacked messages condition"
condition_prometheus_query_language {
query = "rate(pubsub_googleapis_com:subscription_oldest_unacked_message_age{subscription_id='${google_pubsub_subscription.holi-search-integration-okuna-sub.name}'}[5m]) > 0"
  • Maintainer

    Isn't number of un-acknowledged messages > 0 a bit too restrictive. This would trigger alerts even for minor spikes. Maybe, we can set this to >5 or >10 to avoid over-flooding channels.

  • Author Owner

    You're kinda right I think. The alarm must trigger when there is at least 1 message not processed. A single message not being processed leads to head-of-line blocking as we have activated ordering guarantees in combination with a push subscriber. Therefore, holi-search-integration will only ever receive 1 message at a time repeatedly until it has succeeded to process it.

    I have increased the value to >= 1.

  • Please register or sign in to reply
duration = "300s"
}
}
notification_channels = [local.environment_name == "production" ? data.terraform_remote_state.holi_infra_monitoring_state.outputs.monitoring_notification_channel_rocket_chat_matching_data_production_id : data.terraform_remote_state.holi_infra_monitoring_state.outputs.monitoring_notification_channel_rocket_chat_matching_data_staging_id]
documentation {
content = "Alert triggered when Pub/Sub subscription(s) (subscription_id='${google_pubsub_subscription.holi-search-integration-okuna-sub.name}') have unacknowledged messages for more than 5 minutes"
  • Maintainer

    Missing actionable documentation. Maybe we can suggest steps to resolve the issue (e.g. Check Cloud Function Logs or something).

  • Author Owner

    Changed to

    "Alert triggered when Pub/Sub subscription(s) (subscription_id='${google_pubsub_subscription.holi-search-integration-okuna-sub.name}') have unacknowledged messages for more than 5 minutes. Please check the cloud function deployment and logs to see if the subscriber works correctly."

  • Please register or sign in to reply
mime_type = "text/markdown"
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment