From 1a505a54db35227b98fffaade09fd2913af95c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Mon, 13 Jan 2025 14:36:29 +0100 Subject: [PATCH 1/3] [CI] Always send a heartbeat metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This script was setup to only upload metrics to Grafana when a new workflow was available. If either the Grafana or github token becomes stale, no metrics would get recorded either. We have alerting in place to detect a lack of update, but because we only uploaded metrics on new workflows, we could have normal cases were no data would get uploaded for a few hours (example, late night weekend). For those reasons, the delay before alerting for no-data had to be set quite high. By adding a fixed heartbeat in the uploaded metrics, we know we MUST receive at least 1 metric every 5 minutes, and can have a more reactive monitoring. Signed-off-by: Nathan Gauër --- .ci/metrics/metrics.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 55025e50d1081..50360ddefd24c 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -147,6 +147,15 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key): f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr ) +def make_heartbeat_metric(): + return JobMetrics( + "metrics_container_heartbeat", + 1, # queue time seconds + 2, # run time seconds + 3, # job result + time.time_ns(), # created at ns + 0, # workflow run ID + ) def main(): # Authenticate with Github @@ -166,11 +175,14 @@ def main(): while True: current_metrics = get_metrics(github_repo, workflows_to_track) if len(current_metrics) == 0: - print("No metrics found to upload.", file=sys.stderr) - continue + print("No metrics found to upload.", file=sys.stdout) + + # Always send a hearbeat metric so we can monitor is this container + # is still able to log to Grafana. + current_metrics.append(make_heartbeat_metric()) upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key) - print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr) + print(f"Uploaded {len(current_metrics)} metrics", file=sys.stdout) for workflow_metric in reversed(current_metrics): workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id From 35cf962f585c1bdc9b40c3f04514f722750869ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Mon, 13 Jan 2025 14:58:16 +0100 Subject: [PATCH 2/3] clang-format --- .ci/metrics/metrics.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 50360ddefd24c..0e289651785f1 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -147,15 +147,17 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key): f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr ) + def make_heartbeat_metric(): - return JobMetrics( - "metrics_container_heartbeat", - 1, # queue time seconds - 2, # run time seconds - 3, # job result - time.time_ns(), # created at ns - 0, # workflow run ID - ) + return JobMetrics( + "metrics_container_heartbeat", + 1, # queue time seconds + 2, # run time seconds + 3, # job result + time.time_ns(), # created at ns + 0, # workflow run ID + ) + def main(): # Authenticate with Github From f569206439dad29bbb0e8e53dbe1deeb471e03cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Mon, 13 Jan 2025 15:06:17 +0100 Subject: [PATCH 3/3] clang-format --- .ci/metrics/metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 0e289651785f1..6eefcdcec93bf 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -151,11 +151,11 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key): def make_heartbeat_metric(): return JobMetrics( "metrics_container_heartbeat", - 1, # queue time seconds - 2, # run time seconds - 3, # job result - time.time_ns(), # created at ns - 0, # workflow run ID + 1, # queue time seconds + 2, # run time seconds + 3, # job result + time.time_ns(), # created at ns + 0, # workflow run ID )