Comprehensive strategies for handling errors and ensuring reliable monitoring
import os
import json
import time
import requests
from datetime import datetime
class SEERClient:
def __init__(self, api_key, offline_dir="failed_payloads"):
self.api_key = api_key
self.offline_dir = offline_dir
self.base_url = "https://api.seer.ansrstudio.com"
# Create offline directory if it doesn't exist
if not os.path.exists(self.offline_dir):
os.makedirs(self.offline_dir)
def _save_offline(self, endpoint, payload):
"""Save failed payload to disk for later retry"""
timestamp = int(time.time() * 1000)
filename = f"{endpoint}_{timestamp}.json"
filepath = os.path.join(self.offline_dir, filename)
with open(filepath, "w") as f:
json.dump(payload, f)
print(f"Saved failed payload to {filepath}")
def send_monitoring_data(self, payload):
"""Send monitoring data with offline fallback"""
try:
response = requests.post(
f"{self.base_url}/monitoring",
headers={
"Authorization": self.api_key,
"Content-Type": "application/json"
},
json=payload,
timeout=10
)
response.raise_for_status()
return True
except Exception as e:
print(f"Failed to send monitoring data: {e}")
self._save_offline("monitoring", payload)
return False
def send_heartbeat(self, pipeline_id):
"""Send heartbeat with offline fallback"""
payload = {"pipeline_id": pipeline_id}
try:
response = requests.post(
f"{self.base_url}/heartbeat",
headers={
"Authorization": self.api_key,
"Content-Type": "application/json"
},
json=payload,
timeout=10
)
response.raise_for_status()
return True
except Exception as e:
print(f"Failed to send heartbeat: {e}")
self._save_offline("heartbeat", payload)
return Falsedef replay_failed_payloads(api_key):
"""Replay all failed payloads from offline storage"""
temp_dir = os.path.join(os.path.dirname(__file__), "failed_payloads")
if not os.path.exists(temp_dir):
return
replayed = 0
failed = 0
for filename in os.listdir(temp_dir):
filepath = os.path.join(temp_dir, filename)
try:
with open(filepath, "r") as f:
payload = json.load(f)
headers = {
"Authorization": api_key,
"Content-Type": "application/json"
}
# Determine endpoint from filename
if "monitoring" in filename:
url = "https://api.seer.ansrstudio.com/monitoring"
elif "heartbeat" in filename:
url = "https://api.seer.ansrstudio.com/heartbeat"
else:
continue
# Attempt to send
response = requests.post(url, headers=headers, json=payload, timeout=10)
response.raise_for_status()
# Success - remove the file
os.remove(filepath)
replayed += 1
print(f"✓ Replayed {filename}")
except Exception as e:
failed += 1
print(f"✗ Failed to replay {filename}: {e}")
# Leave the file for next retry
print(f"Replay complete: {replayed} succeeded, {failed} failed")
# Usage: Run this periodically or at script startup
replay_failed_payloads("your_api_key_here")Never let monitoring failures crash your application
Set reasonable timeouts (5-10 seconds) to prevent hanging
Wait progressively longer between retry attempts
Save to disk for later replay when connectivity returns
Keep logs of failures to identify patterns
Simulate network failures to verify your error handling works