0
0
Fork 0
mirror of https://github.com/alerta/alerta-contrib.git synced 2025-03-15 04:54:46 +00:00
alerta_alerta-contrib/integrations/urlmon/urlmon.py

466 lines
17 KiB
Python
Raw Permalink Normal View History

import datetime
2014-06-15 00:54:46 +00:00
import json
import logging
import platform
import queue
2014-06-15 00:54:46 +00:00
import re
import socket
import ssl
2023-03-20 22:39:34 +00:00
import sys
import threading
2023-03-20 22:39:34 +00:00
import time
from http.server import BaseHTTPRequestHandler as BHRH
from urllib.error import URLError # pylint: disable=no-name-in-module
from urllib.parse import urlparse # pylint: disable=no-name-in-module
2023-03-20 22:39:34 +00:00
from urllib.request import ( # pylint: disable=no-name-in-module
HTTPBasicAuthHandler, ProxyHandler, Request, build_opener, install_opener,
urlopen)
2023-03-20 22:39:34 +00:00
import settings
2017-11-16 00:00:37 +00:00
from alertaclient.api import Client
2014-06-15 00:54:46 +00:00
2023-03-20 22:39:34 +00:00
HTTP_RESPONSES = {k: v[0] for k, v in list(BHRH.responses.items())}
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
# Add missing responses
HTTP_RESPONSES[102] = 'Processing'
HTTP_RESPONSES[207] = 'Multi-Status'
HTTP_RESPONSES[422] = 'Unprocessable Entity'
HTTP_RESPONSES[423] = 'Locked'
HTTP_RESPONSES[424] = 'Failed Dependency'
HTTP_RESPONSES[506] = 'Variant Also Negotiates'
HTTP_RESPONSES[507] = 'Insufficient Storage'
HTTP_RESPONSES[510] = 'Not Extended'
2014-06-15 00:54:46 +00:00
_HTTP_ALERTS = [
'HttpConnectionError',
'HttpServerError',
'HttpClientError',
'HttpRedirection',
'HttpContentError',
'HttpResponseSlow',
'HttpResponseOK',
'HttpResponseRegexError',
'HttpResponseRegexOK'
]
2014-10-12 15:08:57 +00:00
__version__ = '3.3.0'
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
LOOP_EVERY = 60 # seconds
2023-03-20 22:39:34 +00:00
# TARGET_FILE = 'urlmon.targets' # FIXME -- or settings.py ???
2014-10-12 15:08:57 +00:00
SERVER_THREADS = 20
SLOW_WARNING_THRESHOLD = 5000 # ms
SLOW_CRITICAL_THRESHOLD = 10000 # ms
MAX_TIMEOUT = 15000 # ms
SSL_DAYS = 30
SSL_DAYS_PANIC = 7
2014-06-15 00:54:46 +00:00
2023-03-20 22:39:34 +00:00
LOG = logging.getLogger('alerta.urlmon')
logging.basicConfig(
format='%(asctime)s - %(name)s: %(levelname)s - %(message)s', level=logging.DEBUG)
2014-06-15 00:54:46 +00:00
class WorkerThread(threading.Thread):
2014-10-12 15:08:57 +00:00
def __init__(self, queue, api):
2014-06-15 00:54:46 +00:00
threading.Thread.__init__(self)
LOG.debug('Initialising %s...', self.getName())
self.queue = queue # internal queue
2014-10-12 15:08:57 +00:00
self.api = api # send alerts api
2014-06-15 00:54:46 +00:00
def run(self):
while True:
LOG.debug('Waiting on input queue...')
try:
check, queue_time = self.queue.get()
except TypeError:
LOG.info('%s is shutting down.', self.getName())
break
2014-10-12 15:08:57 +00:00
if time.time() - queue_time > LOOP_EVERY:
2014-06-15 00:54:46 +00:00
LOG.warning('URL request for %s to %s expired after %d seconds.', check['resource'], check['url'],
int(time.time() - queue_time))
self.queue.task_done()
continue
2014-10-12 15:08:57 +00:00
resource = check['resource']
LOG.info('%s polling %s...', self.getName(), resource)
status, reason, body, rtt = self.urlmon(check)
2014-06-15 00:54:46 +00:00
status_regex = check.get('status_regex', None)
search_string = check.get('search', None)
rule = check.get('rule', None)
2014-10-12 15:08:57 +00:00
warn_thold = check.get('warning', SLOW_WARNING_THRESHOLD)
crit_thold = check.get('critical', SLOW_CRITICAL_THRESHOLD)
checker_api = check.get('api_endpoint', None)
checker_apikey = check.get('api_key', None)
check_ssl = check.get('check_ssl')
if (checker_api and checker_apikey):
local_api = Client(endpoint=checker_api, key=checker_apikey)
else:
local_api = self.api
2014-06-15 00:54:46 +00:00
try:
description = HTTP_RESPONSES[status]
except KeyError:
description = 'undefined'
if not status:
event = 'HttpConnectionError'
2014-10-12 15:08:57 +00:00
severity = 'major'
2014-06-15 00:54:46 +00:00
value = reason
2014-10-12 15:08:57 +00:00
text = 'Error during connection or data transfer (timeout=%d).' % MAX_TIMEOUT
2014-06-15 00:54:46 +00:00
elif status_regex:
if re.search(status_regex, str(status)):
event = 'HttpResponseRegexOK'
2014-10-12 15:08:57 +00:00
severity = 'normal'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d that matched "%s" in %dms' % (
status, status_regex, rtt)
2014-06-15 00:54:46 +00:00
else:
event = 'HttpResponseRegexError'
2014-10-12 15:08:57 +00:00
severity = 'major'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d that failed to match "%s"' % (
status, status_regex)
2014-06-15 00:54:46 +00:00
elif 100 <= status <= 199:
event = 'HttpInformational'
2014-10-12 15:08:57 +00:00
severity = 'normal'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d in %dms' % (
status, rtt)
2014-06-15 00:54:46 +00:00
elif 200 <= status <= 299:
event = 'HttpResponseOK'
2014-10-12 15:08:57 +00:00
severity = 'normal'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d in %dms' % (
status, rtt)
2014-06-15 00:54:46 +00:00
elif 300 <= status <= 399:
event = 'HttpRedirection'
2014-10-12 15:08:57 +00:00
severity = 'minor'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d in %dms' % (
status, rtt)
2014-06-15 00:54:46 +00:00
elif 400 <= status <= 499:
event = 'HttpClientError'
2014-10-12 15:08:57 +00:00
severity = 'minor'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d in %dms' % (
status, rtt)
2014-06-15 00:54:46 +00:00
elif 500 <= status <= 599:
event = 'HttpServerError'
2014-10-12 15:08:57 +00:00
severity = 'major'
2014-06-15 00:54:46 +00:00
value = '%s (%d)' % (description, status)
2023-03-20 22:39:34 +00:00
text = 'HTTP server responded with status code %d in %dms' % (
status, rtt)
2014-06-15 00:54:46 +00:00
else:
event = 'HttpUnknownError'
2014-10-12 15:08:57 +00:00
severity = 'warning'
2014-06-15 00:54:46 +00:00
value = 'UNKNOWN'
text = 'HTTP request resulted in an unhandled error.'
if event in ['HttpResponseOK', 'HttpResponseRegexOK']:
if rtt > crit_thold:
event = 'HttpResponseSlow'
2014-10-12 15:08:57 +00:00
severity = 'critical'
2014-06-15 00:54:46 +00:00
value = '%dms' % rtt
text = 'Website available but exceeding critical RT thresholds of %dms' % crit_thold
elif rtt > warn_thold:
event = 'HttpResponseSlow'
2014-10-12 15:08:57 +00:00
severity = 'warning'
2014-06-15 00:54:46 +00:00
value = '%dms' % rtt
text = 'Website available but exceeding warning RT thresholds of %dms' % warn_thold
if search_string and body:
LOG.debug('Searching for %s', search_string)
found = False
for line in body.split('\n'):
m = re.search(search_string, line)
if m:
found = True
2023-03-20 22:39:34 +00:00
LOG.debug('Regex: Found %s in %s',
search_string, line)
2014-06-15 00:54:46 +00:00
break
if not found:
event = 'HttpContentError'
2014-10-12 15:08:57 +00:00
severity = 'minor'
2014-06-15 00:54:46 +00:00
value = 'Search failed'
text = 'Website available but pattern "%s" not found' % search_string
elif rule and body:
LOG.debug('Evaluating rule %s', rule)
2014-10-12 15:08:57 +00:00
headers = check.get('headers', {})
2014-06-15 00:54:46 +00:00
if 'Content-type' in headers and headers['Content-type'] == 'application/json':
2014-10-12 15:08:57 +00:00
try:
body = json.loads(body)
2017-11-16 00:00:37 +00:00
except ValueError as e:
2023-03-20 22:39:34 +00:00
LOG.error(
'Could not evaluate rule %s: %s', rule, e)
2014-06-15 00:54:46 +00:00
try:
2023-03-20 22:39:34 +00:00
# NOTE: assumes request body in variable called 'body'
eval(rule)
2017-11-16 00:00:37 +00:00
except (SyntaxError, NameError, ZeroDivisionError) as e:
2014-06-15 00:54:46 +00:00
LOG.error('Could not evaluate rule %s: %s', rule, e)
2017-11-16 00:00:37 +00:00
except Exception as e:
2014-06-15 00:54:46 +00:00
LOG.error('Could not evaluate rule %s: %s', rule, e)
else:
if not eval(rule):
event = 'HttpContentError'
2014-10-12 15:08:57 +00:00
severity = 'minor'
2014-06-15 00:54:46 +00:00
value = 'Rule failed'
text = 'Website available but rule evaluation failed (%s)' % rule
2023-03-20 22:39:34 +00:00
LOG.debug('URL: %s, Status: %s (%s), Round-Trip Time: %dms -> %s',
2014-06-15 00:54:46 +00:00
check['url'], description, status, rtt, event)
resource = check['resource']
correlate = _HTTP_ALERTS
group = 'Web'
environment = check['environment']
service = check['service']
text = text
tags = check.get('tags', list())
2023-03-20 22:39:34 +00:00
threshold_info = '%s : RT > %d RT > %d x %s' % (
check['url'], warn_thold, crit_thold, check.get('count', 1))
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
try:
local_api.send_alert(
2017-11-16 00:00:37 +00:00
resource=resource,
event=event,
correlate=correlate,
group=group,
value=value,
severity=severity,
environment=environment,
service=service,
text=text,
event_type='serviceAlert',
tags=tags,
attributes={
'thresholdInfo': threshold_info
}
)
except Exception as e:
2014-10-12 15:08:57 +00:00
LOG.warning('Failed to send alert: %s', e)
2014-06-15 00:54:46 +00:00
if check_ssl:
ssl_date_fmt = r'%b %d %H:%M:%S %Y %Z'
context = ssl.create_default_context()
domain = '{uri.netloc}'.format(uri=urlparse(check.get('url')))
port = urlparse(check.get('url')).port or 443
conn = context.wrap_socket(
socket.socket(socket.AF_INET),
server_hostname=domain
)
conn.settimeout(3.0)
conn.connect((domain, port))
ssl_info = conn.getpeercert()
2023-03-20 22:39:34 +00:00
days_left = datetime.datetime.strptime(
ssl_info['notAfter'], ssl_date_fmt) - datetime.datetime.utcnow()
if days_left < datetime.timedelta(days=0):
text = 'HTTPS cert for %s expired' % check['resource']
severity = 'critical'
elif days_left < datetime.timedelta(days=SSL_DAYS) and days_left > datetime.timedelta(days=SSL_DAYS_PANIC):
2023-03-20 22:39:34 +00:00
text = 'HTTPS cert for {} will expire at {}'.format(
check['resource'], days_left)
severity = 'major'
elif days_left <= datetime.timedelta(days=SSL_DAYS_PANIC):
2023-03-20 22:39:34 +00:00
text = 'HTTPS cert for {} will expire at {}'.format(
check['resource'], days_left)
severity = 'critical'
else:
severity = 'normal'
try:
local_api.send_alert(
resource=resource,
event='HttpSSLChecker',
correlate=correlate,
group=group,
value='left %s day(s)' % days_left.days,
severity=severity,
environment=environment,
service=service,
text=text,
event_type='serviceAlert',
tags=tags,
attributes={
'thresholdInfo': threshold_info
}
)
except Exception as e:
LOG.warning('Failed to send ssl alert: %s', e)
2014-06-15 00:54:46 +00:00
self.queue.task_done()
LOG.info('%s check complete.', self.getName())
self.queue.task_done()
2014-10-12 15:08:57 +00:00
@staticmethod
def urlmon(check):
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
url = check['url']
post = check.get('post', None)
count = check.get('count', 1)
headers = check.get('headers', {})
username = check.get('username', None)
password = check.get('password', None)
realm = check.get('realm', None)
uri = check.get('uri', None)
proxy = check.get('proxy', False)
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
status = 0
reason = None
body = None
rtt = 0
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
while True:
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
count -= 1
start = time.time()
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
if username and password:
auth_handler = HTTPBasicAuthHandler()
2014-10-12 15:08:57 +00:00
auth_handler.add_password(realm=realm,
uri=uri,
user=username,
passwd=password)
if proxy:
opener = build_opener(auth_handler, ProxyHandler(proxy))
2014-10-12 15:08:57 +00:00
else:
opener = build_opener(auth_handler)
2014-10-12 15:08:57 +00:00
else:
if proxy:
opener = build_opener(ProxyHandler(proxy))
2014-10-12 15:08:57 +00:00
else:
opener = build_opener()
install_opener(opener)
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
if 'User-agent' not in headers:
headers['User-agent'] = 'alert-urlmon/%s' % (__version__)
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
try:
if post:
req = Request(url, json.dumps(post), headers=headers)
2014-10-12 15:08:57 +00:00
else:
req = Request(url, headers=headers)
response = urlopen(req, None, MAX_TIMEOUT)
2017-11-16 00:00:37 +00:00
except ValueError as e:
LOG.error('Request failed: %s' % e)
except URLError as e:
2014-10-12 15:08:57 +00:00
if hasattr(e, 'reason'):
reason = str(e.reason)
status = None
elif hasattr(e, 'code'):
reason = None
status = e.code # pylint: disable=no-member
2017-11-16 00:00:37 +00:00
except Exception as e:
LOG.warning('Unexpected error: %s' % e)
2014-10-12 15:08:57 +00:00
else:
status = response.getcode()
body = response.read()
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
rtt = int((time.time() - start) * 1000) # round-trip time
if status: # return result if any HTTP/S response is received
break
if not count:
break
time.sleep(10)
return status, reason, body, rtt
2014-06-15 00:54:46 +00:00
2023-03-20 22:39:34 +00:00
class UrlmonDaemon:
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
def __init__(self):
2014-06-15 00:54:46 +00:00
2014-10-12 15:08:57 +00:00
self.shuttingdown = False
def run(self):
self.running = True
self.queue = queue.Queue()
2017-11-16 00:00:37 +00:00
self.api = Client(endpoint=settings.ENDPOINT, key=settings.API_KEY)
2014-06-15 00:54:46 +00:00
# Start worker threads
2014-10-12 15:08:57 +00:00
LOG.debug('Starting %s worker threads...', SERVER_THREADS)
for i in range(SERVER_THREADS):
w = WorkerThread(self.queue, self.api)
2014-06-15 00:54:46 +00:00
try:
w.start()
2017-11-16 00:00:37 +00:00
except Exception as e:
2014-06-15 00:54:46 +00:00
LOG.error('Worker thread #%s did not start: %s', i, e)
continue
LOG.info('Started worker thread: %s', w.getName())
while not self.shuttingdown:
try:
2014-10-12 15:08:57 +00:00
for check in settings.checks:
self.queue.put((check, time.time()))
2014-06-15 00:54:46 +00:00
LOG.debug('Send heartbeat...')
try:
2017-11-16 00:00:37 +00:00
origin = '{}/{}'.format('urlmon', platform.uname()[1])
2023-03-20 22:39:34 +00:00
self.api.heartbeat(
origin, tags=[__version__], timeout=3600)
2017-11-16 00:00:37 +00:00
except Exception as e:
2014-06-15 00:54:46 +00:00
LOG.warning('Failed to send heartbeat: %s', e)
2014-10-12 15:08:57 +00:00
time.sleep(LOOP_EVERY)
2014-06-15 00:54:46 +00:00
LOG.info('URL check queue length is %d', self.queue.qsize())
if self.queue.qsize() > 100:
severity = 'warning'
else:
severity = 'ok'
try:
self.api.send_alert(
resource=origin,
event='big queue for http checks',
value=self.queue.qsize(),
severity=severity,
text='URL check queue length is %d' % self.queue.qsize(),
event_type='serviceAlert',
)
except Exception as e:
LOG.warning('Failed to send alert: %s', e)
2014-06-15 00:54:46 +00:00
except (KeyboardInterrupt, SystemExit):
self.shuttingdown = True
LOG.info('Shutdown request received...')
self.running = False
2014-10-12 15:08:57 +00:00
for i in range(SERVER_THREADS):
2014-06-15 00:54:46 +00:00
self.queue.put(None)
w.join()
2014-06-15 10:49:06 +00:00
def main():
2023-03-20 22:39:34 +00:00
LOG = logging.getLogger('alerta.urlmon')
2014-10-12 15:08:57 +00:00
try:
UrlmonDaemon().run()
except Exception as e:
LOG.error(e, exc_info=1)
sys.exit(1)
2023-03-20 22:39:34 +00:00
except KeyboardInterrupt:
LOG.warning('Exiting alerta urlmon.')
2014-10-12 15:08:57 +00:00
sys.exit(1)
2014-06-15 10:49:06 +00:00
2023-03-20 22:39:34 +00:00
2014-06-15 10:49:06 +00:00
if __name__ == '__main__':
2014-10-12 15:08:57 +00:00
main()