alerta-contrib/integrations/urlmon/urlmon.py

378 lines
13 KiB
Python
Raw Normal View History

2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
import os
import sys
2014-06-15 01:54:46 +01:00
import time
import urllib2
import json
import threading
import Queue
import re
2014-10-12 16:08:57 +01:00
import logging
from alerta.api import ApiClient
from alerta.alert import Alert
from alerta.heartbeat import Heartbeat
2014-06-15 01:54:46 +01:00
2014-06-15 11:49:06 +01:00
from BaseHTTPServer import BaseHTTPRequestHandler as BHRH
2014-06-15 01:54:46 +01:00
HTTP_RESPONSES = dict([(k, v[0]) for k, v in BHRH.responses.items()])
2014-10-12 16:08:57 +01:00
# Add missing responses
HTTP_RESPONSES[102] = 'Processing'
HTTP_RESPONSES[207] = 'Multi-Status'
HTTP_RESPONSES[422] = 'Unprocessable Entity'
HTTP_RESPONSES[423] = 'Locked'
HTTP_RESPONSES[424] = 'Failed Dependency'
HTTP_RESPONSES[506] = 'Variant Also Negotiates'
HTTP_RESPONSES[507] = 'Insufficient Storage'
HTTP_RESPONSES[510] = 'Not Extended'
2014-06-15 01:54:46 +01:00
_HTTP_ALERTS = [
'HttpConnectionError',
'HttpServerError',
'HttpClientError',
'HttpRedirection',
'HttpContentError',
'HttpResponseSlow',
'HttpResponseOK',
'HttpResponseRegexError',
'HttpResponseRegexOK'
]
2014-10-12 16:08:57 +01:00
__version__ = '3.3.0'
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
LOOP_EVERY = 60 # seconds
#TARGET_FILE = 'urlmon.targets' # FIXME -- or settings.py ???
SERVER_THREADS = 20
SLOW_WARNING_THRESHOLD = 5000 # ms
SLOW_CRITICAL_THRESHOLD = 10000 # ms
MAX_TIMEOUT = 15000 # ms
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
import settings
2014-06-15 01:54:46 +01:00
2014-10-12 19:38:59 +01:00
LOG = logging.getLogger("alerta.urlmon")
2014-10-12 16:08:57 +01:00
logging.basicConfig(format="%(asctime)s - %(name)s: %(levelname)s - %(message)s", level=logging.DEBUG)
2014-06-15 01:54:46 +01:00
class WorkerThread(threading.Thread):
2014-10-12 16:08:57 +01:00
def __init__(self, queue, api):
2014-06-15 01:54:46 +01:00
threading.Thread.__init__(self)
LOG.debug('Initialising %s...', self.getName())
self.queue = queue # internal queue
2014-10-12 16:08:57 +01:00
self.api = api # send alerts api
2014-06-15 01:54:46 +01:00
def run(self):
while True:
LOG.debug('Waiting on input queue...')
try:
check, queue_time = self.queue.get()
except TypeError:
LOG.info('%s is shutting down.', self.getName())
break
2014-10-12 16:08:57 +01:00
if time.time() - queue_time > LOOP_EVERY:
2014-06-15 01:54:46 +01:00
LOG.warning('URL request for %s to %s expired after %d seconds.', check['resource'], check['url'],
int(time.time() - queue_time))
self.queue.task_done()
continue
2014-10-12 16:08:57 +01:00
resource = check['resource']
LOG.info('%s polling %s...', self.getName(), resource)
status, reason, body, rtt = self.urlmon(check)
2014-06-15 01:54:46 +01:00
status_regex = check.get('status_regex', None)
search_string = check.get('search', None)
rule = check.get('rule', None)
2014-10-12 16:08:57 +01:00
warn_thold = check.get('warning', SLOW_WARNING_THRESHOLD)
crit_thold = check.get('critical', SLOW_CRITICAL_THRESHOLD)
2014-06-15 01:54:46 +01:00
try:
description = HTTP_RESPONSES[status]
except KeyError:
description = 'undefined'
if not status:
event = 'HttpConnectionError'
2014-10-12 16:08:57 +01:00
severity = 'major'
2014-06-15 01:54:46 +01:00
value = reason
2014-10-12 16:08:57 +01:00
text = 'Error during connection or data transfer (timeout=%d).' % MAX_TIMEOUT
2014-06-15 01:54:46 +01:00
elif status_regex:
if re.search(status_regex, str(status)):
event = 'HttpResponseRegexOK'
2014-10-12 16:08:57 +01:00
severity = 'normal'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d that matched "%s" in %dms' % (status, status_regex, rtt)
else:
event = 'HttpResponseRegexError'
2014-10-12 16:08:57 +01:00
severity = 'major'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d that failed to match "%s"' % (status, status_regex)
elif 100 <= status <= 199:
event = 'HttpInformational'
2014-10-12 16:08:57 +01:00
severity = 'normal'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d in %dms' % (status, rtt)
elif 200 <= status <= 299:
event = 'HttpResponseOK'
2014-10-12 16:08:57 +01:00
severity = 'normal'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d in %dms' % (status, rtt)
elif 300 <= status <= 399:
event = 'HttpRedirection'
2014-10-12 16:08:57 +01:00
severity = 'minor'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d in %dms' % (status, rtt)
elif 400 <= status <= 499:
event = 'HttpClientError'
2014-10-12 16:08:57 +01:00
severity = 'minor'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d in %dms' % (status, rtt)
elif 500 <= status <= 599:
event = 'HttpServerError'
2014-10-12 16:08:57 +01:00
severity = 'major'
2014-06-15 01:54:46 +01:00
value = '%s (%d)' % (description, status)
text = 'HTTP server responded with status code %d in %dms' % (status, rtt)
else:
event = 'HttpUnknownError'
2014-10-12 16:08:57 +01:00
severity = 'warning'
2014-06-15 01:54:46 +01:00
value = 'UNKNOWN'
text = 'HTTP request resulted in an unhandled error.'
if event in ['HttpResponseOK', 'HttpResponseRegexOK']:
if rtt > crit_thold:
event = 'HttpResponseSlow'
2014-10-12 16:08:57 +01:00
severity = 'critical'
2014-06-15 01:54:46 +01:00
value = '%dms' % rtt
text = 'Website available but exceeding critical RT thresholds of %dms' % crit_thold
elif rtt > warn_thold:
event = 'HttpResponseSlow'
2014-10-12 16:08:57 +01:00
severity = 'warning'
2014-06-15 01:54:46 +01:00
value = '%dms' % rtt
text = 'Website available but exceeding warning RT thresholds of %dms' % warn_thold
if search_string and body:
LOG.debug('Searching for %s', search_string)
found = False
for line in body.split('\n'):
m = re.search(search_string, line)
if m:
found = True
LOG.debug("Regex: Found %s in %s", search_string, line)
break
if not found:
event = 'HttpContentError'
2014-10-12 16:08:57 +01:00
severity = 'minor'
2014-06-15 01:54:46 +01:00
value = 'Search failed'
text = 'Website available but pattern "%s" not found' % search_string
elif rule and body:
LOG.debug('Evaluating rule %s', rule)
2014-10-12 16:08:57 +01:00
headers = check.get('headers', {})
2014-06-15 01:54:46 +01:00
if 'Content-type' in headers and headers['Content-type'] == 'application/json':
2014-10-12 16:08:57 +01:00
try:
body = json.loads(body)
except ValueError, e:
LOG.error('Could not evaluate rule %s: %s', rule, e)
2014-06-15 01:54:46 +01:00
try:
2014-10-12 16:08:57 +01:00
eval(rule) # NOTE: assumes request body in variable called 'body'
2014-06-15 01:54:46 +01:00
except (SyntaxError, NameError, ZeroDivisionError), e:
LOG.error('Could not evaluate rule %s: %s', rule, e)
except Exception, e:
LOG.error('Could not evaluate rule %s: %s', rule, e)
else:
if not eval(rule):
event = 'HttpContentError'
2014-10-12 16:08:57 +01:00
severity = 'minor'
2014-06-15 01:54:46 +01:00
value = 'Rule failed'
text = 'Website available but rule evaluation failed (%s)' % rule
LOG.debug("URL: %s, Status: %s (%s), Round-Trip Time: %dms -> %s",
check['url'], description, status, rtt, event)
resource = check['resource']
correlate = _HTTP_ALERTS
group = 'Web'
environment = check['environment']
service = check['service']
text = text
tags = check.get('tags', list())
threshold_info = "%s : RT > %d RT > %d x %s" % (check['url'], warn_thold, crit_thold, check.get('count', 1))
urlmonAlert = Alert(
resource=resource,
event=event,
correlate=correlate,
group=group,
value=value,
severity=severity,
environment=environment,
service=service,
text=text,
event_type='serviceAlert',
tags=tags,
attributes={
'thresholdInfo': threshold_info
}
)
2014-10-12 16:08:57 +01:00
try:
self.api.send(urlmonAlert)
except Exception, e:
LOG.warning('Failed to send alert: %s', e)
2014-06-15 01:54:46 +01:00
self.queue.task_done()
LOG.info('%s check complete.', self.getName())
self.queue.task_done()
2014-10-12 16:08:57 +01:00
@staticmethod
def urlmon(check):
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
url = check['url']
post = check.get('post', None)
count = check.get('count', 1)
headers = check.get('headers', {})
username = check.get('username', None)
password = check.get('password', None)
realm = check.get('realm', None)
uri = check.get('uri', None)
proxy = check.get('proxy', False)
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
status = 0
reason = None
body = None
rtt = 0
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
while True:
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
count -= 1
start = time.time()
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
if username and password:
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm=realm,
uri=uri,
user=username,
passwd=password)
if proxy:
opener = urllib2.build_opener(auth_handler, urllib2.ProxyHandler(proxy))
else:
opener = urllib2.build_opener(auth_handler)
else:
if proxy:
opener = urllib2.build_opener(urllib2.ProxyHandler(proxy))
else:
opener = urllib2.build_opener()
urllib2.install_opener(opener)
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
if 'User-agent' not in headers:
headers['User-agent'] = 'alert-urlmon/%s Python-urllib/%s' % (__version__, urllib2.__version__)
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
try:
if post:
req = urllib2.Request(url, json.dumps(post), headers=headers)
else:
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req, None, MAX_TIMEOUT)
except ValueError, e:
LOG.error('Request failed: %s', e)
except urllib2.URLError, e:
if hasattr(e, 'reason'):
reason = str(e.reason)
status = None
elif hasattr(e, 'code'):
reason = None
status = e.code
except Exception, e:
LOG.warning('Unexpected error: %s', e)
else:
status = response.getcode()
body = response.read()
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
rtt = int((time.time() - start) * 1000) # round-trip time
if status: # return result if any HTTP/S response is received
break
if not count:
break
time.sleep(10)
return status, reason, body, rtt
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
class UrlmonDaemon(object):
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
def __init__(self):
2014-06-15 01:54:46 +01:00
2014-10-12 16:08:57 +01:00
self.shuttingdown = False
def run(self):
self.running = True
self.queue = Queue.Queue()
self.api = self.api = ApiClient(endpoint=settings.ENDPOINT, key=settings.API_KEY)
2014-06-15 01:54:46 +01:00
# Start worker threads
2014-10-12 16:08:57 +01:00
LOG.debug('Starting %s worker threads...', SERVER_THREADS)
for i in range(SERVER_THREADS):
w = WorkerThread(self.queue, self.api)
2014-06-15 01:54:46 +01:00
try:
w.start()
except Exception, e:
LOG.error('Worker thread #%s did not start: %s', i, e)
continue
LOG.info('Started worker thread: %s', w.getName())
while not self.shuttingdown:
try:
2014-10-12 16:08:57 +01:00
for check in settings.checks:
self.queue.put((check, time.time()))
2014-06-15 01:54:46 +01:00
LOG.debug('Send heartbeat...')
heartbeat = Heartbeat(tags=[__version__])
try:
self.api.send(heartbeat)
except Exception, e:
LOG.warning('Failed to send heartbeat: %s', e)
2014-10-12 16:08:57 +01:00
time.sleep(LOOP_EVERY)
2014-06-15 01:54:46 +01:00
LOG.info('URL check queue length is %d', self.queue.qsize())
except (KeyboardInterrupt, SystemExit):
self.shuttingdown = True
LOG.info('Shutdown request received...')
self.running = False
2014-10-12 16:08:57 +01:00
for i in range(SERVER_THREADS):
2014-06-15 01:54:46 +01:00
self.queue.put(None)
w.join()
2014-06-15 11:49:06 +01:00
def main():
2014-10-12 19:38:59 +01:00
LOG = logging.getLogger("alerta.urlmon")
2014-10-12 16:08:57 +01:00
try:
UrlmonDaemon().run()
except Exception as e:
LOG.error(e, exc_info=1)
sys.exit(1)
except KeyboardInterrupt as e:
LOG.warning("Exiting alerta urlmon.")
sys.exit(1)
2014-06-15 11:49:06 +01:00
if __name__ == '__main__':
2014-10-12 16:08:57 +01:00
main()