forked from hkotka/docker-container-watchdog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontainer_watchdog.py
137 lines (121 loc) · 6.3 KB
/
container_watchdog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# pylint: disable = broad-except
import sys
import logging
import time
import json
import os
import re
import smtplib
from email.message import EmailMessage
import requests
import docker
# Set logging options and variables
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
polling_interval_after_restart: int = int(os.getenv('POLLING_INTERVAL_AFTER_RESTART', '600'))
polling_interval: int = int(os.getenv('POLLING_INTERVAL', '20'))
docker_host: str = os.getenv('DOCKER_HOSTMACHINE', 'UNKNOWN')
slack_webhook_url: str = os.getenv('SLACK_WEBHOOK_URL', '')
email_sender: str = os.getenv('EMAIL_SENDER', '')
email_receiver: str = os.getenv('EMAIL_RECEIVER', '')
smtp_server: str = os.getenv('SMTP_SERVER', '')
restarted_containers: list = []
notification_content: dict = {}
# Test and establish connection to docker socket
try:
CLIENT = docker.from_env()
CLIENT.version()
logging.info("Connection to Docker socket OK")
except Exception as err:
logging.fatal("%s", err)
sys.exit()
def send_slack_message(content) -> None:
if slack_webhook_url != "":
try:
requests.post(slack_webhook_url, data=json.dumps(content),
headers={'Content-Type': 'application/json'})
logging.info("Message sent to Slack webhook: %s", content['text'])
except (requests.exceptions.Timeout, ConnectionError) as err:
logging.error("%s", err)
def send_smtp_message(content) -> None:
if email_receiver != "" and smtp_server != "":
email_content: str = re.sub('[^ :A-Za-z0-9]+', '', content)
email_message = EmailMessage()
email_message.set_content(email_content)
email_message['Subject'] = 'Container Watchdog Alert notification'
email_message['From'] = email_sender
email_message['To'] = email_receiver
mail = smtplib.SMTP(smtp_server, 25, timeout=40)
try:
mail.send_message(email_message)
logging.info("Email sent to %s with content: %s", email_receiver, email_content)
except Exception as err:
logging.error("%s", err)
mail.quit()
def get_container_health_status(container_object) -> str:
try:
health_status = container_object.attrs['State']['Health']['Status']
except KeyError:
health_status = 'nokey'
return health_status
def get_container_health_log(container_object) -> str:
try:
health_log = container_object.attrs['State']['Health']['Log'][-1]['Output']
except KeyError:
health_log = 'nokey'
except IndexError:
health_log = 'empty'
return health_log
def restart_container(container_object) -> None:
try:
container_object.restart()
logging.info("Restarted container: %s", container_object.name)
notification_content['text'] = ("[Container watchdog]: Container restarted\n"
"\tHost: [ *_{0}_* ]\n"
"\tContainer: [ *_{1}_* ]\n"
"\tState: [ *_{2}_* ]\n"
"\tHealthstatus: [ *_{3}_* ]\n"
"\tOutput: [ _{4}_ ]".format(docker_host, container_object.name, container_status, container_health_status, container_health_log))
if container_object.short_id not in restarted_containers:
restarted_containers.append(container_object.short_id)
except Exception as err:
logging.fatal("%s", err)
notification_content['text'] = ("[Container watchdog]: Docker daemon failed to restart container *{0}* on hostmachine *{1}*"
" with error message: _{2}_".format(container_object.name, docker_host, err))
def container_recovered(container_object) -> None:
logging.info("Container %s has recovered and is now healthy!", container_object.name)
notification_content['text'] = ("[Container watchdog]: Container has recovered\n"
"\tHost: [ *_{0}_* ]\n"
"\tContainer: [ *_{1}_* ]\n"
"\tState: [ *_{2}_* ]\n"
"\tHealthstatus: [ *_{3}_* ]".format(docker_host, container_object.name, container_status, container_health_status))
restarted_containers.remove(container_object.short_id)
# Run loop indefinetly polling every $POLLING_INTERVAL normally or in $POLLING_INTERVAL_AFTER_RESTART after watchdog has restarted a container.
while True:
restart_status: bool = False
container_list: list = CLIENT.containers.list()
for container in container_list:
container_status = container.status
container_health_status: str = get_container_health_status(container)
# Check if the container was restarted previously and is now healthy.
# Send Slack/email notification. Remove from a list of restarted containers
if container.short_id in restarted_containers and container_health_status == 'healthy':
container_recovered(container)
send_slack_message(notification_content)
send_smtp_message(notification_content['text'])
# If container is in unhealthy or exited status, restart and send Slack/Email notification.
elif container_health_status == 'unhealthy':
container_health_log: str = get_container_health_log(container)
logging.error("Found container in unhealthy state! Container: '%s' has health status: '%s' and container status: '%s' with output log: %s",
container.name, container_health_status, container_status, container_health_log)
restart_container(container)
send_slack_message(notification_content)
send_smtp_message(notification_content['text'])
restart_status = True
logging.debug('%s - %s - %s', container.name, container_health_status, container_status)
# Wait to poll again, longer if restarts were done in previous loop
if restart_status is True:
logging.info("Waiting %s seconds until next polling, because container was restarted", polling_interval_after_restart)
time.sleep(polling_interval_after_restart)
elif restart_status is False:
logging.info("All containers are in healthy state!")
time.sleep(polling_interval)