From 9bbd0364d044ba90ff2fbb4d8c20c5410c7da086 Mon Sep 17 00:00:00 2001 From: Marc Cataford Date: Thu, 7 Mar 2024 22:21:57 -0500 Subject: [PATCH] feat(healthchecks): ping services periodically to check for availability + post results to Discord --- .gitignore | 8 ++ services/healthcheck/.dockerignore | 1 + services/healthcheck/.python-version | 1 + services/healthcheck/Dockerfile | 15 ++ services/healthcheck/README.md | 21 +++ services/healthcheck/build.sh | 5 + services/healthcheck/constants.sh | 5 + services/healthcheck/healthcheck/__init__.py | 0 services/healthcheck/healthcheck/conftest.py | 42 ++++++ services/healthcheck/healthcheck/main.py | 55 +++++++ services/healthcheck/healthcheck/main_test.py | 52 +++++++ services/healthcheck/healthcheck/tasks.py | 41 ++++++ .../healthcheck/healthcheck/tasks_test.py | 113 +++++++++++++++ services/healthcheck/healthcheck/use_cases.py | 97 +++++++++++++ .../healthcheck/healthcheck/use_cases_test.py | 14 ++ services/healthcheck/pyproject.toml | 34 +++++ services/healthcheck/requirements.txt | 58 ++++++++ services/healthcheck/requirements_dev.txt | 134 ++++++++++++++++++ services/healthcheck/script/bootstrap.sh | 9 ++ services/healthcheck/script/lock-deps.sh | 6 + services/healthcheck/start.sh | 10 ++ services/healthcheck/stop.sh | 5 + 22 files changed, 726 insertions(+) create mode 100644 services/healthcheck/.dockerignore create mode 100644 services/healthcheck/.python-version create mode 100644 services/healthcheck/Dockerfile create mode 100644 services/healthcheck/README.md create mode 100755 services/healthcheck/build.sh create mode 100644 services/healthcheck/constants.sh create mode 100644 services/healthcheck/healthcheck/__init__.py create mode 100644 services/healthcheck/healthcheck/conftest.py create mode 100644 services/healthcheck/healthcheck/main.py create mode 100644 services/healthcheck/healthcheck/main_test.py create mode 100644 services/healthcheck/healthcheck/tasks.py create mode 100644 services/healthcheck/healthcheck/tasks_test.py create mode 100644 services/healthcheck/healthcheck/use_cases.py create mode 100644 services/healthcheck/healthcheck/use_cases_test.py create mode 100644 services/healthcheck/pyproject.toml create mode 100644 services/healthcheck/requirements.txt create mode 100644 services/healthcheck/requirements_dev.txt create mode 100755 services/healthcheck/script/bootstrap.sh create mode 100755 services/healthcheck/script/lock-deps.sh create mode 100755 services/healthcheck/start.sh create mode 100644 services/healthcheck/stop.sh diff --git a/.gitignore b/.gitignore index 882118e..fed8877 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,11 @@ env.yml # Taskfile binary via bootstrap.sh bin + +**/config.json + +# Python artifacts +**pycache** +**/*.pyc +**/.venv/** +**.egg-info** diff --git a/services/healthcheck/.dockerignore b/services/healthcheck/.dockerignore new file mode 100644 index 0000000..fc3df9e --- /dev/null +++ b/services/healthcheck/.dockerignore @@ -0,0 +1 @@ +**/*_test.py diff --git a/services/healthcheck/.python-version b/services/healthcheck/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/services/healthcheck/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/services/healthcheck/Dockerfile b/services/healthcheck/Dockerfile new file mode 100644 index 0000000..294dd68 --- /dev/null +++ b/services/healthcheck/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.12 AS base + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install -r requirements.txt + +FROM base AS app + +ENV HEALTHCHECK_CONFIG_PATH "/app/config.json" + +COPY healthcheck ./healthcheck + +CMD python -m uvicorn --host "0.0.0.0" healthcheck.main:app diff --git a/services/healthcheck/README.md b/services/healthcheck/README.md new file mode 100644 index 0000000..2ec9be8 --- /dev/null +++ b/services/healthcheck/README.md @@ -0,0 +1,21 @@ +# Healthcheck reporter + +Periodically checks if resources are reacheable and reports via a configurable webhook. + +## Configuration + +A `config.json` file should be provided and follow the schema outlined in `use_cases.Configuration`: + +```json +{ + "endpoints": { + "service-a": "https://service-a.com", + "service-b": "http://service-b:8080", + ... + }, + "webhook_url": "https://my-webhook.com/", + "check_interval": 3600 +} +``` + +Every `check_interval` seconds, the application will attempt to reach each of the services and post a message summarizing the results to `webhook_url`. diff --git a/services/healthcheck/build.sh b/services/healthcheck/build.sh new file mode 100755 index 0000000..e5c5272 --- /dev/null +++ b/services/healthcheck/build.sh @@ -0,0 +1,5 @@ +#!/usr/bin/bash + +source ./constants.sh + +podman build . -t "$APP_IMAGE_NAME":"$IMAGE_VERSION" diff --git a/services/healthcheck/constants.sh b/services/healthcheck/constants.sh new file mode 100644 index 0000000..4c1a8ff --- /dev/null +++ b/services/healthcheck/constants.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export APP_NAME="healthcheck" +export APP_CONTAINER_NAME=$APP_NAME-app +export APP_IMAGE_NAME=$CONTAINER_NAME_PREFIX-$APP_CONTAINER_NAME diff --git a/services/healthcheck/healthcheck/__init__.py b/services/healthcheck/healthcheck/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/healthcheck/healthcheck/conftest.py b/services/healthcheck/healthcheck/conftest.py new file mode 100644 index 0000000..63d9ed6 --- /dev/null +++ b/services/healthcheck/healthcheck/conftest.py @@ -0,0 +1,42 @@ +""" +Shared fixtures. +""" + +import json + +import pytest + + +@pytest.fixture +def anyio_backend(): + """Sets the default anyio backend.""" + return "asyncio" + + +@pytest.fixture(name="mock_configuration") +def f_mock_configuration(): + """Sample configuration""" + return { + "endpoints": {"test-service": "http://test.local"}, + "webhook_url": "http://webhook.local", + "check_interval": 0.1, + } + + +@pytest.fixture(name="set_up_mock_configuration") +def f_set_up_mock_configuration(monkeypatch, tmp_path, mock_configuration): + """ + Initializes a file with the sample configuration data and sets the + HEALTHCHECK_CONFIG_PATH variable to point to it. + """ + + def _fixture(): + config_path = tmp_path / "config.json" + + config_path.write_text(json.dumps(mock_configuration)) + + monkeypatch.setenv("HEALTHCHECK_CONFIG_PATH", str(config_path)) + + return config_path + + return _fixture diff --git a/services/healthcheck/healthcheck/main.py b/services/healthcheck/healthcheck/main.py new file mode 100644 index 0000000..e82beef --- /dev/null +++ b/services/healthcheck/healthcheck/main.py @@ -0,0 +1,55 @@ +""" +Healthcheck reporting service. + +This periodically checks if certain URLs respond and +reports on those responses. +""" + +import asyncio +import contextlib +import logging + +import fastapi + +from healthcheck.tasks import report_on_statuses +from healthcheck.use_cases import check_all_statuses, load_configuration + +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger(__name__) +background_tasks = set() + + +@contextlib.asynccontextmanager +async def lifespan(_): + """Starts and stops asynchronous tasks on application lifecycle.""" + status_checks = asyncio.create_task(report_on_statuses()) + background_tasks.add(status_checks) + logger.info("Started reporting loop.") + yield + + for task in background_tasks: + task.cancel() + + +app = fastapi.FastAPI(lifespan=lifespan) + + +@app.get("/") +def alive(): + """Is the application alive?""" + return 200 + + +@app.get("/config") +def configuration(): + """Check available configuration""" + return fastapi.responses.JSONResponse(load_configuration().model_dump()) + + +@app.get("/status") +def check_status(): + """Checks endpoints respond""" + config = load_configuration() + + return check_all_statuses(config.endpoints) diff --git a/services/healthcheck/healthcheck/main_test.py b/services/healthcheck/healthcheck/main_test.py new file mode 100644 index 0000000..69cbdd6 --- /dev/null +++ b/services/healthcheck/healthcheck/main_test.py @@ -0,0 +1,52 @@ +""" +Test coverage for endpoints and use cases. +""" + +import pytest +from fastapi.testclient import TestClient + +from healthcheck.main import app + + +@pytest.fixture(name="client") +def f_client(): + """Test HTTP client.""" + return TestClient(app) + + +def test_alive_check_returns_200(client): + """Alive check returns 200.""" + response = client.get("/") + + assert response.status_code == 200 + + +def test_check_configuration_returns_config_and_200( + client, set_up_mock_configuration, mock_configuration +): + """Check configuration returns the loaded configuration.""" + set_up_mock_configuration() + + response = client.get("/config") + + assert response.json() == mock_configuration + + +def test_check_status_returns_status_summary( + httpx_mock, client, set_up_mock_configuration +): + """ + Check status endpoint checks all configured endpoints and reports on + availability. + """ + set_up_mock_configuration() + + httpx_mock.add_response(url="http://test.local", status_code=200) + + response = client.get("/status") + + assert response.status_code == 200 + + response_body = response.json() + + assert response_body == {"test-service": True} diff --git a/services/healthcheck/healthcheck/tasks.py b/services/healthcheck/healthcheck/tasks.py new file mode 100644 index 0000000..9f8f2bd --- /dev/null +++ b/services/healthcheck/healthcheck/tasks.py @@ -0,0 +1,41 @@ +""" +Defines asynchronous tasks loaded on startup. +""" + +import logging +import time +import typing + +from healthcheck.use_cases import check_all_statuses, load_configuration, post_message + +logger = logging.getLogger(__name__) + + +async def report_on_statuses(*, max_iterations: typing.Optional[int] = None): + """ + Reports on all registered services. + """ + + config = load_configuration() + + iterations = 0 + + while max_iterations is None or iterations < max_iterations: + statuses = check_all_statuses(config.endpoints) + + message_lines = [] + + for service, status in statuses.items(): + if status: + message_lines.append(f"✅ {service} is healthy.") + else: + message_lines.append(f"🔥 {service} is not responding normally.") + + message = "\n".join(message_lines) + + logger.info(message) + + post_message(config.webhook_url, message) + iterations += 1 + + time.sleep(config.check_interval) diff --git a/services/healthcheck/healthcheck/tasks_test.py b/services/healthcheck/healthcheck/tasks_test.py new file mode 100644 index 0000000..5076245 --- /dev/null +++ b/services/healthcheck/healthcheck/tasks_test.py @@ -0,0 +1,113 @@ +""" +Asynchronous task tests. +""" + +import json + +import pytest + +from healthcheck.tasks import report_on_statuses + +pytestmark = pytest.mark.anyio + + +async def test_report_on_statuses_pings_registered_endpoints( + httpx_mock, mock_configuration, set_up_mock_configuration +): + """Each run pings each specified service.""" + mock_url = mock_configuration["endpoints"]["test-service"] + + set_up_mock_configuration() + + httpx_mock.add_response(url=mock_url) + httpx_mock.add_response(url=mock_configuration["webhook_url"]) + + await report_on_statuses(max_iterations=1) + + requests_captured = httpx_mock.get_requests() + + get_requests = tuple( + request for request in requests_captured if request.method == "GET" + ) + + assert len(get_requests) == 1 + assert str(get_requests[0].url) == mock_url + + +async def test_report_on_statuses_posts_message_to_the_webhook_url( + httpx_mock, mock_configuration, set_up_mock_configuration +): + """Each run posts a message to the webhook for reporting.""" + mock_url = mock_configuration["endpoints"]["test-service"] + + set_up_mock_configuration() + + httpx_mock.add_response(url=mock_url) + httpx_mock.add_response(url=mock_configuration["webhook_url"]) + + await report_on_statuses(max_iterations=1) + + requests_captured = httpx_mock.get_requests() + + post_requests = tuple( + request for request in requests_captured if request.method == "POST" + ) + + # Only one webhook request is made. + assert len(post_requests) == 1 + + webhook_post = post_requests[0] + + # The request goes to the webhook URL. + assert str(webhook_post.url) == mock_configuration["webhook_url"] + + +@pytest.mark.parametrize( + "status_code, expected", + [[200, "is healthy"], [400, "is not responding normally"]], + ids=["healthy", "unhealthy"], +) +async def test_report_on_statuses_posts_message_describing_service_status( + status_code, expected, httpx_mock, mock_configuration, set_up_mock_configuration +): + """Each message describes whether the service is healthy or not.""" + mock_url = mock_configuration["endpoints"]["test-service"] + + set_up_mock_configuration() + + httpx_mock.add_response(url=mock_url, status_code=status_code) + httpx_mock.add_response(url=mock_configuration["webhook_url"]) + + await report_on_statuses(max_iterations=1) + + requests_captured = httpx_mock.get_requests() + + post_requests = tuple( + request for request in requests_captured if request.method == "POST" + ) + + webhook_post = post_requests[0] + + posted_message = json.loads(webhook_post.content) + + assert expected in posted_message["content"] + + +@pytest.mark.parametrize("iterations", [1, 10]) +async def test_report_on_statuses_runs_at_most_n_times_if_max_iterations_specified( + iterations, httpx_mock, mock_configuration, set_up_mock_configuration +): + """Iterations can be capped to a certain count.""" + mock_url = mock_configuration["endpoints"]["test-service"] + + set_up_mock_configuration() + + httpx_mock.add_response(url=mock_url) + httpx_mock.add_response(url=mock_configuration["webhook_url"]) + + await report_on_statuses(max_iterations=iterations) + + requests_captured = httpx_mock.get_requests() + + # Each iteration pings once, posts once. + assert len(requests_captured) == 2 * iterations diff --git a/services/healthcheck/healthcheck/use_cases.py b/services/healthcheck/healthcheck/use_cases.py new file mode 100644 index 0000000..f5eaef6 --- /dev/null +++ b/services/healthcheck/healthcheck/use_cases.py @@ -0,0 +1,97 @@ +""" +Business logic for endpoints and tasks. +""" + +import functools +import json +import logging +import os +import pathlib + +import httpx +import pydantic + +logger = logging.getLogger(__name__) + + +class Configuration(pydantic.BaseModel): + """Service configuration""" + + endpoints: dict[str, str] + webhook_url: str + check_interval: float + + +@functools.cache +def load_configuration() -> Configuration: + """ + Loads configuration from disk. + + If the HEALTHCHECK_CONFIG_PATH env variable is not set, raises. + If the configuration file is not valid json, raises. + If the configuration data doesn't satisfy the Configuration type, raises. + """ + raw_config_path = os.getenv("HEALTHCHECK_CONFIG_PATH") + + if not raw_config_path: + raise RuntimeError( + "No configuration path provided. HEALTHCHECK_CONFIG_PATH must be set." + ) + + config_path = pathlib.Path(raw_config_path) + + if not config_path.exists(): + raise RuntimeError(f"Configuration file does not exist at {config_path}") + + with open(config_path, "r", encoding="utf8") as config_file: + config_raw = config_file.read() + + try: + config = json.loads(config_raw) + except Exception as e: + raise RuntimeError( + "Failed to parse configuration file at {config_path}: {str(e)}" + ) from e + + return Configuration(**config) + + +def check_all_statuses(endpoints: dict[str, str]) -> dict[str, bool]: + """ + Pings all the specified endpoint and produces a mapping describing + whether the target responded with a "OK-ish" status (i.e. 2XX). + + Exceptions raised while requesting are logged and reported as failures + to check. + """ + + status_summary = {} + + for service_name, service_url in endpoints.items(): + try: + response = httpx.get(service_url) + response.raise_for_status() + except Exception: # pylint: disable=broad-except + logger.exception( + "Failed to check health of %s: (%s)", service_name, service_url + ) + status_summary[service_name] = False + else: + status_summary[service_name] = True + + return status_summary + + +def post_message(webhook_url: str, message: str): + """ + Posts a message to a Discord webhook URL. + + See https://discord.com/developers/docs/resources/webhook#execute-webhook for + payload schema. + """ + + payload = {"content": message} + + response = httpx.post(webhook_url, json=payload) + + response.raise_for_status() diff --git a/services/healthcheck/healthcheck/use_cases_test.py b/services/healthcheck/healthcheck/use_cases_test.py new file mode 100644 index 0000000..3bff39d --- /dev/null +++ b/services/healthcheck/healthcheck/use_cases_test.py @@ -0,0 +1,14 @@ +""" +Business logic tests. +""" + +from healthcheck.use_cases import load_configuration + + +def test_load_configuration(set_up_mock_configuration, mock_configuration): + """Checks that configuration can be loaded.""" + set_up_mock_configuration() + + configuration = load_configuration() + + assert configuration.model_dump() == mock_configuration diff --git a/services/healthcheck/pyproject.toml b/services/healthcheck/pyproject.toml new file mode 100644 index 0000000..5a8db28 --- /dev/null +++ b/services/healthcheck/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "healthcheck" +version = "0.0.0" +requires-python = ">= 3.12" +dependencies = [ + "fastapi", + "httpx", + "pydantic", + "uvicorn[standard]", +] + +[project.optional-dependencies] +dev = [ + "anyio", + "black", + "pylint", + "httpx", + "pytest", + "pytest-httpx", + "isort", +] + +[tool.setuptools] +packages = ["healthcheck"] + +[tool.pytest.ini_options] +pythonpath=[ + ".", + "./healthcheck", +] +python_files=[ + "*_test.py" +] + diff --git a/services/healthcheck/requirements.txt b/services/healthcheck/requirements.txt new file mode 100644 index 0000000..f6ab74d --- /dev/null +++ b/services/healthcheck/requirements.txt @@ -0,0 +1,58 @@ +annotated-types==0.6.0 + # via pydantic +anyio==4.3.0 + # via + # httpx + # starlette + # watchfiles +certifi==2024.2.2 + # via + # httpcore + # httpx +click==8.1.7 + # via uvicorn +fastapi==0.110.0 + # via healthcheck (pyproject.toml) +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.4 + # via httpx +httptools==0.6.1 + # via uvicorn +httpx==0.27.0 + # via healthcheck (pyproject.toml) +idna==3.6 + # via + # anyio + # httpx +pydantic==2.6.3 + # via + # fastapi + # healthcheck (pyproject.toml) +pydantic-core==2.16.3 + # via pydantic +python-dotenv==1.0.1 + # via uvicorn +pyyaml==6.0.1 + # via uvicorn +sniffio==1.3.1 + # via + # anyio + # httpx +starlette==0.36.3 + # via fastapi +typing-extensions==4.10.0 + # via + # fastapi + # pydantic + # pydantic-core +uvicorn[standard]==0.27.1 + # via healthcheck (pyproject.toml) +uvloop==0.19.0 + # via uvicorn +watchfiles==0.21.0 + # via uvicorn +websockets==12.0 + # via uvicorn diff --git a/services/healthcheck/requirements_dev.txt b/services/healthcheck/requirements_dev.txt new file mode 100644 index 0000000..fe4a2be --- /dev/null +++ b/services/healthcheck/requirements_dev.txt @@ -0,0 +1,134 @@ +annotated-types==0.6.0 + # via + # -c requirements.txt + # pydantic +anyio==4.3.0 + # via + # -c requirements.txt + # healthcheck (pyproject.toml) + # httpx + # starlette + # watchfiles +astroid==3.1.0 + # via pylint +black==24.2.0 + # via healthcheck (pyproject.toml) +certifi==2024.2.2 + # via + # -c requirements.txt + # httpcore + # httpx +click==8.1.7 + # via + # -c requirements.txt + # black + # uvicorn +dill==0.3.8 + # via pylint +fastapi==0.110.0 + # via + # -c requirements.txt + # healthcheck (pyproject.toml) +h11==0.14.0 + # via + # -c requirements.txt + # httpcore + # uvicorn +httpcore==1.0.4 + # via + # -c requirements.txt + # httpx +httptools==0.6.1 + # via + # -c requirements.txt + # uvicorn +httpx==0.27.0 + # via + # -c requirements.txt + # healthcheck (pyproject.toml) + # pytest-httpx +idna==3.6 + # via + # -c requirements.txt + # anyio + # httpx +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via + # healthcheck (pyproject.toml) + # pylint +mccabe==0.7.0 + # via pylint +mypy-extensions==1.0.0 + # via black +packaging==23.2 + # via + # black + # pytest +pathspec==0.12.1 + # via black +platformdirs==4.2.0 + # via + # black + # pylint +pluggy==1.4.0 + # via pytest +pydantic==2.6.3 + # via + # -c requirements.txt + # fastapi + # healthcheck (pyproject.toml) +pydantic-core==2.16.3 + # via + # -c requirements.txt + # pydantic +pylint==3.1.0 + # via healthcheck (pyproject.toml) +pytest==8.0.2 + # via + # healthcheck (pyproject.toml) + # pytest-httpx +pytest-httpx==0.30.0 + # via healthcheck (pyproject.toml) +python-dotenv==1.0.1 + # via + # -c requirements.txt + # uvicorn +pyyaml==6.0.1 + # via + # -c requirements.txt + # uvicorn +sniffio==1.3.1 + # via + # -c requirements.txt + # anyio + # httpx +starlette==0.36.3 + # via + # -c requirements.txt + # fastapi +tomlkit==0.12.4 + # via pylint +typing-extensions==4.10.0 + # via + # -c requirements.txt + # fastapi + # pydantic + # pydantic-core +uvicorn[standard]==0.27.1 + # via + # -c requirements.txt + # healthcheck (pyproject.toml) +uvloop==0.19.0 + # via + # -c requirements.txt + # uvicorn +watchfiles==0.21.0 + # via + # -c requirements.txt + # uvicorn +websockets==12.0 + # via + # -c requirements.txt + # uvicorn diff --git a/services/healthcheck/script/bootstrap.sh b/services/healthcheck/script/bootstrap.sh new file mode 100755 index 0000000..8066e30 --- /dev/null +++ b/services/healthcheck/script/bootstrap.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash + +python -m venv .venv + +. .venv/bin/activate + +pip install -U pip~=24.0 pip-tools~=7.3.0 + +pip-sync requirements.txt requirements_dev.txt diff --git a/services/healthcheck/script/lock-deps.sh b/services/healthcheck/script/lock-deps.sh new file mode 100755 index 0000000..3c22c01 --- /dev/null +++ b/services/healthcheck/script/lock-deps.sh @@ -0,0 +1,6 @@ +#!/usr/bin/bash + +PYTHON=.venv/bin/python + +$PYTHON -m piptools compile -o requirements.txt pyproject.toml --no-header \ + && $PYTHON -m piptools compile -o requirements_dev.txt --no-header --extra dev --constraint requirements.txt pyproject.toml diff --git a/services/healthcheck/start.sh b/services/healthcheck/start.sh new file mode 100755 index 0000000..4b8a6be --- /dev/null +++ b/services/healthcheck/start.sh @@ -0,0 +1,10 @@ +#!/usr/bin/bash + +source ./constants.sh + +podman run \ + --detach \ + --pod services \ + -v ./config.json:/app/config.json \ + --name "$APP_NAME" \ + "$APP_IMAGE_NAME":"$IMAGE_VERSION" diff --git a/services/healthcheck/stop.sh b/services/healthcheck/stop.sh new file mode 100644 index 0000000..818f1cd --- /dev/null +++ b/services/healthcheck/stop.sh @@ -0,0 +1,5 @@ +#!/usr/bin/bash + +source ./constants.sh + +podman rm -f $APP_NAME