
Без мониторинга вы узнаете о проблемах от пользователей. С правильным мониторингом — раньше них. В этом руководстве настроим enterprise-grade систему наблюдения за Django-приложением: от базовых error tracking до distributed tracing и predictive alerting.
Типичный день без мониторинга:
Тот же сценарий с мониторингом:
ROI очевиден: Мониторинг окупается с первого серьезного инцидента.
┌─────────────────────────────────────────────────────────┐
│ Django Application │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌─────────┐ │
│ │ Views │ │ Models │ │ Celery │ │ APIs │ │
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬────┘ │
│ │ │ │ │ │
│ └─────────────┴─────────────┴─────────────┘ │
│ │ │
│ ┌───────────────┼───────────────┐ │
│ ▼ ▼ ▼ │
│ ┌────────┐ ┌──────────┐ ┌──────────┐ │
│ │ Sentry │ │Prometheus│ │ Logging │ │
│ │ SDK │ │ Metrics │ │ Handler │ │
│ └────┬───┘ └────┬─────┘ └────┬─────┘ │
└─────────┼─────────────┼───────────────┼────────────────┘
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────────┐
│ Sentry │ │Prometheus│ │ Loki / ELK │
│ Cloud │ │ Server │ │ (Logs) │
└────┬─────┘ └────┬─────┘ └──────┬───────┘
│ │ │
│ ┌─────▼─────┐ │
│ │ Grafana │◄─────────┘
│ └─────┬─────┘
│ │
│ ┌─────▼────────┐
└──────►│ Alertmanager │
└──────┬───────┘
│
┌─────────┼─────────┐
▼ ▼ ▼
Slack PagerDuty Telegram
pip install sentry-sdk[django,celery,redis,sqlalchemy]
# settings/production.py
import os
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration
from sentry_sdk.integrations.celery import CeleryIntegration
from sentry_sdk.integrations.redis import RedisIntegration
from sentry_sdk.integrations.logging import LoggingIntegration
from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
# Sentry Logging Integration
sentry_logging = LoggingIntegration(
level=logging.INFO, # Capture info and above as breadcrumbs
event_level=logging.ERROR # Send errors as events
)
def traces_sampler(sampling_context):
"""
Динамическое семплирование для оптимизации затрат
"""
# Health check endpoints не трассируем
if sampling_context.get("asgi_scope", {}).get("path") in ["/health/", "/readiness/"]:
return 0.0
# API endpoints трассируем чаще
if sampling_context.get("asgi_scope", {}).get("path", "").startswith("/api/"):
return 0.5 # 50%
# Admin панель реже
if sampling_context.get("asgi_scope", {}).get("path", "").startswith("/admin/"):
return 0.1 # 10%
# Все остальное
return 0.1 # 10%
def before_send(event, hint):
"""
Фильтрация и обогащение событий перед отправкой
"""
# 1. Игнорировать известный шум
if 'exc_info' in hint:
exc_type, exc_value, tb = hint['exc_info']
# Игнорировать DisallowedHost (частые атаки)
if isinstance(exc_value, DisallowedHost):
return None
# Игнорировать SuspiciousOperation
if isinstance(exc_value, SuspiciousOperation):
return None
# 2. Игнорировать известных ботов
request_data = event.get('request', {})
user_agent = request_data.get('headers', {}).get('User-Agent', '')
bot_patterns = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget']
if any(pattern in user_agent.lower() for pattern in bot_patterns):
return None
# 3. Игнорировать 404 на известных путях
if request_data.get('url', '').endswith(('favicon.ico', 'robots.txt', '.env')):
return None
# 4. Маскировать чувствительные данные
if 'request' in event and 'data' in event['request']:
sensitive_keys = ['password', 'token', 'secret', 'credit_card', 'ssn']
for key in sensitive_keys:
if key in event['request']['data']:
event['request']['data'][key] = '***REDACTED***'
# 5. Добавить deployment информацию
event.setdefault('tags', {})
event['tags']['deployment_id'] = os.getenv('DEPLOYMENT_ID', 'unknown')
event['tags']['k8s_pod'] = os.getenv('HOSTNAME', 'unknown')
return event
def before_send_transaction(event, hint):
"""
Фильтрация performance транзакций
"""
# Не отправлять очень быстрые транзакции (< 100ms)
if event.get('start_timestamp') and event.get('timestamp'):
duration = event['timestamp'] - event['start_timestamp']
if duration < 0.1: # 100ms
return None
return event
# Основная инициализация Sentry
sentry_sdk.init(
dsn=os.getenv('SENTRY_DSN'),
# Integrations
integrations=[
DjangoIntegration(
transaction_style='url', # Группировать по URL pattern, не конкретным ID
middleware_spans=True, # Трассировка каждого middleware
signals_spans=True, # Трассировка Django signals
),
CeleryIntegration(
monitor_beat_tasks=True, # Мониторинг Celery Beat
),
RedisIntegration(),
sentry_logging,
],
# Environment
environment=os.getenv('SENTRY_ENVIRONMENT', 'production'),
release=os.getenv('GIT_COMMIT_SHA', 'unknown'), # Из CI/CD
# Performance Monitoring (APM)
traces_sampler=traces_sampler,
# Profiling (beta, требует дополнительную квоту)
profiles_sample_rate=0.1,
# Error sampling
sample_rate=1.0, # Отправлять 100% ошибок
# Hooks
before_send=before_send,
before_send_transaction=before_send_transaction,
# Privacy
send_default_pii=False, # Не отправлять PII автоматически
# Performance
max_breadcrumbs=50, # Максимум breadcrumbs на событие
attach_stacktrace=True, # Всегда прикреплять stacktrace
# Request body
max_request_body_size='medium', # always, never, small, medium
# Debug (только для staging)
debug=os.getenv('SENTRY_DEBUG', 'false').lower() == 'true',
)
# middleware.py
from sentry_sdk import set_user, set_tag, set_context
import time
class SentryContextMiddleware:
"""
Обогащает Sentry события контекстом запроса
"""
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
# Добавить пользователя
if request.user.is_authenticated:
set_user({
"id": request.user.id,
"email": request.user.email,
"username": request.user.username,
})
else:
set_user({"id": None, "ip_address": self.get_client_ip(request)})
# Добавить теги для фильтрации
set_tag("request_method", request.method)
set_tag("request_path", request.path)
# Бизнес контекст
if hasattr(request.user, 'subscription_tier'):
set_tag("subscription_tier", request.user.subscription_tier)
# Географический контекст
set_tag("country", request.headers.get("CF-IPCountry", "unknown"))
set_tag("user_agent_browser", self.parse_browser(request))
# Дополнительный контекст
set_context("request_info", {
"url": request.build_absolute_uri(),
"method": request.method,
"path": request.path,
"query_string": request.META.get("QUERY_STRING", ""),
"referer": request.META.get("HTTP_REFERER", ""),
"content_type": request.content_type,
})
# HTTP заголовки (без чувствительных данных)
safe_headers = {
k: v for k, v in request.headers.items()
if k.lower() not in ['authorization', 'cookie', 'x-api-key']
}
set_context("headers", safe_headers)
response = self.get_response(request)
# Добавить response контекст
set_tag("response_status", response.status_code)
return response
def get_client_ip(self, request):
"""Получить реальный IP клиента"""
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
if x_forwarded_for:
return x_forwarded_for.split(',')[0].strip()
return request.META.get('REMOTE_ADDR')
def parse_browser(self, request):
"""Простой парсинг браузера из User-Agent"""
ua = request.META.get('HTTP_USER_AGENT', '').lower()
if 'chrome' in ua:
return 'Chrome'
elif 'firefox' in ua:
return 'Firefox'
elif 'safari' in ua:
return 'Safari'
elif 'edge' in ua:
return 'Edge'
return 'Other'
# settings.py
MIDDLEWARE = [
# ...
'myapp.middleware.SentryContextMiddleware',
# ...
]
# views.py
from sentry_sdk import add_breadcrumb
import logging
logger = logging.getLogger(__name__)
def checkout_flow(request):
"""
Пример сложного флоу с детальным трейсингом
"""
# Шаг 1: Начало checkout
add_breadcrumb(
category='checkout',
message='Checkout flow started',
level='info',
data={
'cart_items': len(request.session.get('cart', [])),
'user_id': request.user.id if request.user.is_authenticated else None,
}
)
# Шаг 2: Валидация корзины
try:
cart = validate_cart(request)
add_breadcrumb(
category='checkout',
message='Cart validated',
level='info',
data={'total': float(cart.total), 'items': cart.count}
)
except CartValidationError as e:
add_breadcrumb(
category='checkout',
message='Cart validation failed',
level='error',
data={'error': str(e)}
)
# Ошибка автоматически попадет в Sentry с полным контекстом
raise
# Шаг 3: Применение промокода
if promo_code := request.POST.get('promo_code'):
add_breadcrumb(
category='checkout',
message=f'Applying promo code: {promo_code}',
level='info',
)
discount = apply_promo_code(cart, promo_code)
add_breadcrumb(
category='checkout',
message='Promo code applied',
level='info',
data={'discount': float(discount), 'code': promo_code}
)
# Шаг 4: Обработка платежа
payment_method = request.POST.get('payment_method')
add_breadcrumb(
category='payment',
message=f'Payment attempt with {payment_method}',
level='info',
data={'amount': float(cart.total), 'method': payment_method}
)
try:
transaction = process_payment(cart, payment_method)
add_breadcrumb(
category='payment',
message='Payment successful',
level='info',
data={
'transaction_id': transaction.id,
'amount': float(transaction.amount),
'method': payment_method
}
)
except PaymentError as e:
# При ошибке Sentry получит весь путь пользователя
add_breadcrumb(
category='payment',
message='Payment failed',
level='error',
data={
'error': str(e),
'error_code': e.code if hasattr(e, 'code') else None,
'amount': float(cart.total),
}
)
raise
# Шаг 5: Создание заказа
order = create_order(cart, transaction)
add_breadcrumb(
category='checkout',
message='Order created',
level='info',
data={'order_id': order.id, 'amount': float(order.total)}
)
logger.info(
"Checkout completed successfully",
extra={
'order_id': order.id,
'user_id': request.user.id,
'amount': float(order.total),
}
)
return redirect('order_success', order_id=order.id)
# Автоматическое трассирование Django views
# Включено через DjangoIntegration
# Кастомное трассирование для сложных операций
import sentry_sdk
def data_import_task():
"""
Celery задача с детальным performance профилированием
"""
# Создаем транзакцию
with sentry_sdk.start_transaction(
op="task",
name="data_import_task",
description="Import data from external API"
) as transaction:
# Span 1: Загрузка данных
with sentry_sdk.start_span(
op="http",
description="Fetch data from external API"
) as span:
span.set_tag("api_endpoint", "https://api.example.com/data")
response = requests.get("https://api.example.com/data")
span.set_data("response_size", len(response.content))
span.set_data("status_code", response.status_code)
data = response.json()
# Span 2: Парсинг и валидация
with sentry_sdk.start_span(
op="process",
description="Parse and validate data"
) as span:
validated_items = []
errors = []
for item in data:
try:
validated_items.append(validate_item(item))
except ValidationError as e:
errors.append(str(e))
span.set_data("total_items", len(data))
span.set_data("valid_items", len(validated_items))
span.set_data("invalid_items", len(errors))
# Span 3: Bulk insert в БД
with sentry_sdk.start_span(
op="db",
description="Bulk insert to database"
) as span:
# Django автоматически трассирует SQL, но можем добавить контекст
Model.objects.bulk_create(
[Model(**item) for item in validated_items],
batch_size=1000
)
span.set_data("inserted_count", len(validated_items))
# Span 4: Отправка уведомлений
with sentry_sdk.start_span(
op="notification",
description="Send completion notifications"
) as span:
send_notification_email(
subject="Data import completed",
message=f"Imported {len(validated_items)} items"
)
# Добавляем метаданные к транзакции
transaction.set_tag("import_type", "full")
transaction.set_measurement("items_imported", len(validated_items))
transaction.set_measurement("import_errors", len(errors))
# Результат в Sentry:
# - Общее время выполнения транзакции
# - Breakdown по каждому span
# - Flamegraph визуализация
# - Медленные части выделены
# utils/sentry_helpers.py
from functools import wraps
import sentry_sdk
def trace_queryset(description=None):
"""
Декоратор для трассировки сложных database операций
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
desc = description or f"Query: {func.__name__}"
with sentry_sdk.start_span(
op="db.query",
description=desc
) as span:
result = func(*args, **kwargs)
# Добавляем метрики
if hasattr(result, 'count'):
span.set_data("result_count", result.count())
return result
return wrapper
return decorator
# Использование
@trace_queryset("Fetch active users with subscriptions")
def get_active_subscribers():
return User.objects.filter(
is_active=True,
subscription__status='active'
).select_related('subscription').prefetch_related('orders')
# В Sentry увидите точное время этого запроса
pip install django-prometheus
# settings.py
INSTALLED_APPS = [
'django_prometheus', # Должен быть первым!
# ... остальные apps
]
MIDDLEWARE = [
'django_prometheus.middleware.PrometheusBeforeMiddleware',
# ... остальные middleware в обычном порядке
'django_prometheus.middleware.PrometheusAfterMiddleware',
]
# Database с Prometheus обертками
DATABASES = {
'default': {
'ENGINE': 'django_prometheus.db.backends.postgresql',
'NAME': 'mydb',
'USER': 'myuser',
'PASSWORD': 'mypassword',
'HOST': 'localhost',
'PORT': '5432',
}
}
# Cache с Prometheus обертками
CACHES = {
'default': {
'BACKEND': 'django_prometheus.cache.backends.redis.RedisCache',
'LOCATION': 'redis://127.0.0.1:6379/1',
}
}
# urls.py
urlpatterns = [
path('', include('django_prometheus.urls')), # /metrics endpoint
# ... остальные URLs
]
# metrics.py
from prometheus_client import Counter, Gauge, Histogram, Summary, Info
from django.conf import settings
# ============================================================================
# СЧЕТЧИКИ (Counter) — только растут
# ============================================================================
# Регистрации пользователей
user_registrations_total = Counter(
'app_user_registrations_total',
'Total number of user registrations',
['tier', 'source'] # labels для фильтрации
)
# Использование
# user_registrations_total.labels(tier='free', source='web').inc()
# user_registrations_total.labels(tier='pro', source='mobile').inc()
# Заказы
orders_created_total = Counter(
'app_orders_created_total',
'Total number of orders created',
['status', 'payment_method']
)
# Ошибки приложения (дополнительно к Sentry)
app_errors_total = Counter(
'app_errors_total',
'Total application errors',
['error_type', 'endpoint']
)
# Email отправлены
emails_sent_total = Counter(
'app_emails_sent_total',
'Total emails sent',
['email_type', 'status'] # status: success/failed
)
# API вызовы к внешним сервисам
external_api_calls_total = Counter(
'app_external_api_calls_total',
'Total external API calls',
['service', 'endpoint', 'status_code']
)
# ============================================================================
# GAUGE — может расти и падать
# ============================================================================
# Активные пользователи
active_users_gauge = Gauge(
'app_active_users',
'Number of currently active users'
)
# Размер очереди Celery
celery_queue_length = Gauge(
'app_celery_queue_length',
'Length of Celery queues',
['queue_name']
)
# Использование памяти приложением
app_memory_usage_bytes = Gauge(
'app_memory_usage_bytes',
'Memory usage in bytes'
)
# Подключения к БД
database_connections = Gauge(
'app_database_connections',
'Current database connections',
['state'] # active, idle
)
# Товары в наличии
products_in_stock = Gauge(
'app_products_in_stock',
'Number of products in stock',
['category']
)
# ============================================================================
# HISTOGRAM — распределение значений
# ============================================================================
# Размер заказов
order_value_histogram = Histogram(
'app_order_value_rubles',
'Distribution of order values in rubles',
buckets=[100, 500, 1000, 2000, 5000, 10000, 50000, 100000]
)
# Время выполнения задач Celery
celery_task_duration_seconds = Histogram(
'app_celery_task_duration_seconds',
'Celery task execution time',
['task_name'],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300]
)
# Размер загружаемых файлов
file_upload_size_bytes = Histogram(
'app_file_upload_size_bytes',
'Size of uploaded files',
['file_type'],
buckets=[1024, 10240, 102400, 1024000, 10240000, 104857600] # 1KB - 100MB
)
# Время ответа внешних API
external_api_response_time_seconds = Histogram(
'app_external_api_response_time_seconds',
'External API response time',
['service'],
buckets=[0.1, 0.3, 0.5, 1, 2, 5, 10]
)
# ============================================================================
# SUMMARY — процентили без предопределенных buckets
# ============================================================================
# Размер ответов API
api_response_size_bytes = Summary(
'app_api_response_size_bytes',
'Size of API responses',
['endpoint']
)
# ============================================================================
# INFO — статическая информация о приложении
# ============================================================================
app_info = Info(
'app_version',
'Application version information'
)
# Установка версии
app_info.info({
'version': settings.VERSION,
'git_commit': settings.GIT_COMMIT_SHA,
'environment': settings.ENVIRONMENT,
'django_version': settings.DJANGO_VERSION,
})
# views.py
from .metrics import (
user_registrations_total,
orders_created_total,
order_value_histogram,
app_errors_total
)
def register_user(request):
"""Регистрация с метриками"""
try:
# Создание пользователя
user = User.objects.create_user(
username=request.POST['username'],
email=request.POST['email'],
password=request.POST['password']
)
# Инкремент метрики
source = request.POST.get('source', 'web') # web, mobile, api
user_registrations_total.labels(
tier='free',
source=source
).inc()
return JsonResponse({'status': 'success', 'user_id': user.id})
except Exception as e:
# Трекинг ошибок
app_errors_total.labels(
error_type=type(e).__name__,
endpoint='register_user'
).inc()
raise
def create_order(request):
"""Создание заказа с метриками"""
cart = get_cart(request)
payment_method = request.POST.get('payment_method')
try:
# Создание заказа
order = Order.objects.create(
user=request.user,
total=cart.total,
payment_method=payment_method
)
# Метрики
orders_created_total.labels(
status='created',
payment_method=payment_method
).inc()
order_value_histogram.observe(float(order.total))
return JsonResponse({'order_id': order.id})
except Exception as e:
orders_created_total.labels(
status='failed',
payment_method=payment_method
).inc()
raise
# celery.py
from .metrics import celery_task_duration_seconds, celery_queue_length
from celery.signals import task_prerun, task_postrun, task_failure
import time
# Хранение времени старта задачи
task_start_times = {}
@task_prerun.connect
def task_prerun_handler(sender=None, task_id=None, task=None, **kwargs):
"""Запись времени старта задачи"""
task_start_times[task_id] = time.time()
@task_postrun.connect
def task_postrun_handler(sender=None, task_id=None, task=None, **kwargs):
"""Запись времени выполнения задачи"""
if task_id in task_start_times:
duration = time.time() - task_start_times[task_id]
celery_task_duration_seconds.labels(
task_name=task.name
).observe(duration)
del task_start_times[task_id]
@task_failure.connect
def task_failure_handler(sender=None, task_id=None, exception=None, **kwargs):
"""Трекинг ошибок в задачах"""
from .metrics import app_errors_total
app_errors_total.labels(
error_type=type(exception).__name__,
endpoint=sender.name
).inc()
# Периодическая задача для мониторинга очередей
@shared_task
def monitor_celery_queues():
"""Обновление метрик длины очередей Celery"""
from celery import current_app
inspect = current_app.control.inspect()
# Получить длину очередей
active = inspect.active()
reserved = inspect.reserved()
if active:
for worker, tasks in active.items():
queue_name = worker.split('@')[0]
celery_queue_length.labels(
queue_name=queue_name
).set(len(tasks))
# Воронка продаж
from .metrics import Counter
sales_funnel_step = Counter(
'app_sales_funnel_step_total',
'Sales funnel progression',
['step']
)
# Шаг 1: Просмотр страницы товара
def product_detail_view(request, product_id):
sales_funnel_step.labels(step='product_view').inc()
# ... остальная логика
# Шаг 2: Добавление в корзину
def add_to_cart(request, product_id):
sales_funnel_step.labels(step='add_to_cart').inc()
# ... остальная логика
# Шаг 3: Начало checkout
def checkout_view(request):
sales_funnel_step.labels(step='checkout_start').inc()
# ... остальная логика
# Шаг 4: Завершение покупки
def order_complete(request):
sales_funnel_step.labels(step='purchase').inc()
# ... остальная логика
# В Grafana можно посчитать conversion rate:
# (purchase / product_view) * 100
# prometheus.yml
global:
scrape_interval: 15s # Как часто собирать метрики
evaluation_interval: 15s # Как часто проверять алерты
# Alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Правила алертов
rule_files:
- "alerts.yml"
# Targets для scraping
scrape_configs:
# Django приложение
- job_name: 'django-app'
static_configs:
- targets: ['django-app:8000']
metrics_path: '/metrics'
scrape_interval: 15s
scrape_timeout: 10s
# PostgreSQL (требует postgres_exporter)
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
# Redis (требует redis_exporter)
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
# Node metrics (CPU, Memory, Disk)
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
# Nginx (требует nginx-prometheus-exporter)
- job_name: 'nginx'
static_configs:
- targets: ['nginx-exporter:9113']
// Готовые PromQL запросы для Grafana panels
// ============================================================================
// PANEL 1: Requests Per Second (RPS)
// ============================================================================
{
"title": "Requests Per Second",
"targets": [{
"expr": "rate(django_http_requests_total_by_view_transport_method_total[1m])"
}]
}
// ============================================================================
// PANEL 2: Response Time (95th percentile)
// ============================================================================
{
"title": "Response Time (p95)",
"targets": [{
"expr": "histogram_quantile(0.95, rate(django_http_requests_latency_seconds_bucket[5m]))"
}]
}
// ============================================================================
// PANEL 3: Error Rate (5xx)
// ============================================================================
{
"title": "5xx Error Rate",
"targets": [{
"expr": "sum(rate(django_http_responses_total_by_status{status=~\"5..\"}[1m]))"
}]
}
// ============================================================================
// PANEL 4: Database Query Duration
// ============================================================================
{
"title": "Database Query Duration (p95)",
"targets": [{
"expr": "histogram_quantile(0.95, rate(django_db_query_duration_seconds_bucket[5m]))"
}]
}
// ============================================================================
// PANEL 5: Active Database Connections
// ============================================================================
{
"title": "Active DB Connections",
"targets": [{
"expr": "django_db_connections_total"
}]
}
// ============================================================================
// PANEL 6: Cache Hit Rate
// ============================================================================
{
"title": "Cache Hit Rate %",
"targets": [{
"expr": "(rate(django_cache_get_hits_total[5m]) / (rate(django_cache_get_hits_total[5m]) + rate(django_cache_get_misses_total[5m]))) * 100"
}]
}
// ============================================================================
// PANEL 7: Celery Queue Length
// ============================================================================
{
"title": "Celery Queue Length",
"targets": [{
"expr": "app_celery_queue_length"
}]
}
// ============================================================================
// PANEL 8: Memory Usage
// ============================================================================
{
"title": "Memory Usage",
"targets": [{
"expr": "process_resident_memory_bytes / 1024 / 1024 / 1024" // GB
}]
}
// ============================================================================
// PANEL 9: Sales Funnel Conversion
// ============================================================================
{
"title": "Sales Funnel Conversion Rate",
"targets": [
{
"expr": "app_sales_funnel_step_total{step=\"product_view\"}",
"legendFormat": "Product Views"
},
{
"expr": "app_sales_funnel_step_total{step=\"add_to_cart\"}",
"legendFormat": "Add to Cart"
},
{
"expr": "app_sales_funnel_step_total{step=\"checkout_start\"}",
"legendFormat": "Checkout"
},
{
"expr": "app_sales_funnel_step_total{step=\"purchase\"}",
"legendFormat": "Purchase"
}
]
}
// ============================================================================
// PANEL 10: Order Value Distribution
// ============================================================================
{
"title": "Average Order Value",
"targets": [{
"expr": "rate(app_order_value_rubles_sum[5m]) / rate(app_order_value_rubles_count[5m])"
}]
}
pip install opentelemetry-api opentelemetry-sdk
pip install opentelemetry-instrumentation-django
pip install opentelemetry-instrumentation-requests
# settings/production.py
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.django import DjangoInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
# Настройка трейсера
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)
# Экспорт в Jaeger
jaeger_exporter = JaegerExporter(
agent_host_name="jaeger",
agent_port=6831,
)
trace.get_tracer_provider().add_span_processor(
BatchSpanProcessor(jaeger_exporter)
)
# Автоинструментация Django
DjangoInstrumentor().instrument()
# Автоинструментация requests библиотеки
RequestsInstrumentor().instrument()
# Использование в коде
def complex_operation():
with tracer.start_as_current_span("complex_operation"):
# Ваш код
with tracer.start_as_current_span("database_query"):
users = User.objects.all()
with tracer.start_as_current_span("external_api_call"):
response = requests.get("https://api.example.com/data")
with tracer.start_as_current_span("data_processing"):
process_data(response.json())
# settings/production.py
import logging_loki
LOGGING = {
'version': 1,
'handlers': {
'loki': {
'class': 'logging_loki.LokiHandler',
'url': "http://loki:3100/loki/api/v1/push",
'tags': {
"application": "django-app",
"environment": "production",
},
'version': "1",
},
},
'loggers': {
'django': {
'handlers': ['loki'],
'level': 'INFO',
},
},
}
# requirements.txt
python-logstash-async
# settings/production.py
LOGGING = {
'handlers': {
'logstash': {
'class': 'logstash_async.handler.AsynchronousLogstashHandler',
'transport': 'logstash_async.transport.HttpTransport',
'host': 'logstash',
'port': 5959,
'database_path': '/tmp/logstash.db',
},
},
'loggers': {
'django': {
'handlers': ['logstash'],
'level': 'INFO',
},
},
}
Alert 1: Spike в ошибках
Number of events > 10 in 1 minutelevel:errorAlert 2: Новый тип ошибки
A new issue is createdenvironment:productionAlert 3: Regression
An issue changes state from resolved to unresolvedAlert 4: High memory usage
process.memory.rss > 2GB# alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'
# Шаблоны маршрутизации
route:
receiver: 'default'
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 5m
repeat_interval: 4h
routes:
# Critical алерты → PagerDuty
- match:
severity: critical
receiver: 'pagerduty'
continue: true
# Critical алерты также в Slack
- match:
severity: critical
receiver: 'slack-critical'
# Warning алерты только в Slack
- match:
severity: warning
receiver: 'slack-warnings'
# Receivers
receivers:
- name: 'default'
slack_configs:
- channel: '#alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
- name: 'slack-critical'
slack_configs:
- channel: '#production-critical'
title: ':fire: CRITICAL: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'
send_resolved: true
- name: 'slack-warnings'
slack_configs:
- channel: '#production-warnings'
title: ':warning: Warning: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
# Подавление дублирующихся алертов
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
# alerts.yml
groups:
- name: django_app
interval: 30s
rules:
# ======================================================================
# CRITICAL ALERTS
# ======================================================================
# Приложение недоступно
- alert: ApplicationDown
expr: up{job="django-app"} == 0
for: 1m
labels:
severity: critical
team: backend
annotations:
summary: "Django application is down"
description: "Django app {{ $labels.instance }} has been down for more than 1 minute"
# Высокий error rate
- alert: HighErrorRate
expr: |
(
sum(rate(django_http_responses_total_by_status{status=~"5.."}[5m]))
/
sum(rate(django_http_responses_total_by_status[5m]))
) > 0.05
for: 2m
labels:
severity: critical
team: backend
annotations:
summary: "High 5xx error rate (> 5%)"
description: "Error rate is {{ $value | humanizePercentage }}"
# База данных недоступна
- alert: DatabaseDown
expr: up{job="postgres"} == 0
for: 1m
labels:
severity: critical
team: infrastructure
annotations:
summary: "PostgreSQL database is down"
# Нет места на диске
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space below 10%"
description: "Only {{ $value | humanizePercentage }} disk space remaining"
# ======================================================================
# WARNING ALERTS
# ======================================================================
# Медленные запросы
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
rate(django_http_requests_latency_seconds_bucket[5m])
) > 2
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "95th percentile response time > 2s"
description: "P95 latency is {{ $value }} seconds"
# Высокое использование памяти
- alert: HighMemoryUsage
expr: |
(process_resident_memory_bytes / 1024 / 1024 / 1024) > 1.5
for: 5m
labels:
severity: warning
annotations:
summary: "Memory usage > 1.5GB"
description: "Current memory usage: {{ $value }}GB"
# Высокое использование CPU
- alert: HighCPUUsage
expr: |
rate(process_cpu_seconds_total[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "CPU usage > 80%"
# Длинная очередь Celery
- alert: CeleryQueueTooLong
expr: app_celery_queue_length > 1000
for: 10m
labels:
severity: warning
team: backend
annotations:
summary: "Celery queue length > 1000"
description: "Queue {{ $labels.queue_name }} has {{ $value }} tasks"
# Низкий cache hit rate
- alert: LowCacheHitRate
expr: |
(
rate(django_cache_get_hits_total[5m])
/
(rate(django_cache_get_hits_total[5m]) + rate(django_cache_get_misses_total[5m]))
) < 0.7
for: 10m
labels:
severity: warning
annotations:
summary: "Cache hit rate < 70%"
description: "Current hit rate: {{ $value | humanizePercentage }}"
# Падение конверсии
- alert: LowConversionRate
expr: |
(
rate(app_sales_funnel_step_total{step="purchase"}[1h])
/
rate(app_sales_funnel_step_total{step="product_view"}[1h])
) < 0.02
for: 30m
labels:
severity: warning
team: product
annotations:
summary: "Conversion rate dropped below 2%"
description: "Current conversion: {{ $value | humanizePercentage }}"
# docker-compose.monitoring.yml
version: '3.8'
services:
# ========================================================================
# Django приложение
# ========================================================================
django:
build: .
environment:
- SENTRY_DSN=${SENTRY_DSN}
- PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus_multiproc
ports:
- "8000:8000"
volumes:
- ./:/app
depends_on:
- db
- redis
# ========================================================================
# PostgreSQL
# ========================================================================
db:
image: postgres:15
environment:
POSTGRES_DB: mydb
POSTGRES_USER: myuser
POSTGRES_PASSWORD: mypassword
volumes:
- postgres_data:/var/lib/postgresql/data
# ========================================================================
# Redis
# ========================================================================
redis:
image: redis:7-alpine
# ========================================================================
# Prometheus
# ========================================================================
prometheus:
image: prom/prometheus:latest
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts.yml:/etc/prometheus/alerts.yml
- prometheus_data:/prometheus
ports:
- "9090:9090"
depends_on:
- django
# ========================================================================
# Grafana
# ========================================================================
grafana:
image: grafana/grafana:latest
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
ports:
- "3000:3000"
depends_on:
- prometheus
# ========================================================================
# Alertmanager
# ========================================================================
alertmanager:
image: prom/alertmanager:latest
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
ports:
- "9093:9093"
# ========================================================================
# Loki (Log Aggregation)
# ========================================================================
loki:
image: grafana/loki:latest
command: -config.file=/etc/loki/local-config.yaml
ports:
- "3100:3100"
volumes:
- loki_data:/loki
# ========================================================================
# Promtail (Log Shipper для Loki)
# ========================================================================
promtail:
image: grafana/promtail:latest
volumes:
- /var/log:/var/log
- ./promtail-config.yml:/etc/promtail/config.yml
command: -config.file=/etc/promtail/config.yml
depends_on:
- loki
# ========================================================================
# Exporters
# ========================================================================
# PostgreSQL Exporter
postgres-exporter:
image: prometheuscommunity/postgres-exporter
environment:
DATA_SOURCE_NAME: "postgresql://myuser:mypassword@db:5432/mydb?sslmode=disable"
ports:
- "9187:9187"
depends_on:
- db
# Redis Exporter
redis-exporter:
image: oliver006/redis_exporter
environment:
REDIS_ADDR: redis:6379
ports:
- "9121:9121"
depends_on:
- redis
# Node Exporter
node-exporter:
image: prom/node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
postgres_data:
prometheus_data:
grafana_data:
alertmanager_data:
loki_data:
Проблема: Отправка миллионов событий = огромные счета
Решение:
# Sample rate для production
traces_sample_rate=0.1 # Только 10% транзакций
# Игнорировать шум
def before_send(event, hint):
# Фильтровать боты, 404, и т.д.
return event
# ❌ Плохо: Слишком много деталей
logger.debug(f"Query result: {huge_result}")
# ✅ Хорошо: Только важная информация
logger.info("Order created", extra={'order_id': order.id, 'total': order.total})
# ❌ Плохо: Слишком много unique labels (cardinality explosion)
counter.labels(user_id=user.id).inc() # Миллионы пользователей!
# ✅ Хорошо: Ограниченный набор labels
counter.labels(tier=user.tier).inc() # free/pro/enterprise
Проблема: Слишком много алертов → игнорируются
Решение:
До мониторинга:
После мониторинга:
Proactive Problem Detection
Data-Driven Decisions
Improved Team Velocity
Cost Optimization
Симптомы: Prometheus медленный, большое использование памяти
Причина: Слишком много unique label combinations
Решение:
# ❌ Плохо
counter.labels(user_id=123, order_id=456, product_id=789).inc()
# ✅ Хорошо
counter.labels(product_category='electronics').inc()
Симптомы: "Event dropped due to quota"
Решение:
# Более агрессивное семплирование
traces_sample_rate=0.05 # 5% вместо 10%
# Более строгий before_send
def before_send(event, hint):
# Игнорировать больше шума
pass
Причина: Celery worker запущен без Sentry SDK
Решение:
# celery.py
from sentry_sdk.integrations.celery import CeleryIntegration
sentry_sdk.init(
integrations=[CeleryIntegration()],
# ...
)
Мониторинг — это не опция, а необходимость для любого production приложения.
Начните с:
Постепенно добавляйте:
Помните:
ROI: Первый предотвращенный инцидент окупит все затраты на мониторинг.
Берём проекты на поддержку с чётким SLA. Стабилизируем за 2 недели, даём план развития на 90 дней. 15+ лет опыта с Django.