Observability is the ability to understand your system's internal state from its external outputs. As systems become more distributed, observability becomes critical for debugging, performance optimization, and incident response. This guide covers the three pillars: logs, metrics, and traces.
Structured Logging
// Structured logging with context
import pino from 'pino';
const logger = pino({
level: process.env.LOG_LEVEL || 'info',
formatters: {
level: (label) => ({ level: label }),
},
timestamp: pino.stdTimeFunctions.isoTime,
base: {
service: 'api-service',
version: process.env.APP_VERSION,
environment: process.env.NODE_ENV,
},
});
// Request context logger
function createRequestLogger(req: Request) {
return logger.child({
requestId: req.headers.get('x-request-id') || crypto.randomUUID(),
traceId: req.headers.get('x-trace-id'),
userId: req.headers.get('x-user-id'),
path: new URL(req.url).pathname,
method: req.method,
});
}
// Usage in API route
export async function POST(req: Request) {
const log = createRequestLogger(req);
log.info('Processing order request');
try {
const order = await createOrder(req);
log.info({
event: 'order_created',
orderId: order.id,
amount: order.total,
itemCount: order.items.length,
}, 'Order created successfully');
return Response.json(order);
} catch (error) {
log.error({
event: 'order_creation_failed',
error: error.message,
stack: error.stack,
}, 'Failed to create order');
return Response.json({ error: 'Failed to create order' }, { status: 500 });
}
}
// Log levels guide
// error: Application errors requiring attention
// warn: Unexpected behavior that doesn't break functionality
// info: Important business events (user signup, order placed)
// debug: Detailed information for debuggingMetrics Collection
// Prometheus metrics with prom-client
import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client';
const register = new Registry();
collectDefaultMetrics({ register });
// Request metrics
const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status'],
registers: [register],
});
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'path'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
registers: [register],
});
// Business metrics
const ordersTotal = new Counter({
name: 'orders_total',
help: 'Total orders placed',
labelNames: ['status'],
registers: [register],
});
const orderValue = new Histogram({
name: 'order_value_dollars',
help: 'Order value in dollars',
buckets: [10, 50, 100, 500, 1000, 5000],
registers: [register],
});
const activeUsers = new Gauge({
name: 'active_users',
help: 'Number of active users',
registers: [register],
});
// Middleware to track requests
export function metricsMiddleware(req: Request, res: Response, next: () => void) {
const start = Date.now();
const path = new URL(req.url).pathname;
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequestsTotal.inc({
method: req.method,
path: normalizePath(path),
status: res.status,
});
httpRequestDuration.observe(
{ method: req.method, path: normalizePath(path) },
duration
);
});
next();
}
// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});Distributed Tracing
// OpenTelemetry setup for distributed tracing
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
// Initialize SDK
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION,
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV,
}),
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
}),
instrumentations: [getNodeAutoInstrumentations()],
});
sdk.start();
// Manual instrumentation
const tracer = trace.getTracer('api-service');
async function processOrder(orderId: string) {
return tracer.startActiveSpan('processOrder', async (span) => {
try {
span.setAttribute('order.id', orderId);
// Child span for payment
const paymentResult = await tracer.startActiveSpan(
'processPayment',
async (paymentSpan) => {
const result = await paymentService.charge(orderId);
paymentSpan.setAttribute('payment.status', result.status);
paymentSpan.end();
return result;
}
);
// Child span for inventory
await tracer.startActiveSpan('updateInventory', async (inventorySpan) => {
await inventoryService.reserve(orderId);
inventorySpan.end();
});
span.setAttribute('order.status', 'completed');
span.setStatus({ code: SpanStatusCode.OK });
return { success: true };
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
span.recordException(error);
throw error;
} finally {
span.end();
}
});
}
// Propagate context to downstream services
async function callDownstreamService(endpoint: string, data: any) {
const headers = {};
// Inject trace context into headers
propagation.inject(context.active(), headers);
return fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...headers, // Includes traceparent header
},
body: JSON.stringify(data),
});
}Alerting Strategy
# Prometheus alerting rules
groups:
- name: api-alerts
rules:
# High error rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
# Slow responses
- alert: SlowResponses
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "P95 latency above 2 seconds"
description: "P95 latency is {{ $value }}s"
# Service down
- alert: ServiceDown
expr: up{job="api-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
# High memory usage
- alert: HighMemoryUsage
expr: |
process_resident_memory_bytes / 1024 / 1024 / 1024 > 4
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}GB"Observability Stack
Recommended Observability Stack
Logging:
- Collection: Fluent Bit / Vector
- Storage: Elasticsearch / Loki
- UI: Kibana / Grafana
Metrics:
- Collection: Prometheus / Victoria Metrics
- Storage: Prometheus / Thanos
- UI: Grafana
Tracing:
- Collection: OpenTelemetry Collector
- Storage: Jaeger / Tempo
- UI: Jaeger UI / Grafana
All-in-one Options:
- Datadog, New Relic, Dynatrace (commercial)
- Grafana Cloud (managed open source)
Conclusion
Effective observability requires combining logs, metrics, and traces to get a complete picture of your system. Start with structured logging, add key metrics, and implement distributed tracing as your system grows. The investment in observability pays off significantly during incidents.
Need help implementing observability? Contact Jishu Labs for expert DevOps consulting and monitoring setup.
About David Kumar
David Kumar is the DevOps Lead at Jishu Labs with extensive experience in building observable systems at scale.