Cloud & DevOps16 min read875 words

Observability in 2026: Complete Guide to Logging, Monitoring, and Tracing

Master observability for production systems. Learn structured logging, metrics collection, distributed tracing, and alerting strategies for modern applications.

DK

David Kumar

Observability is the ability to understand your system's internal state from its external outputs. As systems become more distributed, observability becomes critical for debugging, performance optimization, and incident response. This guide covers the three pillars: logs, metrics, and traces.

Structured Logging

typescript
// Structured logging with context
import pino from 'pino';

const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
  },
  timestamp: pino.stdTimeFunctions.isoTime,
  base: {
    service: 'api-service',
    version: process.env.APP_VERSION,
    environment: process.env.NODE_ENV,
  },
});

// Request context logger
function createRequestLogger(req: Request) {
  return logger.child({
    requestId: req.headers.get('x-request-id') || crypto.randomUUID(),
    traceId: req.headers.get('x-trace-id'),
    userId: req.headers.get('x-user-id'),
    path: new URL(req.url).pathname,
    method: req.method,
  });
}

// Usage in API route
export async function POST(req: Request) {
  const log = createRequestLogger(req);
  
  log.info('Processing order request');
  
  try {
    const order = await createOrder(req);
    
    log.info({
      event: 'order_created',
      orderId: order.id,
      amount: order.total,
      itemCount: order.items.length,
    }, 'Order created successfully');
    
    return Response.json(order);
  } catch (error) {
    log.error({
      event: 'order_creation_failed',
      error: error.message,
      stack: error.stack,
    }, 'Failed to create order');
    
    return Response.json({ error: 'Failed to create order' }, { status: 500 });
  }
}

// Log levels guide
// error: Application errors requiring attention
// warn: Unexpected behavior that doesn't break functionality
// info: Important business events (user signup, order placed)
// debug: Detailed information for debugging

Metrics Collection

typescript
// Prometheus metrics with prom-client
import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client';

const register = new Registry();
collectDefaultMetrics({ register });

// Request metrics
const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'path', 'status'],
  registers: [register],
});

const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'HTTP request duration in seconds',
  labelNames: ['method', 'path'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
  registers: [register],
});

// Business metrics
const ordersTotal = new Counter({
  name: 'orders_total',
  help: 'Total orders placed',
  labelNames: ['status'],
  registers: [register],
});

const orderValue = new Histogram({
  name: 'order_value_dollars',
  help: 'Order value in dollars',
  buckets: [10, 50, 100, 500, 1000, 5000],
  registers: [register],
});

const activeUsers = new Gauge({
  name: 'active_users',
  help: 'Number of active users',
  registers: [register],
});

// Middleware to track requests
export function metricsMiddleware(req: Request, res: Response, next: () => void) {
  const start = Date.now();
  const path = new URL(req.url).pathname;
  
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    
    httpRequestsTotal.inc({
      method: req.method,
      path: normalizePath(path),
      status: res.status,
    });
    
    httpRequestDuration.observe(
      { method: req.method, path: normalizePath(path) },
      duration
    );
  });
  
  next();
}

// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Distributed Tracing

typescript
// OpenTelemetry setup for distributed tracing
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { trace, context, SpanStatusCode } from '@opentelemetry/api';

// Initialize SDK
const sdk = new NodeSDK({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
    [SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION,
    [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV,
  }),
  traceExporter: new OTLPTraceExporter({
    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
  }),
  instrumentations: [getNodeAutoInstrumentations()],
});

sdk.start();

// Manual instrumentation
const tracer = trace.getTracer('api-service');

async function processOrder(orderId: string) {
  return tracer.startActiveSpan('processOrder', async (span) => {
    try {
      span.setAttribute('order.id', orderId);
      
      // Child span for payment
      const paymentResult = await tracer.startActiveSpan(
        'processPayment',
        async (paymentSpan) => {
          const result = await paymentService.charge(orderId);
          paymentSpan.setAttribute('payment.status', result.status);
          paymentSpan.end();
          return result;
        }
      );
      
      // Child span for inventory
      await tracer.startActiveSpan('updateInventory', async (inventorySpan) => {
        await inventoryService.reserve(orderId);
        inventorySpan.end();
      });
      
      span.setAttribute('order.status', 'completed');
      span.setStatus({ code: SpanStatusCode.OK });
      
      return { success: true };
    } catch (error) {
      span.setStatus({
        code: SpanStatusCode.ERROR,
        message: error.message,
      });
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  });
}

// Propagate context to downstream services
async function callDownstreamService(endpoint: string, data: any) {
  const headers = {};
  
  // Inject trace context into headers
  propagation.inject(context.active(), headers);
  
  return fetch(endpoint, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
      ...headers,  // Includes traceparent header
    },
    body: JSON.stringify(data),
  });
}

Alerting Strategy

yaml
# Prometheus alerting rules
groups:
  - name: api-alerts
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) 
          / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      # Slow responses
      - alert: SlowResponses
        expr: |
          histogram_quantile(0.95, 
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "P95 latency above 2 seconds"
          description: "P95 latency is {{ $value }}s"

      # Service down
      - alert: ServiceDown
        expr: up{job="api-service"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"

      # High memory usage
      - alert: HighMemoryUsage
        expr: |
          process_resident_memory_bytes / 1024 / 1024 / 1024 > 4
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value }}GB"

Observability Stack

Recommended Observability Stack

Logging:

- Collection: Fluent Bit / Vector

- Storage: Elasticsearch / Loki

- UI: Kibana / Grafana

Metrics:

- Collection: Prometheus / Victoria Metrics

- Storage: Prometheus / Thanos

- UI: Grafana

Tracing:

- Collection: OpenTelemetry Collector

- Storage: Jaeger / Tempo

- UI: Jaeger UI / Grafana

All-in-one Options:

- Datadog, New Relic, Dynatrace (commercial)

- Grafana Cloud (managed open source)

Conclusion

Effective observability requires combining logs, metrics, and traces to get a complete picture of your system. Start with structured logging, add key metrics, and implement distributed tracing as your system grows. The investment in observability pays off significantly during incidents.

Need help implementing observability? Contact Jishu Labs for expert DevOps consulting and monitoring setup.

DK

About David Kumar

David Kumar is the DevOps Lead at Jishu Labs with extensive experience in building observable systems at scale.

Related Articles

Ready to Build Your Next Project?

Let's discuss how our expert team can help bring your vision to life.

Top-Rated
Software Development
Company

Ready to Get Started?

Get consistent results. Collaborate in real-time.
Build Intelligent Apps. Work with Jishu Labs.

SCHEDULE MY CALL