Golang Microservices Architecture

After building and maintaining several high-traffic microservices systems in Go for companies processing billions of requests monthly, I've learned that the difference between a microservice that works and one that works in production comes down to handling the details that textbooks skip.

Why Go for Microservices?

Go isn't just fast—it's predictable. When you're running 50 microservices handling millions of requests, predictability matters more than raw speed. Here's what makes Go special:

  • Goroutines: Handle 100,000 concurrent connections on modest hardware
  • Static Compilation: Deploy a single binary, no runtime dependencies
  • Fast Startup: Services boot in milliseconds, perfect for auto-scaling
  • Built-in Concurrency: Channels and goroutines make parallel processing intuitive

Architecture Pattern: The Service Mesh

Service Structure

// cmd/user-service/main.go
package main

import (
    "context"
    "os"
    "os/signal"
    "syscall"
    "time"
    
    "github.com/yourorg/user-service/internal/config"
    "github.com/yourorg/user-service/internal/server"
    "github.com/yourorg/user-service/pkg/logger"
)

func main() {
    // Load configuration
    cfg, err := config.Load()
    if err != nil {
        logger.Fatal("Failed to load config", "error", err)
    }
    
    // Initialize service
    srv := server.New(cfg)
    
    // Graceful shutdown handling
    quit := make(chan os.Signal, 1)
    signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
    
    // Start server in goroutine
    go func() {
        if err := srv.Start(); err != nil {
            logger.Error("Server failed", "error", err)
            quit <- syscall.SIGTERM
        }
    }()
    
    // Wait for shutdown signal
    <-quit
    logger.Info("Shutting down server...")
    
    // Graceful shutdown with timeout
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()
    
    if err := srv.Shutdown(ctx); err != nil {
        logger.Fatal("Server forced to shutdown", "error", err)
    }
    
    logger.Info("Server exited")
}

Robust Error Handling

Custom Error Types with Context

// pkg/errors/errors.go
package errors

import (
    "fmt"
    "runtime"
)

type ServiceError struct {
    Code       string
    Message    string
    StatusCode int
    Err        error
    File       string
    Line       int
}

func (e *ServiceError) Error() string {
    if e.Err != nil {
        return fmt.Sprintf("%s: %v [%s:%d]", e.Message, e.Err, e.File, e.Line)
    }
    return fmt.Sprintf("%s [%s:%d]", e.Message, e.File, e.Line)
}

func New(code, message string, statusCode int, err error) *ServiceError {
    _, file, line, _ := runtime.Caller(1)
    return &ServiceError{
        Code:       code,
        Message:    message,
        StatusCode: statusCode,
        Err:        err,
        File:       file,
        Line:       line,
    }
}

// Usage in handlers
func (h *Handler) GetUser(c *gin.Context) {
    userID := c.Param("id")
    
    user, err := h.userRepo.FindByID(c.Request.Context(), userID)
    if err != nil {
        serviceErr := errors.New(
            "USER_NOT_FOUND",
            "User not found",
            404,
            err,
        )
        c.JSON(serviceErr.StatusCode, gin.H{
            "error": serviceErr.Code,
            "message": serviceErr.Message,
        })
        return
    }
    
    c.JSON(200, user)
}

Inter-Service Communication

gRPC for High-Performance Communication

// proto/user/user.proto
syntax = "proto3";

package user;

service UserService {
    rpc GetUser(GetUserRequest) returns (UserResponse);
    rpc ValidateToken(ValidateTokenRequest) returns (ValidateTokenResponse);
}

message GetUserRequest {
    string user_id = 1;
}

message UserResponse {
    string id = 1;
    string email = 2;
    string name = 3;
    int64 created_at = 4;
}
// internal/grpc/client.go
package grpc

import (
    "context"
    "time"
    
    "google.golang.org/grpc"
    "google.golang.org/grpc/credentials/insecure"
    pb "github.com/yourorg/user-service/proto/user"
)

type UserClient struct {
    client pb.UserServiceClient
    conn   *grpc.ClientConn
}

func NewUserClient(addr string) (*UserClient, error) {
    conn, err := grpc.Dial(addr,
        grpc.WithTransportCredentials(insecure.NewCredentials()),
        grpc.WithTimeout(5*time.Second),
        grpc.WithBlock(),
    )
    if err != nil {
        return nil, err
    }
    
    return &UserClient{
        client: pb.NewUserServiceClient(conn),
        conn:   conn,
    }, nil
}

func (c *UserClient) GetUser(ctx context.Context, userID string) (*pb.UserResponse, error) {
    ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
    defer cancel()
    
    return c.client.GetUser(ctx, &pb.GetUserRequest{
        UserId: userID,
    })
}

func (c *UserClient) Close() error {
    return c.conn.Close()
}

Observability: Logging, Metrics, and Tracing

Structured Logging with Zerolog

// pkg/logger/logger.go
package logger

import (
    "os"
    "github.com/rs/zerolog"
)

var Log zerolog.Logger

func Init(serviceName string) {
    zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
    
    Log = zerolog.New(os.Stdout).With().
        Timestamp().
        Str("service", serviceName).
        Logger()
}

// Usage in handlers
func (h *Handler) ProcessOrder(ctx context.Context, orderID string) error {
    log := logger.Log.With().
        Str("order_id", orderID).
        Str("trace_id", getTraceID(ctx)).
        Logger()
    
    log.Info().Msg("Processing order started")
    
    err := h.orderService.Process(ctx, orderID)
    if err != nil {
        log.Error().Err(err).Msg("Failed to process order")
        return err
    }
    
    log.Info().Msg("Order processed successfully")
    return nil
}

Prometheus Metrics

// pkg/metrics/metrics.go
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    RequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "Duration of HTTP requests",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint", "status"},
    )
    
    ActiveConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Number of active connections",
        },
    )
    
    CacheHitRate = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "cache_requests_total",
            Help: "Total cache requests",
        },
        []string{"status"}, // hit or miss
    )
)

// Middleware for automatic metrics collection
func MetricsMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        
        // Create response writer wrapper to capture status code
        wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
        
        next.ServeHTTP(wrapped, r)
        
        duration := time.Since(start).Seconds()
        RequestDuration.WithLabelValues(
            r.Method,
            r.URL.Path,
            strconv.Itoa(wrapped.statusCode),
        ).Observe(duration)
    })
}

Database Connection Pooling

// internal/database/pool.go
package database

import (
    "database/sql"
    "time"
    _ "github.com/lib/pq"
)

func NewPool(dsn string) (*sql.DB, error) {
    db, err := sql.Open("postgres", dsn)
    if err != nil {
        return nil, err
    }
    
    // Critical for production performance
    db.SetMaxOpenConns(25)                  // Max connections
    db.SetMaxIdleConns(25)                  // Keep connections alive
    db.SetConnMaxLifetime(5 * time.Minute)  // Recycle old connections
    db.SetConnMaxIdleTime(10 * time.Minute) // Close idle connections
    
    // Test connection
    if err := db.Ping(); err != nil {
        return nil, err
    }
    
    return db, nil
}

Circuit Breaker Pattern

// pkg/circuitbreaker/breaker.go
package circuitbreaker

import (
    "sync"
    "time"
)

type State int

const (
    StateClosed State = iota
    StateHalfOpen
    StateOpen
)

type CircuitBreaker struct {
    maxFailures  int
    resetTimeout time.Duration
    
    mu           sync.RWMutex
    state        State
    failures     int
    lastFailTime time.Time
}

func New(maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
    return &CircuitBreaker{
        maxFailures:  maxFailures,
        resetTimeout: resetTimeout,
        state:        StateClosed,
    }
}

func (cb *CircuitBreaker) Call(fn func() error) error {
    cb.mu.Lock()
    
    // Check if we should transition from Open to HalfOpen
    if cb.state == StateOpen {
        if time.Since(cb.lastFailTime) > cb.resetTimeout {
            cb.state = StateHalfOpen
        } else {
            cb.mu.Unlock()
            return ErrCircuitOpen
        }
    }
    
    cb.mu.Unlock()
    
    // Execute the function
    err := fn()
    
    cb.mu.Lock()
    defer cb.mu.Unlock()
    
    if err != nil {
        cb.failures++
        cb.lastFailTime = time.Now()
        
        if cb.failures >= cb.maxFailures {
            cb.state = StateOpen
        }
        return err
    }
    
    // Success - reset if we were in HalfOpen
    if cb.state == StateHalfOpen {
        cb.state = StateClosed
    }
    cb.failures = 0
    
    return nil
}

// Usage example
var orderServiceBreaker = circuitbreaker.New(5, 60*time.Second)

func (c *Client) PlaceOrder(ctx context.Context, order *Order) error {
    return orderServiceBreaker.Call(func() error {
        return c.orderService.Create(ctx, order)
    })
}

Deployment and CI/CD

Multi-Stage Docker Build

# Dockerfile
FROM golang:1.21-alpine AS builder

WORKDIR /build

# Copy dependency files
COPY go.mod go.sum ./
RUN go mod download

# Copy source code
COPY . .

# Build with optimizations
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo \
    -ldflags '-extldflags "-static" -s -w' \
    -o user-service ./cmd/user-service

# Final minimal image
FROM alpine:latest

RUN apk --no-cache add ca-certificates tzdata

WORKDIR /app

# Copy only the binary
COPY --from=builder /build/user-service .

# Run as non-root user
RUN adduser -D -u 1000 appuser
USER appuser

EXPOSE 8080

CMD ["./user-service"]

Kubernetes Deployment

# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: user-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: user-service
  template:
    metadata:
      labels:
        app: user-service
    spec:
      containers:
      - name: user-service
        image: your-registry/user-service:latest
        ports:
        - containerPort: 8080
        env:
        - name: DATABASE_URL
          valueFrom:
            secretKeyRef:
              name: user-service-secrets
              key: database-url
        resources:
          requests:
            memory: "64Mi"
            cpu: "100m"
          limits:
            memory: "256Mi"
            cpu: "500m"
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 30
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 10

Testing Strategies

Table-Driven Tests

// internal/service/user_test.go
func TestUserService_CreateUser(t *testing.T) {
    tests := []struct {
        name    string
        input   *CreateUserInput
        want    *User
        wantErr bool
        errCode string
    }{
        {
            name: "valid user creation",
            input: &CreateUserInput{
                Email: "test@example.com",
                Name:  "Test User",
            },
            want: &User{
                Email: "test@example.com",
                Name:  "Test User",
            },
            wantErr: false,
        },
        {
            name: "duplicate email",
            input: &CreateUserInput{
                Email: "existing@example.com",
                Name:  "Test User",
            },
            wantErr: true,
            errCode: "USER_EXISTS",
        },
    }
    
    for _, tt := range tests {
        t.Run(tt.name, func(t *testing.T) {
            svc := setupTestService(t)
            
            got, err := svc.CreateUser(context.Background(), tt.input)
            
            if tt.wantErr {
                require.Error(t, err)
                assert.Equal(t, tt.errCode, err.(*ServiceError).Code)
                return
            }
            
            require.NoError(t, err)
            assert.Equal(t, tt.want.Email, got.Email)
            assert.Equal(t, tt.want.Name, got.Name)
        })
    }
}

Performance Patterns

Connection Pooling and Reuse

// internal/client/http_client.go
package client

import (
    "net"
    "net/http"
    "time"
)

func NewHTTPClient() *http.Client {
    return &http.Client{
        Timeout: 10 * time.Second,
        Transport: &http.Transport{
            MaxIdleConns:        100,
            MaxIdleConnsPerHost: 100,
            IdleConnTimeout:     90 * time.Second,
            DialContext: (&net.Dialer{
                Timeout:   5 * time.Second,
                KeepAlive: 30 * time.Second,
            }).DialContext,
            TLSHandshakeTimeout:   5 * time.Second,
            ExpectContinueTimeout: 1 * time.Second,
        },
    }
}

Worker Pool for Concurrent Processing

// pkg/worker/pool.go
package worker

import (
    "context"
    "sync"
)

type Job func(context.Context) error

type Pool struct {
    workers int
    jobs    chan Job
    wg      sync.WaitGroup
}

func NewPool(workers int) *Pool {
    return &Pool{
        workers: workers,
        jobs:    make(chan Job, workers*2),
    }
}

func (p *Pool) Start(ctx context.Context) {
    for i := 0; i < p.workers; i++ {
        p.wg.Add(1)
        go p.worker(ctx)
    }
}

func (p *Pool) worker(ctx context.Context) {
    defer p.wg.Done()
    
    for {
        select {
        case job, ok := <-p.jobs:
            if !ok {
                return
            }
            if err := job(ctx); err != nil {
                logger.Error("Job failed", "error", err)
            }
        case <-ctx.Done():
            return
        }
    }
}

func (p *Pool) Submit(job Job) {
    p.jobs <- job
}

func (p *Pool) Shutdown() {
    close(p.jobs)
    p.wg.Wait()
}

// Usage: Process 10,000 items with 50 workers
pool := worker.NewPool(50)
pool.Start(ctx)

for _, item := range items {
    item := item // capture loop variable
    pool.Submit(func(ctx context.Context) error {
        return processItem(ctx, item)
    })
}

pool.Shutdown()

Real-World Case Study: Payment Processing System

We built a payment processing microservices system handling 50,000 transactions per minute. Here's the architecture:

Services Breakdown

  • API Gateway: Rate limiting, authentication, routing
  • Payment Service: Transaction processing
  • Notification Service: Email/SMS alerts
  • Analytics Service: Real-time reporting
  • Reconciliation Service: Daily settlement processing

Key Metrics Achieved

  • 99.99% uptime over 12 months
  • Sub-100ms P95 latency for payment processing
  • Zero data loss with event sourcing pattern
  • Auto-scaling from 3 to 50 instances during traffic spikes

What Made the Difference

// Event sourcing for audit trail
type PaymentEvent struct {
    ID          string
    PaymentID   string
    EventType   string
    Payload     json.RawMessage
    CreatedAt   time.Time
    ProcessedAt *time.Time
}

// All state changes go through events
func (s *PaymentService) ProcessPayment(ctx context.Context, payment *Payment) error {
    // Create event
    event := &PaymentEvent{
        ID:        uuid.New().String(),
        PaymentID: payment.ID,
        EventType: "payment.processing.started",
        Payload:   marshalPayment(payment),
        CreatedAt: time.Now(),
    }
    
    // Store event first (append-only, never fails)
    if err := s.eventStore.Append(ctx, event); err != nil {
        return err
    }
    
    // Process asynchronously
    s.eventBus.Publish("payment.events", event)
    
    return nil
}

Common Pitfalls and How to Avoid Them

1. Not Handling Context Cancellation

// ❌ Bad: Ignores context cancellation
func processLongRunningTask(ctx context.Context, items []Item) error {
    for _, item := range items {
        process(item) // Will continue even if client disconnects
    }
    return nil
}

// ✅ Good: Respects context
func processLongRunningTask(ctx context.Context, items []Item) error {
    for _, item := range items {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
            if err := process(item); err != nil {
                return err
            }
        }
    }
    return nil
}

2. Goroutine Leaks

// ❌ Bad: Goroutines never cleaned up
func leakyFunction() {
    for i := 0; i < 100; i++ {
        go func() {
            // This goroutine runs forever!
            for {
                doSomething()
                time.Sleep(1 * time.Second)
            }
        }()
    }
}

// ✅ Good: Controlled goroutine lifecycle
func properFunction(ctx context.Context) {
    var wg sync.WaitGroup
    
    for i := 0; i < 100; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            
            ticker := time.NewTicker(1 * time.Second)
            defer ticker.Stop()
            
            for {
                select {
                case <-ticker.C:
                    doSomething()
                case <-ctx.Done():
                    return
                }
            }
        }()
    }
    
    wg.Wait()
}

Conclusion

Building production-ready microservices in Go requires more than just understanding the language—it requires battle-tested patterns for error handling, observability, deployment, and scaling. The examples in this guide come from real production systems handling millions of requests.

Key Takeaways:

  • Use gRPC for inter-service communication when performance matters
  • Implement proper graceful shutdown for zero-downtime deployments
  • Add comprehensive observability from day one
  • Use worker pools and connection pooling to optimize resource usage
  • Test for failure scenarios, not just happy paths

Need help architecting your Go microservices? We've built and scaled systems from scratch to billions of requests. Let's discuss your project.