Pipes and Filters (Pipeline)

# Pipeline using Python generators (pipes are just data flow)

def parse_events(raw_data):
    """Filter: Parse JSON events."""
    for line in raw_data:
        try:
            yield json.loads(line)
        except json.JSONDecodeError:
            continue  # Skip invalid JSON

def validate_events(events):
    """Filter: Validate required fields."""
    for event in events:
        if 'user_id' in event and 'action' in event and 'timestamp' in event:
            yield event

def enrich_events(events, user_db):
    """Filter: Enrich with user data."""
    for event in events:
        user = user_db.get(event['user_id'], {})
        event['user_segment'] = user.get('segment', 'unknown')
        event['user_tier'] = user.get('tier', 'free')
        yield event

def aggregate_events(events):
    """Filter: Aggregate by hour."""
    hourly_buckets = {}
    for event in events:
        hour_key = event['timestamp'][:13]  # YYYY-MM-DD HH
        if hour_key not in hourly_buckets:
            hourly_buckets[hour_key] = {'count': 0, 'actions': {}}
        hourly_buckets[hour_key]['count'] += 1
        action = event['action']
        hourly_buckets[hour_key]['actions'][action] = \
            hourly_buckets[hour_key]['actions'].get(action, 0) + 1

        yield hourly_buckets[hour_key]

def format_output(events):
    """Filter: Format as JSON for warehouse."""
    for event in events:
        yield json.dumps(event)

# Compose the pipeline
def run_pipeline(raw_data, user_db):
    parsed = parse_events(raw_data)
    validated = validate_events(parsed)
    enriched = enrich_events(validated, user_db)
    aggregated = aggregate_events(enriched)
    formatted = format_output(aggregated)
    return formatted

# Usage
with open('events.jsonl') as f:
    user_db = load_user_database()
    results = run_pipeline(f, user_db)
    for line in results:
        warehouse.insert(json.loads(line))

// Java 8+ Stream API naturally supports pipes & filters

public class EventPipeline {

    // Filter: Parse events
    static Event parseEvent(String line) {
        return objectMapper.readValue(line, Event.class);
    }

    // Filter: Validate
    static boolean isValidEvent(Event event) {
        return event.getUserId() != null &&
               event.getAction() != null &&
               event.getTimestamp() != null;
    }

    // Filter: Enrich
    static Event enrichEvent(Event event, UserDatabase userDb) {
        User user = userDb.get(event.getUserId());
        event.setUserSegment(user.getSegment());
        event.setUserTier(user.getTier());
        return event;
    }

    // Filter: Aggregate
    static AggregatedMetric aggregate(List<Event> events) {
        return events.stream()
            .collect(Collectors.groupingBy(
                e -> e.getTimestamp().truncatedTo(ChronoUnit.HOURS),
                Collectors.counting()
            ));
    }

    public static void main(String[] args) throws IOException {
        UserDatabase userDb = new UserDatabase();

        try (BufferedReader reader = new BufferedReader(
                new FileReader("events.jsonl"))) {

            List<AggregatedMetric> results = reader.lines()
                .map(EventPipeline::parseEvent)
                .filter(EventPipeline::isValidEvent)
                .map(e -> enrichEvent(e, userDb))
                .collect(Collectors.groupingBy(
                    e -> e.getTimestamp().truncatedTo(ChronoUnit.HOURS)
                ))
                .entrySet()
                .stream()
                .map(e -> new AggregatedMetric(e.getKey(), e.getValue()))
                .collect(Collectors.toList());

            results.forEach(warehouse::insert);
        }
    }
}

# Unix pipes & filters - the original inspiration

# Example: Process log files and generate report

# Filter 1: cat (read raw logs)
# Filter 2: grep (extract HTTP 500 errors)
# Filter 3: awk (extract user IDs)
# Filter 4: sort (sort unique users)
# Filter 5: uniq -c (count occurrences)

cat /var/log/app.log \
  | grep "HTTP 500" \
  | awk '{print $5}' \
  | sort \
  | uniq -c \
  | sort -rn

# Each filter is independent and reusable
# Output of cat = input of grep
# Output of grep = input of awk
# etc.

# Can swap filters, add/remove stages, reorder as needed

# Using Apache Beam (or similar framework)
# for more complex pipelines

import apache_beam as beam
from apache_beam.pipeline import Pipeline

# Define filters as transforms
class ParseEventsFn(beam.DoFn):
    def process(self, line):
        try:
            yield json.loads(line)
        except json.JSONDecodeError:
            pass

class ValidateEventsFn(beam.DoFn):
    def process(self, event):
        if all(k in event for k in ['user_id', 'action', 'timestamp']):
            yield event

class EnrichEventsFn(beam.DoFn):
    def process(self, event):
        user_db = get_user_db()  # Shared resource
        user = user_db.get(event['user_id'], {})
        event['user_segment'] = user.get('segment')
        yield event

# Compose the pipeline
with Pipeline() as pipeline:
    events = (
        pipeline
        | 'Read' >> beam.io.ReadFromText('events.jsonl')
        | 'Parse' >> beam.ParDo(ParseEventsFn())
        | 'Validate' >> beam.ParDo(ValidateEventsFn())
        | 'Enrich' >> beam.ParDo(EnrichEventsFn())
        | 'Aggregate' >> beam.combiners.Count.PerElement()
        | 'Write' >> beam.io.WriteToText('output.txt')
    )

Pipes & Filters / Pipeline Architecture

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Key Characteristics

Practical Example

When to Use / When Not to Use

Patterns and Pitfalls

Patterns and Pitfalls

Design Review Checklist

Self-Check

Next Steps

References

Pipes & Filters / Pipeline Architecture

TL;DR​

Learning Objectives​

Motivating Scenario​

Core Concepts​

Key Characteristics​

Practical Example​

When to Use / When Not to Use​

Patterns and Pitfalls​

Patterns and Pitfalls

Design Review Checklist​

Self-Check​

Next Steps​

References​

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Key Characteristics

Practical Example

When to Use / When Not to Use

Patterns and Pitfalls

Design Review Checklist

Self-Check

Next Steps

References