Search Engines & Inverted Indexes

from elasticsearch import Elasticsearch
import json

es = Elasticsearch(['localhost:9200'])

# Create index with mapping
mapping = {
    'mappings': {
        'properties': {
            'title': {
                'type': 'text',
                'analyzer': 'standard',
                'fields': {
                    'keyword': {'type': 'keyword'}  # For exact match
                }
            },
            'description': {'type': 'text'},
            'price': {'type': 'float'},
            'rating': {'type': 'float'},
            'brand': {'type': 'keyword'},
            'category': {'type': 'keyword'},
            'tags': {
                'type': 'text',
                'analyzer': 'standard'
            },
            'created_at': {'type': 'date'}
        }
    }
}

es.indices.create(index='products', body=mapping, ignore=400)

# Index documents
product = {
    'title': 'Python Programming Guide',
    'description': 'Comprehensive guide to Python programming',
    'price': 29.99,
    'rating': 4.5,
    'brand': 'TechBooks',
    'category': 'Programming',
    'tags': ['python', 'programming', 'guide'],
    'created_at': '2025-02-14'
}

es.index(index='products', id=1, body=product)

# Bulk indexing
from elasticsearch.helpers import bulk

documents = [
    {'_index': 'products', '_id': i, '_source': product}
    for i, product in enumerate(get_all_products())
]
bulk(es, documents)

# Full-text search
query = {
    'query': {
        'multi_match': {
            'query': 'python programming',
            'fields': ['title^3', 'description', 'tags'],  # title weighted 3x
            'type': 'best_fields'
        }
    },
    'size': 20,
    'from': 0
}

results = es.search(index='products', body=query)
for hit in results['hits']['hits']:
    print(f"{hit['_score']}: {hit['_source']['title']}")

# Faceted search (aggregations)
faceted_query = {
    'query': {
        'bool': {
            'must': [
                {'multi_match': {
                    'query': 'python',
                    'fields': ['title', 'description']
                }}
        }
    },
    'aggs': {
        'brands': {
            'terms': {'field': 'brand', 'size': 10}
        },
        'categories': {
            'terms': {'field': 'category', 'size': 10}
        },
        'price_ranges': {
            'range': {
                'field': 'price',
                'ranges': [
                    {'to': 20},
                    {'from': 20, 'to': 50},
                    {'from': 50}
            }
        }
    }
}

results = es.search(index='products', body=faceted_query)

# Extract facets
for bucket in results['aggregations']['brands']['buckets']:
    print(f"{bucket['key']}: {bucket['doc_count']}")

# Advanced query with filtering
advanced_query = {
    'query': {
        'bool': {
            'must': [
                {'multi_match': {
                    'query': 'python',
                    'fields': ['title', 'description']
                }}
            ],
            'filter': [
                {'range': {'price': {'lte': 50}}},
                {'term': {'brand.keyword': 'TechBooks'}},
                {'range': {'rating': {'gte': 4.0}}}
        }
    }
}

results = es.search(index='products', body=advanced_query)

const { Client } = require('@elastic/elasticsearch');

const client = new Client({ node: 'http://localhost:9200' });

async function setupIndex() {
  await client.indices.create({
    index: 'products',
    body: {
      mappings: {
        properties: {
          title: {
            type: 'text',
            fields: { keyword: { type: 'keyword' } }
          },
          description: { type: 'text' },
          price: { type: 'float' },
          brand: { type: 'keyword' },
          category: { type: 'keyword' },
          rating: { type: 'float' },
          tags: { type: 'text' }
        }
      }
    }
  }, { ignore: [400] });
}

async function indexProduct(product) {
  await client.index({
    index: 'products',
    body: product
  });
}

async function searchProducts(searchTerm) {
  const results = await client.search({
    index: 'products',
    body: {
      query: {
        multi_match: {
          query: searchTerm,
          fields: ['title^3', 'description', 'tags'],
          type: 'best_fields'
        }
      },
      size: 20
    }
  });
  
  return results.body.hits.hits.map(hit => ({
    id: hit._id,
    score: hit._score,
    ...hit._source
  }));
}

async function facetedSearch(searchTerm, filters = {}) {
  const query = {
    bool: {
      must: [{
        multi_match: {
          query: searchTerm,
          fields: ['title', 'description']
        }
      }],
      filter: Object.entries(filters).map(([field, value]) => ({
        term: { [`${field}.keyword`]: value }
      }))
    }
  };
  
  const results = await client.search({
    index: 'products',
    body: {
      query,
      aggs: {
        brands: { terms: { field: 'brand', size: 10 } },
        categories: { terms: { field: 'category', size: 10 } },
        price_ranges: {
          range: {
            field: 'price',
            ranges: [
              { to: 20 },
              { from: 20, to: 50 },
              { from: 50 }
          }
        }
      },
      size: 20
    }
  });
  
  return {
    products: results.body.hits.hits,
    facets: results.body.aggregations
  };
}

// Usage
(async () => {
  await setupIndex();
  
  await indexProduct({
    title: 'Python Programming Guide',
    description: 'Learn Python programming step by step',
    price: 29.99,
    brand: 'TechBooks',
    category: 'Programming',
    rating: 4.5,
    tags: ['python', 'programming']
  });
  
  const results = await searchProducts('python');
  console.log(`Found ${results.length} products`);
  
  const faceted = await facetedSearch('python', { brand: 'TechBooks' });
  console.log(`Results with facets:`, faceted);
})();

from opensearchpy import OpenSearch

client = OpenSearch(hosts=[{'host': 'localhost', 'port': 9200}])

# Simple search
search_body = {
    'size': 10,
    'query': {
        'match': {
            'title': 'python'
        }
    }
}

response = client.search(body=search_body, index='products')

# Parse results
for hit in response['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Title: {hit['_source']['title']}")
    print(f"Price: ${hit['_source']['price']}")
    print('---')

# Boolean query (must/should/must_not)
complex_query = {
    'query': {
        'bool': {
            'must': [
                {'match': {'title': 'python'}},
                {'match': {'description': 'programming'}}
            ],
            'should': [
                {'match': {'tags': 'trending'}}
            ],
            'must_not': [
                {'match': {'title': 'deprecated'}}
            ],
            'filter': [
                {'range': {'price': {'lte': 50}}},
                {'range': {'rating': {'gte': 4}}}
        }
    }
}

response = client.search(body=complex_query, index='products')

Search Engines & Inverted Indexes

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Practical Example

When to Use Search Engines / When Not to Use

Patterns and Pitfalls

Design Review Checklist

Self-Check

Next Steps

Real-World Scaling Strategies

Sharding Strategy for Large Indexes

Relevance Tuning Examples

References

Search Engines & Inverted Indexes

TL;DR​

Learning Objectives​

Motivating Scenario​

Core Concepts​

Practical Example​

When to Use Search Engines / When Not to Use​

Patterns and Pitfalls​

Design Review Checklist​

Self-Check​

Next Steps​

Real-World Scaling Strategies​

Sharding Strategy for Large Indexes​

Relevance Tuning Examples​

References​

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Practical Example

When to Use Search Engines / When Not to Use

Patterns and Pitfalls

Design Review Checklist

Self-Check

Next Steps

Real-World Scaling Strategies

Sharding Strategy for Large Indexes

Relevance Tuning Examples

References