Skip to main content

Pattern Matching

Master various string pattern matching algorithms for efficient text processing and analysis.

Naive String Matching

Basic Naive Algorithm

// Naive string matching algorithm
vector<int> naiveSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();

for (int i = 0; i <= n - m; i++) {
int j;
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
break;
}
}
if (j == m) {
result.push_back(i);
}
}

return result;
}

Time Complexity

  • Time: O((n - m + 1) × m) in worst case
  • Space: O(1)
  • Best case: O(n) when pattern not found
  • Worst case: O(n × m) when all characters match except last

KMP Algorithm

KMP with Failure Function

// Build failure function for KMP
vector<int> buildFailureFunction(string pattern) {
int m = pattern.length();
vector<int> failure(m, 0);

int j = 0;
for (int i = 1; i < m; i++) {
while (j > 0 && pattern[i] != pattern[j]) {
j = failure[j - 1];
}
if (pattern[i] == pattern[j]) {
j++;
}
failure[i] = j;
}

return failure;
}

// KMP string matching
vector<int> kmpSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();

vector<int> failure = buildFailureFunction(pattern);

int j = 0;
for (int i = 0; i < n; i++) {
while (j > 0 && text[i] != pattern[j]) {
j = failure[j - 1];
}
if (text[i] == pattern[j]) {
j++;
}
if (j == m) {
result.push_back(i - m + 1);
j = failure[j - 1];
}
}

return result;
}

KMP Advantages

  • Time complexity: O(n + m)
  • Space complexity: O(m)
  • No backtracking: Efficient for large texts
  • Pattern preprocessing: Failure function computed once

Rabin-Karp Algorithm

Rolling Hash Implementation

// Rabin-Karp with rolling hash
vector<int> rabinKarpSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();

if (m > n) return result;

const int base = 256;
const int mod = 1000000007;

// Calculate hash of pattern
long long patternHash = 0;
long long textHash = 0;
long long h = 1;

for (int i = 0; i < m - 1; i++) {
h = (h * base) % mod;
}

for (int i = 0; i < m; i++) {
patternHash = (base * patternHash + pattern[i]) % mod;
textHash = (base * textHash + text[i]) % mod;
}

// Slide the pattern over text
for (int i = 0; i <= n - m; i++) {
if (patternHash == textHash) {
// Verify match character by character
bool match = true;
for (int j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
match = false;
break;
}
}
if (match) {
result.push_back(i);
}
}

// Calculate hash for next window
if (i < n - m) {
textHash = (base * (textHash - text[i] * h) + text[i + m]) % mod;
if (textHash < 0) {
textHash += mod;
}
}
}

return result;
}

Rolling Hash Benefits

  • Average case: O(n + m)
  • Worst case: O(n × m) due to hash collisions
  • Space: O(1)
  • Multiple patterns: Can search for multiple patterns simultaneously

Z-Algorithm

Z-Array Construction

// Build Z-array for string
vector<int> buildZArray(string s) {
int n = s.length();
vector<int> z(n, 0);

int l = 0, r = 0;
for (int i = 1; i < n; i++) {
if (i <= r) {
z[i] = min(r - i + 1, z[i - l]);
}

while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
z[i]++;
}

if (i + z[i] - 1 > r) {
l = i;
r = i + z[i] - 1;
}
}

return z;
}

// Z-algorithm for pattern matching
vector<int> zAlgorithmSearch(string text, string pattern) {
string combined = pattern + "$" + text;
vector<int> z = buildZArray(combined);
vector<int> result;

int m = pattern.length();
for (int i = m + 1; i < z.size(); i++) {
if (z[i] == m) {
result.push_back(i - m - 1);
}
}

return result;
}

Z-Algorithm Properties

  • Time complexity: O(n + m)
  • Space complexity: O(n + m)
  • No preprocessing: Pattern not preprocessed separately
  • Linear time: Guaranteed O(n + m) performance

Suffix Arrays

Suffix Array Construction

// Build suffix array using sorting
vector<int> buildSuffixArray(string s) {
int n = s.length();
vector<int> suffixArray(n);
vector<int> rank(n);

// Initialize with single characters
for (int i = 0; i < n; i++) {
suffixArray[i] = i;
rank[i] = s[i];
}

// Sort suffixes by first 2^k characters
for (int k = 1; k < n; k *= 2) {
// Sort by rank
sort(suffixArray.begin(), suffixArray.end(), [&](int a, int b) {
if (rank[a] != rank[b]) {
return rank[a] < rank[b];
}
int rankA = (a + k < n) ? rank[a + k] : -1;
int rankB = (b + k < n) ? rank[b + k] : -1;
return rankA < rankB;
});

// Update ranks
vector<int> newRank(n);
newRank[suffixArray[0]] = 0;
for (int i = 1; i < n; i++) {
if (rank[suffixArray[i]] == rank[suffixArray[i - 1]] &&
rank[suffixArray[i] + k] == rank[suffixArray[i - 1] + k]) {
newRank[suffixArray[i]] = newRank[suffixArray[i - 1]];
} else {
newRank[suffixArray[i]] = i;
}
}
rank = newRank;
}

return suffixArray;
}

Longest Common Prefix (LCP)

// Build LCP array
vector<int> buildLCPArray(string s, vector<int>& suffixArray) {
int n = s.length();
vector<int> lcp(n, 0);
vector<int> rank(n);

// Build rank array
for (int i = 0; i < n; i++) {
rank[suffixArray[i]] = i;
}

int k = 0;
for (int i = 0; i < n; i++) {
if (rank[i] == n - 1) {
k = 0;
continue;
}

int j = suffixArray[rank[i] + 1];
while (i + k < n && j + k < n && s[i + k] == s[j + k]) {
k++;
}

lcp[rank[i]] = k;
if (k > 0) k--;
}

return lcp;
}

Pattern Search in Suffix Array

// Search pattern in suffix array
vector<int> searchInSuffixArray(string text, string pattern, vector<int>& suffixArray) {
vector<int> result;
int n = text.length();
int m = pattern.length();

// Binary search for pattern
int left = 0, right = n - 1;
while (left <= right) {
int mid = (left + right) / 2;
int cmp = text.compare(suffixArray[mid], m, pattern);

if (cmp == 0) {
// Found match, find all occurrences
int start = mid;
while (start > 0 && text.compare(suffixArray[start - 1], m, pattern) == 0) {
start--;
}

int end = mid;
while (end < n - 1 && text.compare(suffixArray[end + 1], m, pattern) == 0) {
end++;
}

for (int i = start; i <= end; i++) {
result.push_back(suffixArray[i]);
}
break;
} else if (cmp < 0) {
left = mid + 1;
} else {
right = mid - 1;
}
}

return result;
}

Performance Comparison

AlgorithmTime ComplexitySpace ComplexityBest Use Case
NaiveO((n-m+1)×m)O(1)Simple cases, small patterns
KMPO(n + m)O(m)General purpose, no backtracking
Rabin-KarpO(n + m) avgO(1)Multiple patterns, rolling hash
Z-AlgorithmO(n + m)O(n + m)Pattern matching, string analysis
Suffix ArrayO(n log n)O(n)Multiple queries, string analysis

Common Patterns

  1. Single pattern search: KMP or Z-algorithm
  2. Multiple pattern search: Rabin-Karp or Aho-Corasick
  3. String analysis: Suffix arrays and LCP
  4. Rolling hash: Rabin-Karp for multiple patterns
  5. No backtracking: KMP for large texts

Applications

  • Text editors: Find and replace operations
  • Search engines: Pattern matching in documents
  • Bioinformatics: DNA sequence analysis
  • Data compression: String compression algorithms
  • Network security: Intrusion detection systems