Pattern Matching
Master various string pattern matching algorithms for efficient text processing and analysis.
Naive String Matching
Basic Naive Algorithm
// Naive string matching algorithm
vector<int> naiveSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();
for (int i = 0; i <= n - m; i++) {
int j;
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
break;
}
}
if (j == m) {
result.push_back(i);
}
}
return result;
}
Time Complexity
- Time: O((n - m + 1) × m) in worst case
- Space: O(1)
- Best case: O(n) when pattern not found
- Worst case: O(n × m) when all characters match except last
KMP Algorithm
KMP with Failure Function
// Build failure function for KMP
vector<int> buildFailureFunction(string pattern) {
int m = pattern.length();
vector<int> failure(m, 0);
int j = 0;
for (int i = 1; i < m; i++) {
while (j > 0 && pattern[i] != pattern[j]) {
j = failure[j - 1];
}
if (pattern[i] == pattern[j]) {
j++;
}
failure[i] = j;
}
return failure;
}
// KMP string matching
vector<int> kmpSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();
vector<int> failure = buildFailureFunction(pattern);
int j = 0;
for (int i = 0; i < n; i++) {
while (j > 0 && text[i] != pattern[j]) {
j = failure[j - 1];
}
if (text[i] == pattern[j]) {
j++;
}
if (j == m) {
result.push_back(i - m + 1);
j = failure[j - 1];
}
}
return result;
}
KMP Advantages
- Time complexity: O(n + m)
- Space complexity: O(m)
- No backtracking: Efficient for large texts
- Pattern preprocessing: Failure function computed once
Rabin-Karp Algorithm
Rolling Hash Implementation
// Rabin-Karp with rolling hash
vector<int> rabinKarpSearch(string text, string pattern) {
vector<int> result;
int n = text.length();
int m = pattern.length();
if (m > n) return result;
const int base = 256;
const int mod = 1000000007;
// Calculate hash of pattern
long long patternHash = 0;
long long textHash = 0;
long long h = 1;
for (int i = 0; i < m - 1; i++) {
h = (h * base) % mod;
}
for (int i = 0; i < m; i++) {
patternHash = (base * patternHash + pattern[i]) % mod;
textHash = (base * textHash + text[i]) % mod;
}
// Slide the pattern over text
for (int i = 0; i <= n - m; i++) {
if (patternHash == textHash) {
// Verify match character by character
bool match = true;
for (int j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
match = false;
break;
}
}
if (match) {
result.push_back(i);
}
}
// Calculate hash for next window
if (i < n - m) {
textHash = (base * (textHash - text[i] * h) + text[i + m]) % mod;
if (textHash < 0) {
textHash += mod;
}
}
}
return result;
}
Rolling Hash Benefits
- Average case: O(n + m)
- Worst case: O(n × m) due to hash collisions
- Space: O(1)
- Multiple patterns: Can search for multiple patterns simultaneously
Z-Algorithm
Z-Array Construction
// Build Z-array for string
vector<int> buildZArray(string s) {
int n = s.length();
vector<int> z(n, 0);
int l = 0, r = 0;
for (int i = 1; i < n; i++) {
if (i <= r) {
z[i] = min(r - i + 1, z[i - l]);
}
while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
z[i]++;
}
if (i + z[i] - 1 > r) {
l = i;
r = i + z[i] - 1;
}
}
return z;
}
// Z-algorithm for pattern matching
vector<int> zAlgorithmSearch(string text, string pattern) {
string combined = pattern + "$" + text;
vector<int> z = buildZArray(combined);
vector<int> result;
int m = pattern.length();
for (int i = m + 1; i < z.size(); i++) {
if (z[i] == m) {
result.push_back(i - m - 1);
}
}
return result;
}
Z-Algorithm Properties
- Time complexity: O(n + m)
- Space complexity: O(n + m)
- No preprocessing: Pattern not preprocessed separately
- Linear time: Guaranteed O(n + m) performance
Suffix Arrays
Suffix Array Construction
// Build suffix array using sorting
vector<int> buildSuffixArray(string s) {
int n = s.length();
vector<int> suffixArray(n);
vector<int> rank(n);
// Initialize with single characters
for (int i = 0; i < n; i++) {
suffixArray[i] = i;
rank[i] = s[i];
}
// Sort suffixes by first 2^k characters
for (int k = 1; k < n; k *= 2) {
// Sort by rank
sort(suffixArray.begin(), suffixArray.end(), [&](int a, int b) {
if (rank[a] != rank[b]) {
return rank[a] < rank[b];
}
int rankA = (a + k < n) ? rank[a + k] : -1;
int rankB = (b + k < n) ? rank[b + k] : -1;
return rankA < rankB;
});
// Update ranks
vector<int> newRank(n);
newRank[suffixArray[0]] = 0;
for (int i = 1; i < n; i++) {
if (rank[suffixArray[i]] == rank[suffixArray[i - 1]] &&
rank[suffixArray[i] + k] == rank[suffixArray[i - 1] + k]) {
newRank[suffixArray[i]] = newRank[suffixArray[i - 1]];
} else {
newRank[suffixArray[i]] = i;
}
}
rank = newRank;
}
return suffixArray;
}
Longest Common Prefix (LCP)
// Build LCP array
vector<int> buildLCPArray(string s, vector<int>& suffixArray) {
int n = s.length();
vector<int> lcp(n, 0);
vector<int> rank(n);
// Build rank array
for (int i = 0; i < n; i++) {
rank[suffixArray[i]] = i;
}
int k = 0;
for (int i = 0; i < n; i++) {
if (rank[i] == n - 1) {
k = 0;
continue;
}
int j = suffixArray[rank[i] + 1];
while (i + k < n && j + k < n && s[i + k] == s[j + k]) {
k++;
}
lcp[rank[i]] = k;
if (k > 0) k--;
}
return lcp;
}
Pattern Search in Suffix Array
// Search pattern in suffix array
vector<int> searchInSuffixArray(string text, string pattern, vector<int>& suffixArray) {
vector<int> result;
int n = text.length();
int m = pattern.length();
// Binary search for pattern
int left = 0, right = n - 1;
while (left <= right) {
int mid = (left + right) / 2;
int cmp = text.compare(suffixArray[mid], m, pattern);
if (cmp == 0) {
// Found match, find all occurrences
int start = mid;
while (start > 0 && text.compare(suffixArray[start - 1], m, pattern) == 0) {
start--;
}
int end = mid;
while (end < n - 1 && text.compare(suffixArray[end + 1], m, pattern) == 0) {
end++;
}
for (int i = start; i <= end; i++) {
result.push_back(suffixArray[i]);
}
break;
} else if (cmp < 0) {
left = mid + 1;
} else {
right = mid - 1;
}
}
return result;
}
Performance Comparison
Algorithm | Time Complexity | Space Complexity | Best Use Case |
---|---|---|---|
Naive | O((n-m+1)×m) | O(1) | Simple cases, small patterns |
KMP | O(n + m) | O(m) | General purpose, no backtracking |
Rabin-Karp | O(n + m) avg | O(1) | Multiple patterns, rolling hash |
Z-Algorithm | O(n + m) | O(n + m) | Pattern matching, string analysis |
Suffix Array | O(n log n) | O(n) | Multiple queries, string analysis |
Common Patterns
- Single pattern search: KMP or Z-algorithm
- Multiple pattern search: Rabin-Karp or Aho-Corasick
- String analysis: Suffix arrays and LCP
- Rolling hash: Rabin-Karp for multiple patterns
- No backtracking: KMP for large texts
Applications
- Text editors: Find and replace operations
- Search engines: Pattern matching in documents
- Bioinformatics: DNA sequence analysis
- Data compression: String compression algorithms
- Network security: Intrusion detection systems