sochdb-python-sdk/examples/12_performance_optimization.py at main · sochdb/sochdb-python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python3
"""
SochDB Performance Optimization Test

This script tests different ef_construction values to find the optimal
balance between insert speed and recall quality.

The profiling shows neighbor selection consumes 80% of insert time,
which is directly proportional to ef_construction.
"""

import os
import sys
import time
import json
import numpy as np
from pathlib import Path

# Add the SDK to path
sys.path.append(str(Path(__file__).parent.parent / "src"))

try:
    from sochdb.vector import VectorIndex
except ImportError:
    print("Error: Could not import VectorIndex")
    print("Make sure SOCHDB_LIB_PATH is set to the compiled library")
    sys.exit(1)


def test_performance_vs_quality(
    vectors,
    ef_values=[50, 100, 150, 200],
    max_connections=16,
    num_test_vectors=1000
):
    """Test different ef_construction values for performance vs quality trade-off."""

    results = []

    print("=" * 80)
    print("SochDB Performance vs Quality Optimization")
    print("=" * 80)
    print(f"Testing ef_construction values: {ef_values}")
    print(f"Vectors: {num_test_vectors}, Dimension: {vectors.shape[1]}")
    print()

    # Generate test query
    query_vector = np.random.randn(vectors.shape[1]).astype(np.float32)
    query_vector = query_vector / np.linalg.norm(query_vector)

    # Test each ef_construction value
    for ef in ef_values:
        print(f"Testing ef_construction = {ef}")
        print("-" * 50)

        # Create index
        index = VectorIndex(
            dimension=vectors.shape[1],
            ef_construction=ef,
            max_connections=max_connections,
        )

        # Prepare test data
        test_vectors = vectors[:num_test_vectors]
        ids = list(range(num_test_vectors))

        # Convert to numpy array if needed
        if not isinstance(test_vectors, np.ndarray):
            test_vectors = np.array(test_vectors, dtype=np.float32)

        # Measure insert time
        start_time = time.perf_counter()
        result = index.insert_batch(ids, test_vectors)  # Fixed: ids first, then vectors
        insert_time = time.perf_counter() - start_time

        throughput = num_test_vectors / insert_time

        print(f"  Insert time: {insert_time:.3f}s")
        print(f"  Throughput: {throughput:.0f} vec/sec")
        print(f"  Result: {result}")

        # Measure search performance
        search_times = []
        for _ in range(10):  # Average over 10 searches
            start = time.perf_counter()
            search_results = index.search(query_vector, k=10)
            search_times.append((time.perf_counter() - start) * 1000)  # ms

        avg_search_time = sum(search_times) / len(search_times)

        print(f"  Search time: {avg_search_time:.3f}ms")
        print(f"  Search results: {len(search_results) if search_results else 0}")

        # Test recall by searching for inserted vectors
        recall_scores = []
        for i in range(min(100, num_test_vectors)):  # Test 100 vectors for recall
            test_vec = test_vectors[i]
            search_results = index.search(test_vec, k=10)
            if search_results and search_results[0][0] == i:  # Should find itself first
                recall_scores.append(1.0)
            else:
                recall_scores.append(0.0)

        self_recall = sum(recall_scores) / len(recall_scores) * 100

        print(f"  Self-recall: {self_recall:.1f}%")

        results.append({
            'ef_construction': ef,
            'insert_time': insert_time,
            'throughput': throughput,
            'search_time_ms': avg_search_time,
            'self_recall_pct': self_recall,
            'vectors_inserted': result if isinstance(result, int) else num_test_vectors
        })

        print()

    return results


def print_optimization_report(results):
    """Print detailed optimization recommendations."""

    print("=" * 80)
    print("OPTIMIZATION ANALYSIS")
    print("=" * 80)

    # Performance vs Quality Table
    print(f"{'EF':<6} {'Throughput':<12} {'Search ms':<10} {'Self-Recall':<12} {'Speedup':<10}")
    print("-" * 60)

    baseline_throughput = None
    for r in results:
        if baseline_throughput is None:
            baseline_throughput = r['throughput']
            speedup = "1.0x"
        else:
            speedup = f"{r['throughput'] / baseline_throughput:.1f}x"

        print(f"{r['ef_construction']:<6} "
              f"{r['throughput']:.0f} vec/s{'':<4} "
              f"{r['search_time_ms']:.3f}{'':<6} "
              f"{r['self_recall_pct']:.1f}%{'':<8} "
              f"{speedup}")

    print()

    # Find optimal configurations
    best_throughput = max(results, key=lambda x: x['throughput'])
    best_recall = max(results, key=lambda x: x['self_recall_pct'])

    # Find balanced option (good recall with significant speedup)
    balanced = None
    for r in results:
        if r['self_recall_pct'] >= 95.0:  # Good recall
            if balanced is None or r['throughput'] > balanced['throughput']:
                balanced = r

    print("RECOMMENDATIONS:")
    print("-" * 40)
    print(f"📈 Best Throughput: ef={best_throughput['ef_construction']} "
          f"({best_throughput['throughput']:.0f} vec/s, "
          f"{best_throughput['self_recall_pct']:.1f}% recall)")

    print(f"🎯 Best Recall: ef={best_recall['ef_construction']} "
          f"({best_recall['throughput']:.0f} vec/s, "
          f"{best_recall['self_recall_pct']:.1f}% recall)")

    if balanced:
        print(f"⚖️  Balanced: ef={balanced['ef_construction']} "
              f"({balanced['throughput']:.0f} vec/s, "
              f"{balanced['self_recall_pct']:.1f}% recall)")

    # Calculate potential speedup vs current (ef=200)
    current_config = next((r for r in results if r['ef_construction'] == 200), None)
    if current_config and best_throughput['ef_construction'] != 200:
        speedup = best_throughput['throughput'] / current_config['throughput']
        print(f"\n💡 Switching from ef=200 to ef={best_throughput['ef_construction']} "
              f"would give {speedup:.1f}x speedup")

    print()


def save_results(results, filename="performance_optimization_results.json"):
    """Save results to JSON file."""
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to: {filename}")


def main():
    # Generate test data
    print("Generating test vectors...")
    np.random.seed(42)
    dimension = 768  # OpenAI embedding size
    num_vectors = 2000  # Enough for testing different ef values

    vectors = np.random.randn(num_vectors, dimension).astype(np.float32)
    # Normalize to unit vectors
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

    print(f"Generated {num_vectors} vectors of dimension {dimension}")
    print()

    # Test different ef_construction values
    ef_values = [50, 100, 150, 200]  # Test range from fast to current
    results = test_performance_vs_quality(vectors, ef_values, num_test_vectors=1000)

    # Print analysis
    print_optimization_report(results)

    # Save results
    save_results(results)

    # Additional insight
    print("=" * 80)
    print("PROFILING INSIGHT")
    print("=" * 80)
    print("""
Our previous profiling showed that neighbor selection consumes 80% of insert time.
This is directly proportional to ef_construction because:

1. Search returns ~ef candidates per layer
2. RNG heuristic checks each candidate against all selected neighbors
3. Each check requires a full 768-dimensional cosine distance calculation

Reducing ef_construction from 200 to 100 should approximately halve the
neighbor selection time, giving ~1.6x overall speedup while maintaining
good recall quality.

For production use, consider:
- ef_construction = 100-128 for balanced performance/quality
- ef_construction = 50-75 for maximum speed with acceptable recall
- Keep max_connections = 16 (already optimal)
""")


if __name__ == '__main__':
    main()