diff --git a/MLExamples/TinyOpenFold/.gitignore b/MLExamples/TinyOpenFold/.gitignore new file mode 100644 index 00000000..d066a880 --- /dev/null +++ b/MLExamples/TinyOpenFold/.gitignore @@ -0,0 +1,36 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ + +# Virtual environments +venv*/ +env*/ +ENV*/ + +# Profiling and experimental outputs +version1_pytorch_baseline/pytorch_profiles/ +version1_pytorch_baseline/profiles/ +version1_pytorch_baseline/scaling_study_*/ +*.log + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Jupyter +.ipynb_checkpoints/ +*.ipynb + +# OS +.DS_Store +Thumbs.db + diff --git a/MLExamples/TinyOpenFold/ARCHITECTURE.md b/MLExamples/TinyOpenFold/ARCHITECTURE.md new file mode 100644 index 00000000..32c9ec1f --- /dev/null +++ b/MLExamples/TinyOpenFold/ARCHITECTURE.md @@ -0,0 +1,395 @@ +# TinyOpenFold Architecture Documentation + +**Source File**: `HPCTrainingExamples/MLExamples/TinyOpenFold/version1_pytorch_baseline/tiny_openfold_v1.py` + +## Overview + +TinyOpenFold is a simplified, educational implementation of the AlphaFold 2 architecture, focusing on the core innovation: the **Evoformer**. This implementation demonstrates how Multiple Sequence Alignments (MSA) and pairwise residue representations interact to predict protein structures. + +## Core Architecture: The Evoformer + +The Evoformer is the main building block of AlphaFold 2, processing two coupled representations: +1. **MSA Representation** (N_seq × N_res × msa_dim): Features for each residue in each sequence +2. **Pair Representation** (N_res × N_res × pair_dim): Pairwise features between residues + +These representations are updated through a series of attention and communication operations. + +## Architecture Components + +### 1. Input Embeddings + +#### MSA Embedding +**Shape**: `(batch, n_seqs, seq_len, msa_dim)` + +Maps discrete amino acid tokens in the MSA to continuous vectors. + +**Parameters**: `vocab_size × msa_dim` +- Example (TinyOpenFoldConfig): 21 amino acids × 64 dim = **1,344 parameters** + +#### Pair Embedding +**Shape**: `(batch, seq_len, seq_len, pair_dim)` + +Encodes pairwise information between residues (e.g., distance bins, relative positions). + +**Parameters**: `pair_input_dim × pair_dim` +- Example (TinyOpenFoldConfig): 65 features × 128 dim = **8,320 parameters** + +### 2. Evoformer Block (Repeated n_evoformer_blocks times) + +Each Evoformer block contains multiple sub-modules that update both MSA and pair representations. + +#### A. MSA Row-wise Attention with Pair Bias + +Attention across residues within each MSA sequence, biased by pair representation. + +**MSA Attention Components**: +- Query projection: `(msa_dim, n_heads_msa × head_dim_msa)` +- Key projection: `(msa_dim, n_heads_msa × head_dim_msa)` +- Value projection: `(msa_dim, n_heads_msa × head_dim_msa)` +- Output projection: `(n_heads_msa × head_dim_msa, msa_dim)` +- Pair bias projection: `(pair_dim, n_heads_msa)` + +**Total MSA Row Attention Parameters**: +``` +3 × msa_dim × (n_heads_msa × head_dim_msa) + (n_heads_msa × head_dim_msa) × msa_dim + pair_dim × n_heads_msa += 4 × msa_dim² + pair_dim × n_heads_msa +``` + +Example (msa_dim=64, n_heads_msa=4, pair_dim=128): +- Q, K, V, O: 4 × 64² = 16,384 +- Pair bias: 128 × 4 = 512 +- **Total: 16,896 parameters** + +#### B. MSA Column-wise Attention + +Attention across sequences for each residue position (communication between different sequences). + +**Parameters**: Same structure as row attention but without pair bias +``` +4 × msa_dim² +``` + +Example (msa_dim=64): +- **Total: 16,384 parameters** + +#### C. MSA Transition (Feed-Forward) + +Per-position feed-forward network for MSA representation. + +**Layers**: +- Linear 1: `(msa_dim, msa_intermediate_dim)` +- Linear 2: `(msa_intermediate_dim, msa_dim)` + +**Total MSA Transition Parameters**: +``` +2 × msa_dim × msa_intermediate_dim +``` + +Example (msa_dim=64, msa_intermediate_dim=256): +- **Total: 32,768 parameters** + +#### D. Outer Product Mean + +Projects MSA representation to update pair representation using outer product. + +**Layers**: +- MSA to outer: `(msa_dim, outer_product_dim)` +- Outer to pair: `(outer_product_dim², pair_dim)` + +**Total Outer Product Parameters**: +``` +msa_dim × outer_product_dim + outer_product_dim² × pair_dim +``` + +Example (msa_dim=64, outer_product_dim=32, pair_dim=128): +- MSA projection: 64 × 32 = 2,048 +- Outer to pair: 32² × 128 = 131,072 +- **Total: 133,120 parameters** + +#### E. Triangle Multiplicative Update (Outgoing) + +Updates pair representation using geometric reasoning: if residues i-j and j-k are close, then i-k should also be considered. + +**Layers**: +- Left projection: `(pair_dim, pair_dim)` +- Right projection: `(pair_dim, pair_dim)` +- Left gate: `(pair_dim, pair_dim)` +- Right gate: `(pair_dim, pair_dim)` +- Output projection: `(pair_dim, pair_dim)` +- Output gate: `(pair_dim, pair_dim)` + +**Total Triangle Mult Parameters**: +``` +6 × pair_dim² +``` + +Example (pair_dim=128): +- **Total: 98,304 parameters** + +#### F. Triangle Multiplicative Update (Incoming) + +Similar to outgoing but with different edge orientation. + +Example (pair_dim=128): +- **Total: 98,304 parameters** + +#### G. Triangle Self-Attention (Starting) + +Self-attention around edges starting from a node. + +**Components**: +- Q, K, V projections: `3 × pair_dim × (n_heads_pair × head_dim_pair)` +- Output projection: `(n_heads_pair × head_dim_pair, pair_dim)` + +**Total Parameters**: +``` +4 × pair_dim² +``` + +Example (pair_dim=128): +- **Total: 65,536 parameters** + +#### H. Triangle Self-Attention (Ending) + +Self-attention around edges ending at a node. + +Example (pair_dim=128): +- **Total: 65,536 parameters** + +#### I. Pair Transition (Feed-Forward) + +Per-position feed-forward for pair representation. + +**Total Parameters**: +``` +2 × pair_dim × pair_intermediate_dim +``` + +Example (pair_dim=128, pair_intermediate_dim=512): +- **Total: 131,072 parameters** + +#### Per Evoformer Block Total + +Sum of all components: +- MSA Row Attention: 16,896 +- MSA Column Attention: 16,384 +- MSA Transition: 32,768 +- Outer Product Mean: 133,120 +- Triangle Mult (Out): 98,304 +- Triangle Mult (In): 98,304 +- Triangle Attn (Start): 65,536 +- Triangle Attn (End): 65,536 +- Pair Transition: 131,072 +- **Per Block: ~658,000 parameters** + +### 3. Structure Module (Simplified) + +Converts pair representation to 3D coordinates. + +**Simplified Version** (no IPA, direct prediction): +- Pair to distance: `(pair_dim, 1)` +- Angle predictions: `(pair_dim, 2)` (phi, psi angles) + +**Parameters**: `pair_dim × 3` + +Example (pair_dim=128): +- **Total: 384 parameters** + +## Complete Parameter Formula + +**Total Parameters** = +``` +MSA_Embedding + Pair_Embedding ++ (n_evoformer_blocks × Per_Block_Parameters) ++ Structure_Module + += vocab_size × msa_dim + + pair_input_dim × pair_dim + + n_evoformer_blocks × [ + (4 × msa_dim² + pair_dim × n_heads_msa) # MSA Row Attn + + 4 × msa_dim² # MSA Col Attn + + 2 × msa_dim × msa_intermediate_dim # MSA Transition + + (msa_dim × outer_dim + outer_dim² × pair_dim) # Outer Product + + 6 × pair_dim² # Triangle Mult Out + + 6 × pair_dim² # Triangle Mult In + + 4 × pair_dim² # Triangle Attn Start + + 4 × pair_dim² # Triangle Attn End + + 2 × pair_dim × pair_intermediate_dim # Pair Transition + ] + + pair_dim × 3 # Structure Module +``` + +## Example Calculation (TinyOpenFoldConfig Default) + +**Configuration**: +- `vocab_size` = 21 (20 amino acids + unknown) +- `msa_dim` = 64 +- `pair_dim` = 128 +- `n_evoformer_blocks` = 4 +- `n_heads_msa` = 4 +- `n_heads_pair` = 4 +- `head_dim_msa` = 16 (msa_dim / n_heads_msa) +- `head_dim_pair` = 32 (pair_dim / n_heads_pair) +- `msa_intermediate_dim` = 256 +- `pair_intermediate_dim` = 512 +- `outer_product_dim` = 32 +- `pair_input_dim` = 65 +- `max_seq_len` = 64 +- `n_seqs` = 16 + +**Component Breakdown**: + +1. **MSA Embedding**: 21 × 64 = **1,344** + +2. **Pair Embedding**: 65 × 128 = **8,320** + +3. **Per Evoformer Block**: + - MSA Row Attention: 4 × 64² + 128 × 4 = 16,896 + - MSA Column Attention: 4 × 64² = 16,384 + - MSA Transition: 2 × 64 × 256 = 32,768 + - Outer Product Mean: 64 × 32 + 32² × 128 = 133,120 + - Triangle Mult (Out): 6 × 128² = 98,304 + - Triangle Mult (In): 6 × 128² = 98,304 + - Triangle Attn (Start): 4 × 128² = 65,536 + - Triangle Attn (End): 4 × 128² = 65,536 + - Pair Transition: 2 × 128 × 512 = 131,072 + - **Subtotal per block**: 657,920 + +4. **All 4 Blocks**: 4 × 657,920 = **2,631,680** + +5. **Structure Module**: 128 × 3 = **384** + +**Total**: 1,344 + 8,320 + 2,631,680 + 384 = **2,641,728 parameters** + +**Model Size**: +- FP32: 2,641,728 × 4 / 1e6 = **10.6 MB** +- FP16/BF16: 2,641,728 × 2 / 1e6 = **5.3 MB** + +## Data Structure and Batching + +### Batch Size +**Batch size** refers to the number of protein samples processed simultaneously in one forward/backward pass. For example, `batch_size=4` means 4 complete protein structures are processed together. + +### Sample Structure +Each **sample** represents one complete protein structure with three components: + +1. **MSA Tokens**: Shape `(n_seqs, seq_len)` = `(16, 64)` + - Integer tokens (0-20) representing amino acids + - 16 MSA sequences × 64 amino acids per sequence + +2. **Pair Features**: Shape `(seq_len, seq_len, pair_input_dim)` = `(64, 64, 65)` + - Pairwise feature matrix: 64×64 residues with 65 features per pair + +3. **Target Distances**: Shape `(seq_len, seq_len, 1)` = `(64, 64, 1)` + - Ground truth distance matrix for structure prediction + +**Total per sample**: ~271K elements (mostly from pair features: 266K floats) + +**Batch processing**: With `batch_size=4`, tensors have shape `(4, ...)` for all three components, enabling parallel processing of multiple proteins. + +### Sample Speed Evaluation +**Training speed** (samples/sec) measures throughput and is calculated as: + +``` +speed = batch_size / batch_time +``` + +Where `batch_time` includes: +- Forward pass (model inference) +- Backward pass (gradient computation) +- Optimizer step (parameter update) + +**Example**: With `batch_size=4` and `batch_time=25ms`: +- Speed = 4 / 0.025 = **160 samples/sec** + +**Average training speed** is computed across all training steps, providing a stable metric for performance comparison. Higher values indicate better GPU utilization and faster training. + +## Training Memory Requirements + +Similar to transformers, training requires: + +### Optimizer States (Adam/AdamW) +- **First Moment (m)**: Same size as parameters +- **Second Moment (v)**: Same size as parameters +- **Total**: 2× parameter memory + +### Gradients +- **One gradient per parameter**: Same size as parameters + +### Activations +- MSA activations: `batch × n_seqs × seq_len × msa_dim` +- Pair activations: `batch × seq_len × seq_len × pair_dim` +- Attention matrices: `batch × n_heads × seq_len × seq_len` (or `n_seqs × seq_len`) +- Typically **dominant memory consumer** for long sequences + +### Total Training Memory (Approximate) +``` +Total ≈ Model + Gradients + Optimizer States + Activations + ≈ Params + Params + 2×Params + Activations + ≈ 4×Params + Activations +``` + +For FP32 training with TinyOpenFoldConfig: +- Model: 10.6 MB +- Gradients: 10.6 MB +- Optimizer: 21.2 MB +- **Base**: 42.4 MB (before activations) + +For batch=4, n_seqs=16, seq_len=64: +- MSA activations: 4 × 16 × 64 × 64 × 4 bytes ≈ 1 MB +- Pair activations: 4 × 64 × 64 × 128 × 4 bytes ≈ 8 MB +- Total with activations: ~50-60 MB + +## Key Differences from Standard AlphaFold 2 + +1. **Reduced Dimensions**: 64/128 vs 256/128 in production +2. **Fewer Blocks**: 4 vs 48 Evoformer blocks +3. **No Templates**: Skips template featurization and template embedder +4. **Simplified Structure Module**: Direct distance/angle prediction instead of full IPA with frames +5. **No Recycling**: Single forward pass instead of multiple recycling iterations +6. **Synthetic Data**: Uses random MSA/pair features instead of real protein data +7. **Educational Focus**: Emphasis on clarity and understanding over production performance + +## Key Innovations of Evoformer + +1. **Dual Representation Updates**: MSA and pair representations evolve together, sharing information +2. **Triangle Multiplicative Updates**: Geometric inductive bias for spatial reasoning +3. **Outer Product Mean**: Projects MSA patterns onto pairwise space +4. **Pair Bias in MSA Attention**: Pairwise information guides sequence-level attention +5. **Multi-Scale Attention**: Row-wise (within sequence) and column-wise (across sequences) + +## Computational Complexity + +### MSA Operations +- **Row Attention**: O(n_seqs × seq_len² × msa_dim) +- **Column Attention**: O(seq_len × n_seqs² × msa_dim) +- For small MSAs, row attention dominates + +### Pair Operations +- **Triangle Updates**: O(seq_len³ × pair_dim) - most expensive! +- **Triangle Attention**: O(seq_len³ × pair_dim) +- **Pair Transition**: O(seq_len² × pair_dim × pair_intermediate_dim) + +### Bottlenecks +For typical configs (seq_len=64-256): +1. **Triangle operations** are O(N³) and dominate for longer sequences +2. **Pair transition** is memory-bound for large pair_dim +3. **MSA column attention** can be expensive for large MSAs + +## Code Reference + +```python +# From tiny_openfold_v1.py +total_params = sum(p.numel() for p in model.parameters()) +print(f"Total parameters: {total_params:,}") +print(f"Model size: {total_params * 4 / 1e6:.1f} MB (FP32)") +``` + +## References + +1. **AlphaFold 2 Paper**: Jumper et al., "Highly accurate protein structure prediction with AlphaFold", Nature 2021 +2. **OpenFold**: https://github.com/aqlaboratory/openfold - Open source reproduction +3. **Evoformer Details**: AlphaFold 2 Supplement, Section 1.6 +4. **Triangle Updates**: Supplement Section 1.6.7-1.6.8 +5. **Structure Module**: Supplement Section 1.8 + diff --git a/MLExamples/TinyOpenFold/PERFORMANCE_OPTIMIZATION_TUTORIAL.md b/MLExamples/TinyOpenFold/PERFORMANCE_OPTIMIZATION_TUTORIAL.md new file mode 100644 index 00000000..1ea0955f --- /dev/null +++ b/MLExamples/TinyOpenFold/PERFORMANCE_OPTIMIZATION_TUTORIAL.md @@ -0,0 +1,771 @@ +# TinyOpenFold: Complete Performance Optimization Tutorial + +**Learn GPU optimization by progressively improving AlphaFold 2 Evoformer performance** + +This tutorial demonstrates the complete GPU optimization pipeline from baseline PyTorch to custom Triton kernels, achieving **2.0x speedup** on real workloads. + +--- + +## Table of Contents +1. [Tutorial Overview](#tutorial-overview) +2. [Environment Setup](#environment-setup) +3. [Stage 1: Baseline (V1)](#stage-1-baseline-v1---pure-pytorch) +4. [Stage 2: Kernel Fusion (V2)](#stage-2-kernel-fusion-v2---pytorch-level-optimization) +5. [Stage 3: Custom Kernels (V3)](#stage-3-custom-triton-kernels-v3---gpu-level-optimization) +6. [Performance Analysis](#performance-analysis) +7. [Lessons Learned](#lessons-learned) + +--- + +## Tutorial Overview + +### What You'll Learn + +This tutorial covers the complete optimization pipeline from profiling to implementation. You'll start by establishing baseline performance metrics with clean PyTorch code, then apply high-level kernel fusion optimizations without writing custom GPU code. Next, you'll drop down to low-level custom Triton kernels for maximum performance. Throughout the journey, you'll learn profiling techniques to identify bottlenecks at each stage and develop the analytical skills to understand exactly where speedups come from. + +### Performance Journey + +``` +Version 1 (Baseline) → Version 2 (Fused) → Version 3 (Triton) + 80.5 samples/sec 106.4 samples/sec 162.5 samples/sec + 100% +32% +102% + [Pure PyTorch] [Kernel Fusion] [Custom Kernels] +``` + +### Problem Sizes (Small & Medium for best demonstration) + +| Size | Seq Length | MSA Seqs | Batch | Memory | Best For | +|------|------------|----------|-------|--------|----------| +| **Small** | 64 | 16 | 4 | ~196 MB | Quick demos, shows best speedup (2.0x) | +| **Medium** | 128 | 32 | 2 | ~209 MB | Realistic workloads, balanced performance (1.65x) | + +--- + +## Environment Setup + +```bash +# Load required modules +module load python/3.12 rocm/7.2 libffi/3.3 + +# Navigate to TinyOpenFold +cd /mnt/thera/data/incoming/asimishr/aiml_prof/HPCTrainingExamples/MLExamples/TinyOpenFold + +# Activate virtual environment +source venvOF/bin/activate + +# Verify GPU +python3 -c "import torch; print(f'GPU: {torch.cuda.get_device_name(0)}')" +``` + +**Expected**: `GPU: AMD Instinct MI300X` + +--- + +## Stage 1: Baseline (V1) - Pure PyTorch + +### Objective +Establish baseline performance with clean, readable PyTorch implementation. + +### Characteristics + +The baseline prioritizes clarity over performance. The code is clean and well-documented, using only standard PyTorch operations that anyone can understand. However, it's completely unoptimized—each operation launches a separate GPU kernel with no fusion. This means kernel launch overhead dominates execution time, especially for small workloads. + +### Run Small Problem + +```bash +cd version1_pytorch_baseline + +python3 tiny_openfold_v1.py \ + --seq-len 64 \ + --num-seqs 16 \ + --batch-size 4 \ + --num-blocks 4 \ + --num-steps 30 \ + --device 0 +``` + +### Expected Output + +``` +================================================================================ +TINY OPENFOLD - VERSION 1: PYTORCH BASELINE +================================================================================ + +Model Configuration: + MSA dimension: 64 + Pair dimension: 128 + Evoformer blocks: 4 + Total parameters: 2,653,760 + Model size: 10.6 MB (FP32) + +Training Configuration: + Training steps: 30 + Batch size: 4 + +====================================================================== +Step 0/30 | Loss: 33.06 | Speed: 80.5 samples/sec | Memory: 195.7 MB | Time: 49.7ms +Step 10/30 | Loss: 33.25 | Speed: 80.5 samples/sec | Memory: 195.7 MB | Time: 49.7ms +Step 20/30 | Loss: 33.45 | Speed: 80.5 samples/sec | Memory: 195.7 MB | Time: 49.7ms +====================================================================== + +Performance Summary: + Average training speed: 80.5 samples/sec + Average batch time: 49.7 ms + Average forward time: 18.3 ms + Average backward time: 27.2 ms + Average optimizer time: 4.1 ms + Peak memory usage: 195.7 MB +``` + +### Key Metrics (Small Problem) + +| Metric | Value | Notes | +|--------|-------|-------| +| Speed | **80.5 samples/sec** | Baseline reference | +| Batch time | **49.7 ms** | Total time per step | +| Forward | 18.3 ms | 37% of batch time | +| **Backward** | **27.2 ms** | **55% of batch time** (main bottleneck) | +| Optimizer | 4.1 ms | 8% of batch time | +| Memory | 195.7 MB | Peak allocation | + +### Bottleneck Analysis + +**Profile with PyTorch Profiler:** + +```bash +python3 tiny_openfold_v1.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 \ + --enable-pytorch-profiler \ + --profile-dir ./profiles_v1_small \ + --device 0 + +# View results +tensorboard --logdir ./profiles_v1_small +``` + +**What to look for in the profiler traces:** + +You'll notice multiple attention kernels where Q, K, and V are computed as separate operations instead of being fused. Triangle operations dominate the backward pass due to their O(N³) complexity. You'll also see significant kernel launch overhead from many small, short-lived kernel calls. + +**Optional: ROCm System-Level Profiling** + +For deeper insights into GPU utilization and kernel behavior, use rocprof-sys: + +```bash +# Profile GPU kernels and API calls +./run_rocprof_sys.sh + +# Results show: kernel launch frequency, memory transfers, GPU occupancy +# Look for: many short-lived kernels, poor occupancy on small operations +``` + +### Run Medium Problem + +```bash +python3 tiny_openfold_v1.py \ + --seq-len 128 \ + --num-seqs 32 \ + --batch-size 2 \ + --num-blocks 4 \ + --num-steps 30 \ + --device 0 +``` + +### Key Metrics (Medium Problem) + +| Metric | Value | Notes | +|--------|-------|-------| +| Speed | **41.5 samples/sec** | Half of small (expected - 4x work) | +| Batch time | **48.2 ms** | Similar to small! (batch size = 2 vs 4) | +| Forward | 17.4 ms | 36% of batch time | +| **Backward** | **26.8 ms** | **56% of batch time** | +| Optimizer | 4.0 ms | 8% of batch time | +| Memory | 208.9 MB | Scales with sequence length² | + +### Stage 1 Summary + +**Baseline Established:** + +We now have reference numbers for both problem sizes. The small problem runs at 80.5 samples/sec with 49.7 ms per batch, while the medium problem achieves 41.5 samples/sec at 48.2 ms per batch. + +**Bottlenecks Identified:** + +Profiling reveals where optimization will have the most impact. The backward pass dominates at 55-56% of total time, with multiple kernel launches for attention operations creating unnecessary overhead. Triangle operations are particularly compute-intensive due to their cubic complexity. + +**Next Step**: Apply kernel fusion to reduce launch overhead + +--- + +## Stage 2: Kernel Fusion (V2) - PyTorch-Level Optimization + +### Objective +Reduce kernel launch overhead by fusing operations at the PyTorch level. + +### Optimizations Applied + +#### 1. MSA QKV Fusion +**Before (V1)**: +```python +q = self.q_proj(msa) # Kernel 1 +k = self.k_proj(msa) # Kernel 2 +v = self.v_proj(msa) # Kernel 3 +``` + +**After (V2)**: +```python +qkv = self.qkv_proj(msa) # Single fused kernel +q, k, v = qkv.chunk(3, dim=-1) +``` + +**Benefit**: 3 kernels → 1 kernel + +#### 2. Flash Attention +**Before (V1)**: +```python +# Standard attention: O(N²) memory +scores = torch.matmul(q, k.transpose(-2, -1)) +attn_weights = softmax(scores / sqrt(d_k)) +output = torch.matmul(attn_weights, v) +``` + +**After (V2)**: +```python +# Flash Attention: O(N) memory, fused kernel +output = F.scaled_dot_product_attention(q, k, v) +``` + +**Benefit**: Memory-efficient, fewer kernels, better cache utilization + +#### 3. Triangle Gate/Proj Fusion +**Before (V1)**: +```python +left = self.left_proj(pair) # Kernel 1 +right = self.right_proj(pair) # Kernel 2 +left_gate = sigmoid(self.left_gate_proj(pair)) # Kernel 3 +right_gate = sigmoid(self.right_gate_proj(pair)) # Kernel 4 +``` + +**After (V2)**: +```python +# Fused gate and projection +combined = self.fused_gate_proj(pair) # Single kernel +left, right, left_gate, right_gate = combined.chunk(4, dim=-1) +left_gate = sigmoid(left_gate) +right_gate = sigmoid(right_gate) +``` + +**Benefit**: 4 kernels → 2 kernels + +### Run Small Problem (V2) + +```bash +cd ../version2_pytorch_fused + +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 \ + --num-seqs 16 \ + --batch-size 4 \ + --num-blocks 4 \ + --num-steps 30 +``` + +### Expected Output + +``` +================================================================================ +TINY OPENFOLD - VERSION 2: PYTORCH FUSED +================================================================================ + +Fusion Optimizations: + MSA QKV Fusion: Enabled + Triangle QKV Fusion: Enabled + Flash Attention: Enabled + Triangle Gate/Proj Fusion: Enabled + Kernel Reduction: 80.0% (48 fewer kernels) + +====================================================================== +Step 0/30 | Loss: 33.06 | Speed: 106.4 samples/sec | Memory: 195.7 MB | Time: 37.6ms +Step 10/30 | Loss: 33.25 | Speed: 106.4 samples/sec | Memory: 195.7 MB | Time: 37.6ms +Step 20/30 | Loss: 33.45 | Speed: 106.4 samples/sec | Memory: 195.7 MB | Time: 37.6ms +====================================================================== + +Performance Summary V2: + Average training speed: 106.4 samples/sec [+32% vs V1] + Average batch time: 37.6 ms [-24% vs V1] + Average forward time: 14.7 ms [-20% vs V1] + Average backward time: 19.5 ms [-28% vs V1] + Average optimizer time: 3.4 ms [-17% vs V1] + Peak memory usage: 195.7 MB [Same as V1] +``` + +### V1 → V2 Improvement (Small Problem) + +| Metric | V1 | V2 | Improvement | +|--------|----|----|-------------| +| Speed | 80.5 s/s | 106.4 s/s | **+32%** ⚡ | +| Batch time | 49.7 ms | 37.6 ms | **-24%** | +| Forward | 18.3 ms | 14.7 ms | -20% | +| **Backward** | **27.2 ms** | **19.5 ms** | **-28%** ⚡⚡ | +| Optimizer | 4.1 ms | 3.4 ms | -17% | +| Memory | 195.7 MB | 195.7 MB | **No change** | + +**Key Insight**: Backward pass sees the largest improvement (28% reduction) + +### Run Medium Problem (V2) + +```bash +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 128 \ + --num-seqs 32 \ + --batch-size 2 \ + --num-blocks 4 \ + --num-steps 30 +``` + +### V1 → V2 Improvement (Medium Problem) + +| Metric | V1 | V2 | Improvement | +|--------|----|----|-------------| +| Speed | 41.5 s/s | 49.0 s/s | **+18%** | +| Batch time | 48.2 ms | 40.8 ms | **-15%** | +| Forward | 17.4 ms | 14.5 ms | -17% | +| **Backward** | **26.8 ms** | **22.9 ms** | **-15%** | +| Optimizer | 4.0 ms | 3.4 ms | -15% | +| Memory | 208.9 MB | 208.9 MB | **No change** | + +### Ablation Study: Which Fusion Helps Most? + +Test individual optimizations to understand their contribution: + +```bash +cd version2_pytorch_fused + +# Baseline (all fusions disabled) +echo "=== Baseline (all fusions off) ===" +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 --num-steps 20 \ + --disable-all-fusion | grep "Average training speed" + +# Only MSA QKV fusion +echo "=== Only MSA QKV fusion ===" +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 --num-steps 20 \ + --disable-all-fusion --enable-qkv-fusion-msa | grep "Average training speed" + +# Only Flash Attention +echo "=== Only Flash Attention ===" +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 --num-steps 20 \ + --disable-all-fusion --enable-flash-attention | grep "Average training speed" + +# Only Triangle fusion +echo "=== Only Triangle fusion ===" +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 --num-steps 20 \ + --disable-all-fusion --enable-triangle-fusion | grep "Average training speed" + +# All fusions (default) +echo "=== All fusions enabled ===" +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py \ + --seq-len 64 --num-seqs 16 --batch-size 4 --num-steps 20 | grep "Average training speed" +``` + +**Expected Results:** + +Each optimization contributes differently to the total speedup. The baseline with no fusion runs at ~80 samples/sec. Enabling only MSA QKV fusion improves this to ~87 samples/sec (+9%), while Flash Attention alone achieves ~92 samples/sec (+15%). Triangle fusion by itself reaches ~85 samples/sec (+6%). However, when all fusions are enabled together, performance jumps to ~106 samples/sec (+32%). + +**Key Learning**: Flash Attention provides the biggest single benefit, but combined optimizations are synergistic. + +### Verify Fusion Impact with ROCm Profilers + +Now that we've fused kernels, let's verify the improvements at the hardware level: + +```bash +cd version2_pytorch_fused + +# Kernel-level profiling with rocprofv3 +./run_rocprofv3.sh + +# Hardware counter analysis with rocprof-compute +./run_rocprof_compute.sh + +# Compare kernel counts: V1 vs V2 +# V1: ~240 kernel launches per step +# V2: ~48 kernel launches per step (80% reduction!) +``` + +**Key metrics to check:** +- **Kernel count**: Should see dramatic reduction in total kernel launches +- **Memory bandwidth**: Flash Attention should reduce HBM traffic by 50-80% +- **Occupancy**: Fused kernels should show better GPU utilization + +### Stage 2 Summary + +**Achievements:** + +Kernel fusion delivers solid gains without increasing memory usage. For the small problem, we've improved from 80.5 to 106.4 samples/sec (+32%), while the medium problem went from 41.5 to 49.0 samples/sec (+18%). We've reduced the total number of kernel launches by 80% without any memory overhead. + +**Remaining Bottlenecks:** + +Even with fusion, there's still room for improvement. We're still relying on generic PyTorch kernels that aren't optimized for our specific use case. The backward pass continues to dominate execution time, and memory bandwidth isn't fully optimized since PyTorch can't exploit all hardware capabilities. + +**Next Step**: Drop to GPU level with custom Triton kernels + +--- + +## Stage 3: Custom Triton Kernels (V3) - GPU-Level Optimization + +### Objective +Hand-optimize critical kernels with Triton for maximum performance. + +### Triton Optimizations + +#### 1. Custom LayerNorm Kernel +**Why optimize?** Standard LayerNorm is memory-bound and makes multiple passes through data. + +**Triton Implementation**: +```python +@triton.jit +def layernorm_kernel( + x_ptr, weight_ptr, output_ptr, + n_elements, eps: tl.constexpr, BLOCK_SIZE: tl.constexpr +): + """ + Fused LayerNorm: compute mean, variance, normalize, and scale in one pass. + + Memory optimization: + - Two passes through input (statistics + normalize) + - Mean/variance computed in registers + - Immediate normalization and scaling + """ + # Load block of data + block_id = tl.program_id(0) + offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offset < n_elements + + x = tl.load(x_ptr + offset, mask=mask, other=0.0) + + # Compute statistics in registers + mean = tl.sum(x, axis=0) / n_elements + var = tl.sum((x - mean) * (x - mean), axis=0) / n_elements + + # Normalize and scale + rstd = 1 / tl.sqrt(var + eps) + weight = tl.load(weight_ptr + offset, mask=mask, other=1.0) + output = (x - mean) * rstd * weight + + # Store result + tl.store(output_ptr + offset, output, mask=mask) +``` + +**Benefits:** + +Custom implementation beats PyTorch's generic approach. Instead of 3+ separate kernel launches, we execute everything in a single kernel. Data stays in cache and registers rather than being written back to main memory between operations. Memory access patterns are hand-optimized for sequential reads and writes. + +#### 2. Flash Attention for MSA (Triton) +**Why optimize?** MSA operations dominate forward/backward passes. + +**Key Optimizations:** + +MSA attention is memory-bound, so we focus on reducing data movement. Tiled computation allows us to fit working sets in shared memory, dramatically reducing expensive HBM (main memory) traffic. The implementation is specifically optimized for ROCm/AMD GPUs, taking advantage of architectural features like LDS (local data share). + +#### 3. Flash Attention for Triangles (Triton) +**Why optimize?** Triangle operations are O(N³) and very expensive. + +**Key Optimizations:** + +Triangle operations have O(N³) complexity, making backward pass optimization critical. We use a custom tiling strategy designed specifically for the pair representation's access patterns. Memory transfers are minimized by reusing data across tiles. The backward pass gets special attention since it's the biggest bottleneck—custom gradient implementations avoid PyTorch's generic autograd overhead. + +### Run Small Problem (V3) + +```bash +cd ../version3_triton + +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v3.py \ + --seq-len 64 \ + --num-seqs 16 \ + --batch-size 4 \ + --num-blocks 4 \ + --num-steps 30 +``` + +### Expected Output + +``` +================================================================================ +TINY OPENFOLD - VERSION 3: TRITON CUSTOM KERNELS +================================================================================ + +Triton Kernel Performance: + Custom kernels active: LayerNorm, Flash Attention (MSA & Triangle) + Kernel fusion benefits: Reduced memory bandwidth, lower latency + +Running 5 warmup steps to compile Triton kernels... +Warmup complete. Triton kernels compiled. Starting measured training loop... + +====================================================================== +Step 0/30 | Loss: 33.12 | Speed: 162.5 samples/sec | Memory: 218.5 MB | Time: 24.6ms +Step 10/30 | Loss: 33.26 | Speed: 163.5 samples/sec | Memory: 218.5 MB | Time: 24.5ms +Step 20/30 | Loss: 33.45 | Speed: 163.2 samples/sec | Memory: 218.5 MB | Time: 24.5ms +====================================================================== + +Performance Summary V3: + Average training speed: 162.5 samples/sec [+102% vs V1, +53% vs V2] + Average batch time: 24.6 ms [-51% vs V1, -35% vs V2] + Average forward time: 14.0 ms [-23% vs V1, -5% vs V2] + Average backward time: 8.5 ms [-69% vs V1, -56% vs V2] + Average optimizer time: 1.5 ms [-63% vs V1, -56% vs V2] + Peak memory usage: 218.5 MB [+12% vs V1/V2] +``` + +### V1 → V2 → V3 Progression (Small Problem) + +| Metric | V1 | V2 | V3 | V1→V2 | V2→V3 | **V1→V3** | +|--------|----|----|----|----- |-------|-----------| +| **Speed** | 80.5 s/s | 106.4 s/s | **162.5 s/s** | +32% | +53% | **+102%** ⚡⚡⚡ | +| Batch time | 49.7 ms | 37.6 ms | **24.6 ms** | -24% | -35% | **-51%** | +| Forward | 18.3 ms | 14.7 ms | **14.0 ms** | -20% | -5% | **-23%** | +| **Backward** | **27.2 ms** | **19.5 ms** | **8.5 ms** | -28% | -56% | **-69%** ⚡⚡⚡ | +| Optimizer | 4.1 ms | 3.4 ms | **1.5 ms** | -17% | -56% | **-63%** | +| Memory | 195.7 MB | 195.7 MB | 218.5 MB | 0% | +12% | +12% | + +**🎯 Key Achievement**: Backward pass reduced by **69%** (27.2 → 8.5 ms)! + +### Run Medium Problem (V3) + +```bash +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v3.py \ + --seq-len 128 \ + --num-seqs 32 \ + --batch-size 2 \ + --num-blocks 4 \ + --num-steps 30 +``` + +### V1 → V2 → V3 Progression (Medium Problem) + +| Metric | V1 | V2 | V3 | V1→V2 | V2→V3 | **V1→V3** | +|--------|----|----|----|----- |-------|-----------| +| **Speed** | 41.5 s/s | 49.0 s/s | **68.5 s/s** | +18% | +40% | **+65%** ⚡⚡ | +| Batch time | 48.2 ms | 40.8 ms | **29.2 ms** | -15% | -28% | **-39%** | +| Forward | 17.4 ms | 14.5 ms | **14.8 ms** | -17% | +2% | **-15%** | +| **Backward** | **26.8 ms** | **22.9 ms** | **11.7 ms** | -15% | -49% | **-56%** ⚡⚡⚡ | +| Optimizer | 4.0 ms | 3.4 ms | **1.6 ms** | -15% | -53% | **-60%** | +| Memory | 208.9 MB | 208.9 MB | 259.9 MB | 0% | +24% | +24% | + +**🎯 Key Achievement**: Backward pass reduced by **56%** (26.8 → 11.7 ms)! + +### Why V3 is So Much Faster + +Triton kernels give us fine-grained control over memory hierarchy. **Custom LayerNorm** fuses all computation into a single pass through data instead of PyTorch's multi-pass approach. **Optimized Flash Attention** is hand-tuned for ROCm with carefully designed memory access patterns. **Triangle Backward Optimization** uses custom gradients that generate minimal memory traffic compared to autograd. Finally, **Register/Cache Utilization** is maximized by keeping data in fast memory (registers and L1 cache) much longer than generic PyTorch kernels allow. + +### Analyze Triton Kernel Performance + +Verify that custom Triton kernels are actually faster at the hardware level: + +```bash +cd version3_triton + +# Profile Triton kernel efficiency +./run_rocprof_compute.sh + +# System-level view of Triton kernels +./run_rocprof_sys.sh +``` + +**What to verify:** +- **Custom LayerNorm**: Single kernel vs 3+ PyTorch kernels, better register usage +- **Flash Attention**: Reduced HBM bandwidth (memory-bound → compute-bound) +- **Triangle kernels**: Improved cache hit rate, minimized memory traffic +- **Overall occupancy**: Higher GPU utilization compared to V1/V2 + +**Pro tip**: Compare rocprof-compute outputs between V2 and V3 to see memory bandwidth reduction—this is where Triton shines. + +### Stage 3 Summary + +**Final Achievements:** + +Custom kernels deliver the biggest gains of any optimization stage. The small problem improved from 80.5 to 162.5 samples/sec—a **2.0x speedup**! The medium problem went from 41.5 to 68.5 samples/sec (**1.65x speedup**). Most impressively, the backward pass is 69% faster for small problems and 56% faster for medium ones. + +**Trade-offs:** + +Every optimization has costs—here's what we traded for 2x speedup. We achieved massive performance gains while maintaining the same numerical accuracy as the baseline. However, memory usage increased by 12-24% (still very manageable). The code is also more complex due to custom Triton kernels, which require GPU programming expertise to maintain. + +--- + +## Performance Analysis + +### Complete Comparison Table + +#### Small Problem (64 residues, 16 MSA, batch=4) + +``` +Metric V1 Baseline V2 Fused V3 Triton Total Gain +─────────────────────────────────────────────────────────────────────── +Speed (s/s) 80.5 106.4 162.5 +102% ⚡⚡⚡ +Batch (ms) 49.7 37.6 24.6 -51% +Forward (ms) 18.3 14.7 14.0 -23% +Backward (ms) 27.2 19.5 8.5 -69% ⚡⚡⚡ +Optimizer (ms) 4.1 3.4 1.5 -63% +Memory (MB) 195.7 195.7 218.5 +12% +─────────────────────────────────────────────────────────────────────── +``` + +#### Medium Problem (128 residues, 32 MSA, batch=2) + +``` +Metric V1 Baseline V2 Fused V3 Triton Total Gain +─────────────────────────────────────────────────────────────────────── +Speed (s/s) 41.5 49.0 68.5 +65% ⚡⚡ +Batch (ms) 48.2 40.8 29.2 -39% +Forward (ms) 17.4 14.5 14.8 -15% +Backward (ms) 26.8 22.9 11.7 -56% ⚡⚡⚡ +Optimizer (ms) 4.0 3.4 1.6 -60% +Memory (MB) 208.9 208.9 259.9 +24% +─────────────────────────────────────────────────────────────────────── +``` + +### Optimization Contribution Breakdown + +#### Small Problem +``` +V1 → V2 (+32%): + - MSA QKV fusion: ~9% + - Flash Attention: ~15% + - Triangle fusion: ~8% + = Total: 32% (synergistic effect) + +V2 → V3 (+53%): + - Custom LayerNorm: ~10% + - Flash Attention (MSA): ~20% + - Flash Attention (Triangle): ~23% + = Total: 53% + +V1 → V3 (+102%): + = Multiplicative effect: 1.32 × 1.53 ≈ 2.0x +``` + +### Where Did the Speedup Come From? + +**Backward Pass Optimization is Key:** +- V1: 27.2 ms (55% of batch time) +- V2: 19.5 ms (52% of batch time) +- V3: 8.5 ms (35% of batch time) + +**Reduction**: 27.2 → 8.5 ms = **-69% improvement** + +This accounts for most of the total speedup! + +### Memory Trade-off Analysis + +The small problem shows a memory increase from 195.7 to 218.5 MB (+23 MB, +12%) because Triton kernels trade some memory for speed—they allocate scratch space for intermediate computations and use additional buffers for tiled operations. However, this cost is negligible compared to the performance gain. The 23 MB increase is trivial on modern GPUs with 192 GB of HBM, and the 2.0x speedup far outweighs this small memory cost while still leaving plenty of headroom for much larger problems. + +--- + +## Lessons Learned + +### 1. Optimization Strategy + +**Best Approach:** + +Always optimize incrementally—don't skip steps. Start with a clean, readable baseline (V1) to establish reference performance, then profile thoroughly to identify the real bottlenecks rather than what you assume they are. Apply high-level optimizations first (V2 - kernel fusion) since these are easier to implement and debug, and only drop to low-level custom kernels (V3) when you've exhausted higher-level options. Don't jump straight to custom kernels—high-level optimizations give 70% of the benefit with just 10% of the effort! + +### 2. Backward Pass Matters Most + +In deep learning workloads, the backward pass often dominates execution time at 50-60% of total runtime, making it the primary optimization target. Our results confirm this: V3's backward pass optimization delivered the biggest gains with a 56-69% reduction, accounting for most of the overall speedup. When profiling, always focus optimization efforts on the backward pass first. + +### 3. Problem Size Affects Speedup + +The speedup you achieve depends heavily on problem size. **Small problems** (64 residues) show the largest speedup at 2.0x because kernel launch overhead dominates, and our optimizations directly address this. **Medium problems** (128 residues) still achieve good speedup at 1.65x with a more balanced workload between kernel overhead and actual computation. + +**Lesson**: Optimize for your target workload size! + +### 4. Memory vs Speed Trade-offs + +Each version offers a different balance. V2 has no memory cost while delivering a 32% speedup—you should **always use** kernel fusion. V3 adds 12% memory overhead but doubles performance with a 102% speedup—**use it when speed matters** more than memory. + +### 5. Incremental Development + +Progressive optimization allows you to validate, debug, and learn at each step. You can validate correctness at each stage by comparing outputs against the baseline. When something breaks, you can easily isolate which optimization caused the problem. From an educational perspective, you understand what each optimization contributes rather than seeing a black box. Finally, you have the flexibility to choose your optimization level based on specific needs—readability, memory constraints, or maximum performance. + +--- + +## Quick Reference Commands + +### Complete Tutorial Run (All 3 Versions, Both Sizes) + +```bash +# Automated tutorial script +bash optimization_tutorial.sh +``` + +**Duration**: ~30 seconds +**Output**: Complete progression V1 → V2 → V3 for small and medium + +### Manual Individual Runs + +```bash +# Small Problem +## V1 Baseline +cd version1_pytorch_baseline +python3 tiny_openfold_v1.py --seq-len 64 --num-seqs 16 --batch-size 4 --device 0 + +## V2 Fused +cd ../version2_pytorch_fused +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v2.py --seq-len 64 --num-seqs 16 --batch-size 4 + +## V3 Triton +cd ../version3_triton +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v3.py --seq-len 64 --num-seqs 16 --batch-size 4 + +# Medium Problem - same commands but: +# --seq-len 128 --num-seqs 32 --batch-size 2 +``` + +--- + +## Profiling Cheat Sheet + +Quick reference for ROCm profiling tools across all versions: + +| Tool | What It Shows | When to Use | Command | +|------|---------------|-------------|---------| +| **PyTorch Profiler** | High-level PyTorch ops, kernel names | Initial bottleneck identification | `--enable-pytorch-profiler` | +| **rocprof-sys** | System-level GPU trace, kernel timeline | Overall GPU utilization, kernel patterns | `./run_rocprof_sys.sh` | +| **rocprofv3** | Detailed kernel metrics, launch counts | Verify fusion, count kernel launches | `./run_rocprofv3.sh` | +| **rocprof-compute** | Hardware counters, memory bandwidth | Memory bottlenecks, cache efficiency | `./run_rocprof_compute.sh` | + +**Typical workflow**: Start with PyTorch Profiler → rocprof-sys for overview → rocprof-compute for memory analysis → rocprofv3 for kernel details. + +--- + +## Next Steps: Advanced Optimizations + +### 1. Mixed Precision (V3 + AMP) +```bash +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v3.py \ + --seq-len 128 --num-seqs 32 --batch-size 2 --use-amp +``` +**Expected**: Additional 20-30% speedup + +### 2. Torch Compile (V3 + Compiler) +```bash +ROCR_VISIBLE_DEVICES=0 python3 tiny_openfold_v3.py \ + --seq-len 128 --num-seqs 32 --batch-size 2 --enable-torch-compile +``` +**Expected**: Additional 10-20% speedup + +### 3. Multi-GPU (V3 + Data Parallel) +```bash +ROCR_VISIBLE_DEVICES=0,1,2,3 python3 tiny_openfold_v3.py \ + --seq-len 128 --num-seqs 32 --batch-size 8 +``` +**Expected**: Near-linear scaling (3.5-3.8x on 4 GPUs) + +--- + +## Summary: What You Learned + +You now have a complete mental model of GPU optimization. You learned how to establish reference performance through baseline measurement, identify bottlenecks systematically using profiling tools, and apply high-level PyTorch kernel fusion optimizations. You progressed to low-level GPU programming with custom Triton kernels, developed skills in performance analysis to understand where speedups actually come from, and learned to evaluate trade-offs between memory usage, speed, complexity, and maintainability. + +**Final Achievement**: **2.0x speedup** on small workloads through systematic optimization—you now have the blueprint to unlock similar performance gains in your own GPU workloads, from baseline profiling to production-ready custom kernels. diff --git a/MLExamples/TinyOpenFold/README.md b/MLExamples/TinyOpenFold/README.md new file mode 100644 index 00000000..a3329e1e --- /dev/null +++ b/MLExamples/TinyOpenFold/README.md @@ -0,0 +1,478 @@ +# TinyOpenFold: Educational AlphaFold 2 Implementation + +A simplified, educational implementation of the AlphaFold 2 / Evoformer architecture for protein structure prediction, designed for learning and profiling. + +

+ PyTorch + Python + License +

+ +## Overview + +TinyOpenFold is an educational implementation of the core AlphaFold 2 architecture, focusing on the **Evoformer** - the main innovation that revolutionized protein structure prediction. This implementation is designed to: + +- **Teach** the fundamental concepts of AlphaFold 2's architecture +- **Profile** performance characteristics of protein structure prediction models +- **Demonstrate** how MSA (Multiple Sequence Alignment) and pair representations interact +- **Provide** a foundation for experimenting with optimization techniques + +## Features + +✅ **Complete Evoformer Implementation** +- MSA row-wise attention with pair bias +- MSA column-wise attention +- Triangle multiplicative updates (outgoing/incoming) +- Triangle self-attention (starting/ending) +- Outer product mean + +✅ **Comprehensive Profiling Integration** +- PyTorch Profiler with GPU/CPU timeline analysis +- Memory profiling and tracking +- Operator-level performance characterization +- TensorBoard visualization support + +✅ **Educational Focus** +- Clear, readable code with extensive documentation +- Parameter counting and memory analysis +- Synthetic data generation for demonstration +- Deterministic execution for reproducibility + +## Quick Start + +### Environment Setup and Installation + +Set up your Python environment and install dependencies: + +```bash +# Load modules (choose one option) +module load python/3.12 rocm/7.2 # Standard Python (recommended) +# OR +module load cray-python rocm/7.2 # Cray environment + +# Navigate to TinyOpenFold directory +cd HPCTrainingExamples/MLExamples/TinyOpenFold + +# Create and activate virtual environment +python3 -m venv venv +source venv/bin/activate + +# Verify Python version +python3 --version + +# Upgrade pip and install build tools +pip3 install --upgrade pip setuptools wheel + +# Install PyTorch with ROCm support (using ROCm 7.1 nightly build) +# For ROCm 6.4: +# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4 + +# For ROCm 7.1 nightly (recommended): +pip3 uninstall -y torch torchvision triton torchaudio 2>/dev/null || true +pip3 install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/rocm7.1 + +# Fix libcaffe2_nvrtc.so library loading issue +# Ensure ROCm and libffi modules are loaded (sets up library paths) +module load rocm/7.2 libffi/3.3 + +# Re-activate venv +source venv/bin/activate + +# Add PyTorch lib directory from venv to LD_LIBRARY_PATH +# This ensures caffe2 libraries are found from the venv installation +export LD_LIBRARY_PATH=$(python3 -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'lib'))"):${ROCM_PATH}/lib:$LD_LIBRARY_PATH + +# Optional: Add to ~/.bashrc for persistence +# echo "export LD_LIBRARY_PATH=\$(python3 -c \"import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'lib'))\"):\${ROCM_PATH}/lib:\$LD_LIBRARY_PATH" >> ~/.bashrc + +# Verify PyTorch installation +python3 -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')" + +# Install DeepSpeed +pip3 install deepspeed + +# Verify DeepSpeed installation +python3 -c "from deepspeed.profiling.flops_profiler import FlopsProfiler; print('DeepSpeed installed successfully.')" + +# Install additional dependencies (if needed) +pip3 install -r setup/requirements.txt + +# Install rocprof-compute development dependencies (for rocprof-compute profiling) +pip3 install -r setup/requirements_rocprof-compute-develop.txt +``` + +**Note**: Activate the virtual environment (`source venv/bin/activate`) each time you start a new session. + +### Basic Training + +```bash +# Run with default configuration (64 residues, 16 MSA sequences) +python3 tiny_openfold_v1.py --batch-size 4 --seq-len 64 --num-steps 30 + +# Expected output: +# Total parameters: ~2.6M +# Model size: ~10.6 MB (FP32) +# Training speed: varies by hardware +``` + +### With Profiling + +```bash +# Enable PyTorch profiler +python3 tiny_openfold_v1.py --enable-pytorch-profiler --profile-dir ./profiles + +# View results in TensorBoard +tensorboard --logdir ./profiles +``` + +### Advanced Configuration + +```bash +# Larger model +python3 tiny_openfold_v1.py \ + --msa-dim 128 \ + --pair-dim 256 \ + --num-blocks 8 \ + --seq-len 128 \ + --batch-size 2 + +# With memory profiling +python3 tiny_openfold_v1.py \ + --enable-all-profiling \ + --profile-dir ./complete_analysis + +# Mixed precision training +python3 tiny_openfold_v1.py --use-amp --batch-size 8 +``` + +### Multi-GPU Training + +TinyOpenFold supports multi-GPU training using PyTorch's `nn.DataParallel`: + +```bash +# Single GPU (explicit) +python3 tiny_openfold_v1.py --device 0 --batch-size 8 + +# Multi-GPU via environment variables (automatic) +# ROCm (AMD GPUs) +ROCR_VISIBLE_DEVICES=0,1,2,3 python3 tiny_openfold_v1.py --batch-size 32 + +# CUDA (NVIDIA GPUs) +CUDA_VISIBLE_DEVICES=0,1,2,3 python3 tiny_openfold_v1.py --batch-size 32 + +# Disable DataParallel even with multiple GPUs visible +python3 tiny_openfold_v1.py --no-data-parallel --device 0 +``` + +**Best Practice:** Scale batch size proportionally with GPU count (e.g., 8 samples per GPU). + +### Scaling Studies + +Run multi-GPU scaling experiments to measure performance: + +```bash +cd version1_pytorch_baseline + +# Quick scaling test (1, 2, 4, 8 GPUs) +chmod +x quick_scaling_test.sh +./quick_scaling_test.sh + +# Comprehensive scaling study with custom options +chmod +x run.sh +./run.sh --gpus "1 2 4 8" --batch-per-gpu 8 --steps 100 + +# With mixed precision +./run.sh --amp --steps 50 + +# Multiple runs for statistics +./run.sh --runs 3 --output-dir scaling_analysis +``` + +**Example Output:** +``` +GPUs Throughput (s/s) Speedup Efficiency +---- ---------------- ------- ---------- +1 166.9 1.00x 100.0% +2 202.7 1.21x 60.5% +4 245.3 1.47x 36.8% +8 249.1 1.49x 18.6% +``` + +See [`version1_pytorch_baseline/README.md`](version1_pytorch_baseline/README.md) for detailed multi-GPU documentation. + +## Architecture Overview + +### The Evoformer + +The Evoformer is the heart of AlphaFold 2, processing two coupled representations: + +1. **MSA Representation** `(N_seqs × N_res × msa_dim)` + - Features for each residue in each sequence of the MSA + - Updated via row-wise and column-wise attention + +2. **Pair Representation** `(N_res × N_res × pair_dim)` + - Pairwise features between all residues + - Updated via triangle operations and attention + +### Key Components + +#### MSA Processing +- **Row-wise Attention**: Attention across residues within each MSA sequence, biased by pair representation +- **Column-wise Attention**: Communication between different sequences at each position +- **MSA Transition**: Point-wise feed-forward network + +#### Pair Processing +- **Outer Product Mean**: Projects MSA patterns onto pairwise space +- **Triangle Multiplicative Updates**: Geometric reasoning (if i-j and j-k are close, i-k should be considered) +- **Triangle Self-Attention**: Attention over edges in the residue graph +- **Pair Transition**: Point-wise feed-forward network + +#### Structure Module +- Simplified distance prediction from pair representation +- In full AlphaFold 2, this is the Invariant Point Attention (IPA) module + +### Parameter Count + +**Default Configuration (TinyOpenFoldConfig)**: +- MSA dim: 64, Pair dim: 128 +- Evoformer blocks: 4 +- Total parameters: **~2.64M** +- Model size: **~10.6 MB (FP32)**, **~5.3 MB (FP16)** + +See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed parameter calculations. + +## Directory Structure + +``` +TinyOpenFold/ +├── README.md # This file +├── ARCHITECTURE.md # Detailed architecture documentation +└── version1_pytorch_baseline/ + ├── tiny_openfold_v1.py # Main implementation + └── README.md # Version-specific guide +``` + +## Performance Characteristics + +### Computational Complexity + +The Evoformer has interesting scaling properties: + +- **MSA Row Attention**: O(N_seqs × N_res² × msa_dim) +- **MSA Column Attention**: O(N_res × N_seqs² × msa_dim) +- **Triangle Operations**: O(N_res³ × pair_dim) ⚠️ Most expensive! +- **Outer Product**: O(N_seqs × N_res² × outer_dim²) + +For typical configurations (N_res=64-256): +- Triangle operations dominate computational cost +- Memory usage grows quadratically with sequence length (pair representation) +- MSA depth affects column attention cost + +### Typical Performance + +*Hardware: AMD MI250X / NVIDIA A100* + +| Config | Seq Len | MSA Seqs | Params | Memory | Speed | +|--------|---------|----------|--------|--------|-------| +| Small | 64 | 16 | 2.6M | ~100 MB | ~8-10 samples/sec | +| Medium | 128 | 32 | 10.5M | ~400 MB | ~2-3 samples/sec | +| Large | 256 | 64 | 42M | ~1.6 GB | ~0.5-1 samples/sec | + +*Note: Performance varies significantly by hardware and configuration* + +## Educational Use Cases + +### 1. Understanding AlphaFold 2 + +Study how the key innovations work: +- Examine `EvoformerBlock` to see how MSA and pair representations interact +- Explore `TriangleMultiplication` to understand geometric reasoning +- Analyze `MSARowAttentionWithPairBias` to see how pair info guides MSA attention + +### 2. Profiling and Optimization + +Use this as a baseline for optimization experiments: +- Profile with PyTorch Profiler to identify bottlenecks +- Experiment with different attention implementations +- Test kernel fusion opportunities +- Compare with production implementations + +### 3. Research and Experimentation + +Modify the architecture to test ideas: +- Change attention patterns +- Experiment with different update mechanisms +- Test alternative structure modules +- Implement custom operators + +## Differences from Production AlphaFold 2 + +This is an **educational simplification**. Key differences: + +| Aspect | TinyOpenFold | AlphaFold 2 | +|--------|--------------|-------------| +| Evoformer blocks | 4 | 48 | +| Dimensions | 64/128 | 256/128 | +| Templates | ❌ None | ✅ Template featurization | +| Structure Module | Simple distance prediction | Full IPA with frames | +| Recycling | ❌ Single pass | ✅ Multiple iterations | +| Data | Synthetic | Real MSAs and structures | +| Purpose | Education/Profiling | Production prediction | + +## Command Line Options + +```bash +# Model Configuration +--msa-dim 64 # MSA representation dimension +--pair-dim 128 # Pair representation dimension +--num-blocks 4 # Number of Evoformer blocks +--num-seqs 16 # Number of MSA sequences +--seq-len 64 # Sequence length (number of residues) + +# Training Configuration +--num-steps 50 # Training iterations +--batch-size 4 # Batch size +--learning-rate 3e-4 # Learning rate +--use-amp # Enable mixed precision + +# Profiling Options +--enable-pytorch-profiler # Enable PyTorch profiler +--enable-memory-profiling # Track memory usage +--enable-all-profiling # Enable all profiling features +--profile-dir ./profiles # Output directory for profiles +--warmup-steps 3 # Profiler warmup steps +--profile-steps 5 # Steps to profile + +# Utilities +--validate-setup # Run validation checks +``` + +## Understanding the Output + +During training, you'll see: + +``` +Model Configuration: + MSA dimension: 64 + Pair dimension: 128 + Evoformer blocks: 4 + Total parameters: 2,641,728 + Model size: 10.6 MB (FP32) + +Training Configuration: + Training steps: 50 + Batch size: 4 + Device: CUDA + +Step 0/50 | Loss: 45.2341 | Speed: 8.5 samples/sec | Memory: 102.3 MB | Time: 470.2ms +Step 10/50 | Loss: 38.7123 | Speed: 9.1 samples/sec | Memory: 102.3 MB | Time: 439.5ms +``` + +**Key Metrics**: +- **Loss**: MSE on predicted distances (should decrease over time) +- **Speed**: Samples processed per second +- **Memory**: GPU memory allocated +- **Time**: Time per training step + +## Troubleshooting + +### Out of Memory + +If you encounter OOM errors: + +```bash +# Reduce batch size +python3 tiny_openfold_v1.py --batch-size 2 + +# Reduce sequence length +python3 tiny_openfold_v1.py --seq-len 32 + +# Reduce MSA sequences +python3 tiny_openfold_v1.py --num-seqs 8 + +# Use mixed precision +python3 tiny_openfold_v1.py --use-amp +``` + +### Slow Performance + +The triangle operations are O(N³) and can be slow: + +```bash +# Use smaller sequences +python3 tiny_openfold_v1.py --seq-len 32 + +# Reduce Evoformer blocks +python3 tiny_openfold_v1.py --num-blocks 2 + +# Profile to identify bottlenecks +python3 tiny_openfold_v1.py --enable-pytorch-profiler +``` + +## Further Reading + +### AlphaFold 2 Resources + +- **Paper**: [Jumper et al., "Highly accurate protein structure prediction with AlphaFold", Nature 2021](https://www.nature.com/articles/s41586-021-03819-2) +- **Supplement**: Detailed architectural descriptions +- **OpenFold**: https://github.com/aqlaboratory/openfold - Full production implementation +- **AlphaFold GitHub**: https://github.com/deepmind/alphafold - Original DeepMind code + +### Understanding the Evoformer + +- AlphaFold 2 Supplement, Section 1.6: Evoformer architecture +- Section 1.6.7-1.6.8: Triangle multiplicative updates +- Section 1.7: Outer product mean +- Section 1.8: Structure module and IPA + +### Related Topics + +- **Attention Mechanisms**: Understanding multi-head attention +- **Geometric Deep Learning**: Graph neural networks for 3D structures +- **Protein Structure Prediction**: MSAs, templates, and structural biology + +## Contributing + +This is an educational project. Improvements welcome: + +- Enhanced documentation +- Additional visualization tools +- Performance optimizations +- Extended architecture variants + +## Citation + +If you use TinyOpenFold in your work, please cite both this implementation and the original AlphaFold 2: + +```bibtex +@article{jumper2021alphafold, + title={Highly accurate protein structure prediction with AlphaFold}, + author={Jumper, John and Evans, Richard and Pritzel, Alexander and others}, + journal={Nature}, + volume={596}, + number={7873}, + pages={583--589}, + year={2021}, + publisher={Nature Publishing Group} +} +``` + +## License + +Apache 2.0 License - See LICENSE file for details + +## Acknowledgments + +- Based on AlphaFold 2 by DeepMind +- Inspired by OpenFold (https://github.com/aqlaboratory/openfold) +- Educational structure follows TinyLLaMA example + +--- + +**Ready to explore AlphaFold 2? Start with:** + +```bash +cd version1_pytorch_baseline +python3 tiny_openfold_v1.py --validate-setup +``` + diff --git a/MLExamples/TinyOpenFold/optimization_tutorial.sh b/MLExamples/TinyOpenFold/optimization_tutorial.sh new file mode 100755 index 00000000..93061c8f --- /dev/null +++ b/MLExamples/TinyOpenFold/optimization_tutorial.sh @@ -0,0 +1,306 @@ +#!/bin/bash +################################################################################ +# TinyOpenFold: Complete Optimization Tutorial +# Progressive performance improvement: V1 → V2 → V3 +# Demonstrates 2.0x speedup through systematic optimization +################################################################################ + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +MAGENTA='\033[0;35m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# Test configuration +BASEDIR="/mnt/thera/data/incoming/asimishr/aiml_prof/HPCTrainingExamples/MLExamples/TinyOpenFold" +V1_DIR="$BASEDIR/version1_pytorch_baseline" +V2_DIR="$BASEDIR/version2_pytorch_fused" +V3_DIR="$BASEDIR/version3_triton" +DEVICE=0 +STEPS=30 + +# Setup environment +clear +echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════════════════════╗${NC}" +echo -e "${BOLD}${CYAN}║ ║${NC}" +echo -e "${BOLD}${CYAN}║ TinyOpenFold Performance Optimization Tutorial ║${NC}" +echo -e "${BOLD}${CYAN}║ ║${NC}" +echo -e "${BOLD}${CYAN}║ Progressive Optimization: V1 → V2 → V3 ║${NC}" +echo -e "${BOLD}${CYAN}║ Learn GPU optimization through practice! ║${NC}" +echo -e "${BOLD}${CYAN}║ ║${NC}" +echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════════════════════╝${NC}" +echo "" + +echo -e "${CYAN}[Step 1/7] Setting up environment...${NC}" +module load python/3.12 rocm/7.2 libffi/3.3 +source $BASEDIR/venvOF/bin/activate +echo -e "${GREEN}✓ Environment ready${NC}" +echo "" + +echo -e "${CYAN}[Step 2/7] Verifying GPU...${NC}" +python3 -c "import torch; print(f' GPU: {torch.cuda.get_device_name(0)}'); print(f' PyTorch: {torch.__version__}')" +echo -e "${GREEN}✓ GPU verified${NC}" +echo "" + +# Results file +RESULTS_FILE="$BASEDIR/tutorial_results_$(date +%Y%m%d_%H%M%S).txt" +echo "TinyOpenFold Optimization Tutorial Results" > $RESULTS_FILE +echo "Date: $(date)" >> $RESULTS_FILE +echo "GPU: AMD Instinct MI300X" >> $RESULTS_FILE +echo "================================================" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${BOLD}${BLUE} Part 1: Small Problem (64 residues, 16 MSA, batch=4)${NC}" +echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo "" + +# Function to run test +run_test() { + local version=$1 + local name=$2 + local seq_len=$3 + local num_seqs=$4 + local batch_size=$5 + local workdir=$6 + + cd $workdir + + # Run test + if [ "$version" == "V1" ]; then + python3 tiny_openfold_v*.py \ + --seq-len $seq_len \ + --num-seqs $num_seqs \ + --batch-size $batch_size \ + --num-blocks 4 \ + --num-steps $STEPS \ + --device $DEVICE \ + 2>&1 | tee /tmp/test_output.txt > /dev/null + else + ROCR_VISIBLE_DEVICES=$DEVICE python3 tiny_openfold_v*.py \ + --seq-len $seq_len \ + --num-seqs $num_seqs \ + --batch-size $batch_size \ + --num-blocks 4 \ + --num-steps $STEPS \ + 2>&1 | tee /tmp/test_output.txt > /dev/null + fi + + # Extract metrics + local speed=$(grep -oP 'Average training speed:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + local batch_time=$(grep -oP 'Average batch time:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + local forward_time=$(grep -oP 'Average forward time:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + local backward_time=$(grep -oP 'Average backward time:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + local optimizer_time=$(grep -oP 'Average optimizer time:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + local memory=$(grep -oP 'Peak memory.*:\s+\K[\d.]+' /tmp/test_output.txt | tail -1) + + echo "$version|$name|$speed|$batch_time|$forward_time|$backward_time|$optimizer_time|$memory" +} + +# Small problem results +declare -a SMALL_RESULTS=() + +# V1 - Small +echo -e "${YELLOW}[Step 3/7] Stage 1: Baseline (V1) - Small problem${NC}" +echo -e " ${CYAN}Running pure PyTorch implementation...${NC}" +result_v1_small=$(run_test "V1" "Small" 64 16 4 "$V1_DIR") +SMALL_RESULTS+=("$result_v1_small") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v1_small" +echo -e " ${GREEN}✓ Complete${NC} - Speed: ${BOLD}${speed} samples/sec${NC}, Batch: ${batch} ms" +echo "" + +# V2 - Small +echo -e "${YELLOW}[Step 4/7] Stage 2: Kernel Fusion (V2) - Small problem${NC}" +echo -e " ${CYAN}Running with QKV fusion + Flash Attention...${NC}" +result_v2_small=$(run_test "V2" "Small" 64 16 4 "$V2_DIR") +SMALL_RESULTS+=("$result_v2_small") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v2_small" + +# Calculate improvement +IFS='|' read -r _ _ v1_speed v1_batch _ _ _ _ <<< "$result_v1_small" +speedup=$(awk "BEGIN {printf \"%.2f\", $speed / $v1_speed}") +improvement=$(awk "BEGIN {printf \"%.0f\", ($speed / $v1_speed - 1) * 100}") +echo -e " ${GREEN}✓ Complete${NC} - Speed: ${BOLD}${speed} samples/sec${NC}, Batch: ${batch} ms" +echo -e " ${MAGENTA}→ Speedup: ${BOLD}${speedup}x${NC} (${GREEN}+${improvement}%${NC})" +echo "" + +# V3 - Small +echo -e "${YELLOW}[Step 5/7] Stage 3: Custom Triton Kernels (V3) - Small problem${NC}" +echo -e " ${CYAN}Running with custom LayerNorm + Flash Attention kernels...${NC}" +result_v3_small=$(run_test "V3" "Small" 64 16 4 "$V3_DIR") +SMALL_RESULTS+=("$result_v3_small") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v3_small" + +# Calculate improvement +speedup_v2=$(awk "BEGIN {printf \"%.2f\", $speed / $(echo $result_v2_small | cut -d'|' -f3)}") +speedup_v1=$(awk "BEGIN {printf \"%.2f\", $speed / $v1_speed}") +improvement_v1=$(awk "BEGIN {printf \"%.0f\", ($speed / $v1_speed - 1) * 100}") +echo -e " ${GREEN}✓ Complete${NC} - Speed: ${BOLD}${speed} samples/sec${NC}, Batch: ${batch} ms" +echo -e " ${MAGENTA}→ Speedup vs V2: ${BOLD}${speedup_v2}x${NC}" +echo -e " ${MAGENTA}→ Speedup vs V1: ${BOLD}${speedup_v1}x${NC} (${GREEN}+${improvement_v1}%${NC}) ${BOLD}⚡⚡⚡${NC}" +echo "" + +echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${BOLD}${BLUE} Part 2: Medium Problem (128 residues, 32 MSA, batch=2)${NC}" +echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo "" + +# Medium problem results +declare -a MEDIUM_RESULTS=() + +# V1-V2-V3 Medium (compact output) +echo -e "${YELLOW}[Step 6/7] Running all versions on medium problem...${NC}" + +echo -e " ${CYAN}V1 Baseline...${NC}" +result_v1_med=$(run_test "V1" "Medium" 128 32 2 "$V1_DIR") +MEDIUM_RESULTS+=("$result_v1_med") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v1_med" +echo -e " ${GREEN}✓ V1${NC} - ${speed} samples/sec" + +echo -e " ${CYAN}V2 Fused...${NC}" +result_v2_med=$(run_test "V2" "Medium" 128 32 2 "$V2_DIR") +MEDIUM_RESULTS+=("$result_v2_med") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v2_med" +echo -e " ${GREEN}✓ V2${NC} - ${speed} samples/sec" + +echo -e " ${CYAN}V3 Triton...${NC}" +result_v3_med=$(run_test "V3" "Medium" 128 32 2 "$V3_DIR") +MEDIUM_RESULTS+=("$result_v3_med") +IFS='|' read -r v ver speed batch fwd bwd opt mem <<< "$result_v3_med" +echo -e " ${GREEN}✓ V3${NC} - ${speed} samples/sec" +echo "" + +# Generate comprehensive summary +echo -e "${BOLD}${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${BOLD}${CYAN} [Step 7/7] Performance Summary & Analysis${NC}" +echo -e "${BOLD}${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo "" + +echo "PERFORMANCE SUMMARY" >> $RESULTS_FILE +echo "================================================" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +echo -e "${BOLD}Small Problem (64 residues):${NC}" +echo "" >> $RESULTS_FILE +echo "Small Problem (64 residues):" >> $RESULTS_FILE +printf "${MAGENTA}%-8s %-12s %-12s %-12s %-12s %-10s${NC}\n" "Version" "Speed(s/s)" "Batch(ms)" "Forward(ms)" "Backward(ms)" "Speedup" +printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "Version" "Speed(s/s)" "Batch(ms)" "Forward(ms)" "Backward(ms)" "Speedup" >> $RESULTS_FILE +echo "────────────────────────────────────────────────────────────────────────" +echo "────────────────────────────────────────────────────────────────────────" >> $RESULTS_FILE + +# Extract V1 small baseline +IFS='|' read -r _ _ v1s_speed v1s_batch v1s_fwd v1s_bwd _ _ <<< "${SMALL_RESULTS[0]}" + +for i in "${!SMALL_RESULTS[@]}"; do + IFS='|' read -r ver name speed batch fwd bwd opt mem <<< "${SMALL_RESULTS[$i]}" + + if [ "$i" -eq 0 ]; then + speedup="1.0x" + else + speedup=$(awk "BEGIN {printf \"%.2fx\", $speed / $v1s_speed}") + fi + + printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "$ver" "$speed" "$batch" "$fwd" "$bwd" "$speedup" + printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "$ver" "$speed" "$batch" "$fwd" "$bwd" "$speedup" >> $RESULTS_FILE +done + +echo "" +echo "" >> $RESULTS_FILE + +echo -e "${BOLD}Medium Problem (128 residues):${NC}" +echo "Medium Problem (128 residues):" >> $RESULTS_FILE +printf "${MAGENTA}%-8s %-12s %-12s %-12s %-12s %-10s${NC}\n" "Version" "Speed(s/s)" "Batch(ms)" "Forward(ms)" "Backward(ms)" "Speedup" +printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "Version" "Speed(s/s)" "Batch(ms)" "Forward(ms)" "Backward(ms)" "Speedup" >> $RESULTS_FILE +echo "────────────────────────────────────────────────────────────────────────" +echo "────────────────────────────────────────────────────────────────────────" >> $RESULTS_FILE + +# Extract V1 medium baseline +IFS='|' read -r _ _ v1m_speed v1m_batch v1m_fwd v1m_bwd _ _ <<< "${MEDIUM_RESULTS[0]}" + +for i in "${!MEDIUM_RESULTS[@]}"; do + IFS='|' read -r ver name speed batch fwd bwd opt mem <<< "${MEDIUM_RESULTS[$i]}" + + if [ "$i" -eq 0 ]; then + speedup="1.0x" + else + speedup=$(awk "BEGIN {printf \"%.2fx\", $speed / $v1m_speed}") + fi + + printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "$ver" "$speed" "$batch" "$fwd" "$bwd" "$speedup" + printf "%-8s %-12s %-12s %-12s %-12s %-10s\n" "$ver" "$speed" "$batch" "$fwd" "$bwd" "$speedup" >> $RESULTS_FILE +done + +echo "" +echo "" >> $RESULTS_FILE + +# Key insights +echo -e "${BOLD}${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${BOLD}${GREEN} Key Insights${NC}" +echo -e "${BOLD}${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo "" + +echo "Key Insights:" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +# Calculate final speedups +IFS='|' read -r _ _ v3s_speed v3s_batch v3s_fwd v3s_bwd _ _ <<< "${SMALL_RESULTS[2]}" +IFS='|' read -r _ _ v3m_speed v3m_batch v3m_fwd v3m_bwd _ _ <<< "${MEDIUM_RESULTS[2]}" + +small_speedup=$(awk "BEGIN {printf \"%.1fx\", $v3s_speed / $v1s_speed}") +medium_speedup=$(awk "BEGIN {printf \"%.2fx\", $v3m_speed / $v1m_speed}") +small_bwd_reduction=$(awk "BEGIN {printf \"%.0f\", (1 - $v3s_bwd / $v1s_bwd) * 100}") +medium_bwd_reduction=$(awk "BEGIN {printf \"%.0f\", (1 - $v3m_bwd / $v1m_bwd) * 100}") + +echo -e " ${BOLD}1. Progressive Optimization Works!${NC}" +echo -e " • Small problem: ${GREEN}${small_speedup} total speedup${NC} (V1 → V3)" +echo -e " • Medium problem: ${GREEN}${medium_speedup} total speedup${NC} (V1 → V3)" +echo "" +echo " 1. Progressive Optimization Works!" >> $RESULTS_FILE +echo " • Small problem: ${small_speedup} total speedup (V1 → V3)" >> $RESULTS_FILE +echo " • Medium problem: ${medium_speedup} total speedup (V1 → V3)" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +echo -e " ${BOLD}2. Backward Pass is Key Bottleneck${NC}" +echo -e " • Small: ${v1s_bwd} ms → ${v3s_bwd} ms (${GREEN}-${small_bwd_reduction}%${NC})" +echo -e " • Medium: ${v1m_bwd} ms → ${v3m_bwd} ms (${GREEN}-${medium_bwd_reduction}%${NC})" +echo "" +echo " 2. Backward Pass is Key Bottleneck" >> $RESULTS_FILE +echo " • Small: ${v1s_bwd} ms → ${v3s_bwd} ms (-${small_bwd_reduction}%)" >> $RESULTS_FILE +echo " • Medium: ${v1m_bwd} ms → ${v3m_bwd} ms (-${medium_bwd_reduction}%)" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +echo -e " ${BOLD}3. Optimization Stages${NC}" +echo -e " • V1 → V2: High-level kernel fusion (32% & 18% gain)" +echo -e " • V2 → V3: Custom Triton kernels (additional 53% & 40% gain)" +echo -e " • Each stage builds on previous improvements" +echo "" +echo " 3. Optimization Stages" >> $RESULTS_FILE +echo " • V1 → V2: High-level kernel fusion (32% & 18% gain)" >> $RESULTS_FILE +echo " • V2 → V3: Custom Triton kernels (additional 53% & 40% gain)" >> $RESULTS_FILE +echo "" >> $RESULTS_FILE + +echo "" +echo -e "${BOLD}${GREEN}╔══════════════════════════════════════════════════════════════════╗${NC}" +echo -e "${BOLD}${GREEN}║ ║${NC}" +echo -e "${BOLD}${GREEN}║ ✓ Tutorial Complete! ║${NC}" +echo -e "${BOLD}${GREEN}║ ║${NC}" +echo -e "${BOLD}${GREEN}║ You've learned the complete GPU optimization pipeline: ║${NC}" +echo -e "${BOLD}${GREEN}║ 1. Baseline measurement & profiling ║${NC}" +echo -e "${BOLD}${GREEN}║ 2. High-level kernel fusion ║${NC}" +echo -e "${BOLD}${GREEN}║ 3. Custom GPU kernels with Triton ║${NC}" +echo -e "${BOLD}${GREEN}║ ║${NC}" +echo -e "${BOLD}${GREEN}║ Achievement: ${BOLD}${YELLOW}${small_speedup} speedup${GREEN} on small problems! 🚀 ║${NC}" +echo -e "${BOLD}${GREEN}║ ║${NC}" +echo -e "${BOLD}${GREEN}╚══════════════════════════════════════════════════════════════════╝${NC}" +echo "" +echo -e "📊 Full results saved to: ${CYAN}$RESULTS_FILE${NC}" +echo -e "📖 See ${CYAN}PERFORMANCE_OPTIMIZATION_TUTORIAL.md${NC} for detailed explanations" +echo "" + +echo "Tutorial completed at: $(date)" >> $RESULTS_FILE diff --git a/MLExamples/TinyOpenFold/setup/requirements.txt b/MLExamples/TinyOpenFold/setup/requirements.txt new file mode 100644 index 00000000..8849dd13 --- /dev/null +++ b/MLExamples/TinyOpenFold/setup/requirements.txt @@ -0,0 +1,78 @@ +annotated-types==0.7.0 +astunparse==1.6.2 +blinker==1.9.0 +certifi==2026.1.4 +charset-normalizer==3.4.4 +click==8.3.1 +colorlover==0.3.0 +contourpy==1.3.3 +cycler==0.12.1 +dash==3.4.0 +dash-bootstrap-components==2.0.4 +dash-svg==0.0.12 +deepspeed==0.18.4 +dnspython==2.8.0 +einops==0.8.1 +filelock==3.20.3 +Flask==3.1.2 +fonttools==4.61.1 +fsspec==2026.1.0 +greenlet==3.3.1 +hjson==3.1.0 +idna==3.11 +importlib_metadata==8.7.1 +itsdangerous==2.2.0 +Jinja2==3.1.6 +kaleido==0.2.1 +kiwisolver==1.4.9 +linkify-it-py==2.0.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +matplotlib==3.10.8 +mdit-py-plugins==0.5.0 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.1.2 +narwhals==2.16.0 +nest-asyncio==1.6.0 +networkx==3.6.1 +ninja==1.13.0 +numpy==2.4.1 +packaging==26.0 +pandas==3.0.0 +pillow==12.1.0 +platformdirs==4.5.1 +plotext==5.3.2 +plotille==6.0.0 +plotly==6.5.2 +psutil==7.2.1 +py-cpuinfo==9.0.0 +pydantic==2.12.5 +pydantic_core==2.41.5 +Pygments==2.19.2 +pymongo==4.16.0 +pyparsing==3.3.2 +python-dateutil==2.9.0.post0 +PyYAML==6.0.3 +requests==2.32.5 +retrying==1.4.2 +rich==14.3.2 +six==1.17.0 +SQLAlchemy==2.0.46 +sympy==1.14.0 +tabulate==0.9.0 +textual==7.5.0 +textual-fspicker==0.6.0 +textual-plotext==1.0.1 +torch==2.11.0.dev20260202+rocm7.1 +torchaudio==2.11.0.dev20260203+rocm7.1 +torchvision==0.25.0.dev20260203+rocm7.1 +tqdm==4.67.1 +triton==3.6.0+git9844da95 +triton-rocm==3.6.0+git9844da95 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +uc-micro-py==1.0.3 +urllib3==2.6.3 +Werkzeug==3.1.5 +zipp==3.23.0 diff --git a/MLExamples/TinyOpenFold/setup/requirements_rocprof-compute-develop.txt b/MLExamples/TinyOpenFold/setup/requirements_rocprof-compute-develop.txt new file mode 100644 index 00000000..e3c5e5bd --- /dev/null +++ b/MLExamples/TinyOpenFold/setup/requirements_rocprof-compute-develop.txt @@ -0,0 +1,13 @@ +astunparse==1.6.2 +dash-bootstrap-components==2.0.4 +dash-svg==0.0.12 +dash==3.4.0 +numpy==1.26.4 +pandas==2.2.3 +plotext==5.3.2 +plotille==5.0.0 +pyyaml==6.0.3 +sqlalchemy==2.0.46 +tabulate==0.9.0 +textual==7.3.0 +textual_plotext==1.0.1 diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/FLOPS_ANALYSIS.md b/MLExamples/TinyOpenFold/version1_pytorch_baseline/FLOPS_ANALYSIS.md new file mode 100644 index 00000000..b54a2d9d --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/FLOPS_ANALYSIS.md @@ -0,0 +1,125 @@ +# DeepSpeed FLOPS Analysis for TinyOpenFold + +Analyze computational efficiency and FLOPS breakdown of the Evoformer architecture using DeepSpeed profiling tools. + +## Quick Start + +```bash +# Basic FLOPS analysis +./run_deepspeed_flops.sh + +# Comprehensive analysis with all features +./run_deepspeed_flops.sh --all + +# Custom configuration +./run_deepspeed_flops.sh --batch-size 8 --seq-len 128 --num-blocks 8 + +# Install DeepSpeed if needed +pip install deepspeed +``` + +## What You Get + +The FLOPS profiler provides: +- **Total FLOPS** per training step +- **FLOPS breakdown** by component (MSA attention, triangle multiplication, etc.) +- **Model FLOPS Utilization (MFU)** - GPU efficiency metric +- **Computational intensity** - memory vs compute bound classification +- **Roofline model data** - optimization recommendations + +**Example Output:** +``` +FLOPS Analysis Summary: + Total FLOPS per step: 2.45e+11 + Model FLOPS Utilization: 15.3% + +Evoformer FLOPS Breakdown: + msa_attention: 8.32e+10 (34.0%) + triangle_multiplication: 6.21e+10 (25.4%) + pair_transition: 4.15e+10 (17.0%) +``` + +## Key Metrics + +### Model FLOPS Utilization (MFU) + +``` +MFU = (Achieved FLOPS) / (Peak GPU FLOPS) × 100% +``` + +**Targets:** +- < 20%: Heavy overhead, needs kernel fusion +- 20-40%: Typical unoptimized baseline +- 40-60%: Good optimization +- 60-80%: Excellent (state-of-the-art) + +### Computational Intensity + +```bash +./run_deepspeed_flops.sh --intensity +``` + +**Classification:** +- < 10 FLOPS/byte: Memory-bound +- 10-50 FLOPS/byte: Balanced +- \> 50 FLOPS/byte: Compute-bound + +## Common Commands + +```bash +# Identify bottlenecks +./run_deepspeed_flops.sh --all --output-dir analysis +cat analysis/flops_profile.json | jq '.flops_analysis.evoformer_breakdown' + +# Multi-GPU analysis +./run_deepspeed_flops.sh --multi-gpu --output-dir multi_gpu_results + +# Specific GPUs +./run_deepspeed_flops.sh --devices "0,1,2,3" + +# Roofline analysis +./run_deepspeed_flops.sh --roofline --output-dir roofline_data +``` + +## Command-Line Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--batch-size ` | Batch size | 4 | +| `--seq-len ` | Sequence length | 64 | +| `--num-blocks ` | Evoformer blocks | 4 | +| `--device ` | GPU device ID | default | +| `--multi-gpu` | Profile all GPUs | false | +| `--devices ` | Specific GPUs (e.g., "0,1,2") | none | +| `--all` | All analysis types | false | +| `--roofline` | Roofline analysis | false | +| `--intensity` | Computational intensity | false | + +## Output Files + +- `flops_profile.json` - Complete FLOPS analysis and efficiency metrics +- `computational_intensity.json` - Memory bandwidth analysis +- `roofline_data.json` - Roofline model data + +## Optimization Priorities + +Based on FLOPS breakdown: + +1. **Triangle Multiplication > 25%**: Implement fused kernels (30-40% improvement) +2. **MSA Attention > 30%**: Use Flash Attention (2-3x speedup) +3. **Low MFU (< 20%)**: Apply kernel fusion, reduce Python overhead +4. **Memory-bound (AI < 10)**: Use mixed precision, optimize memory access + +## GPU Specifications + +| GPU | Peak FP32 TFLOPS | Memory Bandwidth | Target MFU | +|-----|------------------|------------------|------------| +| AMD MI300X | 163.4 | 5300 GB/s | 40-60% | +| NVIDIA H100 | 67 | 3350 GB/s | 45-65% | +| NVIDIA A100 | 19.5 | 2039 GB/s | 35-55% | + +## References + +- [DeepSpeed FLOPS Profiler](https://www.deepspeed.ai/tutorials/flops-profiler/) +- [Roofline Model](https://en.wikipedia.org/wiki/Roofline_model) +- Main documentation: `README.md` diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/README.md b/MLExamples/TinyOpenFold/version1_pytorch_baseline/README.md new file mode 100644 index 00000000..1efedb97 --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/README.md @@ -0,0 +1,734 @@ +# TinyOpenFold V1: PyTorch Baseline + +Educational implementation of AlphaFold 2's Evoformer architecture with comprehensive profiling integration. + +## Overview + +This version provides a clean, well-documented baseline implementation of the core AlphaFold 2 architecture, focusing on the **Evoformer** blocks that process MSA (Multiple Sequence Alignment) and pair representations. + +## Quick Start + +### Basic Training Run + +```bash +# Default configuration: 64 residues, 16 MSA sequences, 4 Evoformer blocks +python tiny_openfold_v1.py --batch-size 4 --num-steps 30 + +# Expected output: +# Model Configuration: +# MSA dimension: 64 +# Pair dimension: 128 +# Evoformer blocks: 4 +# Total parameters: 2,641,728 +# Model size: 10.6 MB (FP32) +# +# Training steps complete with loss decreasing +``` + +### Validation Check + +```bash +# Verify your environment is set up correctly +python tiny_openfold_v1.py --validate-setup + +# Should output: +# Validation successful! Environment ready. +``` + +## Architecture Components + +### 1. MSA Representation Processing + +**MSA Row-wise Attention with Pair Bias** +- Attends across residues within each MSA sequence +- Biased by the pair representation (key innovation!) +- Shape: `(batch, n_seqs, seq_len, msa_dim)` + +**MSA Column-wise Attention** +- Attends across different sequences for each position +- Enables communication between sequences in the MSA +- Shape: `(batch, n_seqs, seq_len, msa_dim)` + +**MSA Transition** +- Point-wise feed-forward network +- Applied to each MSA element independently + +### 2. Pair Representation Processing + +**Outer Product Mean** +- Projects MSA patterns onto pairwise space +- Computes mean outer product across MSA sequences +- Updates pair representation with sequence information + +**Triangle Multiplicative Updates** +- Geometric reasoning: if i-j and j-k are close, i-k should be considered +- Two versions: outgoing and incoming edges +- Most computationally expensive operation (O(N³)) + +**Triangle Self-Attention** +- Attention over edges in the residue graph +- Two versions: starting and ending nodes +- Enables long-range communication + +**Pair Transition** +- Point-wise feed-forward network for pair representation + +### 3. Structure Module + +**Simplified Distance Prediction** +- Predicts pairwise distances from pair representation +- In full AlphaFold 2, this is the Invariant Point Attention (IPA) module +- Output: `(batch, seq_len, seq_len, 1)` - distance matrix + +## Model Configuration + +### Default Configuration + +```python +TinyOpenFoldConfig( + vocab_size=21, # 20 amino acids + unknown + msa_dim=64, # MSA feature dimension + pair_dim=128, # Pair feature dimension + n_evoformer_blocks=4, # Number of Evoformer blocks + n_heads_msa=4, # MSA attention heads + n_heads_pair=4, # Pair attention heads + msa_intermediate_dim=256, # MSA FFN dimension (4x msa_dim) + pair_intermediate_dim=512, # Pair FFN dimension (4x pair_dim) + outer_product_dim=32, # Outer product projection dim + max_seq_len=64, # Maximum sequence length + n_seqs=16, # Number of MSA sequences +) +``` + +### Scaling Configurations + +#### Tiny (for testing) +```bash +python tiny_openfold_v1.py \ + --msa-dim 32 \ + --pair-dim 64 \ + --num-blocks 2 \ + --seq-len 32 \ + --num-seqs 8 \ + --batch-size 8 + +# Parameters: ~660K +# Memory: ~40 MB +# Speed: ~15-20 samples/sec +``` + +#### Small (default) +```bash +python tiny_openfold_v1.py \ + --msa-dim 64 \ + --pair-dim 128 \ + --num-blocks 4 \ + --seq-len 64 \ + --num-seqs 16 \ + --batch-size 4 + +# Parameters: ~2.6M +# Memory: ~100 MB +# Speed: ~8-10 samples/sec +``` + +#### Medium +```bash +python tiny_openfold_v1.py \ + --msa-dim 128 \ + --pair-dim 256 \ + --num-blocks 8 \ + --seq-len 128 \ + --num-seqs 32 \ + --batch-size 2 + +# Parameters: ~42M +# Memory: ~800 MB +# Speed: ~1-2 samples/sec +``` + +## Profiling Guide + +### PyTorch Profiler + +Detailed kernel-level performance and memory analysis: + +```bash +# Basic profiling +python tiny_openfold_v1.py \ + --enable-pytorch-profiler \ + --profile-dir ./profiles \ + --batch-size 4 \ + --num-steps 30 + +# View timeline in Chrome +# Open chrome://tracing and load ./profiles/trace_*.json +``` + +**Provides:** +- Kernel execution times +- Memory allocation patterns +- CPU/GPU timeline + +#### Minimal Overhead Profiling (Recommended for Throughput Measurement) + +For production-like performance measurements with minimal profiling overhead: + +```bash +# Default: Profile only 5 out of 20 steps (25% overhead) +./run_pytorch_profiler.sh + +# Minimal overhead: Profile 5 out of 100 steps (~5% overhead) +./run_pytorch_profiler.sh \ + --batch-size 4 \ + --seq-len 64 \ + --num-steps 100 \ + --profile-steps 5 \ + --device 0 + +# Very stable throughput: Profile 5 out of 200 steps (~2.5% overhead) +./run_pytorch_profiler.sh \ + --num-steps 200 \ + --profile-steps 5 + +# View comprehensive report +less pytorch_profiles/comprehensive_profiling_report.md + +# View trace in Chrome +# Open chrome://tracing and load: pytorch_profiles/trace_step_*.json +``` + +**Key Parameters for Minimal Overhead:** +- `--num-steps 100-200`: More steps = more stable throughput average +- `--profile-steps 5`: Only these steps have profiling overhead (~40% slower) +- Non-profiled steps: **No overhead** (82 samples/sec baseline) +- Result: Average throughput with only 5-10% overhead + +**What You Get:** +- `trace_step_*.json` - Chrome trace file (~80-100 MB) for detailed kernel inspection +- `comprehensive_profiling_report.md` - Analysis with bottleneck identification +- `operator_analysis.json` - Performance data +- Throughput summary at end of comprehensive report + +**Example Output:** +``` +Average training speed: 75.0 samples/sec (vs 82 baseline, 10% overhead with 5/100 profiled) +``` + +### DeepSpeed FLOPS Profiler + +Analyze computational efficiency and FLOPS breakdown using DeepSpeed: + +```bash +# Basic FLOPS analysis (single GPU, default device) +./run_deepspeed_flops.sh + +# Profile on specific GPU +./run_deepspeed_flops.sh --device 1 + +# Multi-GPU comparative analysis (all available GPUs - 8 on MI250X) +./run_deepspeed_flops.sh --multi-gpu + +# Multi-GPU analysis (specific GPUs) +./run_deepspeed_flops.sh --devices "0,1,2" + +# Comprehensive analysis with roofline model +./run_deepspeed_flops.sh --all --batch-size 4 --seq-len 64 + +# Custom configuration +./run_deepspeed_flops.sh \ + --batch-size 8 \ + --seq-len 128 \ + --num-blocks 8 \ + --roofline \ + --intensity +``` + +**Key Metrics from FLOPS Analysis:** +- **Model FLOPS Utilization (MFU)**: Efficiency of GPU usage (target: 40-60% for baseline) +- **FLOPS Breakdown**: Which Evoformer components use most compute +- **Arithmetic Intensity**: Memory-bound vs compute-bound classification +- **Roofline Data**: Optimization recommendations +- **Multi-GPU Efficiency**: Scaling efficiency across multiple GPUs (target: >90% for good scaling) + +**Example Output (Single GPU):** +``` +FLOPS Analysis Summary: + Total FLOPS per step: 2.45e+11 + Model FLOPS Utilization: 15.3% + +Evoformer FLOPS Breakdown: + msa_attention: 8.32e+10 (34.0%) + triangle_multiplication: 6.21e+10 (25.4%) + pair_transition: 4.15e+10 (17.0%) +``` + +**Example Output (Multi-GPU):** +``` +Aggregate Multi-GPU Summary: + Number of GPUs: 8 + Total System TFLOPS: 196.8 + Average MFU: 15.8% + Total Throughput: 84.6 samples/sec + Multi-GPU Efficiency: 95.2% + Speedup vs Single GPU: 7.62x +``` + +**Multi-GPU Analysis:** +- Profiles each GPU independently to measure per-GPU FLOPS +- Calculates aggregate system TFLOPS (sum across all GPUs) +- Reports multi-GPU efficiency (actual speedup / ideal speedup) +- Identifies GPU-to-GPU performance variance (MFU std dev) +- Useful for understanding scaling bottlenecks and load balancing + +**See Also:** +- `FLOPS_ANALYSIS.md` for detailed documentation and workflows +- `PROFILER_COMPARISON_GUIDE.md` for DeepSpeed FLOPS vs PyTorch Profiler comparison + +### Memory Profiling + +Track memory usage throughout training: + +```bash +python tiny_openfold_v1.py \ + --enable-memory-profiling \ + --profile-dir ./memory_analysis \ + --batch-size 4 + +# Check performance_summary.json for memory statistics +cat ./memory_analysis/performance_summary.json +``` + +### Complete Profiling Suite + +Enable all profiling features: + +```bash +python tiny_openfold_v1.py \ + --enable-all-profiling \ + --profile-dir ./complete_analysis \ + --batch-size 4 \ + --num-steps 50 +``` + +## Performance Analysis + +### Expected Bottlenecks + +Based on the architecture, expect these components to dominate compute time: + +1. **Triangle Operations** (40-50% of time) + - O(N³) complexity makes these expensive + - Both multiplicative updates and attention + - Most sensitive to sequence length + +2. **MSA Attention** (25-35% of time) + - Row-wise attention: O(N_seqs × N_res²) + - Column-wise attention: O(N_res × N_seqs²) + - Depends on both MSA depth and sequence length + +3. **Outer Product Mean** (10-15% of time) + - Computing outer products across MSA + - Memory-bound operation + +4. **Transitions** (5-10% of time) + - Feed-forward networks + - Usually well-optimized by PyTorch + +### Memory Consumption + +Memory usage breakdown (approximate): + +``` +Total GPU Memory = Model Parameters + Activations + Gradients + Optimizer States + +For batch=4, seq_len=64, n_seqs=16: +- Model: ~11 MB (FP32) +- MSA activations: ~4 MB +- Pair activations: ~32 MB +- Attention scores: ~8 MB +- Gradients: ~11 MB +- Optimizer (Adam): ~22 MB +- Total: ~90-100 MB +``` + +**Key Insight**: Pair representation dominates memory (seq_len²) + +### Optimization Opportunities + +From the baseline implementation, potential optimizations include: + +1. **Flash Attention** for MSA attention operations +2. **Kernel Fusion** for triangle operations +3. **Mixed Precision (FP16)** to reduce memory and improve throughput +4. **Gradient Checkpointing** for larger models +5. **Custom CUDA/Triton Kernels** for triangle updates + +## Training Output Explanation + +### During Training + +``` +Step 0/50 | Loss: 45.2341 | Speed: 8.5 samples/sec | Memory: 102.3 MB | Time: 470.2ms +``` + +- **Loss**: MSE on predicted distances (should decrease) +- **Speed**: Throughput in samples/second +- **Memory**: Current GPU memory allocation +- **Time**: Milliseconds per training iteration + +### Final Summary + +``` +Performance Summary: + Total samples processed: 200 + Average training speed: 8.7 samples/sec + Average batch time: 459.3 ms + Average forward time: 285.1 ms + Average backward time: 165.7 ms + Average optimizer time: 8.5 ms + Final loss: 38.4512 + Peak memory usage: 102.3 MB +``` + +**What to Analyze:** +- Forward/backward time ratio (typically 1.5-2.0x) +- Memory growth over time +- Loss convergence behavior + +## Multi-GPU Training and Scaling Studies + +### Multi-GPU Training with DataParallel + +TinyOpenFold supports multi-GPU training using PyTorch's `nn.DataParallel`. The implementation automatically detects and uses multiple GPUs based on environment variables. + +**Single GPU (Explicit):** +```bash +# Use specific GPU +python tiny_openfold_v1.py --device 0 --batch-size 8 +``` + +**Multi-GPU (Automatic Detection):** +```bash +# ROCm (AMD GPUs) - automatically uses GPUs 0 and 1 +ROCR_VISIBLE_DEVICES=0,1 python tiny_openfold_v1.py --batch-size 16 + +# CUDA (NVIDIA GPUs) - automatically uses GPUs 0, 1, 2, 3 +CUDA_VISIBLE_DEVICES=0,1,2,3 python tiny_openfold_v1.py --batch-size 32 + +# Disable multi-GPU even if multiple GPUs are available +python tiny_openfold_v1.py --no-data-parallel --device 0 --batch-size 8 +``` + +**Best Practices:** +- Scale batch size proportionally with GPU count (e.g., 8 per GPU) +- The effective batch size is split across GPUs automatically +- Monitor per-GPU memory usage to avoid OOM errors +- Use `--device` to override automatic GPU detection for single-GPU runs + +### Running Scaling Studies + +Two scripts are provided for conducting GPU scaling studies: + +#### Quick Scaling Test (Simple) + +For a quick test with 1, 2, 4, and 8 GPUs: + +```bash +# Make script executable +chmod +x quick_scaling_test.sh + +# Run quick scaling test (8 samples per GPU, 50 steps) +./quick_scaling_test.sh +``` + +**Output:** +- Creates timestamped directory with logs for each GPU configuration +- Automatically calculates speedup and efficiency +- Generates summary table with throughput comparison + +**Example Results:** +``` +GPUs Throughput (s/s) Speedup Efficiency +---- ------------------- --------- ---------- +1 166.9 1.00x 100.0% +2 202.7 1.21x 60.5% +4 245.3 1.47x 36.8% +8 249.1 1.49x 18.6% +``` + +#### Comprehensive Scaling Study (Advanced) + +For more control and statistical analysis: + +```bash +# Make script executable +chmod +x run.sh + +# Run full scaling study with defaults +./run.sh + +# Custom configuration +./run.sh --gpus "1 2 4 8" --batch-per-gpu 8 --steps 100 --runs 3 + +# With mixed precision and profiling +./run.sh --amp --profile --steps 50 + +# Specify output directory +./run.sh --output-dir my_scaling_study_$(date +%Y%m%d) + +# Show help +./run.sh --help +``` + +**Options:** +- `--gpus `: GPU counts to test (default: "1 2 4 8") +- `--batch-per-gpu `: Batch size per GPU (default: 8) +- `--steps `: Training steps per run (default: 50) +- `--runs `: Number of runs per configuration for statistics (default: 1) +- `--amp`: Enable mixed precision training (FP16) +- `--profile`: Enable PyTorch profiler +- `--output-dir `: Custom output directory + +**Output Files:** +``` +scaling_study_TIMESTAMP/ +├── config.txt # Study configuration +├── summary.txt # Human-readable summary with statistics +├── summary.csv # Machine-readable results +├── gpu1_batch8_run1.log # Detailed logs for each run +├── gpu2_batch16_run1.log +├── gpu4_batch32_run1.log +└── gpu8_batch64_run1.log +``` + +### Understanding Scaling Efficiency + +**Scaling Metrics:** +- **Speedup**: `Throughput(N GPUs) / Throughput(1 GPU)` +- **Efficiency**: `(Speedup / N GPUs) × 100%` + +**Expected Behavior:** +- **Ideal Linear Scaling**: 100% efficiency (rare in practice) +- **Good Scaling**: 70-90% efficiency for 2-4 GPUs +- **Diminishing Returns**: Efficiency drops with more GPUs due to: + - Communication overhead between GPUs + - DataParallel synchronization costs + - Small model size (2.6M parameters) + - Memory bandwidth limitations + +**TinyOpenFold Scaling Characteristics:** +- Sub-linear scaling is expected due to small model size +- Communication overhead becomes significant at 4+ GPUs +- Best efficiency typically at 2-4 GPUs +- Beyond 8 GPUs, overhead may exceed benefits for this model size + +**Optimization Tips:** +- Use larger batch sizes per GPU to amortize communication costs +- Enable mixed precision (`--use-amp`) to reduce memory and increase throughput +- Consider gradient accumulation for effective larger batch sizes +- For production OpenFold, use model parallelism instead of data parallelism + +## Command Reference + +### Model Configuration +```bash +--msa-dim 64 # MSA representation dimension +--pair-dim 128 # Pair representation dimension +--num-blocks 4 # Number of Evoformer blocks +--num-seqs 16 # Number of MSA sequences +--seq-len 64 # Sequence length (residues) +``` + +### Training Parameters +```bash +--num-steps 50 # Training iterations +--batch-size 4 # Batch size +--learning-rate 3e-4 # Learning rate +--use-amp # Enable mixed precision (FP16) +``` + +### Profiling Options +```bash +--enable-pytorch-profiler # Enable PyTorch profiler +--enable-memory-profiling # Track memory usage +--enable-all-profiling # Enable all profiling +--profile-dir PATH # Output directory +--warmup-steps 3 # Profiler warmup iterations +--profile-steps 5 # Iterations to profile +``` + +## Code Structure + +### Main Classes + +**`TinyOpenFoldConfig`**: Model configuration dataclass + +**`MSARowAttentionWithPairBias`**: MSA row attention + pair bias +- Projects MSA to Q, K, V +- Adds pair representation as attention bias +- Core innovation of AlphaFold 2 + +**`MSAColumnAttention`**: MSA column attention +- Transposes to attend across sequences +- Independent of pair representation + +**`TriangleMultiplication`**: Triangle multiplicative update +- Gated projections for left and right edges +- Einstein summation for triangle computation +- Separate classes for outgoing/incoming + +**`TriangleAttention`**: Triangle self-attention +- Standard multi-head attention over edges +- Two variants: starting and ending nodes + +**`OuterProductMean`**: Outer product mean computation +- Projects MSA to lower dimension +- Computes outer product between positions +- Averages across MSA depth + +**`EvoformerBlock`**: Complete Evoformer block +- Orchestrates all MSA and pair operations +- Includes layer norms and residual connections + +**`TinyOpenFold`**: Main model class +- Input embeddings +- Stack of Evoformer blocks +- Structure module for predictions + +### Data Flow + +``` +Input: + ├─ MSA tokens (batch, n_seqs, seq_len) + └─ Pair features (batch, seq_len, seq_len, pair_input_dim) + +Embeddings: + ├─ MSA: (batch, n_seqs, seq_len, msa_dim) + └─ Pair: (batch, seq_len, seq_len, pair_dim) + +Evoformer Blocks (repeated N times): + ├─ MSA updates: + │ ├─ Row attention (with pair bias) + │ ├─ Column attention + │ └─ Transition + └─ Pair updates: + ├─ Outer product mean + ├─ Triangle multiplication (out/in) + ├─ Triangle attention (start/end) + └─ Transition + +Structure Module: + └─ Pair → Distances: (batch, seq_len, seq_len, 1) + +Output: + └─ Predicted distance matrix +``` + +## Debugging Tips + +### Model Not Training (Loss Not Decreasing) + +```bash +# Check with smaller problem +python tiny_openfold_v1.py \ + --seq-len 16 \ + --num-seqs 4 \ + --batch-size 2 \ + --num-steps 100 + +# Increase learning rate +python tiny_openfold_v1.py --learning-rate 1e-3 +``` + +### Numerical Instabilities + +```bash +# Use mixed precision for better numerical stability +python tiny_openfold_v1.py --use-amp +``` + +### Slow Performance + +```bash +# Profile to find bottlenecks +python tiny_openfold_v1.py \ + --enable-pytorch-profiler \ + --profile-dir ./debug_profile \ + --num-steps 20 + +# Reduce problem size +python tiny_openfold_v1.py --seq-len 32 --num-seqs 8 +``` + +## Understanding the Code + +### Key Code Sections to Study + +1. **MSA Row Attention** (lines ~250-310) + - See how pair bias is added to attention scores + - Note the broadcasting across MSA sequences + +2. **Triangle Multiplication** (lines ~480-530) + - Examine the Einstein summation for triangle updates + - Understand gating mechanism + +3. **Evoformer Block** (lines ~620-680) + - See how MSA and pair updates are orchestrated + - Note the residual connections + +4. **Training Loop** (lines ~900-1050) + - Profiling integration points + - Timing and metrics collection + +### Profiler Integration Points + +The code includes `record_function()` calls for profiling: + +```python +with record_function("evoformer_block"): + with record_function("msa_row_attention"): + # ... attention code +``` + +These show up in PyTorch Profiler and help identify bottlenecks. + +## Next Steps + +After running the baseline: + +1. **Analyze Profiling Results** + - Open TensorBoard to view timeline + - Identify hotspot operations + - Check memory usage patterns + +2. **Experiment with Configurations** + - Try different sequence lengths + - Vary MSA depth + - Test different numbers of blocks + +3. **Consider Optimizations** + - Implement flash attention for MSA operations + - Fuse triangle operations + - Try mixed precision training + +## Resources + +### AlphaFold 2 Paper +- Main: https://www.nature.com/articles/s41586-021-03819-2 +- Supplement: Detailed architecture (Section 1.6 for Evoformer) + +### OpenFold (Production Implementation) +- GitHub: https://github.com/aqlaboratory/openfold +- Documentation: https://openfold.readthedocs.io/ + +### Parent Directory +- See `../ARCHITECTURE.md` for detailed parameter calculations +- See `../README.md` for project overview + +--- + +**Questions or Issues?** + +Check the parent README or examine the code comments for detailed explanations of each component. + diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.py b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.py new file mode 100644 index 00000000..1e631c4d --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.py @@ -0,0 +1,1011 @@ +#!/usr/bin/env python3 +""" +DeepSpeed FLOPS Profiler Integration for Tiny OpenFold V1 + +This script provides comprehensive FLOPS analysis using DeepSpeed's FLOPS profiler +to measure computational efficiency and identify optimization opportunities for +the Evoformer architecture. + +Features: +- Detailed FLOPS breakdown by operation type (MSA attention, pair updates, triangle mult) +- Model FLOPS Utilization (MFU) calculation +- Computational intensity analysis +- Memory bandwidth requirements +- Arithmetic intensity metrics +- Roofline model preparation data + +Usage: + # Run FLOPS profiling with default settings + python run_deepspeed_flops.py + + # Custom configuration + python run_deepspeed_flops.py --batch-size 4 --seq-len 64 + + # Analyze existing results + python run_deepspeed_flops.py --analyze-results flops_profile.json + + # Generate roofline analysis data + python run_deepspeed_flops.py --generate-roofline --output-dir ./roofline_data +""" + +import torch +import torch.nn as nn +import argparse +import json +import os +import numpy as np +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime +import time + +# Import the model from tiny_openfold_v1 +from tiny_openfold_v1 import ( + TinyOpenFold, + TinyOpenFoldConfig, + ProteinDataset, + setup_deterministic_environment +) + +# Optional DeepSpeed import +try: + from deepspeed.profiling.flops_profiler import FlopsProfiler + DEEPSPEED_AVAILABLE = True +except ImportError: + DEEPSPEED_AVAILABLE = False + + +class EvoformerFLOPSAnalyzer: + """Comprehensive FLOPS analysis for Evoformer architecture.""" + + def __init__(self, output_dir: str = "./flops_analysis"): + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.analysis_results = {} + + def profile_model_flops( + self, + config: TinyOpenFoldConfig, + batch_size: int = 4, + num_steps: int = 10, + detailed_analysis: bool = True, + device_id: Optional[int] = None + ) -> Dict[str, Any]: + """Profile model FLOPS using DeepSpeed profiler.""" + + if not DEEPSPEED_AVAILABLE: + return {'error': 'DeepSpeed not available for FLOPS profiling'} + + print(f"Starting FLOPS Analysis - Evoformer Architecture") + print(f" Output directory: {self.output_dir}") + print(f" Batch size: {batch_size}") + print(f" Sequence length: {config.max_seq_len}") + print(f" MSA sequences: {config.n_seqs}") + print(f" Analysis steps: {num_steps}") + + # Setup environment + setup_deterministic_environment() + + # Device selection + if device_id is not None: + if not torch.cuda.is_available(): + print(f" Warning: CUDA not available, ignoring device_id={device_id}") + device = torch.device("cpu") + elif device_id >= torch.cuda.device_count(): + raise ValueError(f"Device {device_id} not available. Only {torch.cuda.device_count()} GPU(s) found.") + else: + device = torch.device(f"cuda:{device_id}") + print(f" Using GPU: {device_id}") + else: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Create model and dataset + model = TinyOpenFold(config).to(device) + dataset = ProteinDataset(config) + + # Initialize FLOPS profiler + prof = FlopsProfiler(model) + + # Model information + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print(f"\nModel Information:") + print(f" Total parameters: {total_params:,}") + print(f" Trainable parameters: {trainable_params:,}") + print(f" Model size (FP32): {total_params * 4 / 1e6:.1f} MB") + print(f" Evoformer blocks: {config.n_evoformer_blocks}") + print(f" MSA dimension: {config.msa_dim}") + print(f" Pair dimension: {config.pair_dim}") + + # Run profiling + model.train() + prof.start_profile() + + total_flops = 0 + total_time = 0 + step_results = [] + + for step in range(num_steps): + # Get batch + msa_tokens, pair_features, target_distances = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_features = pair_features.to(device) + target_distances = target_distances.to(device) + + # Time the forward pass + start_time = time.time() + if torch.cuda.is_available(): + torch.cuda.synchronize() + + # Forward pass + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'] + + if torch.cuda.is_available(): + torch.cuda.synchronize() + step_time = time.time() - start_time + + # Backward pass (for training scenario) + loss.backward() + + # Get step FLOPS + if hasattr(prof, 'get_total_flops'): + step_flops = prof.get_total_flops() + else: + # Fallback estimation + step_flops = self._estimate_evoformer_flops(config, batch_size) + + total_flops += step_flops + total_time += step_time + + step_results.append({ + 'step': step, + 'loss': loss.item(), + 'flops': step_flops, + 'time': step_time, + 'flops_per_sec': step_flops / step_time if step_time > 0 else 0 + }) + + if step % 2 == 0: + print(f" Step {step}: Loss {loss.item():.4f}, " + f"FLOPS {step_flops:.2e}, Time {step_time*1000:.1f}ms") + + # Clear gradients for next step + model.zero_grad() + + # Stop profiling and get results + prof.stop_profile() + + # Get detailed profile information + try: + flops_summary = prof.get_total_flops() + params_summary = prof.get_total_params() + + if detailed_analysis and hasattr(prof, 'print_model_profile'): + # Capture detailed profile output + import io + import contextlib + + profile_output = io.StringIO() + with contextlib.redirect_stdout(profile_output): + prof.print_model_profile(profile_step=1, module_depth=-1, top_modules=50) + + detailed_profile = profile_output.getvalue() + else: + detailed_profile = "Detailed profile not available" + + except Exception as e: + print(f" Warning: Could not get detailed FLOPS data: {e}") + flops_summary = total_flops / num_steps if num_steps > 0 else 0 + params_summary = total_params + detailed_profile = f"Profile generation failed: {e}" + + # Calculate efficiency metrics + avg_time_per_step = total_time / num_steps if num_steps > 0 else 0 + avg_flops_per_step = total_flops / num_steps if num_steps > 0 else 0 + throughput = batch_size / avg_time_per_step if avg_time_per_step > 0 else 0 + + # Calculate Model FLOPS Utilization (MFU) + mfu_metrics = self._calculate_mfu( + model_flops=avg_flops_per_step, + time_per_step=avg_time_per_step, + device_peak_flops=self._get_device_peak_flops() + ) + + # Evoformer-specific FLOPS breakdown + evoformer_breakdown = self._estimate_evoformer_breakdown(config, batch_size) + + results = { + 'model_info': { + 'total_params': total_params, + 'trainable_params': trainable_params, + 'config': config.to_dict(), + 'architecture': 'Evoformer' + }, + 'profiling_config': { + 'batch_size': batch_size, + 'sequence_length': config.max_seq_len, + 'msa_sequences': config.n_seqs, + 'num_steps': num_steps, + 'device': str(device) + }, + 'flops_analysis': { + 'total_flops': flops_summary, + 'avg_flops_per_step': avg_flops_per_step, + 'flops_per_parameter': avg_flops_per_step / max(1, total_params), + 'evoformer_breakdown': evoformer_breakdown, + 'detailed_profile': detailed_profile + }, + 'performance_metrics': { + 'avg_time_per_step': avg_time_per_step, + 'throughput_samples_per_sec': throughput, + 'avg_loss': np.mean([r['loss'] for r in step_results]), + 'flops_per_sec': avg_flops_per_step / avg_time_per_step if avg_time_per_step > 0 else 0 + }, + 'efficiency_metrics': mfu_metrics, + 'step_by_step_results': step_results, + 'timestamp': datetime.now().isoformat() + } + + # Save results + results_path = self.output_dir / "flops_profile.json" + with open(results_path, 'w') as f: + json.dump(results, f, indent=2) + + print(f"\nFLOPS Analysis Summary:") + print(f" Total FLOPS per step: {avg_flops_per_step:.2e}") + print(f" FLOPS per parameter: {results['flops_analysis']['flops_per_parameter']:.2f}") + print(f" Throughput: {throughput:.1f} samples/sec") + print(f" Model FLOPS Utilization: {mfu_metrics['mfu_percent']:.1f}%") + + print(f"\nEvoformer FLOPS Breakdown:") + for component, flops in evoformer_breakdown.items(): + pct = (flops / avg_flops_per_step * 100) if avg_flops_per_step > 0 else 0 + print(f" {component}: {flops:.2e} ({pct:.1f}%)") + + print(f"\n Results saved to: {results_path}") + + return results + + def profile_multi_gpu_flops( + self, + config: TinyOpenFoldConfig, + batch_size: int = 4, + num_steps: int = 10, + device_ids: Optional[List[int]] = None + ) -> Dict[str, Any]: + """Profile FLOPS across multiple GPUs for comparative analysis.""" + + print(f"\nStarting Multi-GPU FLOPS Analysis - Evoformer Architecture") + print(f" Output directory: {self.output_dir}") + + if not torch.cuda.is_available(): + return {'error': 'CUDA not available for multi-GPU profiling'} + + # Determine which GPUs to use + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + else: + # Validate device IDs + for dev_id in device_ids: + if dev_id >= torch.cuda.device_count(): + raise ValueError(f"Device {dev_id} not available. Only {torch.cuda.device_count()} GPU(s) found.") + + num_gpus = len(device_ids) + print(f" Profiling on {num_gpus} GPU(s): {device_ids}") + print(f" Batch size per GPU: {batch_size}") + print(f" Total effective batch size: {batch_size * num_gpus}") + print(f" Sequence length: {config.max_seq_len}") + print(f" Analysis steps: {num_steps}") + + # Profile each GPU individually + per_gpu_results = {} + + for gpu_id in device_ids: + print(f"\n{'='*70}") + print(f"Profiling GPU {gpu_id}: {torch.cuda.get_device_name(gpu_id)}") + print(f"{'='*70}") + + # Profile this GPU + gpu_results = self.profile_model_flops( + config=config, + batch_size=batch_size, + num_steps=num_steps, + detailed_analysis=False, + device_id=gpu_id + ) + + per_gpu_results[f"gpu_{gpu_id}"] = gpu_results + + # Print summary for this GPU + if 'error' not in gpu_results: + print(f"\n GPU {gpu_id} Summary:") + print(f" MFU: {gpu_results['efficiency_metrics']['mfu_percent']:.1f}%") + print(f" Achieved TFLOPS: {gpu_results['efficiency_metrics']['achieved_tflops']:.2f}") + print(f" Throughput: {gpu_results['performance_metrics']['throughput_samples_per_sec']:.1f} samples/sec") + + # Aggregate results + print(f"\n{'='*70}") + print(f"Multi-GPU Aggregate Analysis") + print(f"{'='*70}") + + aggregate_results = self._aggregate_multi_gpu_results( + per_gpu_results, + device_ids, + config, + batch_size, + num_steps + ) + + # Save multi-GPU results + multi_gpu_path = self.output_dir / "flops_profile_multi_gpu.json" + with open(multi_gpu_path, 'w') as f: + json.dump(aggregate_results, f, indent=2) + + print(f"\n Multi-GPU results saved to: {multi_gpu_path}") + + # Print aggregate summary + print(f"\nAggregate Multi-GPU Summary:") + print(f" Number of GPUs: {num_gpus}") + print(f" Total System TFLOPS: {aggregate_results['aggregate_metrics']['total_system_tflops']:.2f}") + print(f" Average MFU: {aggregate_results['aggregate_metrics']['avg_mfu_percent']:.1f}%") + print(f" Total Throughput: {aggregate_results['aggregate_metrics']['total_throughput']:.1f} samples/sec") + print(f" Multi-GPU Efficiency: {aggregate_results['aggregate_metrics']['multi_gpu_efficiency_percent']:.1f}%") + + return aggregate_results + + def _aggregate_multi_gpu_results( + self, + per_gpu_results: Dict[str, Dict], + device_ids: List[int], + config: TinyOpenFoldConfig, + batch_size: int, + num_steps: int + ) -> Dict[str, Any]: + """Aggregate results from multiple GPU profiling runs.""" + + num_gpus = len(device_ids) + + # Collect metrics from each GPU + mfu_values = [] + achieved_tflops = [] + throughput_values = [] + avg_time_per_step = [] + + for gpu_id in device_ids: + gpu_key = f"gpu_{gpu_id}" + if gpu_key in per_gpu_results and 'error' not in per_gpu_results[gpu_key]: + result = per_gpu_results[gpu_key] + mfu_values.append(result['efficiency_metrics']['mfu_percent']) + achieved_tflops.append(result['efficiency_metrics']['achieved_tflops']) + throughput_values.append(result['performance_metrics']['throughput_samples_per_sec']) + avg_time_per_step.append(result['performance_metrics']['avg_time_per_step']) + + # Calculate aggregate metrics + avg_mfu = np.mean(mfu_values) if mfu_values else 0 + total_tflops = sum(achieved_tflops) + total_throughput = sum(throughput_values) + avg_time = np.mean(avg_time_per_step) if avg_time_per_step else 0 + + # Calculate multi-GPU efficiency (ideal = 100% means linear scaling) + # Efficiency = (Total Throughput) / (Single GPU Throughput × N) + if len(throughput_values) > 0: + single_gpu_throughput = throughput_values[0] if throughput_values else 0 + ideal_throughput = single_gpu_throughput * num_gpus + multi_gpu_efficiency = (total_throughput / ideal_throughput * 100) if ideal_throughput > 0 else 0 + else: + multi_gpu_efficiency = 0 + + # Get device information + device_info = [] + for gpu_id in device_ids: + device_info.append({ + 'gpu_id': gpu_id, + 'name': torch.cuda.get_device_name(gpu_id), + 'mfu_percent': mfu_values[device_ids.index(gpu_id)] if gpu_id < len(mfu_values) else 0, + 'achieved_tflops': achieved_tflops[device_ids.index(gpu_id)] if gpu_id < len(achieved_tflops) else 0, + 'throughput': throughput_values[device_ids.index(gpu_id)] if gpu_id < len(throughput_values) else 0 + }) + + aggregate_results = { + 'multi_gpu_config': { + 'num_gpus': num_gpus, + 'device_ids': device_ids, + 'batch_size_per_gpu': batch_size, + 'total_batch_size': batch_size * num_gpus, + 'num_steps': num_steps + }, + 'model_config': config.to_dict(), + 'per_gpu_results': per_gpu_results, + 'device_info': device_info, + 'aggregate_metrics': { + 'avg_mfu_percent': avg_mfu, + 'mfu_std_dev': np.std(mfu_values) if len(mfu_values) > 1 else 0, + 'total_system_tflops': total_tflops, + 'avg_tflops_per_gpu': np.mean(achieved_tflops) if achieved_tflops else 0, + 'total_throughput': total_throughput, + 'avg_throughput_per_gpu': np.mean(throughput_values) if throughput_values else 0, + 'avg_time_per_step': avg_time, + 'multi_gpu_efficiency_percent': multi_gpu_efficiency, + 'scaling_efficiency': { + 'ideal_speedup': num_gpus, + 'actual_speedup': (throughput_values[0] * num_gpus / total_throughput) if total_throughput > 0 and throughput_values else 0, + 'efficiency_ratio': multi_gpu_efficiency / 100 + } + }, + 'comparison': { + 'single_gpu_throughput': throughput_values[0] if throughput_values else 0, + 'multi_gpu_throughput': total_throughput, + 'speedup': total_throughput / throughput_values[0] if throughput_values and throughput_values[0] > 0 else 0 + }, + 'timestamp': datetime.now().isoformat() + } + + return aggregate_results + + def _estimate_evoformer_flops(self, config: TinyOpenFoldConfig, batch_size: int) -> float: + """Estimate FLOPS for Evoformer model (fallback if DeepSpeed fails).""" + B = batch_size + L = config.max_seq_len + N = config.n_seqs + d_msa = config.msa_dim + d_pair = config.pair_dim + n_blocks = config.n_evoformer_blocks + n_heads_msa = config.n_heads_msa + n_heads_pair = config.n_heads_pair + d_msa_inter = config.msa_intermediate_dim + d_pair_inter = config.pair_intermediate_dim + + # Embedding FLOPS (input projection) + # MSA embedding: B * N * L * vocab_size * d_msa + embed_flops = B * N * L * config.vocab_size * d_msa + # Pair embedding: B * L * L * pair_input_dim * d_pair + embed_flops += B * L * L * config.pair_input_dim * d_pair + + # Per Evoformer block FLOPS + block_flops = 0 + + # === MSA STACK === + # MSA Row Attention + # Q, K, V projections: 3 * B * N * L * d_msa * d_msa + msa_qkv_flops = 3 * B * N * L * d_msa * d_msa + # Attention scores: B * N * n_heads_msa * L * L * (d_msa / n_heads_msa) + msa_attn_scores = B * N * n_heads_msa * L * L * (d_msa // n_heads_msa) + # Attention output: B * N * n_heads_msa * L * (d_msa / n_heads_msa) * L + msa_attn_out = B * N * n_heads_msa * L * (d_msa // n_heads_msa) * L + # Output projection: B * N * L * d_msa * d_msa + msa_out_proj = B * N * L * d_msa * d_msa + + msa_row_attn = msa_qkv_flops + msa_attn_scores + msa_attn_out + msa_out_proj + + # MSA Column Attention (similar to row but different dimension) + msa_col_attn = msa_row_attn # Approximation + + # MSA Transition (FFN) + # Linear 1: B * N * L * d_msa * d_msa_inter + # Linear 2: B * N * L * d_msa_inter * d_msa + msa_transition = B * N * L * d_msa * d_msa_inter + B * N * L * d_msa_inter * d_msa + + # Outer Product Mean + # Projects MSA to create pair update + # B * L * L * N * d_msa * outer_product_dim + outer_product = B * L * L * N * d_msa * config.outer_product_dim + + msa_stack_total = msa_row_attn + msa_col_attn + msa_transition + outer_product + + # === PAIR STACK === + # Triangle Multiplication Outgoing + # 3 projections + matmul: estimate as 4 * B * L * L * d_pair * d_pair + triangle_mult_out = 4 * B * L * L * d_pair * d_pair + + # Triangle Multiplication Incoming + triangle_mult_in = 4 * B * L * L * d_pair * d_pair + + # Triangle Attention Starting/Ending (simplified) + # Similar to standard attention but on pairs + triangle_attn = 2 * (4 * B * L * L * d_pair * d_pair) + + # Pair Transition (FFN) + pair_transition = B * L * L * d_pair * d_pair_inter + B * L * L * d_pair_inter * d_pair + + pair_stack_total = triangle_mult_out + triangle_mult_in + triangle_attn + pair_transition + + # Layer normalization (relatively small, but included for completeness) + # Multiple layer norms throughout: ~10 per block * B * N * L * d_msa (rough estimate) + layernorm_flops = 10 * B * N * L * d_msa + + block_flops = msa_stack_total + pair_stack_total + layernorm_flops + + # Total model FLOPS + total_flops = embed_flops + (n_blocks * block_flops) + + # Output head (distance prediction) + # B * L * L * d_pair * num_distance_bins (simplified) + output_flops = B * L * L * d_pair * 64 # Assuming 64 distance bins + total_flops += output_flops + + return total_flops + + def _estimate_evoformer_breakdown(self, config: TinyOpenFoldConfig, batch_size: int) -> Dict[str, float]: + """Provide detailed breakdown of FLOPS by Evoformer component.""" + B = batch_size + L = config.max_seq_len + N = config.n_seqs + d_msa = config.msa_dim + d_pair = config.pair_dim + n_blocks = config.n_evoformer_blocks + + breakdown = {} + + # MSA Row/Column Attention + msa_attn_per_block = 2 * (4 * B * N * L * d_msa * d_msa + B * N * config.n_heads_msa * L * L * (d_msa // config.n_heads_msa)) + breakdown['msa_attention'] = msa_attn_per_block * n_blocks + + # MSA Transition + msa_transition_per_block = B * N * L * d_msa * config.msa_intermediate_dim + B * N * L * config.msa_intermediate_dim * d_msa + breakdown['msa_transition'] = msa_transition_per_block * n_blocks + + # Outer Product Mean + outer_product_per_block = B * L * L * N * d_msa * config.outer_product_dim + breakdown['outer_product_mean'] = outer_product_per_block * n_blocks + + # Triangle Multiplication + triangle_mult_per_block = 8 * B * L * L * d_pair * d_pair + breakdown['triangle_multiplication'] = triangle_mult_per_block * n_blocks + + # Triangle Attention + triangle_attn_per_block = 8 * B * L * L * d_pair * d_pair + breakdown['triangle_attention'] = triangle_attn_per_block * n_blocks + + # Pair Transition + pair_transition_per_block = B * L * L * d_pair * config.pair_intermediate_dim + B * L * L * config.pair_intermediate_dim * d_pair + breakdown['pair_transition'] = pair_transition_per_block * n_blocks + + # Embeddings + breakdown['embeddings'] = B * N * L * config.vocab_size * d_msa + B * L * L * config.pair_input_dim * d_pair + + # Output head + breakdown['output_head'] = B * L * L * d_pair * 64 + + return breakdown + + def _get_device_peak_flops(self) -> float: + """Get peak FLOPS for the current device.""" + if not torch.cuda.is_available(): + return 1e12 # Rough CPU estimate + + device_name = torch.cuda.get_device_name(0).lower() + + # AMD GPU peak FLOPS (FP32) + amd_peak_flops = { + 'mi100': 11.5e12, # 11.5 TFLOPS + 'mi200': 47.9e12, # 47.9 TFLOPS + 'mi250': 47.9e12, # 47.9 TFLOPS + 'mi300': 61.3e12, # 61.3 TFLOPS (FP32) + 'mi300x': 163.4e12, # 163.4 TFLOPS (Matrix ops, FP32) + 'rx 7900': 61.4e12, # 61.4 TFLOPS + 'rx 6900': 23.0e12, # 23.0 TFLOPS + } + + # NVIDIA GPU peak FLOPS (FP32) + nvidia_peak_flops = { + 'h100': 67.0e12, # 67 TFLOPS + 'a100': 19.5e12, # 19.5 TFLOPS + 'v100': 15.7e12, # 15.7 TFLOPS + 'rtx 4090': 83.0e12, # 83 TFLOPS + 'rtx 3090': 35.6e12, # 35.6 TFLOPS + } + + # Check AMD GPUs + for gpu_name, flops in amd_peak_flops.items(): + if gpu_name in device_name: + return flops + + # Check NVIDIA GPUs + for gpu_name, flops in nvidia_peak_flops.items(): + if gpu_name in device_name: + return flops + + # Default fallback + return 20e12 # 20 TFLOPS as reasonable default + + def _calculate_mfu(self, model_flops: float, time_per_step: float, device_peak_flops: float) -> Dict[str, float]: + """Calculate Model FLOPS Utilization and related efficiency metrics.""" + if time_per_step <= 0 or device_peak_flops <= 0: + return { + 'mfu_percent': 0.0, + 'achieved_flops_per_sec': 0.0, + 'device_peak_flops': device_peak_flops, + 'efficiency_ratio': 0.0 + } + + achieved_flops_per_sec = model_flops / time_per_step + mfu_percent = (achieved_flops_per_sec / device_peak_flops) * 100 + efficiency_ratio = achieved_flops_per_sec / device_peak_flops + + return { + 'mfu_percent': mfu_percent, + 'achieved_flops_per_sec': achieved_flops_per_sec, + 'device_peak_flops': device_peak_flops, + 'efficiency_ratio': efficiency_ratio, + 'achieved_tflops': achieved_flops_per_sec / 1e12, + 'peak_tflops': device_peak_flops / 1e12 + } + + def analyze_computational_intensity(self, flops_data: Dict[str, Any]) -> Dict[str, Any]: + """Analyze computational intensity and memory bandwidth requirements.""" + print(f"\nAnalyzing computational intensity...") + + if not torch.cuda.is_available(): + return {'error': 'CUDA not available for memory bandwidth analysis'} + + # Get model info + model_info = flops_data.get('model_info', {}) + perf_metrics = flops_data.get('performance_metrics', {}) + total_params = model_info.get('total_params', 0) + + # Estimate memory bandwidth requirements + param_size_bytes = total_params * 4 # FP32 + + # Evoformer has significant intermediate activations + batch_size = flops_data['profiling_config']['batch_size'] + seq_len = flops_data['profiling_config']['sequence_length'] + msa_seqs = flops_data['profiling_config']['msa_sequences'] + config = model_info['config'] + + # MSA activations: B * N * L * d_msa + msa_activation_size = batch_size * msa_seqs * seq_len * config['msa_dim'] * 4 + # Pair activations: B * L * L * d_pair + pair_activation_size = batch_size * seq_len * seq_len * config['pair_dim'] * 4 + + activation_size_estimate = msa_activation_size + pair_activation_size + + # Memory transfers per step (rough estimate) + # Parameters read once, activations multiple times (forward + 2x backward estimate) + memory_bytes_per_step = param_size_bytes + (activation_size_estimate * 3) + + avg_time = perf_metrics.get('avg_time_per_step', 1.0) + memory_bandwidth_used = memory_bytes_per_step / avg_time if avg_time > 0 else 0 + + # Arithmetic intensity (FLOPS per byte) + avg_flops = flops_data['flops_analysis']['avg_flops_per_step'] + arithmetic_intensity = avg_flops / memory_bytes_per_step if memory_bytes_per_step > 0 else 0 + + # Get device memory bandwidth + device_memory_bandwidth = self._get_device_memory_bandwidth() + + intensity_analysis = { + 'arithmetic_intensity_flops_per_byte': arithmetic_intensity, + 'memory_bandwidth_used_gb_per_sec': memory_bandwidth_used / 1e9, + 'memory_bandwidth_utilization_percent': (memory_bandwidth_used / device_memory_bandwidth) * 100 if device_memory_bandwidth > 0 else 0, + 'device_memory_bandwidth_gb_per_sec': device_memory_bandwidth / 1e9, + 'memory_bound_vs_compute_bound': 'memory_bound' if arithmetic_intensity < 10 else 'compute_bound', + 'memory_breakdown': { + 'parameters_mb': param_size_bytes / 1e6, + 'msa_activations_mb': msa_activation_size / 1e6, + 'pair_activations_mb': pair_activation_size / 1e6, + 'total_memory_per_step_mb': memory_bytes_per_step / 1e6 + }, + 'roofline_metrics': { + 'peak_flops': flops_data['efficiency_metrics']['device_peak_flops'], + 'peak_memory_bandwidth': device_memory_bandwidth, + 'achieved_flops': perf_metrics.get('flops_per_sec', 0), + 'achieved_bandwidth': memory_bandwidth_used + } + } + + # Save intensity analysis + intensity_path = self.output_dir / "computational_intensity.json" + with open(intensity_path, 'w') as f: + json.dump(intensity_analysis, f, indent=2) + + print(f" Arithmetic Intensity: {arithmetic_intensity:.2f} FLOPS/byte") + print(f" Memory Bandwidth Used: {memory_bandwidth_used/1e9:.1f} GB/s") + print(f" Memory Bandwidth Utilization: {intensity_analysis['memory_bandwidth_utilization_percent']:.1f}%") + print(f" Memory vs Compute: {intensity_analysis['memory_bound_vs_compute_bound']}") + print(f" Results saved to: {intensity_path}") + + return intensity_analysis + + def _get_device_memory_bandwidth(self) -> float: + """Get peak memory bandwidth for the current device.""" + if not torch.cuda.is_available(): + return 100e9 # 100 GB/s rough CPU estimate + + device_name = torch.cuda.get_device_name(0).lower() + + # AMD GPU memory bandwidth + amd_bandwidth = { + 'mi100': 1228e9, # 1228 GB/s (HBM2) + 'mi200': 1638e9, # 1638 GB/s (HBM2e) + 'mi250': 1638e9, # 1638 GB/s (HBM2e) + 'mi300': 5200e9, # 5200 GB/s (HBM3) + 'mi300x': 5300e9, # 5300 GB/s (HBM3) + 'rx 7900': 960e9, # 960 GB/s (GDDR6) + 'rx 6900': 512e9, # 512 GB/s (GDDR6) + } + + # NVIDIA GPU memory bandwidth + nvidia_bandwidth = { + 'h100': 3350e9, # 3350 GB/s (HBM3) + 'a100': 2039e9, # 2039 GB/s (HBM2e) + 'v100': 1555e9, # 1555 GB/s (HBM2) + 'rtx 4090': 1008e9, # 1008 GB/s (GDDR6X) + 'rtx 3090': 936e9, # 936 GB/s (GDDR6X) + } + + # Check AMD GPUs + for gpu_name, bandwidth in amd_bandwidth.items(): + if gpu_name in device_name: + return bandwidth + + # Check NVIDIA GPUs + for gpu_name, bandwidth in nvidia_bandwidth.items(): + if gpu_name in device_name: + return bandwidth + + # Default fallback + return 1000e9 # 1000 GB/s as reasonable default + + def generate_roofline_data(self, output_dir: str = None) -> str: + """Generate data for roofline model analysis.""" + if output_dir is None: + output_dir = self.output_dir + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Load existing analysis results + flops_file = self.output_dir / "flops_profile.json" + intensity_file = self.output_dir / "computational_intensity.json" + + if not flops_file.exists(): + return "Error: Run FLOPS profiling first" + + with open(flops_file, 'r') as f: + flops_data = json.load(f) + + intensity_data = {} + if intensity_file.exists(): + with open(intensity_file, 'r') as f: + intensity_data = json.load(f) + + # Prepare roofline data + roofline_data = { + 'model_name': 'Tiny OpenFold V1 Baseline - Evoformer', + 'timestamp': datetime.now().isoformat(), + 'device_info': { + 'name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU', + 'peak_flops': flops_data['efficiency_metrics']['device_peak_flops'], + 'peak_tflops': flops_data['efficiency_metrics']['peak_tflops'], + 'peak_memory_bandwidth': intensity_data.get('device_memory_bandwidth_gb_per_sec', 0) * 1e9 + }, + 'performance_point': { + 'arithmetic_intensity': intensity_data.get('arithmetic_intensity_flops_per_byte', 0), + 'achieved_performance': flops_data['performance_metrics']['flops_per_sec'], + 'achieved_tflops': flops_data['efficiency_metrics']['achieved_tflops'], + 'mfu_percent': flops_data['efficiency_metrics']['mfu_percent'] + }, + 'evoformer_breakdown': flops_data['flops_analysis']['evoformer_breakdown'], + 'optimization_targets': self._generate_optimization_targets(flops_data, intensity_data) + } + + # Save roofline data + roofline_path = output_path / "roofline_data.json" + with open(roofline_path, 'w') as f: + json.dump(roofline_data, f, indent=2) + + print(f"Roofline data generated: {roofline_path}") + return str(roofline_path) + + def _generate_optimization_targets(self, flops_data: Dict, intensity_data: Dict) -> List[Dict[str, str]]: + """Generate optimization targets based on Evoformer analysis.""" + targets = [] + + # MFU-based recommendations + mfu = flops_data['efficiency_metrics']['mfu_percent'] + if mfu < 30: + targets.append({ + 'target': 'Kernel Fusion - Evoformer Operations', + 'reason': f'Low MFU ({mfu:.1f}%) indicates kernel launch overhead', + 'expected_improvement': '2-3x speedup potential with fused attention and triangle ops' + }) + + # Arithmetic intensity recommendations + ai = intensity_data.get('arithmetic_intensity_flops_per_byte', 0) + if ai < 10: + targets.append({ + 'target': 'Memory Optimization', + 'reason': f'Low arithmetic intensity ({ai:.2f}) indicates memory-bound operations', + 'expected_improvement': 'Flash Attention for MSA, gradient checkpointing, activation recomputation' + }) + + # Evoformer-specific optimizations + breakdown = flops_data['flops_analysis']['evoformer_breakdown'] + + # Triangle multiplication optimization + triangle_flops = breakdown.get('triangle_multiplication', 0) + total_flops = sum(breakdown.values()) + if triangle_flops / total_flops > 0.2: + targets.append({ + 'target': 'Triangle Multiplication Fusion', + 'reason': f'Triangle mult uses {triangle_flops/total_flops*100:.1f}% of FLOPS', + 'expected_improvement': '30-40% reduction with custom fused kernels' + }) + + # MSA attention optimization + msa_attn_flops = breakdown.get('msa_attention', 0) + if msa_attn_flops / total_flops > 0.15: + targets.append({ + 'target': 'MSA Attention Optimization', + 'reason': f'MSA attention uses {msa_attn_flops/total_flops*100:.1f}% of FLOPS', + 'expected_improvement': 'Flash Attention adaptation for MSA: 2-3x speedup possible' + }) + + # Outer product mean optimization + targets.append({ + 'target': 'Outer Product Mean Fusion', + 'reason': 'Creates large intermediate pair representation', + 'expected_improvement': '20-30% reduction with memory-efficient implementation' + }) + + # General recommendations + targets.extend([ + { + 'target': 'Mixed Precision Training (FP16/BF16)', + 'reason': 'Evoformer has many matmul operations suitable for tensor cores', + 'expected_improvement': '2-3x speedup on modern GPUs with tensor cores' + }, + { + 'target': 'Gradient Checkpointing', + 'reason': 'Large MSA and pair representations consume significant memory', + 'expected_improvement': '3-4x memory reduction, ~20% compute overhead' + } + ]) + + return targets + + +def main(): + """Main entry point for DeepSpeed FLOPS analysis.""" + parser = argparse.ArgumentParser(description='DeepSpeed FLOPS Profiler for Tiny OpenFold V1') + + # Model configuration + parser.add_argument('--batch-size', type=int, default=4, help='Batch size for profiling') + parser.add_argument('--seq-len', type=int, default=64, help='Sequence length') + parser.add_argument('--num-seqs', type=int, default=16, help='Number of MSA sequences') + parser.add_argument('--msa-dim', type=int, default=64, help='MSA dimension') + parser.add_argument('--pair-dim', type=int, default=128, help='Pair dimension') + parser.add_argument('--num-blocks', type=int, default=4, help='Number of Evoformer blocks') + + # Profiling configuration + parser.add_argument('--num-steps', type=int, default=10, help='Number of profiling steps') + parser.add_argument('--output-dir', type=str, default='./flops_analysis', help='Output directory') + parser.add_argument('--detailed-analysis', action='store_true', help='Enable detailed FLOPS breakdown') + + # Device configuration + parser.add_argument('--device', type=int, default=None, help='Specific GPU device ID to use (e.g., 0, 1, 2)') + parser.add_argument('--multi-gpu', action='store_true', help='Profile across all available GPUs') + parser.add_argument('--devices', type=str, default=None, help='Comma-separated list of GPU IDs (e.g., "0,1,2")') + + # Analysis options + parser.add_argument('--analyze-results', type=str, help='Analyze existing FLOPS results file') + parser.add_argument('--generate-roofline', action='store_true', help='Generate roofline analysis data') + parser.add_argument('--computational-intensity', action='store_true', help='Analyze computational intensity') + + args = parser.parse_args() + + if not DEEPSPEED_AVAILABLE and not args.analyze_results: + print("=" * 70) + print("DeepSpeed not available. Please install DeepSpeed for FLOPS profiling.") + print(" pip install deepspeed") + print("\nAlternatively, this script can still provide FLOPS estimates without DeepSpeed.") + print("=" * 70) + return + + # Create analyzer + analyzer = EvoformerFLOPSAnalyzer(args.output_dir) + + print("=" * 70) + print("DEEPSPEED FLOPS PROFILER - TINY OPENFOLD V1 (EVOFORMER)") + print("=" * 70) + + try: + # Analyze existing results + if args.analyze_results: + with open(args.analyze_results, 'r') as f: + flops_data = json.load(f) + print(f"📁 Analyzing existing results: {args.analyze_results}") + + # Print summary + print(f"\nModel: {flops_data['model_info']['architecture']}") + print(f"Parameters: {flops_data['model_info']['total_params']:,}") + print(f"FLOPS per step: {flops_data['flops_analysis']['avg_flops_per_step']:.2e}") + print(f"MFU: {flops_data['efficiency_metrics']['mfu_percent']:.1f}%") + + return + + # Run new FLOPS profiling + config = TinyOpenFoldConfig( + msa_dim=args.msa_dim, + pair_dim=args.pair_dim, + n_evoformer_blocks=args.num_blocks, + n_seqs=args.num_seqs, + max_seq_len=args.seq_len + ) + + # Determine profiling mode: single GPU vs multi-GPU + if args.multi_gpu or args.devices: + # Multi-GPU profiling + device_ids = None + if args.devices: + # Parse comma-separated device IDs + device_ids = [int(d.strip()) for d in args.devices.split(',')] + + flops_results = analyzer.profile_multi_gpu_flops( + config=config, + batch_size=args.batch_size, + num_steps=args.num_steps, + device_ids=device_ids + ) + else: + # Single GPU profiling + flops_results = analyzer.profile_model_flops( + config=config, + batch_size=args.batch_size, + num_steps=args.num_steps, + detailed_analysis=args.detailed_analysis, + device_id=args.device + ) + + if 'error' in flops_results: + print(f"⚠️ FLOPS profiling failed: {flops_results['error']}") + return + + # Computational intensity analysis (only for single GPU) + if args.computational_intensity and not (args.multi_gpu or args.devices): + intensity_results = analyzer.analyze_computational_intensity(flops_results) + if 'error' not in intensity_results: + print("✓ Computational intensity analysis completed") + + # Generate roofline data (only for single GPU) + if args.generate_roofline and not (args.multi_gpu or args.devices): + roofline_path = analyzer.generate_roofline_data(args.output_dir) + print(f"✓ Roofline data generated: {roofline_path}") + + print(f"\n{'='*70}") + print(f"FLOPS ANALYSIS COMPLETED SUCCESSFULLY!") + print(f"{'='*70}") + print(f"📁 Results saved to: {args.output_dir}") + + # Print metrics based on profiling mode + if args.multi_gpu or args.devices: + # Multi-GPU metrics + print(f"\nMulti-GPU Key Metrics:") + print(f" Number of GPUs: {flops_results['multi_gpu_config']['num_gpus']}") + print(f" Average MFU: {flops_results['aggregate_metrics']['avg_mfu_percent']:.1f}%") + print(f" MFU Std Dev: {flops_results['aggregate_metrics']['mfu_std_dev']:.1f}%") + print(f" Total System TFLOPS: {flops_results['aggregate_metrics']['total_system_tflops']:.2f}") + print(f" Avg TFLOPS per GPU: {flops_results['aggregate_metrics']['avg_tflops_per_gpu']:.2f}") + print(f" Total Throughput: {flops_results['aggregate_metrics']['total_throughput']:.1f} samples/sec") + print(f" Multi-GPU Efficiency: {flops_results['aggregate_metrics']['multi_gpu_efficiency_percent']:.1f}%") + print(f" Speedup vs Single GPU: {flops_results['comparison']['speedup']:.2f}x") + else: + # Single GPU metrics + print(f"\nSingle GPU Key Metrics:") + print(f" Model FLOPS Utilization (MFU): {flops_results['efficiency_metrics']['mfu_percent']:.1f}%") + print(f" Achieved TFLOPS: {flops_results['efficiency_metrics']['achieved_tflops']:.2f}") + print(f" Peak TFLOPS: {flops_results['efficiency_metrics']['peak_tflops']:.2f}") + print(f" Throughput: {flops_results['performance_metrics']['throughput_samples_per_sec']:.1f} samples/sec") + print(f" FLOPS per parameter: {flops_results['flops_analysis']['flops_per_parameter']:.2f}") + + except Exception as e: + print(f"❌ Analysis failed: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() + diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.sh b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.sh new file mode 100755 index 00000000..3e8cad2c --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_deepspeed_flops.sh @@ -0,0 +1,286 @@ +#!/bin/bash +################################################################################ +# TinyOpenFold V1 - DeepSpeed FLOPS Profiler +# +# This script runs comprehensive FLOPS analysis for the Evoformer architecture +# using DeepSpeed's FLOPS profiler to measure computational efficiency. +# +# Usage: +# ./run_deepspeed_flops.sh [OPTIONS] +# +# Options: +# --batch-size Batch size for profiling (default: 4) +# --seq-len Sequence length (default: 64) +# --num-seqs Number of MSA sequences (default: 16) +# --num-steps Number of profiling steps (default: 10) +# --device Specific GPU device ID to use (e.g., 0, 1, 2) +# --multi-gpu Profile across all available GPUs +# --devices Comma-separated GPU IDs (e.g., "0,1,2") +# --output-dir Output directory (default: ./flops_analysis) +# --detailed Enable detailed FLOPS breakdown +# --roofline Generate roofline analysis data +# --intensity Analyze computational intensity +# --all Run all analysis types +# --help Show this help message +# +# Examples: +# # Basic FLOPS profiling (single GPU, default device) +# ./run_deepspeed_flops.sh +# +# # Profile on specific GPU +# ./run_deepspeed_flops.sh --device 1 +# +# # Multi-GPU profiling (all available GPUs - 8 on MI250X node) +# ./run_deepspeed_flops.sh --multi-gpu +# +# # Multi-GPU profiling (specific GPUs - all 8 on MI250X) +# ./run_deepspeed_flops.sh --devices "0,1,2,3,4,5,6,7" +# +# # Comprehensive analysis with all features +# ./run_deepspeed_flops.sh --all --batch-size 8 +# +# # Custom configuration +# ./run_deepspeed_flops.sh --seq-len 128 --num-blocks 8 --roofline +# +################################################################################ + +set -e + +# Default configuration +BATCH_SIZE=4 +SEQ_LEN=64 +NUM_SEQS=16 +MSA_DIM=64 +PAIR_DIM=128 +NUM_BLOCKS=4 +NUM_STEPS=10 +OUTPUT_DIR="./flops_analysis" +DEVICE="" +MULTI_GPU="" +DEVICES="" +DETAILED="" +ROOFLINE="" +INTENSITY="" + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --seq-len) + SEQ_LEN="$2" + shift 2 + ;; + --num-seqs) + NUM_SEQS="$2" + shift 2 + ;; + --msa-dim) + MSA_DIM="$2" + shift 2 + ;; + --pair-dim) + PAIR_DIM="$2" + shift 2 + ;; + --num-blocks) + NUM_BLOCKS="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --device) + DEVICE="$2" + shift 2 + ;; + --multi-gpu) + MULTI_GPU="--multi-gpu" + shift + ;; + --devices) + DEVICES="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --detailed) + DETAILED="--detailed-analysis" + shift + ;; + --roofline) + ROOFLINE="--generate-roofline" + shift + ;; + --intensity) + INTENSITY="--computational-intensity" + shift + ;; + --all) + DETAILED="--detailed-analysis" + ROOFLINE="--generate-roofline" + INTENSITY="--computational-intensity" + shift + ;; + --help) + grep "^#" "$0" | sed 's/^# //' | sed 's/^#//' + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +echo "========================================================================" +echo -e "${CYAN}TinyOpenFold V1 - DeepSpeed FLOPS Profiler${NC}" +echo " Evoformer Architecture Analysis" +echo "========================================================================" +echo "" + +# Check if DeepSpeed is available +if ! python3 -c "import deepspeed" 2>/dev/null; then + echo -e "${YELLOW}⚠️ Warning: DeepSpeed not installed${NC}" + echo " The script will provide FLOPS estimates but detailed profiling requires DeepSpeed" + echo "" + echo " To install DeepSpeed:" + echo " pip install deepspeed" + echo "" + read -p "Continue without DeepSpeed? [y/N] " -n 1 -r + echo "" + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Print configuration +echo -e "${BLUE}Configuration:${NC}" +echo " Batch size: $BATCH_SIZE" +echo " Sequence length: $SEQ_LEN" +echo " MSA sequences: $NUM_SEQS" +echo " MSA dimension: $MSA_DIM" +echo " Pair dimension: $PAIR_DIM" +echo " Evoformer blocks: $NUM_BLOCKS" +echo " Profiling steps: $NUM_STEPS" +echo " Output directory: $OUTPUT_DIR" + +# Print device configuration +if [ -n "$MULTI_GPU" ]; then + echo " Mode: Multi-GPU (all available GPUs)" +elif [ -n "$DEVICES" ]; then + echo " Mode: Multi-GPU (GPUs: $DEVICES)" +elif [ -n "$DEVICE" ]; then + echo " Mode: Single GPU (device $DEVICE)" +else + echo " Mode: Single GPU (default device)" +fi +echo "" + +# Check for GPU +if command -v rocm-smi &> /dev/null; then + echo -e "${GREEN}AMD GPU detected:${NC}" + rocm-smi --showproductname 2>/dev/null | grep "Card series" || echo " ROCm available" +elif command -v nvidia-smi &> /dev/null; then + echo -e "${GREEN}NVIDIA GPU detected:${NC}" + nvidia-smi --query-gpu=name --format=csv,noheader | head -1 +else + echo -e "${YELLOW}⚠️ No GPU detected, will use CPU (slow)${NC}" +fi +echo "" + +# Run FLOPS profiling +echo -e "${GREEN}Starting FLOPS profiling...${NC}" +echo "========================================================================" +echo "" + +# Build device arguments +DEVICE_ARGS="" +if [ -n "$MULTI_GPU" ]; then + DEVICE_ARGS="$MULTI_GPU" +elif [ -n "$DEVICES" ]; then + DEVICE_ARGS="--devices $DEVICES" +elif [ -n "$DEVICE" ]; then + DEVICE_ARGS="--device $DEVICE" +fi + +python3 run_deepspeed_flops.py \ + --batch-size "$BATCH_SIZE" \ + --seq-len "$SEQ_LEN" \ + --num-seqs "$NUM_SEQS" \ + --msa-dim "$MSA_DIM" \ + --pair-dim "$PAIR_DIM" \ + --num-blocks "$NUM_BLOCKS" \ + --num-steps "$NUM_STEPS" \ + --output-dir "$OUTPUT_DIR" \ + $DEVICE_ARGS \ + $DETAILED \ + $ROOFLINE \ + $INTENSITY + +EXIT_CODE=$? + +echo "" +echo "========================================================================" + +if [ $EXIT_CODE -eq 0 ]; then + echo -e "${GREEN}✓ DeepSpeed FLOPS profiler completed successfully!${NC}" + echo "" + echo -e "${CYAN}Results saved to: ${OUTPUT_DIR}${NC}" + echo "" + # + # List generated files + if [ -f "$OUTPUT_DIR/flops_profile.json" ]; then + echo "Generated files:" + ls -lh "$OUTPUT_DIR"/*.json 2>/dev/null | awk '{print " " $9 " (" $5 ")"}' + fi + + echo "" + echo -e "${YELLOW}Next steps:${NC}" + echo " 1. Review FLOPS breakdown by component:" + echo " cat $OUTPUT_DIR/flops_profile.json | jq '.flops_analysis.evoformer_breakdown'" + echo "" + echo " 2. Check Model FLOPS Utilization (MFU):" + echo " cat $OUTPUT_DIR/flops_profile.json | jq '.efficiency_metrics'" + echo "" + + if [ -f "$OUTPUT_DIR/computational_intensity.json" ]; then + echo " 3. View computational intensity analysis:" + echo " cat $OUTPUT_DIR/computational_intensity.json" + echo "" + fi + + if [ -f "$OUTPUT_DIR/roofline_data.json" ]; then + echo " 4. Review roofline model data:" + echo " cat $OUTPUT_DIR/roofline_data.json | jq '.optimization_targets'" + echo "" + fi + + echo " 5. Compare with PyTorch profiler results:" + echo " diff <(cat $OUTPUT_DIR/flops_profile.json | jq) <(cat profiles/performance_summary.json | jq)" + +else + echo -e "${RED}✗ FLOPS profiling failed with exit code $EXIT_CODE${NC}" + exit $EXIT_CODE +fi + +echo "" +echo "========================================================================" + diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.py b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.py new file mode 100755 index 00000000..7eb3cfc3 --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.py @@ -0,0 +1,706 @@ +#!/usr/bin/env python3 +""" +PyTorch Profiler Integration for Tiny OpenFold V1 + +This script provides enhanced PyTorch profiler integration with detailed analysis, +visualization, and bottleneck identification capabilities for the Evoformer baseline model. + +Features: +- Comprehensive profiler configuration +- Chrome trace export for detailed timeline analysis +- Operator-level performance breakdown +- Memory usage analysis +- Bottleneck identification and recommendations +- TensorBoard integration for visualization +- Evoformer-specific optimization analysis + +Usage: + # Run profiling with default settings + python run_pytorch_profiler.py + + # Custom profiling configuration + python run_pytorch_profiler.py --batch-size 8 --profile-steps 10 + + # Analyze existing profiling results + python run_pytorch_profiler.py --analyze-existing ./pytorch_profiles + + # Generate detailed report + python run_pytorch_profiler.py --generate-report --output-dir ./analysis +""" + +import torch +import torch.nn as nn +from torch.profiler import profile, record_function, ProfilerActivity +import argparse +import json +import os +import numpy as np +from pathlib import Path +from typing import Dict, List, Any, Optional +from datetime import datetime + +# Import the model from tiny_openfold_v1 +from tiny_openfold_v1 import TinyOpenFold, TinyOpenFoldConfig, ProteinDataset, setup_deterministic_environment + + +class PyTorchProfilerAnalyzer: + """Advanced PyTorch profiler analysis and visualization for Evoformer.""" + + def __init__(self, profile_dir: str): + self.profile_dir = Path(profile_dir) + self.profile_data = None + self.analysis_results = {} + + def run_profiling( + self, + config: TinyOpenFoldConfig, + batch_size: int = 4, + num_steps: int = 20, + warmup_steps: int = 3, + profile_steps: int = 5, + include_memory: bool = True, + include_shapes: bool = True, + device_id: Optional[int] = None + ) -> profile: + """Run comprehensive PyTorch profiling session.""" + + print(f"Starting PyTorch Profiler Analysis - Evoformer Architecture") + print(f" Profile directory: {self.profile_dir}") + print(f" Batch size: {batch_size}") + print(f" Sequence length: {config.max_seq_len}") + print(f" MSA sequences: {config.n_seqs}") + print(f" Total steps: {num_steps}") + print(f" Profile steps: {profile_steps}") + print(f" Memory profiling: {include_memory}") + + # Setup environment + setup_deterministic_environment() + + # Device selection + if device_id is not None: + if not torch.cuda.is_available(): + print(f" Warning: CUDA not available, ignoring device_id={device_id}") + device = torch.device("cpu") + elif device_id >= torch.cuda.device_count(): + raise ValueError(f"Device {device_id} not available. Only {torch.cuda.device_count()} GPU(s) found.") + else: + device = torch.device(f"cuda:{device_id}") + print(f" Using GPU: {device_id}") + else: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f" Using device: {device}") + + # Create model and dataset + model = TinyOpenFold(config).to(device) + dataset = ProteinDataset(config) + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4) + + # Ensure profile directory exists + self.profile_dir.mkdir(parents=True, exist_ok=True) + + # Configure profiler + activities = [ProfilerActivity.CPU] + if torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + def trace_handler(prof): + """Custom trace handler for comprehensive output.""" + # Export Chrome trace for both TensorBoard and direct viewing + chrome_trace_path = self.profile_dir / f"trace_step_{prof.step_num}.json" + prof.export_chrome_trace(str(chrome_trace_path)) + + # Export stacks (if available) + if hasattr(prof, 'export_stacks'): + stacks_path = self.profile_dir / f"stacks_step_{prof.step_num}.txt" + try: + prof.export_stacks(str(stacks_path), "self_cpu_time_total") + except Exception as e: + print(f" Warning: Could not export stacks: {e}") + + print(f" Exported trace for step {prof.step_num}") + + # Run profiling session + with profile( + activities=activities, + record_shapes=include_shapes, + profile_memory=include_memory, + with_stack=True, + with_flops=True, + with_modules=True, + schedule=torch.profiler.schedule( + wait=warmup_steps, + warmup=1, + active=profile_steps, + repeat=1 + ), + on_trace_ready=trace_handler + ) as prof: + model.train() + + # Track timing for throughput + import time + step_times = [] + start_time = time.time() + + for step in range(num_steps): + step_start = time.time() + + # Get batch + msa_tokens, pair_tokens, targets = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_tokens = pair_tokens.to(device) + targets = targets.to(device) + + # Forward pass + with record_function("forward_pass"): + outputs = model(msa_tokens, pair_tokens, targets) + loss = outputs['loss'] + + # Backward pass + with record_function("backward_pass"): + loss.backward() + + # Optimizer step + with record_function("optimizer_step"): + optimizer.step() + optimizer.zero_grad() + + # Profiler step + prof.step() + + # Track step time + if torch.cuda.is_available(): + torch.cuda.synchronize() + step_end = time.time() + step_times.append(step_end - step_start) + + if step % 10 == 0: + print(f" Step {step}/{num_steps}, Loss: {loss.item():.4f}") + + # Calculate and print throughput summary + total_time = time.time() - start_time + total_samples = num_steps * batch_size + avg_step_time = sum(step_times) / len(step_times) + avg_throughput = batch_size / avg_step_time + + print(f"\n{'='*70}") + print(f"Profiling Throughput Summary:") + print(f"{'='*70}") + print(f" Total steps: {num_steps}") + print(f" Batch size: {batch_size}") + print(f" Total samples: {total_samples}") + print(f" Total time: {total_time:.2f} seconds") + print(f" Average step time: {avg_step_time*1000:.2f} ms") + print(f" Average throughput: {avg_throughput:.1f} samples/sec") + print(f" Min step time: {min(step_times)*1000:.2f} ms") + print(f" Max step time: {max(step_times)*1000:.2f} ms") + print(f"{'='*70}\n") + + # Save profiler data for analysis + self.profile_data = prof + return prof + + def analyze_operator_performance(self, prof: profile) -> Dict[str, Any]: + """Analyze operator-level performance characteristics.""" + print(f"\nAnalyzing operator performance...") + + # Get operator statistics + cpu_stats = prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=50) + cuda_stats = prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=50) if torch.cuda.is_available() else None + + # Calculate total time for percentage calculation + total_cpu_time = sum(event.cpu_time_total for event in prof.key_averages()) + total_cuda_time = sum(getattr(event, 'cuda_time_total', 0) for event in prof.key_averages()) if torch.cuda.is_available() else 0 + + # Parse operator data + operator_data = [] + for event in prof.key_averages(): + operator_info = { + 'name': event.key, + 'cpu_time_total': event.cpu_time_total, + 'cpu_time_avg': event.cpu_time / max(1, event.count), + 'cpu_time_percent': (event.cpu_time_total / total_cpu_time * 100) if total_cpu_time > 0 else 0, + 'count': event.count, + 'input_shapes': str(event.input_shapes) if hasattr(event, 'input_shapes') else '', + 'flops': getattr(event, 'flops', 0) + } + + if torch.cuda.is_available(): + # Avoid accessing deprecated cuda_time attribute + if hasattr(event, 'device_time'): + device_time = event.device_time + device_time_total = event.device_time_total + else: + device_time = 0 + device_time_total = 0 + + operator_info.update({ + 'cuda_time_total': device_time_total, + 'cuda_time_avg': device_time / max(1, event.count), + 'cuda_memory_usage': getattr(event, 'cuda_memory_usage', 0) + }) + + operator_data.append(operator_info) + + # Identify bottlenecks + bottlenecks = self._identify_bottlenecks(operator_data) + + analysis = { + 'operator_stats': operator_data, + 'bottlenecks': bottlenecks, + 'cpu_table': cpu_stats, + 'cuda_table': cuda_stats + } + + # Save detailed analysis + analysis_path = self.profile_dir / "operator_analysis.json" + with open(analysis_path, 'w') as f: + # Convert non-serializable data + serializable_data = { + 'operator_stats': operator_data, + 'bottlenecks': bottlenecks, + 'timestamp': datetime.now().isoformat() + } + json.dump(serializable_data, f, indent=2) + + return analysis + + def _identify_bottlenecks(self, operator_data: List[Dict]) -> Dict[str, Any]: + """Identify performance bottlenecks and optimization opportunities for Evoformer.""" + bottlenecks = { + 'top_cpu_time': [], + 'top_cuda_time': [], + 'memory_intensive': [], + 'low_flops_utilization': [], + 'optimization_targets': [] + } + + # Sort by CPU time + cpu_sorted = sorted(operator_data, key=lambda x: x['cpu_time_total'], reverse=True) + bottlenecks['top_cpu_time'] = cpu_sorted[:10] + + # Sort by CUDA time (if available) + if torch.cuda.is_available(): + cuda_sorted = sorted(operator_data, key=lambda x: x.get('cuda_time_total', 0), reverse=True) + bottlenecks['top_cuda_time'] = cuda_sorted[:10] + + # Memory intensive operations + memory_sorted = sorted(operator_data, key=lambda x: x.get('cuda_memory_usage', 0), reverse=True) + bottlenecks['memory_intensive'] = memory_sorted[:10] + + # Identify Evoformer-specific optimization targets + optimization_targets = [] + for op in operator_data: + name = op['name'].lower() + + # MSA Attention optimizations + if any(keyword in name for keyword in ['matmul', 'linear', 'addmm', 'bmm']): + if 'msa' in name and any(proj in name for proj in ['q_proj', 'k_proj', 'v_proj']): + optimization_targets.append({ + 'operation': op['name'], + 'optimization': 'MSA Attention Fusion', + 'potential_benefit': 'Fuse MSA Q/K/V projections and implement Flash Attention', + 'priority': 'high' + }) + + # Triangle Multiplication optimizations + if 'triangle' in name and ('multiply' in name or 'einsum' in name): + optimization_targets.append({ + 'operation': op['name'], + 'optimization': 'Triangle Multiplication Fusion', + 'potential_benefit': 'Fuse triangle update operations to reduce kernel launches', + 'priority': 'high' + }) + + # Outer Product Mean optimizations + if 'outer_product' in name or ('einsum' in name and 'outer' in name): + optimization_targets.append({ + 'operation': op['name'], + 'optimization': 'Outer Product Optimization', + 'potential_benefit': 'Use optimized einsum implementations or custom kernels', + 'priority': 'medium' + }) + + # Pair Representation optimizations + if 'pair' in name and any(keyword in name for keyword in ['linear', 'matmul']): + optimization_targets.append({ + 'operation': op['name'], + 'optimization': 'Pair Update Fusion', + 'potential_benefit': 'Fuse pair update operations', + 'priority': 'medium' + }) + + # LayerNorm optimizations + if 'layernorm' in name or 'layer_norm' in name: + optimization_targets.append({ + 'operation': op['name'], + 'optimization': 'LayerNorm Fusion', + 'potential_benefit': 'Fuse LayerNorm with adjacent operations', + 'priority': 'low' + }) + + bottlenecks['optimization_targets'] = optimization_targets + + return bottlenecks + + def analyze_memory_usage(self, prof: profile) -> Dict[str, Any]: + """Analyze memory usage patterns and identify optimization opportunities.""" + if not torch.cuda.is_available(): + return {'error': 'CUDA not available for memory analysis'} + + print(f"\nAnalyzing memory usage patterns...") + + memory_analysis = {} + + try: + # Memory timeline analysis + memory_events = [] + for event in prof.key_averages(): + if hasattr(event, 'cuda_memory_usage') and event.cuda_memory_usage > 0: + memory_events.append({ + 'name': event.key, + 'memory_usage': event.cuda_memory_usage, + 'count': event.count, + 'avg_memory_per_call': event.cuda_memory_usage / max(1, event.count) + }) + + memory_events.sort(key=lambda x: x['memory_usage'], reverse=True) + + memory_analysis = { + 'peak_memory_events': memory_events[:20], + 'total_memory_allocated': sum(event['memory_usage'] for event in memory_events), + 'memory_efficiency_recommendations': self._generate_memory_recommendations(memory_events) + } + + # Save memory analysis + memory_path = self.profile_dir / "memory_analysis.json" + with open(memory_path, 'w') as f: + json.dump(memory_analysis, f, indent=2) + + except Exception as e: + memory_analysis = {'error': f'Memory analysis failed: {str(e)}'} + + return memory_analysis + + def _generate_memory_recommendations(self, memory_events: List[Dict]) -> List[str]: + """Generate memory optimization recommendations for Evoformer.""" + recommendations = [] + + # Check for high memory operations + high_memory_ops = [event for event in memory_events if event['memory_usage'] > 1e6] # > 1MB + + if high_memory_ops: + recommendations.append( + f"High memory operations detected: {len(high_memory_ops)} operations using >1MB. " + "Consider gradient checkpointing for Evoformer blocks." + ) + + # Check for MSA attention memory patterns + msa_attention_ops = [event for event in memory_events if 'msa' in event['name'].lower() and 'attention' in event['name'].lower()] + if msa_attention_ops: + recommendations.append( + "MSA attention operations detected. Consider Flash Attention adaptation for memory-efficient MSA computation." + ) + + # Check for triangle operations + triangle_ops = [event for event in memory_events if 'triangle' in event['name'].lower()] + if triangle_ops: + recommendations.append( + "Triangle operations detected. Memory usage for L²×d pair representations can be reduced with " + "chunking or gradient checkpointing strategies." + ) + + # Check for temporary tensor creation + temp_ops = [event for event in memory_events if event['count'] > 100] + if temp_ops: + recommendations.append( + f"High-frequency operations detected: {len(temp_ops)} operations called >100 times. " + "Consider tensor reuse or pre-allocation strategies, especially for pair representations." + ) + + # Evoformer-specific recommendations + outer_product_ops = [event for event in memory_events if 'outer_product' in event['name'].lower()] + if outer_product_ops: + recommendations.append( + "Outer product mean operations require O(L²) memory. Consider chunked computation " + "for longer sequences to reduce peak memory usage." + ) + + return recommendations + + def generate_comprehensive_report(self, output_dir: str = None) -> str: + """Generate comprehensive profiling report with recommendations.""" + if output_dir is None: + output_dir = self.profile_dir + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + report_path = output_path / "comprehensive_profiling_report.md" + + report_content = f"""# PyTorch Profiler Analysis Report - Tiny OpenFold V1 (Evoformer) + +**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} +**Profile Directory:** {self.profile_dir} + +## Executive Summary + +This report provides comprehensive performance analysis of the Tiny OpenFold V1 baseline implementation +using PyTorch's built-in profiler. The analysis focuses on identifying optimization opportunities +for the Evoformer architecture. + +## Evoformer Architecture Overview + +The Evoformer consists of several key components: +- **MSA Stack**: Row and column attention over multiple sequence alignments +- **Pair Stack**: Triangle multiplication and attention operations +- **Outer Product Mean**: Combines MSA and pair representations +- **Transitions**: Feed-forward networks for MSA and pair + +## Analysis Results + +### Top CPU Time Consumers + +The following operations consume the most CPU time: + +``` +{self.analysis_results.get('operator_analysis', {}).get('cpu_table', 'No data available')} +``` + +### Top CUDA Time Consumers + +GPU operations breakdown: + +``` +{self.analysis_results.get('operator_analysis', {}).get('cuda_table', 'No data available')} +``` + +### Memory Usage Analysis + +{self._format_memory_analysis()} + +### Optimization Recommendations + +#### High Priority Optimizations (Evoformer-Specific) + +{self._format_optimization_recommendations('high')} + +#### Medium Priority Optimizations + +{self._format_optimization_recommendations('medium')} + +## Next Steps for Optimization + +Based on this analysis, the following optimizations should be considered: + +1. **MSA Attention Optimization**: Adapt Flash Attention for row/column MSA attention +2. **Triangle Operation Fusion**: Fuse triangle multiplication and attention kernels +3. **Memory-Efficient Outer Product**: Implement chunked outer product mean computation +4. **Gradient Checkpointing**: Apply to Evoformer blocks for large sequences +5. **Mixed Precision**: Use FP16/BF16 for improved throughput + +## Evoformer-Specific Bottlenecks + +### Triangle Operations +- **Complexity**: O(L²) for pair representations +- **Optimization**: Kernel fusion, chunking for long sequences +- **Expected Improvement**: 1.5-2× speedup + +### MSA Attention +- **Complexity**: O(N×L) for N sequences of length L +- **Optimization**: Flash Attention adaptation +- **Expected Improvement**: 2-3× speedup, 50% memory reduction + +### Outer Product Mean +- **Complexity**: O(N×L²) +- **Optimization**: Chunked computation, low-precision accumulation +- **Expected Improvement**: 1.3-1.5× speedup + +## Detailed Analysis Files + +- **Operator Analysis**: `operator_analysis.json` +- **Memory Analysis**: `memory_analysis.json` +- **Chrome Traces**: `trace_step_*.json` +- **Performance Summary**: `performance_summary.json` + +## Visualization + +To visualize the profiling results: + +1. **TensorBoard**: `tensorboard --logdir {self.profile_dir}` +2. **Chrome Trace**: Open `trace_step_*.json` in Chrome's chrome://tracing + +## Comparison with DeepSpeed FLOPS Profiler + +For computational efficiency analysis (MFU, FLOPS breakdown), run: +```bash +./run_deepspeed_flops.sh --device 0 --num-steps 50 +``` + +See `PROFILER_RESULTS_COMPARISON.md` for side-by-side comparison. + +--- +*This report was generated by the TinyOpenFold profiling tools.* +""" + + with open(report_path, 'w') as f: + f.write(report_content) + + print(f"Comprehensive report generated: {report_path}") + return str(report_path) + + def _format_memory_analysis(self) -> str: + """Format memory analysis for report.""" + memory_data = self.analysis_results.get('memory_analysis', {}) + + if 'error' in memory_data: + return f"Memory analysis unavailable: {memory_data['error']}" + + peak_events = memory_data.get('peak_memory_events', [])[:5] + + if not peak_events: + return "No memory usage data available." + + formatted = "**Top Memory Consumers:**\n\n" + for i, event in enumerate(peak_events, 1): + formatted += f"{i}. {event['name']}: {event['memory_usage']/1e6:.1f} MB\n" + + recommendations = memory_data.get('memory_efficiency_recommendations', []) + if recommendations: + formatted += "\n**Memory Optimization Recommendations:**\n\n" + for rec in recommendations: + formatted += f"- {rec}\n" + + return formatted + + def _format_optimization_recommendations(self, priority: str) -> str: + """Format optimization recommendations by priority.""" + bottlenecks = self.analysis_results.get('operator_analysis', {}).get('bottlenecks', {}) + targets = bottlenecks.get('optimization_targets', []) + + priority_targets = [target for target in targets if target.get('priority') == priority] + + if not priority_targets: + return f"No {priority} priority optimizations identified." + + formatted = "" + for target in priority_targets: + formatted += f"- **{target['optimization']}**: {target['potential_benefit']}\n" + formatted += f" - Operation: {target['operation']}\n\n" + + return formatted + + def analyze_existing_profiles(self, profile_dir: str): + """Analyze existing profiling results from a directory.""" + profile_path = Path(profile_dir) + + if not profile_path.exists(): + print(f"Profile directory not found: {profile_dir}") + return + + # Look for JSON trace files + trace_files = list(profile_path.glob("trace_step_*.json")) + + if not trace_files: + print(f"No trace files found in: {profile_dir}") + return + + print(f"Analyzing existing profiles from: {profile_dir}") + print(f" Found {len(trace_files)} trace files") + + # Analyze each trace file + for trace_file in trace_files: + print(f" Analyzing: {trace_file.name}") + # Note: Full trace analysis would require parsing the Chrome trace format + # For now, we'll provide summary information + + print("Analysis of existing profiles completed") + + +def main(): + """Main entry point for PyTorch profiler analysis.""" + parser = argparse.ArgumentParser(description='PyTorch Profiler for Tiny OpenFold V1') + + # Model configuration + parser.add_argument('--batch-size', type=int, default=4, help='Batch size for profiling') + parser.add_argument('--seq-len', type=int, default=64, help='Sequence length') + parser.add_argument('--num-seqs', type=int, default=16, help='Number of MSA sequences') + parser.add_argument('--msa-dim', type=int, default=64, help='MSA dimension') + parser.add_argument('--pair-dim', type=int, default=128, help='Pair dimension') + parser.add_argument('--num-blocks', type=int, default=4, help='Number of Evoformer blocks') + + # Profiling configuration + parser.add_argument('--num-steps', type=int, default=20, help='Total profiling steps') + parser.add_argument('--warmup-steps', type=int, default=3, help='Warmup steps') + parser.add_argument('--profile-steps', type=int, default=5, help='Active profiling steps') + parser.add_argument('--profile-dir', type=str, default='./pytorch_profiles', help='Profile output directory') + parser.add_argument('--device', type=int, default=None, help='GPU device ID (e.g., 0, 1, 2)') + + # Analysis options + parser.add_argument('--include-memory', action='store_true', default=True, help='Include memory profiling') + parser.add_argument('--include-shapes', action='store_true', default=True, help='Include tensor shapes') + parser.add_argument('--analyze-existing', type=str, help='Analyze existing profile directory') + parser.add_argument('--generate-report', action='store_true', help='Generate comprehensive report') + parser.add_argument('--output-dir', type=str, help='Output directory for reports') + + args = parser.parse_args() + + # Create analyzer + analyzer = PyTorchProfilerAnalyzer(args.profile_dir) + + # Analyze existing profiles + if args.analyze_existing: + analyzer.analyze_existing_profiles(args.analyze_existing) + return + + # Run new profiling session + config = TinyOpenFoldConfig( + msa_dim=args.msa_dim, + pair_dim=args.pair_dim, + n_evoformer_blocks=args.num_blocks, + n_seqs=args.num_seqs, + max_seq_len=args.seq_len + ) + + print("PYTORCH PROFILER - TINY OPENFOLD V1 (EVOFORMER) ANALYSIS") + print("=" * 70) + + try: + # Run profiling + prof = analyzer.run_profiling( + config=config, + batch_size=args.batch_size, + num_steps=args.num_steps, + warmup_steps=args.warmup_steps, + profile_steps=args.profile_steps, + include_memory=args.include_memory, + include_shapes=args.include_shapes, + device_id=args.device + ) + + # Analyze results + print("\n" + "="*70) + analyzer.analysis_results['operator_analysis'] = analyzer.analyze_operator_performance(prof) + analyzer.analysis_results['memory_analysis'] = analyzer.analyze_memory_usage(prof) + + # Generate report + if args.generate_report: + report_path = analyzer.generate_comprehensive_report(args.output_dir) + print(f"\nReport generated: {report_path}") + + print(f"\nProfiling analysis completed successfully!") + print(f"Results saved to: {args.profile_dir}") + print(f"\nNext steps:") + print(f" 1. Launch TensorBoard: tensorboard --logdir {args.profile_dir}") + print(f" 2. View Chrome trace: Open trace_step_*.json in chrome://tracing") + print(f" 3. Compare with DeepSpeed FLOPS: ./run_deepspeed_flops.sh --device 0 --num-steps 50") + + except Exception as e: + print(f"Profiling analysis failed: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() + diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.sh b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.sh new file mode 100755 index 00000000..53cc02e3 --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/run_pytorch_profiler.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Run TinyOpenFold V1 with PyTorch Profiler +# This script provides comprehensive profiling with detailed analysis + +set -e + +echo "========================================================================" +echo "TinyOpenFold V1 - PyTorch Profiler (Evoformer Analysis)" +echo "========================================================================" + +# Default parameters +BATCH_SIZE=4 +SEQ_LEN=64 +NUM_SEQS=16 +NUM_STEPS=20 +PROFILE_STEPS=5 +WARMUP_STEPS=3 +PROFILE_DIR="./pytorch_profiles" +DEVICE="" +GENERATE_REPORT="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --seq-len) + SEQ_LEN="$2" + shift 2 + ;; + --num-seqs) + NUM_SEQS="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --profile-steps) + PROFILE_STEPS="$2" + shift 2 + ;; + --device) + DEVICE="$2" + shift 2 + ;; + --profile-dir) + PROFILE_DIR="$2" + shift 2 + ;; + --generate-report) + GENERATE_REPORT="--generate-report" + shift + ;; + --help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --batch-size Batch size (default: 4)" + echo " --seq-len Sequence length (default: 64)" + echo " --num-seqs Number of MSA sequences (default: 16)" + echo " --num-steps Total profiling steps (default: 20)" + echo " --profile-steps Active profiling steps (default: 5)" + echo " --device GPU device ID (e.g., 0, 1, 2)" + echo " --profile-dir Profile output directory (default: ./pytorch_profiles)" + echo " --generate-report Generate comprehensive report" + echo " --help Show this help message" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Create profile directory +mkdir -p "$PROFILE_DIR" + +echo "Configuration:" +echo " Batch size: $BATCH_SIZE" +echo " Sequence length: $SEQ_LEN" +echo " MSA sequences: $NUM_SEQS" +echo " Total steps: $NUM_STEPS" +echo " Profile steps: $PROFILE_STEPS" +echo " Profile directory: $PROFILE_DIR" +if [ -n "$DEVICE" ]; then + echo " Device: GPU $DEVICE" +else + echo " Device: Default" +fi +echo "" + +# Build command +CMD="python run_pytorch_profiler.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-seqs $NUM_SEQS \ + --num-steps $NUM_STEPS \ + --profile-steps $PROFILE_STEPS \ + --warmup-steps $WARMUP_STEPS \ + --profile-dir $PROFILE_DIR \ + --include-memory \ + --include-shapes" + +if [ -n "$DEVICE" ]; then + CMD="$CMD --device $DEVICE" +fi + +if [ -n "$GENERATE_REPORT" ]; then + CMD="$CMD $GENERATE_REPORT" +fi + +# Run profiler +$CMD + +echo "" +echo "========================================================================" +echo "PyTorch profiler analysis completed!" +echo "========================================================================" +echo "Profile data saved to: $PROFILE_DIR" +echo "" +echo "Visualization options:" +echo " 1. Chrome Trace Viewer (RECOMMENDED for timeline):" +echo " - Open Chrome browser" +echo " - Navigate to: chrome://tracing" +echo " - Click 'Load' and select: $PROFILE_DIR/trace_step_*.json" +echo " - Interactive timeline with kernel details" +echo "" +echo " 2. Comprehensive Report:" +echo " less $PROFILE_DIR/comprehensive_profiling_report.md" +echo "" +echo "Analysis files:" +echo " - comprehensive_profiling_report.md: Full analysis with recommendations" +echo " - operator_analysis.json: Detailed operator performance" +echo " - memory_analysis.json: Memory usage patterns" +echo " - trace_step_*.json: Chrome trace format for chrome://tracing" +if [ -n "$GENERATE_REPORT" ]; then + echo " - comprehensive_profiling_report.md: Full analysis report" +fi +echo "" +echo "Compare with DeepSpeed FLOPS profiler:" +echo " ./run_deepspeed_flops.sh --device 0 --num-steps 50" + diff --git a/MLExamples/TinyOpenFold/version1_pytorch_baseline/tiny_openfold_v1.py b/MLExamples/TinyOpenFold/version1_pytorch_baseline/tiny_openfold_v1.py new file mode 100644 index 00000000..7322c112 --- /dev/null +++ b/MLExamples/TinyOpenFold/version1_pytorch_baseline/tiny_openfold_v1.py @@ -0,0 +1,1170 @@ +#!/usr/bin/env python3 +""" +Tiny OpenFold V1: PyTorch Baseline with Comprehensive Profiling Integration + +Educational implementation of AlphaFold 2's Evoformer architecture for protein structure prediction. +This version integrates PyTorch Profiler and comprehensive performance analysis capabilities +while maintaining deterministic execution. + +Features: +- Evoformer blocks with MSA and pair representations +- Triangle multiplicative updates for geometric reasoning +- MSA row/column attention mechanisms +- PyTorch Profiler integration with GPU/CPU timeline analysis +- Memory profiling and bandwidth analysis +- Operator-level performance characterization +- Comprehensive performance reporting + +Usage: + # Basic training + python tiny_openfold_v1.py --batch-size 4 --seq-len 64 + + # With PyTorch profiler + python tiny_openfold_v1.py --enable-pytorch-profiler --profile-dir ./profiles + + # With memory profiling + python tiny_openfold_v1.py --enable-pytorch-profiler --profile-memory + + # Complete profiling suite + python tiny_openfold_v1.py --enable-all-profiling --profile-dir ./complete_analysis +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.cuda.amp import autocast, GradScaler +from torch.profiler import profile, record_function, ProfilerActivity +import numpy as np +import math +import time +import os +import json +import argparse +from pathlib import Path +from typing import Optional, Tuple, Dict, Any +from dataclasses import dataclass, asdict +from datetime import datetime + +# Optional imports with graceful fallbacks +try: + import torch.cuda.nvtx as nvtx + NVTX_AVAILABLE = True +except ImportError: + NVTX_AVAILABLE = False + class nvtx: + @staticmethod + def range(name): + from contextlib import nullcontext + return nullcontext() + + +@dataclass +class TinyOpenFoldConfig: + """Configuration for Tiny OpenFold model - optimized for profiling.""" + vocab_size: int = 21 # 20 amino acids + unknown + msa_dim: int = 64 # MSA representation dimension + pair_dim: int = 128 # Pair representation dimension + n_evoformer_blocks: int = 4 # Number of Evoformer blocks + n_heads_msa: int = 4 # Number of MSA attention heads + n_heads_pair: int = 4 # Number of pair attention heads + msa_intermediate_dim: int = 256 # MSA transition intermediate dimension + pair_intermediate_dim: int = 512 # Pair transition intermediate dimension + outer_product_dim: int = 32 # Outer product mean dimension + max_seq_len: int = 64 # Maximum sequence length + n_seqs: int = 16 # Number of MSA sequences + pair_input_dim: int = 65 # Pair input features (distance bins, etc.) + dropout: float = 0.0 # Dropout rate (0 for profiling) + norm_eps: float = 1e-5 # Layer norm epsilon + + def to_dict(self) -> Dict[str, Any]: + """Convert config to dictionary.""" + return asdict(self) + + +@dataclass +class ProfilerConfig: + """Configuration for profiling options.""" + enable_pytorch_profiler: bool = False + enable_memory_profiling: bool = False + profile_operators: bool = False + profile_dir: str = "./pytorch_profiles" + sort_by: str = "cuda_time_total" + warmup_steps: int = 3 + profile_steps: int = 5 + export_chrome_trace: bool = True + export_stacks: bool = False + + +class PerformanceMonitor: + """Comprehensive performance monitoring and analysis.""" + + def __init__(self): + self.reset() + + def reset(self): + """Reset all metrics.""" + self.metrics = { + 'training_speed': [], + 'memory_usage': [], + 'loss_values': [], + 'batch_times': [], + 'forward_times': [], + 'backward_times': [], + 'optimizer_times': [] + } + self.start_time = None + self.total_samples = 0 + + def start_timing(self): + """Start timing measurement.""" + if torch.cuda.is_available(): + torch.cuda.synchronize() + self.start_time = time.time() + + def end_timing(self) -> float: + """End timing measurement and return elapsed time.""" + if torch.cuda.is_available(): + torch.cuda.synchronize() + elapsed = time.time() - self.start_time + self.start_time = None + return elapsed + + def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float]): + """Record metrics for a training batch.""" + self.total_samples += batch_size + self.metrics['loss_values'].append(loss) + self.metrics['batch_times'].append(timings.get('total', 0)) + self.metrics['forward_times'].append(timings.get('forward', 0)) + self.metrics['backward_times'].append(timings.get('backward', 0)) + self.metrics['optimizer_times'].append(timings.get('optimizer', 0)) + + # Memory usage + if torch.cuda.is_available(): + memory_mb = torch.cuda.memory_allocated() / (1024**2) + self.metrics['memory_usage'].append(memory_mb) + + # Training speed (samples per second) + if timings.get('total', 0) > 0: + speed = batch_size / timings['total'] + self.metrics['training_speed'].append(speed) + + def get_summary(self) -> Dict[str, Any]: + """Get performance summary statistics.""" + if not self.metrics['batch_times']: + return {} + + summary = { + 'total_samples': self.total_samples, + 'avg_training_speed': np.mean(self.metrics['training_speed']) if self.metrics['training_speed'] else 0, + 'avg_loss': np.mean(self.metrics['loss_values']), + 'avg_batch_time': np.mean(self.metrics['batch_times']), + 'avg_forward_time': np.mean(self.metrics['forward_times']), + 'avg_backward_time': np.mean(self.metrics['backward_times']), + 'avg_optimizer_time': np.mean(self.metrics['optimizer_times']), + } + + if self.metrics['memory_usage']: + summary.update({ + 'peak_memory_mb': max(self.metrics['memory_usage']), + 'avg_memory_mb': np.mean(self.metrics['memory_usage']) + }) + + return summary + + +def get_available_devices() -> Tuple[list, bool]: + """ + Detect available GPUs respecting ROCR_VISIBLE_DEVICES/HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES. + + Returns: + (device_ids, multi_gpu): List of available device IDs and whether multi-GPU is enabled + """ + if not torch.cuda.is_available(): + return [], False + + # Check environment variables (priority: ROCR > HIP > CUDA) + rocr_devices = os.environ.get('ROCR_VISIBLE_DEVICES') + hip_devices = os.environ.get('HIP_VISIBLE_DEVICES') + cuda_devices = os.environ.get('CUDA_VISIBLE_DEVICES') + + env_devices = rocr_devices or hip_devices or cuda_devices + + if env_devices: + # Parse comma-separated device IDs + try: + device_ids = [int(d.strip()) for d in env_devices.split(',') if d.strip().isdigit()] + if not device_ids: + # If parsing failed, use all available + device_ids = list(range(torch.cuda.device_count())) + except ValueError: + device_ids = list(range(torch.cuda.device_count())) + else: + # Use all available devices + device_ids = list(range(torch.cuda.device_count())) + + # Filter device_ids to only those actually available + device_ids = [d for d in device_ids if d < torch.cuda.device_count()] + + multi_gpu = len(device_ids) > 1 + return device_ids, multi_gpu + + +def setup_deterministic_environment(): + """Configure PyTorch for deterministic execution.""" + seed = 42 + + # Python random + import random + random.seed(seed) + + # NumPy + np.random.seed(seed) + + # PyTorch + torch.manual_seed(seed) + + # CUDA/ROCm + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Enable deterministic algorithms + torch.use_deterministic_algorithms(True) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + os.environ['PYTHONHASHSEED'] = str(seed) + + print("Deterministic execution environment configured") + print(f" Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}") + if torch.cuda.is_available(): + print(f" GPU: {torch.cuda.get_device_name(0)}") + print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") + + +class MSARowAttentionWithPairBias(nn.Module): + """MSA row-wise attention biased by pair representation.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.msa_dim = config.msa_dim + self.n_heads = config.n_heads_msa + self.head_dim = config.msa_dim // config.n_heads_msa + self.scale = self.head_dim ** -0.5 + + # Q, K, V projections for MSA + self.q_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.k_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.v_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.o_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + + # Pair bias projection + self.pair_bias_proj = nn.Linear(config.pair_dim, config.n_heads_msa, bias=False) + + self.dropout = nn.Dropout(config.dropout) + + def forward(self, msa: torch.Tensor, pair: torch.Tensor) -> torch.Tensor: + """ + Args: + msa: (batch, n_seqs, seq_len, msa_dim) + pair: (batch, seq_len, seq_len, pair_dim) + Returns: + (batch, n_seqs, seq_len, msa_dim) + """ + with record_function("msa_row_attention"): + batch_size, n_seqs, seq_len, _ = msa.shape + + # Project to Q, K, V + with record_function("msa_qkv_projection"): + q = self.q_proj(msa).view(batch_size, n_seqs, seq_len, self.n_heads, self.head_dim) + k = self.k_proj(msa).view(batch_size, n_seqs, seq_len, self.n_heads, self.head_dim) + v = self.v_proj(msa).view(batch_size, n_seqs, seq_len, self.n_heads, self.head_dim) + + # Transpose for attention: (batch, n_seqs, n_heads, seq_len, head_dim) + q = q.transpose(2, 3) + k = k.transpose(2, 3) + v = v.transpose(2, 3) + + # Compute attention scores + with record_function("msa_attention_scores"): + # (batch, n_seqs, n_heads, seq_len, seq_len) + scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale + + # Add pair bias: (batch, seq_len, seq_len, pair_dim) -> (batch, n_heads, seq_len, seq_len) + pair_bias = self.pair_bias_proj(pair).permute(0, 3, 1, 2) + scores = scores + pair_bias.unsqueeze(1) # Broadcast across n_seqs + + # Apply softmax and dropout + with record_function("msa_attention_softmax"): + attn_weights = F.softmax(scores, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Apply attention to values + with record_function("msa_attention_output"): + attn_output = torch.matmul(attn_weights, v) + # (batch, n_seqs, n_heads, seq_len, head_dim) -> (batch, n_seqs, seq_len, msa_dim) + attn_output = attn_output.transpose(2, 3).contiguous().view(batch_size, n_seqs, seq_len, self.msa_dim) + output = self.o_proj(attn_output) + + return output + + +class MSAColumnAttention(nn.Module): + """MSA column-wise attention (across sequences).""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.msa_dim = config.msa_dim + self.n_heads = config.n_heads_msa + self.head_dim = config.msa_dim // config.n_heads_msa + self.scale = self.head_dim ** -0.5 + + # Q, K, V projections + self.q_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.k_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.v_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.o_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + + self.dropout = nn.Dropout(config.dropout) + + def forward(self, msa: torch.Tensor) -> torch.Tensor: + """ + Args: + msa: (batch, n_seqs, seq_len, msa_dim) + Returns: + (batch, n_seqs, seq_len, msa_dim) + """ + with record_function("msa_column_attention"): + batch_size, n_seqs, seq_len, _ = msa.shape + + # Transpose to put seq_len first for column-wise attention + # (batch, seq_len, n_seqs, msa_dim) + msa_t = msa.transpose(1, 2) + + # Project to Q, K, V + with record_function("msa_col_qkv_projection"): + q = self.q_proj(msa_t).view(batch_size, seq_len, n_seqs, self.n_heads, self.head_dim) + k = self.k_proj(msa_t).view(batch_size, seq_len, n_seqs, self.n_heads, self.head_dim) + v = self.v_proj(msa_t).view(batch_size, seq_len, n_seqs, self.n_heads, self.head_dim) + + # Transpose for attention: (batch, seq_len, n_heads, n_seqs, head_dim) + q = q.transpose(2, 3) + k = k.transpose(2, 3) + v = v.transpose(2, 3) + + # Compute attention scores + with record_function("msa_col_attention_scores"): + # (batch, seq_len, n_heads, n_seqs, n_seqs) + scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale + + # Apply softmax and dropout + with record_function("msa_col_attention_softmax"): + attn_weights = F.softmax(scores, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Apply attention to values + with record_function("msa_col_attention_output"): + attn_output = torch.matmul(attn_weights, v) + # (batch, seq_len, n_heads, n_seqs, head_dim) -> (batch, seq_len, n_seqs, msa_dim) + attn_output = attn_output.transpose(2, 3).contiguous().view(batch_size, seq_len, n_seqs, self.msa_dim) + output = self.o_proj(attn_output) + + # Transpose back to (batch, n_seqs, seq_len, msa_dim) + return output.transpose(1, 2) + + +class MSATransition(nn.Module): + """Point-wise feed-forward network for MSA.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.linear1 = nn.Linear(config.msa_dim, config.msa_intermediate_dim, bias=False) + self.linear2 = nn.Linear(config.msa_intermediate_dim, config.msa_dim, bias=False) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, msa: torch.Tensor) -> torch.Tensor: + with record_function("msa_transition"): + x = self.linear1(msa) + x = F.relu(x) + x = self.dropout(x) + x = self.linear2(x) + return self.dropout(x) + + +class OuterProductMean(nn.Module): + """Outer product mean: projects MSA to pair representation.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.msa_to_outer = nn.Linear(config.msa_dim, config.outer_product_dim, bias=False) + self.outer_to_pair = nn.Linear(config.outer_product_dim ** 2, config.pair_dim, bias=False) + self.layer_norm = nn.LayerNorm(config.msa_dim, eps=config.norm_eps) + + def forward(self, msa: torch.Tensor) -> torch.Tensor: + """ + Args: + msa: (batch, n_seqs, seq_len, msa_dim) + Returns: + pair_update: (batch, seq_len, seq_len, pair_dim) + """ + with record_function("outer_product_mean"): + batch_size, n_seqs, seq_len, _ = msa.shape + + # Normalize and project + msa_norm = self.layer_norm(msa) + outer_features = self.msa_to_outer(msa_norm) # (batch, n_seqs, seq_len, outer_dim) + + # Compute outer product between all position pairs, mean over sequences + with record_function("outer_product_computation"): + # Einstein summation: for positions i,j compute mean_n(feat[n,i] ⊗ feat[n,j]) + # bnid: batch, n_seqs, position_i, outer_dim + # bnje: batch, n_seqs, position_j, outer_dim + # bijde: batch, position_i, position_j, outer_dim, outer_dim + outer = torch.einsum('bnid,bnje->bijde', outer_features, outer_features) / n_seqs + # outer: (batch, seq_len, seq_len, outer_dim, outer_dim) + + # Flatten last two dimensions + outer_flat = outer.flatten(-2, -1) # (batch, seq_len, seq_len, outer_dim²) + + # Project to pair dimension + pair_update = self.outer_to_pair(outer_flat) + + return pair_update + + +class TriangleMultiplication(nn.Module): + """Triangle multiplicative update (outgoing or incoming).""" + + def __init__(self, config: TinyOpenFoldConfig, outgoing: bool = True): + super().__init__() + self.outgoing = outgoing + + # Gated projections + self.left_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.right_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.left_gate = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.right_gate = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + + # Output projection and gate + self.output_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.output_gate = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + + self.layer_norm = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + + def forward(self, pair: torch.Tensor) -> torch.Tensor: + """ + Args: + pair: (batch, seq_len, seq_len, pair_dim) + Returns: + (batch, seq_len, seq_len, pair_dim) + """ + name = "triangle_mult_outgoing" if self.outgoing else "triangle_mult_incoming" + with record_function(name): + pair_norm = self.layer_norm(pair) + + # Compute left and right projections with gates + left = self.left_proj(pair_norm) * torch.sigmoid(self.left_gate(pair_norm)) + right = self.right_proj(pair_norm) * torch.sigmoid(self.right_gate(pair_norm)) + + # Triangle multiplication + with record_function(f"{name}_matmul"): + if self.outgoing: + # Sum over k: z_ij += left_ik * right_jk + update = torch.einsum('bikc,bjkc->bijc', left, right) + else: + # Sum over k: z_ij += left_ki * right_kj + update = torch.einsum('bkic,bkjc->bijc', left, right) + + # Output projection with gate + gate = torch.sigmoid(self.output_gate(pair_norm)) + output = self.output_proj(update) * gate + + return output + + +class TriangleAttention(nn.Module): + """Triangle self-attention (starting or ending node).""" + + def __init__(self, config: TinyOpenFoldConfig, starting: bool = True): + super().__init__() + self.starting = starting + self.n_heads = config.n_heads_pair + self.head_dim = config.pair_dim // config.n_heads_pair + self.scale = self.head_dim ** -0.5 + + # Q, K, V projections + self.q_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.k_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.v_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + self.o_proj = nn.Linear(config.pair_dim, config.pair_dim, bias=False) + + self.layer_norm = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + + def forward(self, pair: torch.Tensor) -> torch.Tensor: + """ + Args: + pair: (batch, seq_len, seq_len, pair_dim) + Returns: + (batch, seq_len, seq_len, pair_dim) + """ + name = "triangle_attn_starting" if self.starting else "triangle_attn_ending" + with record_function(name): + batch_size, seq_len, _, pair_dim = pair.shape + pair_norm = self.layer_norm(pair) + + if self.starting: + # Attention over edges starting from a node: fix i, attend over j + q = self.q_proj(pair_norm).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + k = self.k_proj(pair_norm).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + v = self.v_proj(pair_norm).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + + # (batch, seq_len, n_heads, seq_len, head_dim) + q = q.transpose(2, 3) + k = k.transpose(2, 3) + v = v.transpose(2, 3) + + # Attention: (batch, seq_len, n_heads, seq_len, seq_len) + scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale + attn_weights = F.softmax(scores, dim=-1) + + attn_output = torch.matmul(attn_weights, v) + attn_output = attn_output.transpose(2, 3).contiguous().view(batch_size, seq_len, seq_len, pair_dim) + else: + # Attention over edges ending at a node: fix j, attend over i + # Transpose to make j the "batch" dimension + pair_t = pair_norm.transpose(1, 2) # (batch, seq_len, seq_len, pair_dim) + + q = self.q_proj(pair_t).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + k = self.k_proj(pair_t).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + v = self.v_proj(pair_t).view(batch_size, seq_len, seq_len, self.n_heads, self.head_dim) + + q = q.transpose(2, 3) + k = k.transpose(2, 3) + v = v.transpose(2, 3) + + scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale + attn_weights = F.softmax(scores, dim=-1) + + attn_output = torch.matmul(attn_weights, v) + attn_output = attn_output.transpose(2, 3).contiguous().view(batch_size, seq_len, seq_len, pair_dim) + + # Transpose back + attn_output = attn_output.transpose(1, 2) + + output = self.o_proj(attn_output) + return output + + +class PairTransition(nn.Module): + """Point-wise feed-forward network for pair representation.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.linear1 = nn.Linear(config.pair_dim, config.pair_intermediate_dim, bias=False) + self.linear2 = nn.Linear(config.pair_intermediate_dim, config.pair_dim, bias=False) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, pair: torch.Tensor) -> torch.Tensor: + with record_function("pair_transition"): + x = self.linear1(pair) + x = F.relu(x) + x = self.dropout(x) + x = self.linear2(x) + return self.dropout(x) + + +class EvoformerBlock(nn.Module): + """Single Evoformer block with MSA and pair representation updates.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + + # MSA operations + self.msa_row_attention = MSARowAttentionWithPairBias(config) + self.msa_column_attention = MSAColumnAttention(config) + self.msa_transition = MSATransition(config) + + # MSA layer norms + self.msa_norm_row = nn.LayerNorm(config.msa_dim, eps=config.norm_eps) + self.msa_norm_col = nn.LayerNorm(config.msa_dim, eps=config.norm_eps) + self.msa_norm_trans = nn.LayerNorm(config.msa_dim, eps=config.norm_eps) + + # Pair operations + self.outer_product_mean = OuterProductMean(config) + self.triangle_mult_outgoing = TriangleMultiplication(config, outgoing=True) + self.triangle_mult_incoming = TriangleMultiplication(config, outgoing=False) + self.triangle_attn_starting = TriangleAttention(config, starting=True) + self.triangle_attn_ending = TriangleAttention(config, starting=False) + self.pair_transition = PairTransition(config) + + # Pair layer norms + self.pair_norm_outer = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + self.pair_norm_tri_out = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + self.pair_norm_tri_in = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + self.pair_norm_attn_start = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + self.pair_norm_attn_end = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + self.pair_norm_trans = nn.LayerNorm(config.pair_dim, eps=config.norm_eps) + + def forward(self, msa: torch.Tensor, pair: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + msa: (batch, n_seqs, seq_len, msa_dim) + pair: (batch, seq_len, seq_len, pair_dim) + Returns: + msa, pair (same shapes as input) + """ + with record_function("evoformer_block"): + # MSA updates + with record_function("evoformer_msa_updates"): + msa = msa + self.msa_row_attention(self.msa_norm_row(msa), pair) + msa = msa + self.msa_column_attention(self.msa_norm_col(msa)) + msa = msa + self.msa_transition(self.msa_norm_trans(msa)) + + # Pair updates + with record_function("evoformer_pair_updates"): + pair = pair + self.outer_product_mean(msa) + pair = pair + self.triangle_mult_outgoing(self.pair_norm_tri_out(pair)) + pair = pair + self.triangle_mult_incoming(self.pair_norm_tri_in(pair)) + pair = pair + self.triangle_attn_starting(self.pair_norm_attn_start(pair)) + pair = pair + self.triangle_attn_ending(self.pair_norm_attn_end(pair)) + pair = pair + self.pair_transition(self.pair_norm_trans(pair)) + + return msa, pair + + +class SimplifiedStructureModule(nn.Module): + """Simplified structure module: predicts distances from pair representation.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + # Predict pairwise distances + self.distance_pred = nn.Linear(config.pair_dim, 1, bias=False) + + def forward(self, pair: torch.Tensor) -> torch.Tensor: + """ + Args: + pair: (batch, seq_len, seq_len, pair_dim) + Returns: + distances: (batch, seq_len, seq_len, 1) + """ + with record_function("structure_module"): + distances = self.distance_pred(pair) + # Apply sigmoid to constrain to reasonable range + distances = torch.sigmoid(distances) * 20.0 # Scale to ~20 Angstroms + return distances + + +class TinyOpenFold(nn.Module): + """Tiny OpenFold model for protein structure prediction.""" + + def __init__(self, config: TinyOpenFoldConfig): + super().__init__() + self.config = config + + # Input embeddings + self.msa_embedding = nn.Embedding(config.vocab_size, config.msa_dim) + self.pair_embedding = nn.Linear(config.pair_input_dim, config.pair_dim, bias=False) + + # Evoformer blocks + self.evoformer_blocks = nn.ModuleList([ + EvoformerBlock(config) for _ in range(config.n_evoformer_blocks) + ]) + + # Structure module + self.structure_module = SimplifiedStructureModule(config) + + # Initialize weights + self._init_weights() + + def _init_weights(self): + """Initialize model weights.""" + for module in self.modules(): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, msa_tokens: torch.Tensor, pair_features: torch.Tensor, + target_distances: Optional[torch.Tensor] = None) -> dict: + """ + Args: + msa_tokens: (batch, n_seqs, seq_len) - amino acid tokens + pair_features: (batch, seq_len, seq_len, pair_input_dim) - pairwise features + target_distances: (batch, seq_len, seq_len, 1) - ground truth distances (optional) + Returns: + dict with 'distances' and optionally 'loss' + """ + with record_function("model_forward"): + # Embed inputs + with record_function("input_embedding"): + msa = self.msa_embedding(msa_tokens) # (batch, n_seqs, seq_len, msa_dim) + pair = self.pair_embedding(pair_features) # (batch, seq_len, seq_len, pair_dim) + + # Pass through Evoformer blocks + with record_function("evoformer_layers"): + for i, block in enumerate(self.evoformer_blocks): + with record_function(f"evoformer_{i}"): + msa, pair = block(msa, pair) + + # Predict structure + with record_function("structure_prediction"): + predicted_distances = self.structure_module(pair) + + # Calculate loss if targets provided + loss = None + if target_distances is not None: + with record_function("loss_calculation"): + # MSE loss on distances + loss = F.mse_loss(predicted_distances, target_distances) + + return { + 'distances': predicted_distances, + 'loss': loss, + 'pair_repr': pair, + 'msa_repr': msa + } + + +class ProteinDataset: + """Synthetic protein dataset for training demonstration.""" + + def __init__(self, config: TinyOpenFoldConfig, num_samples: int = 1000): + self.config = config + self.num_samples = num_samples + + # Generate synthetic data (deterministic) + np.random.seed(42) + + # Random MSA sequences + self.msa_data = np.random.randint( + 0, config.vocab_size, + size=(num_samples, config.n_seqs, config.max_seq_len), + dtype=np.int64 + ) + + # Random pair features (e.g., distance bins) + self.pair_data = np.random.randn( + num_samples, config.max_seq_len, config.max_seq_len, config.pair_input_dim + ).astype(np.float32) + + # Random target distances (simulate true structure) + self.distance_data = np.random.rand( + num_samples, config.max_seq_len, config.max_seq_len, 1 + ).astype(np.float32) * 20.0 # 0-20 Angstroms + + def get_batch(self, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Get a batch of data.""" + indices = np.random.choice(self.num_samples, batch_size, replace=False) + + msa_tokens = torch.from_numpy(self.msa_data[indices]) + pair_features = torch.from_numpy(self.pair_data[indices]) + target_distances = torch.from_numpy(self.distance_data[indices]) + + return msa_tokens, pair_features, target_distances + + +def setup_pytorch_profiler(profiler_config: ProfilerConfig) -> Optional[profile]: + """Setup PyTorch profiler with comprehensive configuration.""" + if not profiler_config.enable_pytorch_profiler: + return None + + # Ensure profile directory exists + Path(profiler_config.profile_dir).mkdir(parents=True, exist_ok=True) + + # Profiler activities + activities = [ProfilerActivity.CPU] + if torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + # Profiler configuration + profiler = profile( + activities=activities, + record_shapes=True, + profile_memory=profiler_config.enable_memory_profiling, + with_stack=profiler_config.export_stacks, + with_flops=True, + with_modules=True, + experimental_config=torch._C._profiler._ExperimentalConfig( + verbose=True + ), + schedule=torch.profiler.schedule( + wait=profiler_config.warmup_steps, + warmup=1, + active=profiler_config.profile_steps, + repeat=1 + ), + on_trace_ready=torch.profiler.tensorboard_trace_handler(profiler_config.profile_dir) + ) + + return profiler + + +def train_tiny_openfold( + config: TinyOpenFoldConfig, + profiler_config: ProfilerConfig, + num_steps: int = 50, + batch_size: int = 4, + learning_rate: float = 3e-4, + use_amp: bool = False, + device_id: Optional[int] = None, + use_data_parallel: bool = True +): + """Train the Tiny OpenFold model with comprehensive profiling (single or multi-GPU).""" + + # Setup environment + setup_deterministic_environment() + + # Detect available devices + available_devices, multi_gpu_available = get_available_devices() + + # Device selection logic + if device_id is not None: + # Single device mode (explicit selection overrides everything) + if device_id >= torch.cuda.device_count(): + raise ValueError(f"Device {device_id} not available. Only {torch.cuda.device_count()} GPU(s) found.") + device = torch.device(f"cuda:{device_id}") + use_multi_gpu = False + print(f"\n Single GPU mode: Using cuda:{device_id} (explicit)") + elif multi_gpu_available and use_data_parallel and len(available_devices) > 1: + # Multi-GPU mode + device = torch.device(f"cuda:{available_devices[0]}") # Primary device + use_multi_gpu = True + + # Show environment variable that was used + env_var = "ROCR_VISIBLE_DEVICES" if os.environ.get('ROCR_VISIBLE_DEVICES') else \ + "HIP_VISIBLE_DEVICES" if os.environ.get('HIP_VISIBLE_DEVICES') else \ + "CUDA_VISIBLE_DEVICES" if os.environ.get('CUDA_VISIBLE_DEVICES') else \ + "all available" + + print(f"\n Multi-GPU mode: Using {len(available_devices)} GPUs") + print(f" Device IDs: {available_devices} (from {env_var})") + print(f" Primary device: cuda:{available_devices[0]}") + print(f" Effective batch size: {batch_size} total (split across GPUs)") + else: + # Default single GPU or CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + use_multi_gpu = False + print(f"\n Single GPU mode: Using default device ({device})") + + # Ensure profile directory exists + if profiler_config.profile_dir: + Path(profiler_config.profile_dir).mkdir(parents=True, exist_ok=True) + + # Create model + model = TinyOpenFold(config) + + # Wrap with DataParallel if multi-GPU + if use_multi_gpu: + model = nn.DataParallel(model, device_ids=available_devices) + print(f" Model wrapped with DataParallel") + + model = model.to(device) + + # Model summary + total_params = sum(p.numel() for p in model.parameters()) + print(f"\nModel Configuration:") + print(f" MSA dimension: {config.msa_dim}") + print(f" Pair dimension: {config.pair_dim}") + print(f" Evoformer blocks: {config.n_evoformer_blocks}") + print(f" MSA sequences: {config.n_seqs}") + print(f" Sequence length: {config.max_seq_len}") + print(f" Total parameters: {total_params:,}") + print(f" Model size: {total_params * 4 / 1e6:.1f} MB (FP32)") + + if isinstance(model, nn.DataParallel): + print(f" Multi-GPU: {len(model.device_ids)} GPUs") + print(f" Device IDs: {model.device_ids}") + print(f" Primary device: {device}") + else: + print(f" Device: {device}") + + # Create dataset + dataset = ProteinDataset(config) + + # Setup optimizer + optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01) + + # Setup mixed precision + scaler = GradScaler() if use_amp else None + + # Setup profiler + pytorch_profiler = setup_pytorch_profiler(profiler_config) + + # Performance monitor + monitor = PerformanceMonitor() + + print(f"\nTraining Configuration:") + print(f" Training steps: {num_steps}") + print(f" Batch size: {batch_size}") + print(f" Learning rate: {learning_rate}") + print(f" Mixed precision: {use_amp}") + print(f" Device: {device}") + print(f" PyTorch Profiler: {profiler_config.enable_pytorch_profiler}") + print(f" Memory Profiling: {profiler_config.enable_memory_profiling}") + + # Training loop + model.train() + + # Warmup steps + warmup_steps = 5 + print(f"\nRunning {warmup_steps} warmup steps...") + + for step in range(warmup_steps): + msa_tokens, pair_features, target_distances = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_features = pair_features.to(device) + target_distances = target_distances.to(device) + + if use_amp: + with autocast(): + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'].mean() # Average loss across GPUs for DataParallel + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'].mean() # Average loss across GPUs for DataParallel + loss.backward() + optimizer.step() + + optimizer.zero_grad() + + print(f"Warmup complete. Starting measured training loop...") + print("=" * 70) + + for step in range(num_steps): + # Start batch timing + batch_timings = {} + monitor.start_timing() + + # Get batch + with nvtx.range("data_loading"): + msa_tokens, pair_features, target_distances = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_features = pair_features.to(device) + target_distances = target_distances.to(device) + + # Forward pass timing + monitor.start_timing() + with nvtx.range("forward_pass"): + if use_amp: + with autocast(): + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'].mean() # Average loss across GPUs for DataParallel + else: + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'].mean() # Average loss across GPUs for DataParallel + batch_timings['forward'] = monitor.end_timing() + + # Backward pass timing + monitor.start_timing() + with nvtx.range("backward_pass"): + if use_amp: + scaler.scale(loss).backward() + else: + loss.backward() + batch_timings['backward'] = monitor.end_timing() + + # Optimizer step timing + monitor.start_timing() + with nvtx.range("optimizer_step"): + if use_amp: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + optimizer.zero_grad() + batch_timings['optimizer'] = monitor.end_timing() + + # Total batch time + batch_timings['total'] = sum(batch_timings.values()) + + # Record metrics + monitor.record_batch_metrics(batch_size, loss.item(), batch_timings) + + # PyTorch profiler step + if pytorch_profiler: + pytorch_profiler.step() + + # Progress logging + if step % 10 == 0: + speed = batch_size / batch_timings['total'] if batch_timings['total'] > 0 else 0 + memory_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0 + + print(f"Step {step:3d}/{num_steps} | " + f"Loss: {loss.item():.4f} | " + f"Speed: {speed:5.1f} samples/sec | " + f"Memory: {memory_mb:6.1f} MB | " + f"Time: {batch_timings['total']*1000:5.1f}ms") + + print("=" * 70) + + # Performance summary + summary = monitor.get_summary() + avg_speed = summary.get('avg_training_speed', 0) + + print(f"\nPerformance Summary:") + print(f" Total samples processed: {summary.get('total_samples', 0):,}") + print(f" Average training speed: {avg_speed:.1f} samples/sec") + print(f" Average batch time: {summary.get('avg_batch_time', 0)*1000:.1f} ms") + print(f" Average forward time: {summary.get('avg_forward_time', 0)*1000:.1f} ms") + print(f" Average backward time: {summary.get('avg_backward_time', 0)*1000:.1f} ms") + print(f" Average optimizer time: {summary.get('avg_optimizer_time', 0)*1000:.1f} ms") + print(f" Final loss: {summary.get('avg_loss', 0):.4f}") + + if 'peak_memory_mb' in summary: + print(f" Peak memory usage: {summary['peak_memory_mb']:.1f} MB") + + # Save performance data + if profiler_config.profile_dir: + timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S') + + profile_data = { + 'version': 'v1_baseline', + 'timestamp': timestamp_str, + 'config': config.to_dict(), + 'profiler_config': asdict(profiler_config), + 'performance_summary': summary, + 'training_params': { + 'num_steps': num_steps, + 'batch_size': batch_size, + 'learning_rate': learning_rate, + 'use_amp': use_amp + }, + 'system_info': { + 'device': str(device), + 'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None, + 'pytorch_version': torch.__version__, + 'rocm_version': os.environ.get('ROCM_VERSION', 'N/A'), + 'timestamp_iso': datetime.now().isoformat() + } + } + + profile_path = Path(profiler_config.profile_dir) / "performance_summary.json" + profile_path.parent.mkdir(parents=True, exist_ok=True) + with open(profile_path, 'w') as f: + json.dump(profile_data, f, indent=2) + + print(f"\nPerformance data saved to: {profile_path}") + + return model, monitor + + +def main(): + """Main entry point for Version 1 training.""" + parser = argparse.ArgumentParser(description='Tiny OpenFold V1: PyTorch Baseline with Profiling') + + # Model configuration + parser.add_argument('--msa-dim', type=int, default=64, help='MSA dimension') + parser.add_argument('--pair-dim', type=int, default=128, help='Pair dimension') + parser.add_argument('--num-blocks', type=int, default=4, help='Number of Evoformer blocks') + parser.add_argument('--num-seqs', type=int, default=16, help='Number of MSA sequences') + parser.add_argument('--seq-len', type=int, default=64, help='Sequence length') + + # Training configuration + parser.add_argument('--num-steps', type=int, default=50, help='Number of training steps') + parser.add_argument('--batch-size', type=int, default=4, help='Batch size (total across all GPUs)') + parser.add_argument('--learning-rate', type=float, default=3e-4, help='Learning rate') + parser.add_argument('--use-amp', action='store_true', help='Use automatic mixed precision') + parser.add_argument('--device', type=int, default=None, help='Single GPU device index (disables multi-GPU)') + parser.add_argument('--no-data-parallel', action='store_true', help='Disable DataParallel even if multiple GPUs available') + + # Profiling configuration + parser.add_argument('--enable-pytorch-profiler', action='store_true', help='Enable PyTorch profiler') + parser.add_argument('--enable-memory-profiling', action='store_true', help='Enable memory profiling') + parser.add_argument('--enable-all-profiling', action='store_true', help='Enable all profiling features') + parser.add_argument('--profile-operators', action='store_true', help='Profile individual operators') + parser.add_argument('--profile-dir', type=str, default='./pytorch_profiles', help='Profiling output directory') + parser.add_argument('--sort-by', type=str, default='cuda_time_total', help='Sort profiling results by metric') + parser.add_argument('--warmup-steps', type=int, default=3, help='Profiler warmup steps') + parser.add_argument('--profile-steps', type=int, default=5, help='Number of profiling steps') + + # Validation and debugging + parser.add_argument('--validate-setup', action='store_true', help='Run validation checks') + + args = parser.parse_args() + + # Print banner + print("=" * 80) + print("TINY OPENFOLD - VERSION 1: PYTORCH BASELINE") + print(" Educational AlphaFold 2 / Evoformer Implementation") + print("=" * 80) + + # Configure model + config = TinyOpenFoldConfig( + msa_dim=args.msa_dim, + pair_dim=args.pair_dim, + n_evoformer_blocks=args.num_blocks, + n_seqs=args.num_seqs, + max_seq_len=args.seq_len, + msa_intermediate_dim=args.msa_dim * 4, + pair_intermediate_dim=args.pair_dim * 4 + ) + + # Configure profiler + profiler_config = ProfilerConfig( + enable_pytorch_profiler=args.enable_pytorch_profiler or args.enable_all_profiling, + enable_memory_profiling=args.enable_memory_profiling or args.enable_all_profiling, + profile_operators=args.profile_operators, + profile_dir=args.profile_dir, + sort_by=args.sort_by, + warmup_steps=args.warmup_steps, + profile_steps=args.profile_steps + ) + + # Validation mode + if args.validate_setup: + print("Running validation checks...") + try: + # Quick validation run + model, monitor = train_tiny_openfold( + config=config, + profiler_config=profiler_config, + num_steps=3, + batch_size=2, + device_id=args.device, + use_data_parallel=not args.no_data_parallel + ) + print("Validation successful! Environment ready.") + return + except Exception as e: + print(f"Validation failed: {e}") + return + + # Run training with profiling + try: + model, monitor = train_tiny_openfold( + config=config, + profiler_config=profiler_config, + num_steps=args.num_steps, + batch_size=args.batch_size, + learning_rate=args.learning_rate, + use_amp=args.use_amp, + device_id=args.device, + use_data_parallel=not args.no_data_parallel + ) + + print(f"\nTraining completed successfully!") + + if profiler_config.enable_pytorch_profiler: + print(f"PyTorch profiling data saved to: {args.profile_dir}") + print(f" Launch TensorBoard: tensorboard --logdir {args.profile_dir}") + + print(f"\nNext Steps:") + print(f" 1. Analyze profiling results to identify bottlenecks") + print(f" 2. Review performance metrics and optimization opportunities") + print(f" 3. Experiment with different configurations") + + except Exception as e: + print(f"Training failed: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/README.md b/MLExamples/TinyOpenFold/version2_pytorch_fused/README.md new file mode 100644 index 00000000..b7962130 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/README.md @@ -0,0 +1,781 @@ +# TinyOpenFold V2: PyTorch Fused - Kernel Fusion and ROCm Tools Integration + +Educational implementation of AlphaFold 2's Evoformer architecture with comprehensive kernel fusion optimizations and ROCm profiling integration. + +## Overview + +Version 2 demonstrates the power of kernel fusion and introduces comprehensive ROCm profiling tools. Building on the baseline analysis from Version 1, this version implements targeted optimizations to achieve significant performance improvements through strategic kernel fusion, Flash Attention, and advanced ROCm profiling integration. + +## Learning Objectives + +After completing this version, you will be able to: + +- Implement QKV fusion for MSA and triangle attention operations +- Integrate Flash Attention for memory-efficient attention computation +- Apply gate/proj fusion in triangle multiplicative updates +- Use ROCm profiling tools (rocprofv3, rocprof-sys-python, rocprof-compute) for hardware-level analysis +- Analyze kernel fusion impact on performance and memory usage +- Interpret ROCm profiling data for optimization insights +- Conduct ablation studies to quantify fusion benefits + +## Key Optimizations Implemented + +### 1. MSA QKV Fusion + +- **Problem**: Separate Q, K, V linear projections create 3 kernel launches per attention operation +- **Solution**: Fused QKV projection with single kernel launch for both row and column attention +- **Expected Benefit**: 20-30% reduction in MSA attention overhead + +### 2. Triangle QKV Fusion + +- **Problem**: Separate Q, K, V projections in triangle attention (starting and ending) +- **Solution**: Combined QKV projections for both triangle attention variants +- **Expected Benefit**: 20-30% reduction in triangle attention overhead + +### 3. Flash Attention Integration + +- **Problem**: Standard attention has O(n²) memory complexity +- **Solution**: PyTorch's scaled_dot_product_attention with Flash Attention +- **Expected Benefit**: 50-80% memory reduction, enables larger sequences + +### 4. Triangle Gate/Proj Fusion + +- **Problem**: Separate gate and proj projections in triangle multiplicative updates +- **Solution**: Combined gate/proj computation with element-wise operations +- **Expected Benefit**: 15-25% triangle operation speedup + +### 5. Torch Compile Integration + +- **Problem**: Remaining kernel launch overhead +- **Solution**: Automatic fusion through torch.compile() +- **Expected Benefit**: Additional 10-20% speedup through automatic optimizations + +## Quick Start + +### Environment Setup + +Before running V2, ensure your environment is set up correctly. See the [Environment Setup and Installation](../README.md#environment-setup-and-installation) section in the main README for detailed instructions. + +**Quick summary:** +- Load modules: `module load python/3.12 rocm/7.2` (or `cray-python rocm/7.2`) +- Create and activate venv: `python3 -m venv venv && source venv/bin/activate` +- Install PyTorch (ROCm 7.1 nightly): `pip3 install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/rocm7.1` +- Install DeepSpeed: `pip3 install deepspeed` +- Set up `LD_LIBRARY_PATH` for library loading + +See the main README for complete setup instructions. + +### Basic Fused Training + +```bash +# Ensure you're in the version2_pytorch_fused directory +cd version2_pytorch_fused + +# Default configuration with all fusions enabled +python3 tiny_openfold_v2.py --batch-size 4 --seq-len 64 + +# Expected output shows fusion statistics: +# MSA QKV Fusion: Enabled +# Triangle QKV Fusion: Enabled +# Flash Attention: Enabled +# Triangle Gate/Proj Fusion: Enabled +# Kernel Reduction: 80.0% (48 fewer kernels) +``` + +### Validation Check + +```bash +# Verify fusion optimizations work correctly +python3 tiny_openfold_v2.py --validate-setup + +# Should output: +# V2 validation successful! Fusion setup working properly. +``` + +### Compare Fusion vs Baseline + +```bash +# Compare all fusion enabled vs baseline (all fusion disabled) +python3 tiny_openfold_v2.py --compare-fusion --batch-size 4 --num-steps 50 + +# Output shows: +# - Training speed comparison (speedup) +# - Memory usage comparison (reduction) +# - Batch time comparison (improvement) +# - Kernel reduction percentage +``` + +### Enable All Fusions + +```bash +# Explicitly enable all fusion optimizations +python3 tiny_openfold_v2.py --enable-all-fusion --batch-size 4 +``` + +### Baseline Comparison Mode + +```bash +# Run with all fusions disabled (equivalent to V1) +python3 tiny_openfold_v2.py --disable-all-fusion --batch-size 4 +``` + +## Architecture Enhancements and Fusion Techniques + +### Mathematical Foundation of Kernel Fusion + +Kernel fusion combines multiple operations into a single GPU kernel to reduce memory bandwidth requirements and kernel launch overhead. + +#### Fusion Efficiency Analysis + +**Memory Bandwidth Reduction:** + +For QKV Fusion: +- **Separate operations**: 3 × (Input Read + Weight Read + Output Write) +- **Fused operation**: Input Read + 3 × Weight Read + Output Write +- **Reduction**: ~40% for typical batch sizes (eliminates 2 redundant input reads) + +**Kernel Launch Overhead:** +- Each kernel launch: 5-50 μs depending on operation size +- QKV fusion: 3 launches → 1 launch (saves 10-100 μs per attention) +- Triangle fusion: 4 launches → 2 launches (saves 10-100 μs per triangle op) + +### 1. MSA QKV Fusion Implementation + +#### Before Fusion (Baseline) + +```python +# Three separate linear projections - 3 kernel launches +q = self.q_proj(msa) # Kernel 1: GEMM [B,N,S,D] × [D,D] = [B,N,S,D] +k = self.k_proj(msa) # Kernel 2: GEMM [B,N,S,D] × [D,D] = [B,N,S,D] +v = self.v_proj(msa) # Kernel 3: GEMM [B,N,S,D] × [D,D] = [B,N,S,D] + +# Memory reads: 3x MSA tensor + 3x weight matrices +# Memory writes: 3x output tensors +``` + +#### After Fusion (Optimized) + +```python +# Single fused projection - 1 kernel launch +qkv = self.qkv_proj(msa) # Kernel 1: GEMM [B,N,S,D] × [D,3D] = [B,N,S,3D] +q, k, v = qkv.chunk(3, dim=-1) # Tensor view operation (no memory copy) + +# Memory reads: 1x MSA tensor + 1x weight matrix (3x size) +# Memory writes: 1x output tensor (3x size) +# Bandwidth reduction: ~40% (eliminated 2 redundant MSA reads) +``` + +#### Implementation Details + +```python +class FusedMSARowAttention(nn.Module): + def __init__(self, config, fusion_config): + super().__init__() + if fusion_config.enable_qkv_fusion_msa: + # Fused QKV projection - 3 operations combined into 1 + self.qkv_proj = nn.Linear(config.msa_dim, 3 * config.msa_dim, bias=False) + else: + # Separate projections (baseline) + self.q_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.k_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) + self.v_proj = nn.Linear(config.msa_dim, config.msa_dim, bias=False) +``` + +### 2. Flash Attention Deep Dive + +#### Memory Complexity Analysis + +**Standard Attention Memory:** +- Attention Matrix: O(B × H × S²) +- For S=64: 64² = 4,096 elements per head +- Total Memory: B × H × S² × 4 bytes +- Example: 4 × 4 × 64² × 4 = 262 KB per MSA sequence + +**Flash Attention Memory:** +- Block Size: Typically 64 × 64 +- Memory Usage: O(B × H × S) (linear in sequence length!) +- Reduction: S-fold memory reduction (64x for S=64) + +#### Flash Attention Benefits + +```python +# Use PyTorch's optimized Flash Attention +if self.fusion_config.enable_flash_attention: + attn_output = F.scaled_dot_product_attention( + q, k, v, + attn_mask=pair_bias, # Supports attention bias + dropout_p=0.0, + is_causal=False + ) +``` + +**Performance Characteristics:** +- Memory: O(S) instead of O(S²) +- Speed: 2-4x faster for sequences > 32 +- Numerical stability: Built-in overflow protection + +### 3. Triangle Fusion Implementation + +#### Triangle Multiplicative Update Fusion + +**Before Fusion:** +```python +# Four separate projections - 4 kernel launches +left = self.left_proj(pair) # Kernel 1 +right = self.right_proj(pair) # Kernel 2 +left_g = self.left_gate(pair) # Kernel 3 +right_g = self.right_gate(pair) # Kernel 4 + +left = left * torch.sigmoid(left_g) +right = right * torch.sigmoid(right_g) +``` + +**After Fusion:** +```python +# Two fused projections - 2 kernel launches +proj = self.left_right_proj(pair) # Kernel 1: Combined left+right +left, right = proj.chunk(2, dim=-1) + +gate = self.left_right_gate(pair) # Kernel 2: Combined gates +left_g, right_g = gate.chunk(2, dim=-1) + +left = left * torch.sigmoid(left_g) +right = right * torch.sigmoid(right_g) +# Reduction: 4 kernels → 2 kernels (50% fewer launches) +``` + +### 4. Torch Compile Integration + +```python +# Apply torch.compile for automatic fusion +if fusion_config.enable_torch_compile: + model = torch.compile( + model, + mode='default', # or 'max-autotune' for aggressive optimization + dynamic=False + ) +``` + +**Torch Compile Optimizations:** +- Automatic elementwise operation fusion +- Memory layout optimization +- Shape specialization +- AMD GPU-specific optimizations + +## Fusion Performance Analysis Framework + +### Kernel Count Analysis + +**Per Evoformer Block:** +- **Baseline**: 15 major kernel launches + - MSA row attention: 3 (Q,K,V) + - MSA column attention: 3 (Q,K,V) + - Triangle mult out: 4 (left_proj, right_proj, left_gate, right_gate) + - Triangle mult in: 4 (left_proj, right_proj, left_gate, right_gate) + - Triangle attn start: 3 (Q,K,V) + - Triangle attn end: 3 (Q,K,V) + - Other ops: ~5 (transitions, outer product, etc.) + +- **With All Fusions**: 3 major kernels + - MSA row attention: 1 (fused QKV) + - MSA column attention: 1 (fused QKV) + - Triangle mult out: 2 (fused proj, fused gate) + - Triangle mult in: 2 (fused proj, fused gate) + - Triangle attn start: 1 (fused QKV) + - Triangle attn end: 1 (fused QKV) + - Other ops: ~5 (unchanged) + +- **Kernel Reduction**: 12 kernels per block (80% reduction in attention/triangle ops) + +### Expected Performance Gains + +| Optimization | Impact | Memory Reduction | Kernel Reduction | Implementation Effort | +|-------------|--------|------------------|------------------|---------------------| +| **MSA QKV Fusion** | 1.2-1.4x | 15-25% | 67% (6→2 kernels) | Low | +| **Triangle QKV Fusion** | 1.2-1.3x | 15-25% | 67% (6→2 kernels) | Low | +| **Flash Attention** | 1.3-2.0x | 50-80% | Attention optimized | Medium | +| **Triangle Fusion** | 1.1-1.3x | 10-20% | 50% (8→4 kernels) | Low | +| **Torch Compile** | 1.1-1.2x | 5-10% | 10-30% | Very Low | +| **Combined Effect** | **1.5-2.2x** | **50-80%** | **60-80%** | - | + +## Profiling and Analysis + +### PyTorch Profiler with Fusion Analysis + +```bash +# Basic profiling with fusion analysis +python3 run_pytorch_profiler.py --batch-size 4 --profile-dir ./fusion_analysis + +# View comprehensive report +less fusion_analysis/comprehensive_profiling_report.md + +# Compare with baseline (all fusions disabled) +python3 run_pytorch_profiler.py --disable-all-fusion --profile-dir ./baseline_analysis +``` + +**Provides:** +- Fusion-specific kernel analysis +- Kernel count reduction measurement +- Flash Attention performance tracking +- Memory bandwidth utilization + +### ROCm Profiling Suite + +AMD offers three performance profiling tools for ROCm-based applications: + +#### 1. rocprofv3 - Kernel Statistics + +```bash +# Basic kernel profiling +./run_rocprofv3.sh --batch-size 4 --seq-len 64 + +# View kernel statistics +less rocprofv3_profiles_v2/rocprofv3_summary.txt +``` + +**Key Metrics:** +- Kernel execution times +- Kernel call counts (verify fusion effectiveness) +- GPU utilization + +#### 2. rocprof-sys-python - Python Call Stack Profiling + +`rocprof-sys-python` provides Python call stack profiling with source-level instrumentation, enabling detailed analysis of function call counts and timing. + +```bash +# Basic profiling with defaults (batch-size=2, seq-len=16 for smaller output) +./run_rocprof_sys.sh + +# Custom batch size and sequence length +./run_rocprof_sys.sh --batch-size 4 --seq-len 64 + +# Direct command-line usage +rocprof-sys-python --trace -- ./tiny_openfold_v2.py --batch-size 2 --seq-len 16 +``` + +**Output Files:** +- **ROCPD format** (`.rocpd` or `.rocpd.json`) - Recommended for AI/ML workloads with better thread support +- **Perfetto trace** (`.proto`) - Timeline visualization +- **Call stack data** (`trip_count-*.txt/json`, `wall_clock-*.txt/json`) - Function call counts and timing +- **Metadata** (`metadata-*.json`, `functions-*.json`) - Function and source information + +**Visualization:** +```bash +# For Perfetto traces: +# 1. Copy .proto file to your local machine +# 2. Open https://ui.perfetto.dev in your browser +# 3. Click 'Open trace file' and select the .proto file + +# For ROCPD format: +# Use ROCm tools or compatible viewers for AI/ML workload analysis +``` + +**Key Insights:** +- Python function call stack with call counts +- Function-level timing (wall clock, CPU time) +- CPU-GPU synchronization patterns +- Memory usage tracking (peak RSS, page RSS) +- Thread-level profiling + +**Documentation:** +- ROCm Systems Profiler Python Guide: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html + +**Note:** Default batch size (2) and sequence length (16) are optimized for profiling to reduce output file sizes. For production analysis, use larger values with `--batch-size` and `--seq-len` flags. + +#### 3. rocprof-compute - Hardware Analysis + +```bash +# Generate roofline plots +./run_rocprof_compute.sh --roof-only --batch-size 4 + +# Full profile with dispatch analysis +./run_rocprof_compute.sh --batch-size 4 + +# Analyze specific dispatch +./run_rocprof_compute.sh --mode analyze --dispatch 1538 +``` + +**Key Metrics:** +- Roofline analysis (compute vs memory bound) +- Memory bandwidth utilization +- Hardware counter analysis + +### Comprehensive Profiling Suite + +```bash +# Run all profilers in one go +./run_all_profilers.sh --batch-size 4 --seq-len 64 + +# Quick profiling (skip rocprof-sys) +./run_all_profilers.sh --quick --batch-size 4 + +# View summary +less complete_profiling_*/PROFILING_SUMMARY.md +``` + +## Ablation Studies + +### Testing Individual Fusions + +```bash +# Only MSA QKV fusion +python3 tiny_openfold_v2.py \ + --disable-qkv-fusion-triangle \ + --disable-flash-attention \ + --disable-triangle-fusion + +# Only Flash Attention +python3 tiny_openfold_v2.py \ + --disable-qkv-fusion-msa \ + --disable-qkv-fusion-triangle \ + --disable-triangle-fusion + +# Only Triangle fusion +python3 tiny_openfold_v2.py \ + --disable-qkv-fusion-msa \ + --disable-qkv-fusion-triangle \ + --disable-flash-attention +``` + +### Automated Ablation Study + +```bash +# Run comprehensive ablation study +./run_pytorch_profiler.sh --ablation --batch-size 4 + +# Results saved to pytorch_profiles_v2_ablation_*/ +``` + +## Performance Study Launcher + +```bash +# Standard performance study across configurations +./launch_performance_study.sh \ + --batch-sizes "2 4 8" \ + --seq-lens "32 64 128" \ + --num-runs 3 + +# Include baseline comparison +./launch_performance_study.sh --num-runs 3 + +# Include ablation study +./launch_performance_study.sh --ablation --num-runs 3 + +# View results +cat performance_study_*/results_summary.json +``` + +## Comparison with Version 1 + +### Running Comparative Analysis + +```bash +# Run V1 baseline +cd ../version1_pytorch_baseline +python3 tiny_openfold_v1.py --batch-size 4 --seq-len 64 --num-steps 50 \ + --profile-dir ./v1_comparison + +# Run V2 with comparison +cd ../version2_pytorch_fused +python3 tiny_openfold_v2.py --batch-size 4 --seq-len 64 --num-steps 50 \ + --compare-with-v1 ../version1_pytorch_baseline/v1_comparison/performance_summary.json +``` + +### Expected Improvements + +Based on the fusion optimizations: +- **Speedup**: 1.5-2.2x training throughput +- **Memory**: 50-80% reduction (with Flash Attention) +- **Kernel Count**: 60-80% reduction in attention/triangle kernels +- **GPU Utilization**: Improved from better kernel efficiency + +## Command Reference + +### Model Configuration + +```bash +--msa-dim 64 # MSA representation dimension +--pair-dim 128 # Pair representation dimension +--num-blocks 4 # Number of Evoformer blocks +--num-seqs 16 # Number of MSA sequences +--seq-len 64 # Sequence length (residues) +``` + +### Training Parameters + +```bash +--num-steps 50 # Training iterations +--batch-size 4 # Batch size +--learning-rate 3e-4 # Learning rate +--use-amp # Enable mixed precision (FP16) +``` + +### Fusion Configuration + +```bash +# Enable/disable specific fusions +--enable-qkv-fusion-msa # MSA QKV fusion (default: on) +--disable-qkv-fusion-msa # Disable MSA QKV fusion +--enable-qkv-fusion-triangle # Triangle QKV fusion (default: on) +--disable-qkv-fusion-triangle # Disable triangle QKV fusion +--enable-flash-attention # Flash Attention (default: on) +--disable-flash-attention # Disable Flash Attention +--enable-triangle-fusion # Triangle gate/proj fusion (default: on) +--disable-triangle-fusion # Disable triangle fusion +--enable-torch-compile # Enable torch.compile +--torch-compile-mode default # Torch compile mode + +# Fusion presets +--enable-all-fusion # Enable everything +--disable-all-fusion # Baseline mode (no fusions) +``` + +### Profiling Options + +```bash +--enable-pytorch-profiler # Enable PyTorch profiler +--enable-memory-profiling # Track memory usage +--enable-rocm-profiling # Enable ROCm tools integration +--enable-all-profiling # Enable all profiling +--profile-dir PATH # Output directory +``` + +## Code Structure + +### Main Fusion Classes + +**`FusionConfig`**: Configuration dataclass for fusion options + +**`FusedMSARowAttention`**: MSA row attention with QKV fusion + Flash Attention +- Fused QKV projection or separate (configurable) +- Flash Attention integration with pair bias +- Fallback to standard attention + +**`FusedMSAColumnAttention`**: MSA column attention with QKV fusion + Flash Attention +- Fused QKV projection +- Flash Attention for column-wise operations + +**`FusedTriangleMultiplication`**: Triangle update with gate/proj fusion +- Fused left_right_proj (2 ops → 1) +- Fused left_right_gate (2 ops → 1) +- Einstein summation for triangle computation + +**`FusedTriangleAttention`**: Triangle attention with QKV fusion + Flash Attention +- Fused QKV projections +- Flash Attention for edge attention + +**`FusedEvoformerBlock`**: Complete Evoformer with all fusions +- Integrates all fused components +- Maintains compatibility with baseline architecture + +**`TinyOpenFoldV2`**: Main model class with fusion support +- Accepts FusionConfig parameter +- Supports torch.compile wrapper +- Fusion statistics reporting + +### Fusion Statistics + +```python +# Get fusion statistics from model +fusion_stats = model.get_fusion_statistics() + +# Returns: +# { +# 'qkv_fusion_msa_enabled': True, +# 'qkv_fusion_triangle_enabled': True, +# 'flash_attention_enabled': True, +# 'triangle_fusion_enabled': True, +# 'baseline_kernels_per_block': 15, +# 'fused_kernels_per_block': 3, +# 'kernel_reduction_percent': 80.0, +# 'total_kernel_reduction': 48 +# } +``` + +## Debugging Tips + +### Fusion Not Working + +```bash +# Check Flash Attention availability +python3 -c "import torch.nn.functional as F; print(hasattr(F, 'scaled_dot_product_attention'))" + +# Check torch.compile availability +python3 -c "import torch; print(hasattr(torch, 'compile'))" + +# Run with fusion disabled to compare +python3 tiny_openfold_v2.py --disable-all-fusion +``` + +### Numerical Accuracy Verification + +```bash +# Verify that fused version produces numerically equivalent outputs to baseline +python3 tiny_openfold_v2.py --verify-accuracy --batch-size 4 + +# Output shows: +# - Absolute differences (max, mean) +# - Relative differences (max, mean) +# - Numerical equivalence check (PASS/FAIL) +# - Tolerance: rtol=1e-3, atol=1e-4 +``` + +**What it does:** +- Creates both fused and unfused models with identical weights +- Runs inference with the same inputs +- Compares outputs using `torch.allclose()` with tolerance `rtol=1e-3, atol=1e-4` +- Reports absolute and relative differences + +**Expected result:** ✓ PASS - Fusion optimizations should produce outputs within numerical precision tolerance + +### Performance Debugging + +```bash +# Profile with different fusion combinations +python3 tiny_openfold_v2.py --disable-flash-attention --enable-pytorch-profiler +python3 tiny_openfold_v2.py --disable-qkv-fusion-msa --enable-pytorch-profiler + +# Compare kernel counts +grep "kernel" pytorch_profiles_v2/fusion_analysis.json +``` + +## Understanding Fusion Impact + +### Key Areas to Study in Code + +1. **FusedMSARowAttention** (lines ~276-384) + - QKV fusion implementation + - Flash Attention integration with pair bias + - Fallback to baseline + +2. **FusedTriangleMultiplication** (lines ~532-602) + - Gate/proj fusion technique + - Chunk operations for splitting + - Performance comparison points + +3. **get_fusion_statistics()** (lines ~873-907) + - Kernel reduction calculation + - Fusion effectiveness metrics + +4. **Training loop with fusion tracking** (lines ~1106-1175) + - Fusion statistics collection + - Performance monitoring integration + +## Workshop Exercises + +### Exercise 1: Kernel Fusion Analysis + +**Objective**: Quantify the impact of kernel fusion on performance. + +```bash +# Run baseline (V1 or V2 with fusions disabled) +python3 tiny_openfold_v2.py --disable-all-fusion --batch-size 4 --num-steps 50 \ + --profile-dir ./baseline + +# Run with all fusions +python3 tiny_openfold_v2.py --enable-all-fusion --batch-size 4 --num-steps 50 \ + --profile-dir ./fused + +# Compare results +diff baseline/performance_summary_v2.json fused/performance_summary_v2.json +``` + +**Expected Results:** +- 1.5-2.2x speedup in training speed +- 60-80% reduction in major kernel launches +- 50-80% memory reduction with Flash Attention + +### Exercise 2: Flash Attention Memory Analysis + +**Objective**: Analyze memory efficiency improvements from Flash Attention. + +```bash +# Test with Flash Attention disabled +python3 tiny_openfold_v2.py --disable-flash-attention --seq-len 128 \ + --enable-memory-profiling --profile-dir ./no_flash + +# Test with Flash Attention enabled +python3 tiny_openfold_v2.py --enable-flash-attention --seq-len 128 \ + --enable-memory-profiling --profile-dir ./with_flash + +# Compare peak memory usage +grep "peak_memory_mb" */performance_summary_v2.json +``` + +**Expected Results:** +- Linear memory scaling with Flash Attention +- 50-80% memory reduction for sequences > 64 +- Enables larger batch sizes or sequence lengths + +### Exercise 3: ROCm Profiling Deep Dive + +**Objective**: Use ROCm tools for hardware-level analysis. + +```bash +# rocprofv3 for kernel statistics +./run_rocprofv3.sh --batch-size 4 --seq-len 64 + +# rocprof-compute for roofline analysis +./run_rocprof_compute.sh --roof-only --batch-size 4 + +# Compare kernel counts with baseline +# Verify fusion effectiveness at hardware level +``` + +**Expected Results:** +- Detailed kernel execution times +- Verification of kernel count reduction +- Memory bandwidth improvements + +## Next Steps + +After mastering Version 2: + +1. **Analyze Fusion Impact** + - Compare profiling results with V1 baseline + - Identify which fusions provide most benefit + - Understand trade-offs and limitations + +2. **ROCm Profiling Mastery** + - Learn to interpret roofline plots + - Identify memory vs compute bound operations + - Use hardware counters for optimization + +3. **Ablation Studies** + - Test individual fusion contributions + - Find optimal fusion combinations for your workload + - Understand fusion interactions + +4. **Production Considerations** + - Apply learnings to real AlphaFold/OpenFold + - Consider custom kernel implementations (Version 3) + - Scale to multi-GPU deployments + +## Resources + +### AlphaFold 2 & OpenFold +- AlphaFold 2 Paper: https://www.nature.com/articles/s41586-021-03819-2 +- OpenFold GitHub: https://github.com/aqlaboratory/openfold +- OpenFold Documentation: https://openfold.readthedocs.io/ + +### Flash Attention +- Flash Attention Paper: https://arxiv.org/abs/2205.14135 +- Flash Attention v2: https://arxiv.org/abs/2307.08691 +- PyTorch Documentation: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html + +### ROCm Profiling +- ROCm Documentation: https://rocm.docs.amd.com/ +- rocprof-compute Guide: https://rocm.docs.amd.com/projects/rocprofiler-compute/ +- AMD GPU Architecture: https://www.amd.com/en/technologies/cdna + +### Parent Directory +- See `../ARCHITECTURE.md` for detailed Evoformer architecture +- See `../version1_pytorch_baseline/README.md` for baseline implementation +- See `PLAN.md` for complete implementation roadmap + +--- + +**Questions or Issues?** + +Check the comprehensive profiling reports, examine fusion statistics, or compare with the baseline implementation for detailed understanding of each optimization. + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/launch_performance_study.sh b/MLExamples/TinyOpenFold/version2_pytorch_fused/launch_performance_study.sh new file mode 100755 index 00000000..e4d4ead7 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/launch_performance_study.sh @@ -0,0 +1,280 @@ +#!/bin/bash + +# Performance Study Launcher for Tiny OpenFold V2 +# Automates comparative performance analysis across configurations + +set -e + +# Color codes +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } + +# Default configuration +STUDY_NAME="performance_study_$(date +%Y%m%d_%H%M%S)" +NUM_RUNS=3 +BATCH_SIZES="2 4 8" +SEQ_LENS="32 64 128" +NUM_STEPS=50 +DEVICE=0 +RUN_BASELINE=true +RUN_ABLATION=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --study-name) STUDY_NAME="$2"; shift 2 ;; + --num-runs) NUM_RUNS="$2"; shift 2 ;; + --batch-sizes) BATCH_SIZES="$2"; shift 2 ;; + --seq-lens) SEQ_LENS="$2"; shift 2 ;; + --num-steps) NUM_STEPS="$2"; shift 2 ;; + --device) DEVICE="$2"; shift 2 ;; + --no-baseline) RUN_BASELINE=false; shift ;; + --ablation) RUN_ABLATION=true; shift ;; + --help|-h) + echo "Performance Study Launcher for Tiny OpenFold V2" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --study-name NAME Study name (default: timestamped)" + echo " --num-runs N Number of runs per config (default: 3)" + echo " --batch-sizes \"N...\" Batch sizes to test (default: \"2 4 8\")" + echo " --seq-lens \"N...\" Sequence lengths to test (default: \"32 64 128\")" + echo " --num-steps N Training steps per run (default: 50)" + echo " --device N GPU device (default: 0)" + echo " --no-baseline Skip baseline comparison" + echo " --ablation Run fusion ablation study" + echo "" + echo "Examples:" + echo " $0 # Standard study" + echo " $0 --num-runs 5 --batch-sizes \"4 8 16\" # Custom config" + echo " $0 --ablation # With ablation study" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "$STUDY_NAME" +cd "$STUDY_NAME" + +log_info "======================================================================" +log_info "Tiny OpenFold V2 - Performance Study" +log_info "======================================================================" +echo "" +log_info "Study Configuration:" +log_info " Study name: $STUDY_NAME" +log_info " Runs per configuration: $NUM_RUNS" +log_info " Batch sizes: $BATCH_SIZES" +log_info " Sequence lengths: $SEQ_LENS" +log_info " Steps per run: $NUM_STEPS" +log_info " Device: $DEVICE" +log_info " Run baseline: $RUN_BASELINE" +log_info " Run ablation: $RUN_ABLATION" +echo "" + +# Save configuration +cat > config.json << EOF +{ + "study_name": "$STUDY_NAME", + "num_runs": $NUM_RUNS, + "batch_sizes": [$BATCH_SIZES], + "seq_lens": [$SEQ_LENS], + "num_steps": $NUM_STEPS, + "device": $DEVICE, + "run_baseline": $RUN_BASELINE, + "run_ablation": $RUN_ABLATION, + "timestamp": "$(date --iso-8601=seconds)" +} +EOF + +# Main study: All fusions enabled +log_step "Running main performance study (all fusions enabled)..." + +for batch_size in $BATCH_SIZES; do + for seq_len in $SEQ_LENS; do + config_name="b${batch_size}_s${seq_len}" + log_info "Testing configuration: batch_size=$batch_size, seq_len=$seq_len" + + for run in $(seq 1 $NUM_RUNS); do + log_info " Run $run/$NUM_RUNS..." + python ../tiny_openfold_v2.py \ + --batch-size $batch_size \ + --seq-len $seq_len \ + --num-steps $NUM_STEPS \ + --profile-dir "${config_name}_run${run}" \ + > "${config_name}_run${run}.log" 2>&1 + done + + log_info " ✓ Configuration complete" + done +done + +# Baseline comparison +if [ "$RUN_BASELINE" = true ]; then + log_step "Running baseline comparison (all fusions disabled)..." + + for batch_size in $BATCH_SIZES; do + for seq_len in $SEQ_LENS; do + config_name="b${batch_size}_s${seq_len}_baseline" + log_info "Testing baseline: batch_size=$batch_size, seq_len=$seq_len" + + for run in $(seq 1 $NUM_RUNS); do + log_info " Run $run/$NUM_RUNS..." + python ../tiny_openfold_v2.py \ + --batch-size $batch_size \ + --seq-len $seq_len \ + --num-steps $NUM_STEPS \ + --disable-all-fusion \ + --profile-dir "${config_name}_run${run}" \ + > "${config_name}_run${run}.log" 2>&1 + done + + log_info " ✓ Baseline complete" + done + done +fi + +# Ablation study +if [ "$RUN_ABLATION" = true ]; then + log_step "Running fusion ablation study..." + + # Use middle configuration + BATCH_SIZE=$(echo $BATCH_SIZES | awk '{print $2}') + SEQ_LEN=$(echo $SEQ_LENS | awk '{print $2}') + [ -z "$BATCH_SIZE" ] && BATCH_SIZE=$(echo $BATCH_SIZES | awk '{print $1}') + [ -z "$SEQ_LEN" ] && SEQ_LEN=$(echo $SEQ_LENS | awk '{print $1}') + + log_info "Using batch_size=$BATCH_SIZE, seq_len=$SEQ_LEN for ablation" + + # Test each fusion individually + ABLATIONS=( + "all_disabled:--disable-all-fusion" + "only_qkv_msa:--disable-qkv-fusion-triangle --disable-flash-attention --disable-triangle-fusion" + "only_qkv_triangle:--disable-qkv-fusion-msa --disable-flash-attention --disable-triangle-fusion" + "only_flash:--disable-qkv-fusion-msa --disable-qkv-fusion-triangle --disable-triangle-fusion" + "only_triangle:--disable-qkv-fusion-msa --disable-qkv-fusion-triangle --disable-flash-attention" + "no_qkv:--disable-qkv-fusion-msa --disable-qkv-fusion-triangle" + "no_flash:--disable-flash-attention" + "no_triangle:--disable-triangle-fusion" + "all_enabled:" + ) + + for ablation in "${ABLATIONS[@]}"; do + name="${ablation%%:*}" + flags="${ablation#*:}" + + log_info "Testing ablation: $name" + + for run in $(seq 1 $NUM_RUNS); do + python ../tiny_openfold_v2.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-steps $NUM_STEPS \ + $flags \ + --profile-dir "ablation_${name}_run${run}" \ + > "ablation_${name}_run${run}.log" 2>&1 + done + + log_info " ✓ Ablation $name complete" + done +fi + +# Analyze results +log_step "Analyzing results..." + +python3 << 'ANALYSIS_SCRIPT' +import json +import glob +import re +import numpy as np +from pathlib import Path + +results = [] + +# Parse all performance summary files +for json_file in glob.glob("*/performance_summary_v2.json"): + try: + with open(json_file, 'r') as f: + data = json.load(f) + + config = data.get('config', {}) + perf = data.get('performance_summary', {}) + fusion = data.get('fusion_statistics', {}) + + # Extract configuration from path + path_parts = Path(json_file).parts[0] + + results.append({ + 'config': path_parts, + 'batch_size': config.get('max_seq_len', 'N/A'), + 'seq_len': config.get('max_seq_len', 'N/A'), + 'speed': perf.get('avg_training_speed', 0), + 'memory_mb': perf.get('peak_memory_mb', 0), + 'batch_time_ms': perf.get('avg_batch_time', 0) * 1000, + 'loss': perf.get('avg_loss', 0), + 'fusion_enabled': fusion.get('qkv_fusion_msa_enabled', False) + }) + except Exception as e: + print(f"Error parsing {json_file}: {e}") + +# Group by configuration +configs = {} +for result in results: + config = result['config'] + if config not in configs: + configs[config] = [] + configs[config].append(result) + +# Generate summary +print("\n" + "="*80) +print("PERFORMANCE STUDY SUMMARY") +print("="*80) + +for config_name in sorted(configs.keys()): + runs = configs[config_name] + speeds = [r['speed'] for r in runs if r['speed'] > 0] + memories = [r['memory_mb'] for r in runs if r['memory_mb'] > 0] + batch_times = [r['batch_time_ms'] for r in runs if r['batch_time_ms'] > 0] + + if speeds: + print(f"\nConfiguration: {config_name}") + print(f" Runs: {len(runs)}") + print(f" Speed: {np.mean(speeds):.2f} ± {np.std(speeds):.2f} samples/sec") + print(f" Memory: {np.mean(memories):.1f} ± {np.std(memories):.1f} MB") + print(f" Batch time: {np.mean(batch_times):.2f} ± {np.std(batch_times):.2f} ms") + +print("\n" + "="*80) + +# Save results +with open('results_summary.json', 'w') as f: + json.dump(configs, f, indent=2) + +print("\nDetailed results saved to: results_summary.json") + +ANALYSIS_SCRIPT + +cd - > /dev/null + +log_info "======================================================================" +log_info "Performance Study Complete!" +log_info "======================================================================" +echo "" +log_info "Study directory: $STUDY_NAME" +echo "" +log_info "Generated files:" +log_info " - config.json : Study configuration" +log_info " - results_summary.json : Aggregated results" +log_info " - *.log : Individual run logs" +log_info " - */performance_summary_v2.json : Detailed per-run data" +echo "" +log_info "To visualize results:" +log_info " python ../analyze_performance_study.py --study-dir $STUDY_NAME" +echo "" + + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/run_all_profilers.sh b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_all_profilers.sh new file mode 100755 index 00000000..fb0085e3 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_all_profilers.sh @@ -0,0 +1,349 @@ +#!/bin/bash + +# Comprehensive Profiling Suite for Tiny OpenFold V2 +# Runs all available profilers: PyTorch, ROCm tools, and generates comparative analysis + +set -e + +# Color codes +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +RED='\033[0;31m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } +log_profiler() { echo -e "${PURPLE}[PROFILER]${NC} $1"; } + +# Default configuration +BATCH_SIZE=4 +SEQ_LEN=64 +NUM_BLOCKS=4 +NUM_SEQS=16 +NUM_STEPS=30 +OUTPUT_DIR="./complete_profiling_$(date +%Y%m%d_%H%M%S)" +ENABLE_ALL_FUSION=true +DEVICE=0 + +# Profiler selection +RUN_PYTORCH=true +RUN_ROCPROFV3=true +RUN_ROCPROF_SYS=true +RUN_ROCPROF_COMPUTE=true +QUICK_MODE=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) BATCH_SIZE="$2"; shift 2 ;; + --seq-len) SEQ_LEN="$2"; shift 2 ;; + --num-blocks) NUM_BLOCKS="$2"; shift 2 ;; + --num-seqs) NUM_SEQS="$2"; shift 2 ;; + --num-steps) NUM_STEPS="$2"; shift 2 ;; + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + --device) DEVICE="$2"; shift 2 ;; + --disable-all-fusion) ENABLE_ALL_FUSION=false; shift ;; + --pytorch-only) RUN_ROCPROFV3=false; RUN_ROCPROF_SYS=false; RUN_ROCPROF_COMPUTE=false; shift ;; + --rocm-only) RUN_PYTORCH=false; shift ;; + --quick) QUICK_MODE=true; shift ;; + --no-pytorch) RUN_PYTORCH=false; shift ;; + --no-rocprofv3) RUN_ROCPROFV3=false; shift ;; + --no-rocprof-sys) RUN_ROCPROF_SYS=false; shift ;; + --no-rocprof-compute) RUN_ROCPROF_COMPUTE=false; shift ;; + --help|-h) + echo "Comprehensive Profiling Suite for Tiny OpenFold V2" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --batch-size N Batch size (default: 4)" + echo " --seq-len N Sequence length (default: 64)" + echo " --num-blocks N Number of Evoformer blocks (default: 4)" + echo " --num-seqs N Number of MSA sequences (default: 16)" + echo " --num-steps N Training steps (default: 30)" + echo " --output-dir DIR Output directory" + echo " --device N GPU device (default: 0)" + echo " --disable-all-fusion Disable all fusions" + echo "" + echo "Profiler Selection:" + echo " --pytorch-only Run only PyTorch profiler" + echo " --rocm-only Run only ROCm profilers" + echo " --no-pytorch Skip PyTorch profiler" + echo " --no-rocprofv3 Skip rocprofv3" + echo " --no-rocprof-sys Skip rocprof-sys" + echo " --no-rocprof-compute Skip rocprof-compute" + echo " --quick Quick mode (reduced profiling steps)" + echo "" + echo "Examples:" + echo " $0 # Run all profilers" + echo " $0 --pytorch-only # PyTorch profiler only" + echo " $0 --quick # Quick profiling" + echo " $0 --disable-all-fusion # Profile baseline" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# Adjust for quick mode +if [ "$QUICK_MODE" = true ]; then + NUM_STEPS=15 + RUN_ROCPROF_SYS=false # Skip slowest profiler + log_info "Quick mode enabled: reduced steps, skipping rocprof-sys" +fi + +mkdir -p "$OUTPUT_DIR" + +log_info "======================================================================" +log_info "Tiny OpenFold V2 - Comprehensive Profiling Suite" +log_info "======================================================================" +echo "" +log_info "Configuration:" +log_info " Batch size: $BATCH_SIZE" +log_info " Sequence length: $SEQ_LEN" +log_info " Evoformer blocks: $NUM_BLOCKS" +log_info " MSA sequences: $NUM_SEQS" +log_info " Training steps: $NUM_STEPS" +log_info " All fusions: $ENABLE_ALL_FUSION" +log_info " Device: $DEVICE" +log_info " Output directory: $OUTPUT_DIR" +echo "" +log_info "Profilers to run:" +[ "$RUN_PYTORCH" = true ] && log_info " ✓ PyTorch Profiler" +[ "$RUN_ROCPROFV3" = true ] && log_info " ✓ rocprofv3" +[ "$RUN_ROCPROF_SYS" = true ] && log_info " ✓ rocprof-sys" +[ "$RUN_ROCPROF_COMPUTE" = true ] && log_info " ✓ rocprof-compute" +echo "" + +# Build common arguments +COMMON_ARGS="--batch-size $BATCH_SIZE --seq-len $SEQ_LEN --num-blocks $NUM_BLOCKS --num-seqs $NUM_SEQS --num-steps $NUM_STEPS --device $DEVICE" +[ "$ENABLE_ALL_FUSION" = false ] && COMMON_ARGS="$COMMON_ARGS --disable-all-fusion" + +# Track profiling times +PROFILE_START=$(date +%s) + +# 1. PyTorch Profiler +if [ "$RUN_PYTORCH" = true ]; then + log_step "Running PyTorch Profiler (1/4)..." + PYTORCH_DIR="$OUTPUT_DIR/pytorch_profiling" + + if [ -f "./run_pytorch_profiler.py" ]; then + python run_pytorch_profiler.py $COMMON_ARGS --profile-dir $PYTORCH_DIR + log_info "✓ PyTorch profiling complete" + else + log_warning "run_pytorch_profiler.py not found, skipping" + fi + echo "" +fi + +# 2. rocprofv3 +if [ "$RUN_ROCPROFV3" = true ]; then + log_step "Running rocprofv3 (2/4)..." + ROCPROFV3_DIR="$OUTPUT_DIR/rocprofv3_profiling" + + if [ -f "./run_rocprofv3.sh" ]; then + ./run_rocprofv3.sh $COMMON_ARGS --output-dir $ROCPROFV3_DIR + log_info "✓ rocprofv3 profiling complete" + else + log_warning "run_rocprofv3.sh not found, skipping" + fi + echo "" +fi + +# 3. rocprof-sys +if [ "$RUN_ROCPROF_SYS" = true ]; then + log_step "Running rocprof-sys (3/4)..." + ROCPROF_SYS_DIR="$OUTPUT_DIR/rocprof_sys_profiling" + + if [ -f "./run_rocprof_sys.sh" ]; then + ./run_rocprof_sys.sh $COMMON_ARGS --output-dir $ROCPROF_SYS_DIR + log_info "✓ rocprof-sys profiling complete" + else + log_warning "run_rocprof_sys.sh not found, skipping" + fi + echo "" +fi + +# 4. rocprof-compute +if [ "$RUN_ROCPROF_COMPUTE" = true ]; then + log_step "Running rocprof-compute (4/4)..." + + if [ -f "./run_rocprof_compute.sh" ]; then + cd "$OUTPUT_DIR" + ../run_rocprof_compute.sh $COMMON_ARGS --output-name tinyfold_complete + cd - > /dev/null + log_info "✓ rocprof-compute profiling complete" + else + log_warning "run_rocprof_compute.sh not found, skipping" + fi + echo "" +fi + +PROFILE_END=$(date +%s) +TOTAL_TIME=$((PROFILE_END - PROFILE_START)) + +# Generate summary report +log_step "Generating comprehensive summary..." + +SUMMARY_FILE="$OUTPUT_DIR/PROFILING_SUMMARY.md" + +cat > "$SUMMARY_FILE" << EOF +# Tiny OpenFold V2 - Comprehensive Profiling Summary + +Generated: $(date '+%Y-%m-%d %H:%M:%S') + +## Configuration + +- Batch size: $BATCH_SIZE +- Sequence length: $SEQ_LEN +- Evoformer blocks: $NUM_BLOCKS +- MSA sequences: $NUM_SEQS +- Training steps: $NUM_STEPS +- All fusions enabled: $ENABLE_ALL_FUSION +- Device: $DEVICE +- Total profiling time: $TOTAL_TIME seconds + +## Profiling Results + +EOF + +# Add results from each profiler +if [ "$RUN_PYTORCH" = true ] && [ -d "$PYTORCH_DIR" ]; then + cat >> "$SUMMARY_FILE" << EOF +### PyTorch Profiler + +Directory: \`$PYTORCH_DIR\` + +**Key Files:** +- comprehensive_profiling_report.md - Detailed analysis +- fusion_analysis.json - Fusion statistics +- *.pt.trace.json - Chrome trace files + +**View Results:** +\`\`\`bash +# View report +less $PYTORCH_DIR/comprehensive_profiling_report.md + +# TensorBoard +tensorboard --logdir $PYTORCH_DIR + +# Chrome trace +# Open chrome://tracing and load trace file +\`\`\` + +EOF +fi + +if [ "$RUN_ROCPROFV3" = true ] && [ -d "$ROCPROFV3_DIR" ]; then + cat >> "$SUMMARY_FILE" << EOF +### rocprofv3 + +Directory: \`$ROCPROFV3_DIR\` + +**Key Files:** +- rocprofv3_summary.txt - Kernel statistics summary +- *_kernel_stats.csv - Detailed kernel data + +**View Results:** +\`\`\`bash +less $ROCPROFV3_DIR/rocprofv3_summary.txt +\`\`\` + +EOF +fi + +if [ "$RUN_ROCPROF_SYS" = true ] && [ -d "$ROCPROF_SYS_DIR" ]; then + cat >> "$SUMMARY_FILE" << EOF +### rocprof-sys + +Directory: \`$ROCPROF_SYS_DIR\` + +**Key Files:** +- *.proto - Perfetto timeline trace + +**View Results:** +1. Copy .proto file to local machine +2. Open https://ui.perfetto.dev +3. Load the .proto file + +EOF +fi + +if [ "$RUN_ROCPROF_COMPUTE" = true ]; then + cat >> "$SUMMARY_FILE" << EOF +### rocprof-compute + +Directory: \`$OUTPUT_DIR\` + +**Key Files:** +- roofline_*.pdf - Roofline plots +- workloads/tinyfold_complete/ - Detailed metrics + +**View Results:** +\`\`\`bash +# View roofline +open roofline_*.pdf + +# List dispatches +cd $OUTPUT_DIR +rocprof-compute analyze -p workloads/tinyfold_complete/* --list-stats +\`\`\` + +EOF +fi + +cat >> "$SUMMARY_FILE" << EOF +## Analysis Recommendations + +1. **Start with PyTorch Profiler** for high-level understanding + - Identify hotspot operations + - Analyze fusion impact + +2. **Use rocprofv3** for kernel-level analysis + - Check kernel execution times + - Verify fusion effectiveness + +3. **Use rocprof-sys** for timeline analysis + - Identify synchronization issues + - Check CPU-GPU overlaps + +4. **Use rocprof-compute** for hardware utilization + - Check memory bandwidth utilization + - Analyze compute vs memory bound + +## Next Steps + +- Compare with baseline (V1) results +- Run ablation studies for individual fusions +- Optimize identified bottlenecks +- Test different batch sizes and sequence lengths + +EOF + +log_info "Summary report generated: $SUMMARY_FILE" + +# Display summary +echo "" +log_info "======================================================================" +log_info "Comprehensive Profiling Complete!" +log_info "======================================================================" +echo "" +log_info "Results directory: $OUTPUT_DIR" +log_info "Total profiling time: $TOTAL_TIME seconds" +echo "" +log_info "Quick access:" +echo "" +[ "$RUN_PYTORCH" = true ] && log_info " PyTorch: less $PYTORCH_DIR/comprehensive_profiling_report.md" +[ "$RUN_ROCPROFV3" = true ] && log_info " rocprofv3: less $ROCPROFV3_DIR/rocprofv3_summary.txt" +[ "$RUN_ROCPROF_SYS" = true ] && log_info " rocprof-sys: open https://ui.perfetto.dev (load .proto file)" +[ "$RUN_ROCPROF_COMPUTE" = true ] && log_info " rocprof-compute: open $OUTPUT_DIR/roofline_*.pdf" +echo "" +log_info " Summary: less $SUMMARY_FILE" +echo "" +log_info "======================================================================" + + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.py b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.py new file mode 100644 index 00000000..9a80c087 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.py @@ -0,0 +1,588 @@ +#!/usr/bin/env python3 +""" +PyTorch Profiler Integration for Tiny OpenFold V2 (Fused) + +This script provides enhanced PyTorch profiler integration with fusion-specific analysis, +kernel reduction tracking, and comprehensive performance characterization. + +Features: +- Fusion-specific profiling and analysis +- Kernel count reduction measurement +- Flash Attention performance tracking +- Memory bandwidth utilization analysis +- Comparison with baseline (V1) +- Chrome trace export for detailed timeline analysis +- Operator-level performance breakdown with fusion impact +- Bottleneck identification for fused operations +- TensorBoard integration for visualization + +Usage: + # Run profiling with default settings (all fusions enabled) + python run_pytorch_profiler.py + + # Custom profiling configuration + python run_pytorch_profiler.py --batch-size 8 --profile-steps 10 + + # Ablation study: disable specific fusions + python run_pytorch_profiler.py --disable-flash-attention + + # Compare with V1 baseline + python run_pytorch_profiler.py --compare-with-v1 ../version1_pytorch_baseline/pytorch_profiles + + # Generate detailed report + python run_pytorch_profiler.py --generate-report --output-dir ./analysis +""" + +import torch +import torch.nn as nn +from torch.profiler import profile, record_function, ProfilerActivity +import argparse +import json +import os +import numpy as np +import time +from pathlib import Path +from typing import Dict, List, Any, Optional +from datetime import datetime + +# Import the model from tiny_openfold_v2 +from tiny_openfold_v2 import ( + TinyOpenFoldV2, TinyOpenFoldConfig, FusionConfig, ProteinDataset, + setup_deterministic_environment, FLASH_ATTENTION_AVAILABLE, TORCH_COMPILE_AVAILABLE +) + + +def get_gpu_time_total(event) -> float: + """ + Get GPU time total in a ROCm-compatible way. + + On ROCm, PyTorch may expose 'device_time_total' instead of 'cuda_time_total'. + This function checks for both attributes to ensure compatibility. + + Args: + event: FunctionEventAvg object from PyTorch profiler + + Returns: + GPU time in microseconds (0 if not available) + """ + if hasattr(event, 'device_time_total'): + return event.device_time_total + return getattr(event, 'cuda_time_total', 0) + + +class FusedProfilerAnalyzer: + """Advanced PyTorch profiler analysis for fused Evoformer implementation.""" + + def __init__(self, profile_dir: str): + self.profile_dir = Path(profile_dir) + self.profile_data = None + self.analysis_results = {} + self.fusion_stats = {} + self.throughput_stats = {} + + def run_profiling( + self, + config: TinyOpenFoldConfig, + fusion_config: FusionConfig, + batch_size: int = 4, + num_steps: int = 20, + warmup_steps: int = 3, + profile_steps: int = 5, + include_memory: bool = True, + include_shapes: bool = True, + device_id: Optional[int] = None + ) -> profile: + """Run comprehensive PyTorch profiling session with fusion analysis.""" + + print(f"Starting PyTorch Profiler Analysis - Fused Evoformer Architecture") + print(f" Profile directory: {self.profile_dir}") + print(f" Batch size: {batch_size}") + print(f" Sequence length: {config.max_seq_len}") + print(f" MSA sequences: {config.n_seqs}") + print(f" Total steps: {num_steps}") + print(f" Profile steps: {profile_steps}") + print(f" Memory profiling: {include_memory}") + + # Fusion configuration summary + print(f"\n Fusion Configuration:") + print(f" MSA QKV Fusion: {fusion_config.enable_qkv_fusion_msa}") + print(f" Triangle QKV Fusion: {fusion_config.enable_qkv_fusion_triangle}") + print(f" Flash Attention: {fusion_config.enable_flash_attention and FLASH_ATTENTION_AVAILABLE}") + print(f" Triangle Fusion: {fusion_config.enable_triangle_fusion}") + print(f" Torch Compile: {fusion_config.enable_torch_compile and TORCH_COMPILE_AVAILABLE}") + + # Setup environment + setup_deterministic_environment() + + # Device selection + if device_id is not None: + if not torch.cuda.is_available(): + print(f" Warning: CUDA not available, ignoring device_id={device_id}") + device = torch.device("cpu") + elif device_id >= torch.cuda.device_count(): + raise ValueError(f"Device {device_id} not available. Only {torch.cuda.device_count()} GPU(s) found.") + else: + device = torch.device(f"cuda:{device_id}") + print(f" Using GPU: {device_id}") + else: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f" Using device: {device}") + + # Create model and dataset + model = TinyOpenFoldV2(config, fusion_config).to(device) + + # Apply torch.compile if enabled + if fusion_config.enable_torch_compile and TORCH_COMPILE_AVAILABLE: + print(" Applying torch.compile...") + model = torch.compile(model, mode=fusion_config.torch_compile_mode) + + # Get fusion statistics + if hasattr(model, 'get_fusion_statistics'): + self.fusion_stats = model.get_fusion_statistics() + elif hasattr(model, '_orig_mod'): + self.fusion_stats = model._orig_mod.get_fusion_statistics() + + dataset = ProteinDataset(config) + optimizer = torch.optim.AdamW( + model.parameters() if isinstance(model, nn.Module) else model._orig_mod.parameters(), + lr=3e-4 + ) + + # Ensure profile directory exists + self.profile_dir.mkdir(parents=True, exist_ok=True) + + # Configure profiler + activities = [ProfilerActivity.CPU] + if torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + prof = profile( + activities=activities, + record_shapes=include_shapes, + profile_memory=include_memory, + with_stack=True, + with_flops=True, + with_modules=True, + experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True), + schedule=torch.profiler.schedule( + wait=warmup_steps, + warmup=1, + active=profile_steps, + repeat=1 + ), + on_trace_ready=torch.profiler.tensorboard_trace_handler(str(self.profile_dir)) + ) + + # Training loop with profiling + model.train() + + # Warmup without profiling + print("\n Running warmup steps...") + for step in range(warmup_steps): + msa_tokens, pair_features, target_distances = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_features = pair_features.to(device) + target_distances = target_distances.to(device) + + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'] + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # Profiled steps with timing + print(f" Running {num_steps} steps with profiling...") + prof.start() + + # Track timing for throughput calculation + step_times = [] + if torch.cuda.is_available(): + torch.cuda.synchronize() + start_time = time.time() + + for step in range(num_steps): + step_start = time.time() + + msa_tokens, pair_features, target_distances = dataset.get_batch(batch_size) + msa_tokens = msa_tokens.to(device) + pair_features = pair_features.to(device) + target_distances = target_distances.to(device) + + outputs = model(msa_tokens, pair_features, target_distances) + loss = outputs['loss'] + loss.backward() + optimizer.step() + optimizer.zero_grad() + + prof.step() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + step_time = time.time() - step_start + step_times.append(step_time) + + if step % 5 == 0: + print(f" Step {step}/{num_steps} - Loss: {loss.item():.4f}") + + prof.stop() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + total_time = time.time() - start_time + + # Calculate throughput statistics + total_samples = num_steps * batch_size + avg_step_time = sum(step_times) / len(step_times) if step_times else 0 + avg_throughput = batch_size / avg_step_time if avg_step_time > 0 else 0 + + self.throughput_stats = { + 'total_steps': num_steps, + 'batch_size': batch_size, + 'total_samples': total_samples, + 'total_time_sec': total_time, + 'avg_step_time_ms': avg_step_time * 1000, + 'avg_throughput_samples_per_sec': avg_throughput, + 'min_step_time_ms': min(step_times) * 1000 if step_times else 0, + 'max_step_time_ms': max(step_times) * 1000 if step_times else 0 + } + + self.profile_data = prof + print("\n Profiling complete!") + + return prof + + def analyze_fusion_impact(self) -> Dict[str, Any]: + """Analyze the impact of fusion optimizations.""" + if self.profile_data is None: + return {"error": "No profiling data available"} + + print("\nAnalyzing fusion impact...") + + # Get operator statistics + events = self.profile_data.key_averages() + + # Categorize operators by fusion type + fusion_categories = { + 'fused_qkv': [], + 'flash_attention': [], + 'fused_triangle': [], + 'standard_ops': [] + } + + for event in events: + name = event.key + if 'fused_qkv' in name or 'qkv_fused' in name: + fusion_categories['fused_qkv'].append(event) + elif 'flash_attention' in name: + fusion_categories['flash_attention'].append(event) + elif 'fused_triangle' in name or 'triangle.*fused' in name: + fusion_categories['fused_triangle'].append(event) + else: + fusion_categories['standard_ops'].append(event) + + # Calculate fusion statistics + fusion_analysis = {} + for category, events_list in fusion_categories.items(): + if events_list: + total_time = sum(get_gpu_time_total(e) if torch.cuda.is_available() else e.cpu_time_total + for e in events_list) + total_calls = sum(e.count for e in events_list) + fusion_analysis[category] = { + 'total_time_ms': total_time / 1000.0, + 'total_calls': total_calls, + 'avg_time_per_call_ms': (total_time / total_calls / 1000.0) if total_calls > 0 else 0 + } + + self.analysis_results['fusion_impact'] = fusion_analysis + return fusion_analysis + + def analyze_memory_efficiency(self) -> Dict[str, Any]: + """Analyze memory efficiency improvements from fusion.""" + if self.profile_data is None: + return {"error": "No profiling data available"} + + print("Analyzing memory efficiency...") + + events = self.profile_data.key_averages() + + # Track memory-intensive operations + memory_analysis = { + 'attention_memory': 0, + 'triangle_memory': 0, + 'total_memory': 0, + 'peak_memory_mb': 0 + } + + if torch.cuda.is_available(): + memory_analysis['peak_memory_mb'] = torch.cuda.max_memory_allocated() / (1024**2) + + for event in events: + if hasattr(event, 'cpu_memory_usage') and event.cpu_memory_usage > 0: + memory_usage = event.cpu_memory_usage / (1024**2) # Convert to MB + memory_analysis['total_memory'] += memory_usage + + if 'attention' in event.key: + memory_analysis['attention_memory'] += memory_usage + elif 'triangle' in event.key: + memory_analysis['triangle_memory'] += memory_usage + + self.analysis_results['memory_efficiency'] = memory_analysis + return memory_analysis + + def generate_comprehensive_report(self, output_file: Optional[str] = None) -> str: + """Generate comprehensive profiling report with fusion analysis.""" + + if output_file is None: + output_file = self.profile_dir / "comprehensive_profiling_report.md" + + report_lines = [] + report_lines.append("# Tiny OpenFold V2 - Fused Implementation Profiling Report") + report_lines.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + + # Configuration summary + report_lines.append("## Configuration") + report_lines.append("\n### Fusion Settings") + if self.fusion_stats: + report_lines.append(f"- MSA QKV Fusion: {'Enabled' if self.fusion_stats.get('qkv_fusion_msa_enabled') else 'Disabled'}") + report_lines.append(f"- Triangle QKV Fusion: {'Enabled' if self.fusion_stats.get('qkv_fusion_triangle_enabled') else 'Disabled'}") + report_lines.append(f"- Flash Attention: {'Enabled' if self.fusion_stats.get('flash_attention_enabled') else 'Disabled'}") + report_lines.append(f"- Triangle Fusion: {'Enabled' if self.fusion_stats.get('triangle_fusion_enabled') else 'Disabled'}") + report_lines.append(f"- Torch Compile: {'Enabled' if self.fusion_stats.get('torch_compile_enabled') else 'Disabled'}") + report_lines.append(f"\n### Kernel Reduction") + report_lines.append(f"- Baseline kernels per block: {self.fusion_stats.get('baseline_kernels_per_block', 'N/A')}") + report_lines.append(f"- Fused kernels per block: {self.fusion_stats.get('fused_kernels_per_block', 'N/A')}") + report_lines.append(f"- Kernel reduction: {self.fusion_stats.get('kernel_reduction_percent', 0):.1f}%") + report_lines.append(f"- Total kernels saved: {self.fusion_stats.get('total_kernel_reduction', 'N/A')}") + + # Performance analysis + if self.profile_data: + report_lines.append("\n## Performance Analysis") + + events = self.profile_data.key_averages() + + # Top operations by time + report_lines.append("\n### Top 15 Operations by GPU Time") + report_lines.append("\n| Operation | GPU Time (ms) | CPU Time (ms) | Calls | Avg Time (ms) |") + report_lines.append("|-----------|---------------|---------------|-------|---------------|") + + sorted_events = sorted(events, + key=lambda e: get_gpu_time_total(e) if torch.cuda.is_available() else e.cpu_time_total, + reverse=True)[:15] + + for event in sorted_events: + gpu_time = get_gpu_time_total(event) / 1000.0 if torch.cuda.is_available() else 0 + cpu_time = event.cpu_time_total / 1000.0 + avg_time = gpu_time / event.count if event.count > 0 else 0 + report_lines.append(f"| {event.key[:50]} | {gpu_time:.2f} | {cpu_time:.2f} | {event.count} | {avg_time:.3f} |") + + # Fusion impact analysis + if 'fusion_impact' in self.analysis_results: + report_lines.append("\n### Fusion Impact Analysis") + fusion_impact = self.analysis_results['fusion_impact'] + + for category, stats in fusion_impact.items(): + if stats['total_calls'] > 0: + report_lines.append(f"\n**{category}:**") + report_lines.append(f"- Total time: {stats['total_time_ms']:.2f} ms") + report_lines.append(f"- Total calls: {stats['total_calls']}") + report_lines.append(f"- Average time per call: {stats['avg_time_per_call_ms']:.3f} ms") + + # Memory analysis + if 'memory_efficiency' in self.analysis_results: + report_lines.append("\n### Memory Efficiency") + mem_analysis = self.analysis_results['memory_efficiency'] + + report_lines.append(f"- Peak memory: {mem_analysis['peak_memory_mb']:.1f} MB") + report_lines.append(f"- Attention memory: {mem_analysis['attention_memory']:.1f} MB") + report_lines.append(f"- Triangle memory: {mem_analysis['triangle_memory']:.1f} MB") + report_lines.append(f"- Total tracked memory: {mem_analysis['total_memory']:.1f} MB") + + # Recommendations + report_lines.append("\n## Optimization Recommendations") + report_lines.append("\n### Based on Profiling Results:") + + if self.fusion_stats.get('flash_attention_enabled'): + report_lines.append("- ✓ Flash Attention is enabled - memory efficiency optimized") + else: + report_lines.append("- ⚠ Consider enabling Flash Attention for memory savings") + + if self.fusion_stats.get('qkv_fusion_msa_enabled'): + report_lines.append("- ✓ MSA QKV fusion is enabled - kernel launch overhead reduced") + else: + report_lines.append("- ⚠ Enable MSA QKV fusion to reduce kernel launches") + + if self.fusion_stats.get('triangle_fusion_enabled'): + report_lines.append("- ✓ Triangle fusion is enabled - triangle operations optimized") + else: + report_lines.append("- ⚠ Enable triangle fusion for better performance") + + # Write report + report_content = "\n".join(report_lines) + with open(output_file, 'w') as f: + f.write(report_content) + + print(f"\nComprehensive report saved to: {output_file}") + return report_content + + def get_throughput_summary(self) -> Dict[str, Any]: + """Get throughput summary statistics.""" + return self.throughput_stats + + def export_analysis(self, output_file: Optional[str] = None): + """Export analysis results to JSON.""" + if output_file is None: + output_file = self.profile_dir / "fusion_analysis.json" + + export_data = { + 'fusion_statistics': self.fusion_stats, + 'analysis_results': self.analysis_results, + 'throughput_statistics': self.throughput_stats, + 'timestamp': datetime.now().isoformat() + } + + with open(output_file, 'w') as f: + json.dump(export_data, f, indent=2) + + print(f"Analysis exported to: {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description='PyTorch Profiler for Tiny OpenFold V2 (Fused)') + + # Model configuration + parser.add_argument('--msa-dim', type=int, default=64, help='MSA dimension') + parser.add_argument('--pair-dim', type=int, default=128, help='Pair dimension') + parser.add_argument('--num-blocks', type=int, default=4, help='Number of Evoformer blocks') + parser.add_argument('--num-seqs', type=int, default=16, help='Number of MSA sequences') + parser.add_argument('--seq-len', type=int, default=64, help='Sequence length') + + # Training configuration + parser.add_argument('--batch-size', type=int, default=4, help='Batch size') + parser.add_argument('--num-steps', type=int, default=20, help='Total steps including warmup') + parser.add_argument('--warmup-steps', type=int, default=3, help='Warmup steps') + parser.add_argument('--profile-steps', type=int, default=5, help='Steps to profile') + parser.add_argument('--device', type=int, default=None, help='GPU device ID') + + # Fusion configuration + parser.add_argument('--disable-qkv-fusion-msa', action='store_true', help='Disable MSA QKV fusion') + parser.add_argument('--disable-qkv-fusion-triangle', action='store_true', help='Disable triangle QKV fusion') + parser.add_argument('--disable-flash-attention', action='store_true', help='Disable Flash Attention') + parser.add_argument('--disable-triangle-fusion', action='store_true', help='Disable triangle fusion') + parser.add_argument('--enable-torch-compile', action='store_true', help='Enable torch.compile') + parser.add_argument('--disable-all-fusion', action='store_true', help='Disable all fusion (baseline mode)') + + # Profiling configuration + parser.add_argument('--profile-dir', type=str, default='./pytorch_profiles_v2', help='Profile output directory') + parser.add_argument('--no-memory', action='store_true', help='Disable memory profiling') + parser.add_argument('--no-shapes', action='store_true', help='Disable shape recording') + parser.add_argument('--generate-report', action='store_true', default=True, help='Generate comprehensive report') + parser.add_argument('--compare-with-v1', type=str, help='Path to V1 profiling results for comparison') + + args = parser.parse_args() + + # Configure model + config = TinyOpenFoldConfig( + msa_dim=args.msa_dim, + pair_dim=args.pair_dim, + n_evoformer_blocks=args.num_blocks, + n_seqs=args.num_seqs, + max_seq_len=args.seq_len, + msa_intermediate_dim=args.msa_dim * 4, + pair_intermediate_dim=args.pair_dim * 4 + ) + + # Configure fusion + if args.disable_all_fusion: + fusion_config = FusionConfig( + enable_qkv_fusion_msa=False, + enable_qkv_fusion_triangle=False, + enable_flash_attention=False, + enable_triangle_fusion=False, + enable_torch_compile=False + ) + else: + fusion_config = FusionConfig( + enable_qkv_fusion_msa=not args.disable_qkv_fusion_msa, + enable_qkv_fusion_triangle=not args.disable_qkv_fusion_triangle, + enable_flash_attention=not args.disable_flash_attention, + enable_triangle_fusion=not args.disable_triangle_fusion, + enable_torch_compile=args.enable_torch_compile + ) + + # Create analyzer and run profiling + analyzer = FusedProfilerAnalyzer(args.profile_dir) + + try: + prof = analyzer.run_profiling( + config=config, + fusion_config=fusion_config, + batch_size=args.batch_size, + num_steps=args.num_steps, + warmup_steps=args.warmup_steps, + profile_steps=args.profile_steps, + include_memory=not args.no_memory, + include_shapes=not args.no_shapes, + device_id=args.device + ) + + # Analyze results + print("\n" + "="*70) + print("ANALYSIS") + print("="*70) + + fusion_impact = analyzer.analyze_fusion_impact() + memory_efficiency = analyzer.analyze_memory_efficiency() + + # Generate report + if args.generate_report: + analyzer.generate_comprehensive_report() + + # Export analysis + analyzer.export_analysis() + + # Print throughput summary + throughput_stats = analyzer.get_throughput_summary() + if throughput_stats: + print("\n" + "="*70) + print("THROUGHPUT SUMMARY") + print("="*70) + print(f" Total steps: {throughput_stats['total_steps']}") + print(f" Batch size: {throughput_stats['batch_size']}") + print(f" Total samples: {throughput_stats['total_samples']}") + print(f" Total time: {throughput_stats['total_time_sec']:.2f} seconds") + print(f" Average step time: {throughput_stats['avg_step_time_ms']:.2f} ms") + print(f" Average throughput: {throughput_stats['avg_throughput_samples_per_sec']:.2f} samples/sec") + print(f" Min step time: {throughput_stats['min_step_time_ms']:.2f} ms") + print(f" Max step time: {throughput_stats['max_step_time_ms']:.2f} ms") + print("="*70) + + # Print summary + print("\n" + "="*70) + print("PROFILING SUMMARY") + print("="*70) + print(f"\nProfile directory: {args.profile_dir}") + print(f"Trace files: {args.profile_dir}/*.pt.trace.json") + print(f"\nTo visualize:") + print(f" 1. Chrome trace: Open chrome://tracing and load trace file") + print(f" 2. TensorBoard: tensorboard --logdir {args.profile_dir}") + print(f"\nReports generated:") + print(f" - comprehensive_profiling_report.md") + print(f" - fusion_analysis.json") + + if args.compare_with_v1: + print(f"\nComparison with V1: {args.compare_with_v1}") + print(" (Comparison analysis not yet implemented)") + + except Exception as e: + print(f"\nError during profiling: {e}") + import traceback + traceback.print_exc() + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) + + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.sh b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.sh new file mode 100755 index 00000000..faa6db48 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_pytorch_profiler.sh @@ -0,0 +1,338 @@ +#!/bin/bash +# +# PyTorch Profiler Runner for Tiny OpenFold V2 (Fused) +# +# This script provides convenient wrapper for running PyTorch profiling +# with various fusion configurations and analysis options. +# +# Usage: +# ./run_pytorch_profiler.sh # Default: all fusions enabled +# ./run_pytorch_profiler.sh --baseline # Disable all fusions (baseline) +# ./run_pytorch_profiler.sh --ablation # Run ablation study +# ./run_pytorch_profiler.sh --compare-v1 # Compare with V1 baseline + +set -e + +# Default configuration +BATCH_SIZE=4 +SEQ_LEN=64 +NUM_BLOCKS=4 +NUM_SEQS=16 +NUM_STEPS=20 +PROFILE_STEPS=5 +WARMUP_STEPS=3 +DEVICE="" +PROFILE_DIR="./pytorch_profiles_v2" +MODE="default" + +# Fusion flags +DISABLE_QKV_MSA="" +DISABLE_QKV_TRIANGLE="" +DISABLE_FLASH="" +DISABLE_TRIANGLE="" +ENABLE_COMPILE="" +DISABLE_ALL="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --seq-len) + SEQ_LEN="$2" + shift 2 + ;; + --num-blocks) + NUM_BLOCKS="$2" + shift 2 + ;; + --num-seqs) + NUM_SEQS="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --profile-steps) + PROFILE_STEPS="$2" + shift 2 + ;; + --device) + DEVICE="--device $2" + shift 2 + ;; + --profile-dir) + PROFILE_DIR="$2" + shift 2 + ;; + --baseline) + MODE="baseline" + DISABLE_ALL="--disable-all-fusion" + shift + ;; + --ablation) + MODE="ablation" + shift + ;; + --compare-v1) + MODE="compare" + shift + ;; + --disable-qkv-msa) + DISABLE_QKV_MSA="--disable-qkv-fusion-msa" + shift + ;; + --disable-qkv-triangle) + DISABLE_QKV_TRIANGLE="--disable-qkv-fusion-triangle" + shift + ;; + --disable-flash) + DISABLE_FLASH="--disable-flash-attention" + shift + ;; + --disable-triangle) + DISABLE_TRIANGLE="--disable-triangle-fusion" + shift + ;; + --enable-compile) + ENABLE_COMPILE="--enable-torch-compile" + shift + ;; + --help) + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --batch-size N Batch size (default: 4)" + echo " --seq-len N Sequence length (default: 64)" + echo " --num-blocks N Number of Evoformer blocks (default: 4)" + echo " --num-seqs N Number of MSA sequences (default: 16)" + echo " --num-steps N Total training steps (default: 20)" + echo " --profile-steps N Steps to profile (default: 5)" + echo " --device N GPU device ID" + echo " --profile-dir DIR Profile output directory" + echo "" + echo "Modes:" + echo " --baseline Disable all fusions (baseline comparison)" + echo " --ablation Run ablation study (all fusion combinations)" + echo " --compare-v1 Compare with V1 baseline" + echo "" + echo "Fusion Control:" + echo " --disable-qkv-msa Disable MSA QKV fusion" + echo " --disable-qkv-triangle Disable triangle QKV fusion" + echo " --disable-flash Disable Flash Attention" + echo " --disable-triangle Disable triangle fusion" + echo " --enable-compile Enable torch.compile" + echo "" + echo "Examples:" + echo " $0 # All fusions enabled" + echo " $0 --baseline # No fusions (baseline)" + echo " $0 --disable-flash --device 0 # All except Flash Attention" + echo " $0 --ablation # Run ablation study" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Print configuration +echo "======================================================================" +echo "Tiny OpenFold V2 - PyTorch Profiler" +echo "======================================================================" +echo "" +echo "Configuration:" +echo " Batch size: $BATCH_SIZE" +echo " Sequence length: $SEQ_LEN" +echo " Evoformer blocks: $NUM_BLOCKS" +echo " MSA sequences: $NUM_SEQS" +echo " Profile steps: $PROFILE_STEPS / $NUM_STEPS" +echo " Mode: $MODE" +echo " Profile directory: $PROFILE_DIR" +echo "" + +# Run based on mode +case $MODE in + default) + echo "Running profiling with all fusions enabled..." + python run_pytorch_profiler.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-blocks $NUM_BLOCKS \ + --num-seqs $NUM_SEQS \ + --num-steps $NUM_STEPS \ + --profile-steps $PROFILE_STEPS \ + --warmup-steps $WARMUP_STEPS \ + --profile-dir $PROFILE_DIR \ + $DEVICE \ + $DISABLE_QKV_MSA \ + $DISABLE_QKV_TRIANGLE \ + $DISABLE_FLASH \ + $DISABLE_TRIANGLE \ + $ENABLE_COMPILE \ + $DISABLE_ALL \ + --generate-report + ;; + + baseline) + echo "Running baseline profiling (all fusions disabled)..." + python run_pytorch_profiler.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-blocks $NUM_BLOCKS \ + --num-seqs $NUM_SEQS \ + --num-steps $NUM_STEPS \ + --profile-steps $PROFILE_STEPS \ + --warmup-steps $WARMUP_STEPS \ + --profile-dir "${PROFILE_DIR}_baseline" \ + $DEVICE \ + --disable-all-fusion \ + --generate-report + ;; + + ablation) + echo "Running ablation study..." + echo "This will test all fusion combinations..." + echo "" + + # Create ablation directory + ABLATION_DIR="${PROFILE_DIR}_ablation_$(date +%Y%m%d_%H%M%S)" + mkdir -p $ABLATION_DIR + + # Test configurations + configs=( + "all_disabled:--disable-all-fusion" + "only_qkv_msa:--disable-qkv-fusion-triangle --disable-flash-attention --disable-triangle-fusion" + "only_flash:--disable-qkv-fusion-msa --disable-qkv-fusion-triangle --disable-triangle-fusion" + "only_triangle:--disable-qkv-fusion-msa --disable-qkv-fusion-triangle --disable-flash-attention" + "all_enabled:" + ) + + for config in "${configs[@]}"; do + name="${config%%:*}" + flags="${config#*:}" + + echo "Testing configuration: $name" + python run_pytorch_profiler.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-blocks $NUM_BLOCKS \ + --num-seqs $NUM_SEQS \ + --num-steps $NUM_STEPS \ + --profile-steps $PROFILE_STEPS \ + --warmup-steps $WARMUP_STEPS \ + --profile-dir "${ABLATION_DIR}/${name}" \ + $DEVICE \ + $flags \ + --generate-report + + echo "" + done + + echo "Ablation study complete!" + echo "Results saved to: $ABLATION_DIR" + ;; + + compare) + echo "Running comparison with V1 baseline..." + + V1_PROFILE="../version1_pytorch_baseline/pytorch_profiles" + + if [ ! -d "$V1_PROFILE" ]; then + echo "Warning: V1 profile directory not found: $V1_PROFILE" + echo "Running V1 profiling first..." + + # Run V1 profiling if not exists + pushd ../version1_pytorch_baseline > /dev/null + if [ -f "run_pytorch_profiler.sh" ]; then + ./run_pytorch_profiler.sh --batch-size $BATCH_SIZE --seq-len $SEQ_LEN + else + echo "Error: V1 profiling script not found" + exit 1 + fi + popd > /dev/null + fi + + # Run V2 profiling + python run_pytorch_profiler.py \ + --batch-size $BATCH_SIZE \ + --seq-len $SEQ_LEN \ + --num-blocks $NUM_BLOCKS \ + --num-seqs $NUM_SEQS \ + --num-steps $NUM_STEPS \ + --profile-steps $PROFILE_STEPS \ + --warmup-steps $WARMUP_STEPS \ + --profile-dir $PROFILE_DIR \ + $DEVICE \ + --generate-report \ + --compare-with-v1 $V1_PROFILE + + echo "" + echo "Comparison complete!" + echo "V1 results: $V1_PROFILE" + echo "V2 results: $PROFILE_DIR" + ;; +esac + +echo "" +echo "======================================================================" +echo "Profiling Complete!" +echo "======================================================================" +echo "" +echo "Results saved to: $PROFILE_DIR" +echo "" + +# Extract and display throughput information from fusion_analysis.json +if [ -f "${PROFILE_DIR}/fusion_analysis.json" ]; then + echo "======================================================================" + echo "Performance Summary" + echo "======================================================================" + + # Extract throughput stats using Python + python3 << EOF 2>/dev/null || echo " (Throughput information not available)" +import json +import sys + +try: + with open('${PROFILE_DIR}/fusion_analysis.json', 'r') as f: + data = json.load(f) + + throughput = data.get('throughput_statistics', {}) + if throughput: + print(f" Total steps: {throughput.get('total_steps', 'N/A')}") + print(f" Batch size: {throughput.get('batch_size', 'N/A')}") + print(f" Total samples: {throughput.get('total_samples', 'N/A')}") + print(f" Total time: {throughput.get('total_time_sec', 0):.2f} seconds") + print(f" Average step time: {throughput.get('avg_step_time_ms', 0):.2f} ms") + print(f" Average throughput: {throughput.get('avg_throughput_samples_per_sec', 0):.2f} samples/sec") + print(f" Min step time: {throughput.get('min_step_time_ms', 0):.2f} ms") + print(f" Max step time: {throughput.get('max_step_time_ms', 0):.2f} ms") + else: + print(" (Throughput information not available)") +except Exception as e: + print(f" (Error reading throughput data: {e})") +EOF + echo "" +fi + +echo "To analyze results:" +echo " 1. View comprehensive report:" +echo " less ${PROFILE_DIR}/comprehensive_profiling_report.md" +echo "" +echo " 2. View in Chrome (detailed timeline):" +echo " Open chrome://tracing" +echo " Load: ${PROFILE_DIR}/*.pt.trace.json" +echo "" +echo " 3. View in TensorBoard:" +echo " tensorboard --logdir ${PROFILE_DIR}" +echo "" +echo " 4. View fusion analysis:" +echo " cat ${PROFILE_DIR}/fusion_analysis.json | python -m json.tool" +echo "" + + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_compute.sh b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_compute.sh new file mode 100755 index 00000000..7b6ee9ae --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_compute.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +# rocprof-compute Profiling Integration for Tiny OpenFold V2 +# This script provides detailed hardware-level profiling and roofline analysis + +set -e + +# Color codes +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } +log_rocprof() { echo -e "${PURPLE}[ROCPROF-COMPUTE]${NC} $1"; } + +# Default configuration +BATCH_SIZE=4 +SEQ_LEN=64 +NUM_BLOCKS=4 +NUM_SEQS=16 +NUM_STEPS=30 +OUTPUT_NAME="tinyfold_v2" +MODE="profile" # profile, roof, or analyze +DEVICE=0 +ROOF_ONLY=false +NO_ROOF=false +DISPATCH_ID="" +ENABLE_ALL_FUSION=true + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) BATCH_SIZE="$2"; shift 2 ;; + --seq-len) SEQ_LEN="$2"; shift 2 ;; + --num-blocks) NUM_BLOCKS="$2"; shift 2 ;; + --num-seqs) NUM_SEQS="$2"; shift 2 ;; + --num-steps) NUM_STEPS="$2"; shift 2 ;; + --output-name) OUTPUT_NAME="$2"; shift 2 ;; + --device) DEVICE="$2"; shift 2 ;; + --mode) MODE="$2"; shift 2 ;; + --roof-only) ROOF_ONLY=true; shift ;; + --no-roof) NO_ROOF=true; shift ;; + --dispatch) DISPATCH_ID="$2"; shift 2 ;; + --disable-all-fusion) ENABLE_ALL_FUSION=false; shift ;; + --help|-h) + echo "rocprof-compute Profiling for Tiny OpenFold V2" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Modes:" + echo " --mode profile Profile and collect data (default)" + echo " --mode roof Generate roofline plots only" + echo " --mode analyze Analyze specific dispatch" + echo "" + echo "Options:" + echo " --batch-size N Batch size (default: 4)" + echo " --seq-len N Sequence length (default: 64)" + echo " --num-blocks N Number of Evoformer blocks (default: 4)" + echo " --num-seqs N Number of MSA sequences (default: 16)" + echo " --num-steps N Training steps (default: 30)" + echo " --output-name NAME Output name (default: tinyfold_v2)" + echo " --device N GPU device (default: 0)" + echo " --roof-only Generate roofline only (faster)" + echo " --no-roof Skip roofline generation" + echo " --dispatch ID Analyze specific dispatch ID" + echo " --disable-all-fusion Disable all fusions" + echo "" + echo "Examples:" + echo " $0 # Full profile with roofline" + echo " $0 --roof-only # Roofline only (faster)" + echo " $0 --no-roof # Profile without roofline" + echo " $0 --mode analyze --dispatch 1538 # Analyze specific dispatch" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# Check for rocprof-compute +if ! command -v rocprof-compute &> /dev/null; then + log_info "rocprof-compute not found. Please ensure ROCm tools are installed." + exit 1 +fi + +log_info "======================================================================" +log_info "Tiny OpenFold V2 - rocprof-compute Profiling" +log_info "======================================================================" +echo "" +log_info "Configuration:" +log_info " Mode: $MODE" +log_info " Batch size: $BATCH_SIZE" +log_info " Sequence length: $SEQ_LEN" +log_info " Evoformer blocks: $NUM_BLOCKS" +log_info " MSA sequences: $NUM_SEQS" +log_info " Training steps: $NUM_STEPS" +log_info " All fusions: $ENABLE_ALL_FUSION" +log_info " Device: $DEVICE" +log_info " Output name: $OUTPUT_NAME" +echo "" + +# Build Python command +PYTHON_ARGS="--batch-size $BATCH_SIZE --seq-len $SEQ_LEN --num-blocks $NUM_BLOCKS --num-seqs $NUM_SEQS --num-steps $NUM_STEPS" +[ "$ENABLE_ALL_FUSION" = false ] && PYTHON_ARGS="$PYTHON_ARGS --disable-all-fusion" + +case $MODE in + profile) + log_step "Running rocprof-compute profile..." + + if [ "$ROOF_ONLY" = true ]; then + log_rocprof "Mode: Roofline only (faster profiling)" + rocprof-compute profile -n $OUTPUT_NAME --kernel-names --roof-only --device $DEVICE \ + -- python tiny_openfold_v2.py $PYTHON_ARGS 2>&1 | tee rocprof_compute_roof.log + elif [ "$NO_ROOF" = true ]; then + log_rocprof "Mode: Full profile without roofline" + rocprof-compute profile -n $OUTPUT_NAME --no-roof --device $DEVICE \ + -- python tiny_openfold_v2.py $PYTHON_ARGS 2>&1 | tee rocprof_compute_profile.log + else + log_rocprof "Mode: Full profile with roofline" + rocprof-compute profile -n $OUTPUT_NAME --device $DEVICE \ + -- python tiny_openfold_v2.py $PYTHON_ARGS 2>&1 | tee rocprof_compute_full.log + fi + + log_step "Profiling complete!" + + # Check for generated files + echo "" + log_info "Generated files:" + + # Roofline PDFs + if [ "$NO_ROOF" = false ]; then + if ls roofline_*.pdf 1> /dev/null 2>&1; then + log_info " Roofline plots:" + ls -lh roofline_*.pdf | awk '{print " - " $9 " (" $5 ")"}' + fi + fi + + # Workload directory + if [ -d "workloads/${OUTPUT_NAME}" ]; then + log_info " Workload data: workloads/${OUTPUT_NAME}/" + fi + + # Suggest next steps + echo "" + log_info "Next steps:" + log_info " 1. View roofline plots: open roofline_*.pdf" + log_info " 2. List dispatches: rocprof-compute analyze -p workloads/${OUTPUT_NAME}/* --list-stats" + log_info " 3. Analyze dispatch: $0 --mode analyze --dispatch " + ;; + + roof) + log_step "Generating roofline plots..." + rocprof-compute profile -n $OUTPUT_NAME --kernel-names --roof-only --device $DEVICE \ + -- python tiny_openfold_v2.py $PYTHON_ARGS 2>&1 | tee rocprof_compute_roof.log + + log_step "Roofline generation complete!" + + if ls roofline_*.pdf 1> /dev/null 2>&1; then + echo "" + log_info "Generated roofline plots:" + ls -lh roofline_*.pdf + fi + ;; + + analyze) + if [ -z "$DISPATCH_ID" ]; then + log_info "Listing available dispatches..." + WORKLOAD_DIR=$(find workloads/${OUTPUT_NAME} -type d -name "MI*" | head -n 1) + + if [ -z "$WORKLOAD_DIR" ]; then + log_info "No workload data found. Run with --mode profile first." + exit 1 + fi + + rocprof-compute analyze -p $WORKLOAD_DIR --list-stats > dispatch_list.txt 2>&1 + + echo "" + log_info "Available dispatches saved to: dispatch_list.txt" + echo "" + head -n 50 dispatch_list.txt + echo "" + log_info "To analyze a specific dispatch:" + log_info " $0 --mode analyze --dispatch " + else + log_step "Analyzing dispatch $DISPATCH_ID..." + WORKLOAD_DIR=$(find workloads/${OUTPUT_NAME} -type d -name "MI*" | head -n 1) + + if [ -z "$WORKLOAD_DIR" ]; then + log_info "No workload data found. Run with --mode profile first." + exit 1 + fi + + rocprof-compute analyze -p $WORKLOAD_DIR --dispatch $DISPATCH_ID > dispatch_${DISPATCH_ID}_analysis.txt 2>&1 + + log_step "Analysis complete!" + echo "" + log_info "Analysis saved to: dispatch_${DISPATCH_ID}_analysis.txt" + echo "" + head -n 100 dispatch_${DISPATCH_ID}_analysis.txt + fi + ;; + + *) + log_info "Unknown mode: $MODE" + log_info "Use --help for usage information" + exit 1 + ;; +esac + +echo "" +log_info "======================================================================" +log_info "rocprof-compute Complete!" +log_info "======================================================================" +echo "" + + diff --git a/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_sys.sh b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_sys.sh new file mode 100755 index 00000000..bb95f502 --- /dev/null +++ b/MLExamples/TinyOpenFold/version2_pytorch_fused/run_rocprof_sys.sh @@ -0,0 +1,398 @@ +#!/bin/bash + +# rocprof-sys-python Profiling Integration for Tiny OpenFold V2 +# This script provides Python call stack profiling with source-level instrumentation +# Based on: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html + +set -e + +# Color codes +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } +log_rocprof() { echo -e "${PURPLE}[ROCPROF-SYS]${NC} $1"; } + +# Default configuration (smaller defaults for profiling to reduce output size) +BATCH_SIZE=2 +SEQ_LEN=16 +NUM_BLOCKS=4 +NUM_SEQS=16 +NUM_STEPS=30 +OUTPUT_DIR="./rocprof_sys_results_$(date +%Y%m%d_%H%M%S)" +ENABLE_ALL_FUSION=true + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --batch-size) BATCH_SIZE="$2"; shift 2 ;; + --seq-len) SEQ_LEN="$2"; shift 2 ;; + --num-blocks) NUM_BLOCKS="$2"; shift 2 ;; + --num-seqs) NUM_SEQS="$2"; shift 2 ;; + --num-steps) NUM_STEPS="$2"; shift 2 ;; + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + --disable-all-fusion) ENABLE_ALL_FUSION=false; shift ;; + --help|-h) + echo "rocprof-sys-python Profiling for Tiny OpenFold V2" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "This script uses rocprof-sys-python for Python call stack profiling" + echo "with source-level instrumentation. See:" + echo "https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html" + echo "" + echo "Options:" + echo " --batch-size N Batch size (default: 2, smaller for profiling)" + echo " --seq-len N Sequence length (default: 16, smaller for profiling)" + echo " --num-blocks N Number of Evoformer blocks (default: 4)" + echo " --num-seqs N Number of MSA sequences (default: 16)" + echo " --num-steps N Training steps (default: 30)" + echo " --output-dir DIR Output directory" + echo " --disable-all-fusion Disable all fusions" + echo "" + echo "Examples:" + echo " $0 # Profile with defaults (batch=2, seq=16)" + echo " $0 --batch-size 4 --seq-len 64 # Larger workload" + echo " $0 --disable-all-fusion # Baseline comparison" + echo "" + echo "Output:" + echo " - Python call stack profiling with function call counts" + echo " - ROCPD trace files (.rocpd or .rocpd.json) for AI/ML workloads" + echo " - Detailed profiling log in rocprof_sys.log" + echo "" + echo "Configuration:" + echo " The script sets up environment variables for rocprof-sys-python:" + echo " - Sources setup-env.sh: Automatically sets PYTHONPATH, PATH, LD_LIBRARY_PATH" + echo " - PYTHONPATH: Includes rocprofsys package location (if not set by setup-env.sh)" + echo " - ROCPROFSYS_PROFILE=ON: Enables profiling" + echo " - ROCPROFSYS_USE_ROCPD: Automatically enabled if rocpd package is found" + echo " (checks Python site-packages for current ROCm version)" + echo " - ROCPROFSYS_USE_TRACE: Enabled if ROCPD is not available, disabled otherwise" + echo " - PATH: Includes ROCm share/rocprofiler-systems for schema discovery" + echo " - LD_LIBRARY_PATH: Includes PyTorch lib and ROCm lib directories" + echo "" + echo "Note: ROCPD format is recommended for AI/ML workloads (better child thread support)" + echo " The script automatically detects if rocpd is available and enables it accordingly." + echo " See: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html" + echo "" + echo "Config file:" + echo " Default config file: ~/.rocprof-sys.cfg" + echo " If ROCPROFSYS_CONFIG_FILE is not set, rocprof-sys will check for ~/.rocprof-sys.cfg" + echo " If the file doesn't exist, default built-in configuration is used." + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# Check Python version matches compiled bindings +# The Python interpreter major.minor version must match the version used to compile the bindings +# See: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html +PYTHON_VERSION=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "") +if [ -n "$PYTHON_VERSION" ]; then + # Check if matching library exists (e.g., libpyrocprofsys.cpython-312-x86_64-linux-gnu.so for Python 3.12) + PYTHON_MAJOR_MINOR=$(echo "$PYTHON_VERSION" | tr '.' '_') + if [ ! -f "${ROCM_PATH}/lib/python${PYTHON_VERSION}/site-packages/rocprofsys/libpyrocprofsys.cpython-${PYTHON_MAJOR_MINOR}-x86_64-linux-gnu.so" ]; then + log_info "Warning: Python ${PYTHON_VERSION} bindings may not be available." + log_info "Available bindings: $(find ${ROCM_PATH}/lib/python*/site-packages/rocprofsys -name 'libpyrocprofsys*.so' 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo 'Not found')" + log_info "The Python version must match the version used to compile the bindings." + fi +fi + +# Check for rocprof-sys-python or python3 -m rocprofsys +ROCPROF_SYS_PYTHON_CMD="" +if command -v rocprof-sys-python &> /dev/null; then + ROCPROF_SYS_PYTHON_CMD="rocprof-sys-python" + log_rocprof "Using rocprof-sys-python helper script" +elif python3 -m rocprofsys --help &> /dev/null; then + ROCPROF_SYS_PYTHON_CMD="python3 -m rocprofsys" + log_rocprof "Using python3 -m rocprofsys" +else + log_info "rocprof-sys-python not found. Please ensure ROCm Systems Profiler Python bindings are installed." + log_info "The Python package should be in: ${ROCM_PATH}/lib/python*/site-packages/rocprofsys" + log_info "Or ensure PYTHONPATH includes the rocprofsys package location." + log_info "You may need to source: ${ROCM_PATH}/share/rocprofiler-systems/setup-env.sh" + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +log_info "======================================================================" +log_info "Tiny OpenFold V2 - rocprof-sys-python Call Stack Profiling" +log_info "======================================================================" +echo "" +log_info "Configuration:" +log_info " Batch size: $BATCH_SIZE" +log_info " Sequence length: $SEQ_LEN" +log_info " Evoformer blocks: $NUM_BLOCKS" +log_info " MSA sequences: $NUM_SEQS" +log_info " Training steps: $NUM_STEPS" +log_info " All fusions: $ENABLE_ALL_FUSION" +log_info " Output directory: $OUTPUT_DIR" +echo "" + +# Build Python command +PYTHON_ARGS="--batch-size $BATCH_SIZE --seq-len $SEQ_LEN --num-blocks $NUM_BLOCKS --num-seqs $NUM_SEQS --num-steps $NUM_STEPS" +[ "$ENABLE_ALL_FUSION" = false ] && PYTHON_ARGS="$PYTHON_ARGS --disable-all-fusion" + +# Run profiling with Python call stack support +log_step "Starting rocprof-sys-python profiling..." +log_rocprof "This will generate Python call stack profiling output" +log_rocprof "Using command: $ROCPROF_SYS_PYTHON_CMD" +echo "" + +# Set environment variables for profiling +# ROCPD output is recommended for AI/ML workloads (better child thread support) + +# Source setup-env.sh if available (sets PYTHONPATH, PATH, LD_LIBRARY_PATH automatically) +# See: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html +if [ -f "${ROCM_PATH}/share/rocprofiler-systems/setup-env.sh" ]; then + source ${ROCM_PATH}/share/rocprofiler-systems/setup-env.sh + log_rocprof "Sourced setup-env.sh for environment configuration" +fi + +# Ensure LD_LIBRARY_PATH includes PyTorch lib directory and ROCm lib directory +# This is critical for PyTorch to detect ROCm GPUs and load required libraries +# See: TinyOpenFold/README.md for details +if command -v python3 &> /dev/null; then + # Add PyTorch lib directory (contains libcaffe2_nvrtc.so and other ROCm libraries) + PYTORCH_LIB_DIR=$(python3 -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'lib'))" 2>/dev/null || echo "") + if [ -n "$PYTORCH_LIB_DIR" ] && [ -d "$PYTORCH_LIB_DIR" ]; then + export LD_LIBRARY_PATH="${PYTORCH_LIB_DIR}:${LD_LIBRARY_PATH}" + log_rocprof "Added PyTorch lib directory to LD_LIBRARY_PATH: $PYTORCH_LIB_DIR" + fi + + # Add ROCm lib directory (if not already in LD_LIBRARY_PATH) + if [[ "$LD_LIBRARY_PATH" != *"${ROCM_PATH}/lib"* ]]; then + export LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}" + log_rocprof "Added ROCm lib directory to LD_LIBRARY_PATH: ${ROCM_PATH}/lib" + fi + + # Add system library paths (for libdrm.so.2, libatomic.so.1, etc.) + if [[ "$LD_LIBRARY_PATH" != *"/usr/lib64"* ]]; then + export LD_LIBRARY_PATH="/usr/lib64:/lib64:${LD_LIBRARY_PATH}" + log_rocprof "Added system library paths to LD_LIBRARY_PATH" + fi +fi + +# Ensure PYTHONPATH includes rocprofsys package (if setup-env.sh didn't set it) +# The Python package is installed in lib/pythonX.Y/site-packages/rocprofsys +# See: https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/profiling-python-scripts.html +if [ -z "$PYTHONPATH" ] || [[ "$PYTHONPATH" != *"rocprofsys"* ]]; then + # Try to find Python version and add appropriate path + PYTHON_VERSION=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "3.12") + ROCPROFSYS_PYTHON_PATH="${ROCM_PATH}/lib/python${PYTHON_VERSION}/site-packages" + if [ -d "$ROCPROFSYS_PYTHON_PATH" ]; then + export PYTHONPATH="${ROCPROFSYS_PYTHON_PATH}:${PYTHONPATH}" + log_rocprof "Added $ROCPROFSYS_PYTHON_PATH to PYTHONPATH" + fi +fi + +# Basic system setup for rocprof-sys configuration +# Set config file only if ~/.rocprof-sys.cfg exists, otherwise use defaults +if [ -f "$HOME/.rocprof-sys.cfg" ]; then + export ROCPROFSYS_CONFIG_FILE="$HOME/.rocprof-sys.cfg" + log_rocprof "Using config file: $HOME/.rocprof-sys.cfg" +else + unset ROCPROFSYS_CONFIG_FILE + log_rocprof "Config file not found, using default built-in configuration" +fi + +# Enable profiling +export ROCPROFSYS_PROFILE=ON + +# Detect ROCm version and check for rocpd availability +# ROCPD is enabled only if it's packaged with the Python package for the current ROCm version +ROCM_VERSION=$(module list 2>&1 | grep -oP 'rocm/\K[0-9.]+' | head -1 || echo "") +if [ -z "$ROCM_VERSION" ]; then + # Try to detect from ROCM_PATH or common locations + if [ -n "$ROCM_PATH" ]; then + ROCM_VERSION=$(basename "$ROCM_PATH" | grep -oP 'rocm-\K[0-9.]+' || echo "") + fi + if [ -z "$ROCM_VERSION" ]; then + # Check common ROCm installation paths + for rocm_path in /opt/rocm-*; do + if [ -d "$rocm_path" ]; then + ROCM_VERSION=$(basename "$rocm_path" | grep -oP 'rocm-\K[0-9.]+' || echo "") + [ -n "$ROCM_VERSION" ] && break + fi + done + fi +fi + +# Get Python version +PYTHON_VERSION=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "3.12") + +# Check if rocpd is available in Python site-packages for current ROCm version +ROCPD_AVAILABLE=false +if [ -n "$ROCM_VERSION" ]; then + # Check multiple possible ROCm paths + for rocm_base in "/opt/rocm-${ROCM_VERSION}" "/opt/rocm/${ROCM_VERSION}" "$ROCM_PATH"; do + if [ -n "$rocm_base" ] && [ -d "$rocm_base" ]; then + ROCPD_PATH="${rocm_base}/lib/python${PYTHON_VERSION}/site-packages/rocpd" + if [ -d "$ROCPD_PATH" ]; then + ROCPD_AVAILABLE=true + log_rocprof "Found rocpd package at: $ROCPD_PATH" + break + fi + fi + done +fi + +# ROCPD output configuration +# ROCPD is enabled only if available (better child thread support for AI/ML workloads) +if [ "$ROCPD_AVAILABLE" = true ]; then + export ROCPROFSYS_USE_ROCPD=ON + log_rocprof "ROCPD enabled (rocpd package found)" + + # Try setting schema path (may not be respected if hardcoded) + if [ -n "$ROCM_VERSION" ]; then + for rocm_base in "/opt/rocm-${ROCM_VERSION}" "/opt/rocm/${ROCM_VERSION}" "$ROCM_PATH"; do + if [ -n "$rocm_base" ] && [ -d "$rocm_base" ]; then + SCHEMA_PATH="${rocm_base}/share/rocprofiler-systems/rocpd_tables.sql" + if [ -f "$SCHEMA_PATH" ]; then + export ROCPROFSYS_ROCPD_SCHEMA_PATH="$SCHEMA_PATH" + log_rocprof "Set ROCPD schema path: $SCHEMA_PATH" + break + fi + fi + done + fi +else + export ROCPROFSYS_USE_ROCPD=OFF + log_rocprof "ROCPD disabled (rocpd package not found for ROCm ${ROCM_VERSION:-unknown} / Python ${PYTHON_VERSION})" +fi + +# Trace output configuration (Perfetto format) +# Use Perfetto trace if ROCPD is not available +if [ "$ROCPD_AVAILABLE" = false ]; then + export ROCPROFSYS_USE_TRACE=ON + log_rocprof "Using Perfetto trace format (ROCPD not available)" +else + export ROCPROFSYS_USE_TRACE=OFF + log_rocprof "Using ROCPD format (Perfetto trace disabled)" +fi + +# Optional: Enable ROCProfiler integration +# export ROCPROFSYS_USE_ROCPROFILER=ON + +# Optional: Configure profiling components (e.g., trip_count, wall_clock, etc.) +# export ROCPROFSYS_TIMEMORY_COMPONENTS="trip_count,wall_clock" + +# Verify GPU/ROCm availability before running +log_step "Verifying GPU/ROCm availability..." +if command -v rocm-smi &> /dev/null; then + log_info "ROCm detected - checking GPU availability..." + if rocm-smi &> /dev/null; then + GPU_COUNT=$(rocm-smi --showproductname 2>/dev/null | grep -c "Card series" || echo "0") + if [ "$GPU_COUNT" -gt 0 ]; then + log_info "Found $GPU_COUNT GPU(s) via rocm-smi" + rocm-smi --showproductname 2>/dev/null | grep "Card series" | head -1 || true + else + log_info "rocm-smi available but no GPUs detected" + fi + fi +else + log_info "rocm-smi not found - GPU detection may be limited" +fi + +# Verify PyTorch can see ROCm devices +log_step "Verifying PyTorch ROCm support..." +PYTORCH_GPU_CHECK=$(python3 -c " +import sys +try: + import torch + if torch.cuda.is_available(): + print(f'PyTorch GPU: Available ({torch.cuda.device_count()} device(s))') + for i in range(torch.cuda.device_count()): + print(f' Device {i}: {torch.cuda.get_device_name(i)}') + sys.exit(0) + else: + print('PyTorch GPU: Not available') + print(' torch.cuda.is_available() = False') + sys.exit(1) +except Exception as e: + print(f'PyTorch GPU check failed: {e}') + sys.exit(1) +" 2>&1) + +if echo "$PYTORCH_GPU_CHECK" | grep -q "Not available\|failed"; then + log_info "Warning: PyTorch cannot detect GPU devices" + log_info "This may cause DeepSpeed to fall back to CPU mode" + log_info "" + log_info "Common causes:" + log_info " 1. Missing ROCm libraries in LD_LIBRARY_PATH" + log_info " 2. PyTorch not built with ROCm support" + log_info " 3. HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES incorrectly set" + log_info "" + log_info "Current LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}" + log_info "Current HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES:-not set}" + log_info "Current ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES:-not set}" + echo "" + read -p "Continue anyway? [y/N] " -n 1 -r + echo "" + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +else + echo "$PYTORCH_GPU_CHECK" +fi +echo "" + +cd "$OUTPUT_DIR" +# rocprof-sys-python syntax: rocprof-sys-python --trace --