From 29e9e278a331eeba8b44d84a718a4bc49a50dd5b Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 20 Oct 2022 11:23:53 -0700 Subject: [PATCH 001/427] Change version to 0.2.0 Signed-off-by: Przemek Tredak --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 3f69f33a74..0ea3a944b3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0dev +0.2.0 From 73166c4e3f6cf0e754045ba22ff461ef96453aeb Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 23 Nov 2022 09:45:36 -0800 Subject: [PATCH 002/427] Full activation recompute checkpointing bug fix (#31) fix checkpoint loading bug for FAR Signed-off-by: Kirthi Shankar Sivamani Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/fp8.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index e25a413d4f..8fafdafa3e 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -69,13 +69,13 @@ def get_global_fp8_recompute_buffer() -> Dict[str, List[torch.Tensor]]: return _fp8_tensors_recompute_buffer -def set_global_fp8_recompute_buffer(buffer: List[Deque[torch.Tensor]]) -> None: +def set_global_fp8_recompute_buffer(buffer: List[Deque[List[torch.Tensor]]]) -> None: """Sets global fp8 recompute buffer.""" global _fp8_tensors_recompute_buffer # Map all tensors back to GPU. for index, deck in enumerate(buffer): - buffer[index] = deque([tensor.cuda() for tensor in deck]) + buffer[index] = deque([[t.cuda() for t in tensors] for tensors in deck]) _fp8_tensors_recompute_buffer = buffer @@ -118,11 +118,11 @@ def copy_forward_fp8_meta_tensors_for_recompute(fp8_meta: Dict[str, Any]) -> Non global _fp8_tensors_recompute_buffer buffer_position_key = "global_fp8_buffer_pos_fwd_recompute" - to_copy = ( + to_copy = [ fp8_meta["scaling_fwd"].amax_history.clone(), fp8_meta["scaling_fwd"].scale.clone(), fp8_meta["scaling_fwd"].scale_inv.clone(), - ) + ] if buffer_position_key in fp8_meta: _fp8_tensors_recompute_buffer[fp8_meta[buffer_position_key]].append(to_copy) From 126232df4e87cea7a46278ebb23f47397315d0c0 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 31 Jan 2023 10:09:48 -0800 Subject: [PATCH 003/427] Address steady memory increase and bloated checkpoints (#63) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/fp8.py | 18 +----------------- transformer_engine/pytorch/module.py | 8 +++----- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index fd05358a93..e4cce98931 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -5,7 +5,7 @@ """FP8 utilies for TransformerEngine""" from contextlib import contextmanager from collections import deque -from typing import Callable, List, Optional, Dict, Any, Tuple, Union, Deque +from typing import Callable, List, Optional, Dict, Any, Tuple, Union import torch import transformer_engine_extensions as tex @@ -64,22 +64,6 @@ def set_global_fp8_buffer(buffer: Dict[str, List[torch.Tensor]]) -> None: _global_fp8_buffer = buffer -def get_global_fp8_recompute_buffer() -> Dict[str, List[torch.Tensor]]: - """Returns global fp8 recompute buffer.""" - return _fp8_tensors_recompute_buffer - - -def set_global_fp8_recompute_buffer(buffer: List[Deque[List[torch.Tensor]]]) -> None: - """Sets global fp8 recompute buffer.""" - global _fp8_tensors_recompute_buffer - - # Map all tensors back to GPU. - for index, deck in enumerate(buffer): - buffer[index] = deque([[t.cuda() for t in tensors] for tensors in deck]) - - _fp8_tensors_recompute_buffer = buffer - - def setup_amax_forward_global_reduce_func(f: Callable) -> None: """Sets up the function to call during autocast exit.""" global _amax_forward_global_reduce_func diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index 0a6cae3b4a..ada798c374 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -32,8 +32,6 @@ amax_and_scale_update, get_global_fp8_buffer, set_global_fp8_buffer, - get_global_fp8_recompute_buffer, - set_global_fp8_recompute_buffer, set_amax_buffer_key_deletion, delete_key_from_amax_buffer, copy_forward_fp8_meta_tensors_for_recompute, @@ -201,7 +199,6 @@ def get_extra_state(self) -> Union[List[Any], None]: state["scale_bwd"] = self.fp8_meta["scaling_bwd"].scale state["amax_history_bwd"] = self.fp8_meta["scaling_bwd"].amax_history state["global_fp8_buffer"] = get_global_fp8_buffer() - state["global_fp8_recompute_buffer"] = get_global_fp8_recompute_buffer() # Store other pickelable values. extra = {} @@ -254,11 +251,11 @@ def set_extra_state(self, state: Union[List[Any], None]) -> None: # Restore global FP8 buffer states. set_global_fp8_buffer(state["global_fp8_buffer"]) - set_global_fp8_recompute_buffer(state["global_fp8_recompute_buffer"]) - # Load extra items. self.fp8_meta.update(state["extra_fp8_variables"]) self.fp8_meta["recipe"].amax_history_len = state["amax_history_fwd"].shape[0] + if "global_fp8_buffer_pos_fwd_recompute" in self.fp8_meta: + del self.fp8_meta["global_fp8_buffer_pos_fwd_recompute"] # Initialize before loading. self.init_fp8_meta_tensors() @@ -433,6 +430,7 @@ def prepare_forward( # Activation recomputation is used and this is the first forward phase. if ( self.fp8 + and self.training and is_fp8_activation_recompute_enabled() and not in_fp8_activation_recompute_phase() ): From ce58fc2fe786776fef43fcf1a3bb1baaf09ee03a Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 28 Feb 2023 23:05:43 -0800 Subject: [PATCH 004/427] 3rd party acknowledgements (#82) add 3rd party acknowledgements Signed-off-by: Kirthi Shankar Sivamani --- Acknowledgements.txt | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 Acknowledgements.txt diff --git a/Acknowledgements.txt b/Acknowledgements.txt new file mode 100644 index 0000000000..7eec81a9ce --- /dev/null +++ b/Acknowledgements.txt @@ -0,0 +1,140 @@ +This software includes third-party components under the following licenses: + +======================== +GoogleTest + +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +======================== +pybind11 + +Copyright (c) 2016 Wenzel Jakob , All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Please also refer to the file CONTRIBUTING.md, which clarifies licensing of +external contributions to this project including patches, pull requests, etc. + +======================== +PyTorch + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +======================== +FlashAttn + +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +All contributions by Nvidia: +Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 4c358916450c74d03a882e1eda572dd380cfd527 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 2 Mar 2023 10:56:33 -0800 Subject: [PATCH 005/427] Fix unfused QKV params case; stack vs interleave option (#83) * fix qkv weight unfused path Signed-off-by: Kirthi Shankar Sivamani * fix non FA non interleaved case Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/transformer.py | 79 +++++++++++++++++------ transformer_engine/pytorch/utils.py | 9 ++- 2 files changed, 63 insertions(+), 25 deletions(-) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index c0989f9c93..046dda20b2 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -24,7 +24,7 @@ from transformer_engine.pytorch.utils import ( divide, attention_mask_func, - split_tensor_along_last_dim, + split_tensor_along_dim, cast_if_needed, get_default_init_method, ) @@ -126,11 +126,11 @@ def forward( ) # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view( + query_layer = query_layer.reshape( output_size[2], output_size[0] * output_size[1], -1 ) # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + key_layer = key_layer.reshape(output_size[3], output_size[0] * output_size[1], -1) # preallocting result tensor: [b * np, sq, sk] matmul_result = torch.empty( @@ -171,7 +171,7 @@ def forward( ) # change view [sk, b * np, hn] - value_layer = value_layer.view( + value_layer = value_layer.reshape( value_layer.size(0), output_size[0] * output_size[1], -1 ) @@ -504,6 +504,7 @@ def __init__( set_parallel_mode: bool = False, fuse_qkv_params: bool = False, zero_centered_gamma: bool = False, + qkv_weight_interleaved: bool = True, ) -> None: super().__init__() self.layer_number = (layer_number,) @@ -515,6 +516,10 @@ def __init__( self.params_dtype = params_dtype self.init_method = init_method + if not fuse_qkv_params: + qkv_weight_interleaved = False + self.qkv_weight_interleaved = qkv_weight_interleaved + assert ( attention_type in AttnTypes ), f"attention_type {attention_type} not supported" @@ -703,16 +708,28 @@ def forward( is_first_microbatch=is_first_microbatch, ) - # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] - new_tensor_shape = mixed_x_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) + if self.qkv_weight_interleaved: + # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + # split along last dimension + split_dim = -1 + else: + # [sq, b, (np * 3 * hn)] --> [sq, b, 3 * np, hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + 3 * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + # split along second last dimension + split_dim = -2 + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) - # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - query_layer, key_layer, value_layer = split_tensor_along_last_dim( - mixed_x_layer, 3 + # mixed_x_layer --> 3 [sq, b, np, hn] + query_layer, key_layer, value_layer = split_tensor_along_dim( + mixed_x_layer, split_dim, 3 ) else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] @@ -721,15 +738,27 @@ def forward( is_first_microbatch=is_first_microbatch, ) - # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] - new_tensor_shape = mixed_kv_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, - 2 * self.hidden_size_per_attention_head, - ) + if self.qkv_weight_interleaved: + # [sq, b, (np * 2 * hn)] --> [sq, b, np, 2 * hn] + new_tensor_shape = mixed_kv_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head, + ) + # split along last dimension + split_dim = -1 + else: + # [sq, b, (np * 2 * hn)] --> [sq, b, 2 * np, hn] + new_tensor_shape = mixed_kv_layer.size()[:-1] + ( + 2 * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + # split along second last dimension + split_dim = -2 + mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) - # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] - (key_layer, value_layer) = split_tensor_along_last_dim(mixed_kv_layer, 2) + # mixed_kv_layer --> 2 [sk, b, np, hn] + key_layer, value_layer = split_tensor_along_dim(mixed_kv_layer, split_dim, 2) # Attention head [sq, b, h] --> [sq, b, hp] if self.input_layernorm: @@ -863,7 +892,12 @@ class TransformerLayer(torch.nn.Module): .. math:: y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta - + qkv_weight_interleaved : bool, default = `True` + if set to `False`, the QKV weight is interpreted as a concatenation of + query, key, and value weights along the `0th` dimension. The default + interpretation is that the individual `q`, `k`, and `v` weights for each + attention head are interleaved. This parameter is set to `False` when + using :attr:`fuse_qkv_params=False`. Parallelism parameters ---------------------- set_parallel_mode : bool, default = `False` @@ -938,6 +972,7 @@ def __init__( set_parallel_mode: bool = False, fuse_qkv_params: bool = False, zero_centered_gamma: bool = False, + qkv_weight_interleaved: bool = True, ) -> None: super().__init__() @@ -958,6 +993,9 @@ def __init__( not fuse_wgrad_accumulation ), "Gradient accumulation fusion requires single QKV parameter." + if not fuse_qkv_params: + qkv_weight_interleaved = False + self.kv_channels = ( kv_channels if kv_channels else (hidden_size // num_attention_heads) ) @@ -995,6 +1033,7 @@ def __init__( "set_parallel_mode": set_parallel_mode, "fuse_qkv_params": fuse_qkv_params, "zero_centered_gamma": zero_centered_gamma, + "qkv_weight_interleaved" : qkv_weight_interleaved, } self.self_attention = MultiHeadAttention( diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index a71891b8e9..9f1ddaa2b2 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -78,8 +78,8 @@ def divide(numerator: int, denominator: int) -> int: return numerator // denominator -def split_tensor_along_last_dim( - tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False +def split_tensor_along_dim( + tensor: torch.Tensor, dim: int, num_partitions: int, contiguous_split_chunks: bool = False ) -> Tuple[torch.Tensor, ...]: """Split a tensor along its last dimension. Arguments: @@ -89,10 +89,9 @@ def split_tensor_along_last_dim( in memory. """ # Get the size and dimension. - last_dim = tensor.dim() - 1 - last_dim_size = divide(tensor.size()[last_dim], num_partitions) + split_size = divide(tensor.size()[dim], num_partitions) # Split. - tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + tensor_list = torch.split(tensor, split_size, dim=dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) From bb1203894d4cf5007e00a8004bb1b10740cfbee5 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 7 Mar 2023 09:26:18 -0800 Subject: [PATCH 006/427] Fix flash attention (#84) * ignore self attention mask for causal type Signed-off-by: Kirthi Shankar Sivamani * further relax checks to run FA, update docs Signed-off-by: Kirthi Shankar Sivamani * fix pytorch softmax path Signed-off-by: Kirthi Shankar Sivamani * fixes Signed-off-by: Kirthi Shankar Sivamani * minimum ampere requirement for fa Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/test_onnx_export.py | 1 + transformer_engine/pytorch/softmax.py | 13 +++++++ transformer_engine/pytorch/transformer.py | 46 ++++++++++++++--------- transformer_engine/pytorch/utils.py | 7 ++++ 4 files changed, 49 insertions(+), 18 deletions(-) diff --git a/tests/test_onnx_export.py b/tests/test_onnx_export.py index f43899c33f..7d905612b4 100644 --- a/tests/test_onnx_export.py +++ b/tests/test_onnx_export.py @@ -793,6 +793,7 @@ def test_export_core_attention( if attn_mask_type is None: attn_mask_type = 'causal' + inp = (query_layer, key_layer, value_layer) model = te.transformer.DotProductAttention( num_attention_heads=num_attention_heads, kv_channels=kv_channels, diff --git a/transformer_engine/pytorch/softmax.py b/transformer_engine/pytorch/softmax.py index 8bdb3e1c82..775f3fedd9 100644 --- a/transformer_engine/pytorch/softmax.py +++ b/transformer_engine/pytorch/softmax.py @@ -16,6 +16,15 @@ THREADS_PER_BLOCK = 128 +_default_causal_mask = {} + +def _get_default_causal_mask(sq: int) -> torch.Tensor: + """Return the causal upper triangular mask for softmax input""" + if sq not in _default_causal_mask: + _default_causal_mask[sq] = torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool() + return _default_causal_mask[sq] + + class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): """ Fused operation which performs following three operations in sequence @@ -274,6 +283,10 @@ def forward_torch_softmax( if self.scale is not None: inp = inp * self.scale + + if self.attn_mask_type == "causal": + mask = _get_default_causal_mask(inp.size()[2]) + mask_output = self.mask_func(inp, mask) if mask is not None else inp probs = torch.nn.Softmax(dim=-1)(mask_output) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 046dda20b2..a9a3b84aa0 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -27,6 +27,7 @@ split_tensor_along_dim, cast_if_needed, get_default_init_method, + get_device_compute_capability, ) from transformer_engine.pytorch.constants import ( AttnMaskTypes, @@ -220,9 +221,6 @@ def __init__( assert ( attn_mask_type == "causal" ), 'FlashAttention currently only supports causal attention mask.' - assert ( - attention_softmax_in_fp32 - ), 'FlashAttention currently only supports softmax compute in fp32.' self.attn_causal_mask = attn_mask_type == "causal" self.norm_factor = norm_factor @@ -230,6 +228,7 @@ def __init__( self.attention_dropout = attention_dropout self.layer_number = layer_number self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 def forward( self, @@ -287,6 +286,11 @@ class DotProductAttention(torch.nn.Module): representation subspaces as described in the paper: `Attention Is All You Need `_. + .. note:: + + Argument :attr:`attention_mask` will be ignored in the `forward` call when + :attr:`attn_mask_type` is set to `"causal"`. + .. warning:: For the default attention mechanism, this module executes a non-deterministic version of @@ -303,15 +307,6 @@ class DotProductAttention(torch.nn.Module): number of key-value channels. attention_dropout: float, default = 0.0 dropout probability for the dropout op during multi-head attention. - layer_number: int, default = `None` - layer number of the current `DotProductAttention` when multiple such modules - are concatenated, for instance in consecutive transformer blocks. - apply_query_key_layer_scaling: bool, default = `False` - apply query-key layer scaling during BMM1 - by a factor of `layer_number` - attention_softmax_in_fp32: bool, default = `True` - if set to `False`, softmax is executed in - the dtype of activation tensors. attn_mask_type: {'causal', 'padding'}, default = `causal` type of attention mask passed into softmax operation. @@ -371,9 +366,8 @@ def __init__( self.use_flash_attention = ( int(os.getenv("NVTE_FLASH_ATTN", "1")) - and attention_softmax_in_fp32 and attn_mask_type == "causal" - and not apply_query_key_layer_scaling + and get_device_compute_capability() >= 8.0 ) attn_kwargs = { @@ -422,6 +416,11 @@ def forward( """ Dot Product Attention Layer. + .. note:: + + Argument :attr:`attention_mask` will be ignored when :attr:`attn_mask_type` + is set to `"causal"`. + .. note:: Input tensors :attr:`query_layer`, :attr:`key_layer`, and :attr:`value_layer` @@ -448,8 +447,7 @@ def forward( """ use_flash_attention = self.use_flash_attention - if (attention_mask is not None - or query_layer.dtype not in [torch.bfloat16, torch.float16] + if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] ): @@ -515,6 +513,7 @@ def __init__( self.return_layernorm_output = return_layernorm_output self.params_dtype = params_dtype self.init_method = init_method + self.attn_mask_type = attn_mask_type if not fuse_qkv_params: qkv_weight_interleaved = False @@ -658,7 +657,7 @@ def forward( """MultiHeadAttention FWD""" # hidden_states: [sq, b, h] - if attention_mask is not None: + if self.attn_mask_type != "causal" and attention_mask is not None: assert ( attention_mask.dtype == torch.bool ), "Attention mask must be a boolean tensor" @@ -836,6 +835,11 @@ class TransformerLayer(torch.nn.Module): TransformerLayer is made up of an attention block and a feedforward network (MLP). This standard layer is based on the paper "Attention Is All You Need". + .. note:: + + Argument :attr:`attention_mask` will be ignored in the `forward` call when + :attr:`self_attn_mask_type` is set to `"causal"`. + Parameters ---------- hidden_size : int @@ -983,6 +987,7 @@ def __init__( self.apply_residual_connection_post_layernorm = ( apply_residual_connection_post_layernorm ) + self.self_attn_mask_type = self_attn_mask_type assert ( self_attn_mask_type in AttnMaskTypes ), f"self_attn_mask_type {self_attn_mask_type} not supported" @@ -1129,6 +1134,11 @@ def forward( """ Transformer Layer: attention block and a feedforward network (MLP) + .. note:: + + Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type` + is set to `"causal"`. + Parameters ---------- hidden_states : torch.Tensor @@ -1163,7 +1173,7 @@ def forward( hidden_states = hidden_states.contiguous() - if attention_mask is not None: + if self.self_attn_mask_type != "causal" and attention_mask is not None: assert ( attention_mask.dtype == torch.bool ), "Attention mask must be a boolean tensor" diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 9f1ddaa2b2..798bcfb332 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -8,6 +8,13 @@ import torch +def get_device_compute_capability() -> float: + """Returns the cuda compute capability of current GPU""" + major = torch.cuda.get_device_properties(torch.cuda.current_device()).major + minor = torch.cuda.get_device_properties(torch.cuda.current_device()).minor + return major + minor / 10 + + def attention_mask_func( attention_scores: torch.Tensor, attention_mask: torch.Tensor ) -> torch.Tensor: From f18e6773d9ed1aca1f497f6a2d3a927a21a372ea Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 24 Feb 2023 17:54:09 -0800 Subject: [PATCH 007/427] fix bug in non-FP8 nvfuser path (#81) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index a5c247926a..22434ab887 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -2204,7 +2204,7 @@ def forward( gelu=not bias_gelu_nvfusion, ) - if bias_gelu_nvfusion and is_grad_enabled: + if bias_gelu_nvfusion: fc1_out, _, _ = fc1_outputs gelu_out = bias_gelu_fused(fc1_out, fc1_bias) else: From f4955d3a510cab9e40ac63ffa180d9e6702ad603 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Mon, 20 Mar 2023 17:17:38 -0700 Subject: [PATCH 008/427] Add SECURITY.md (#110) Signed-off-by: Przemek Tredak --- SECURITY.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..35edb61b01 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,24 @@ +## Security + +NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. + +If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub/GitLab.** + +## Reporting Potential Security Vulnerability in an NVIDIA Product + +To report a potential security vulnerability in any NVIDIA product: +- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) +- E-Mail: psirt@nvidia.com + - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) + - Please include the following information: + - Product/Driver name and version/branch that contains the vulnerability + - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) + - Instructions to reproduce the vulnerability + - Proof-of-concept or exploit code + - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability + +While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. + +## NVIDIA Product Security + +For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security From e5ab21131c3d185823229b4f86cc3d54a3b39edf Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 22 Mar 2023 00:41:49 -0700 Subject: [PATCH 009/427] Catch FA internal error with compute capability 8.6 (#113) FA doesn't support compute 8.6 with head_dim>64 Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 1869228c2e..cbd0622947 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -353,10 +353,11 @@ def __init__( norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.device_compute_capability = get_device_compute_capability() self.use_flash_attention = ( int(os.getenv("NVTE_FLASH_ATTN", "1")) and attn_mask_type == "causal" - and get_device_compute_capability() >= 8.0 + and self.device_compute_capability >= 8.0 ) attn_kwargs = { @@ -437,6 +438,7 @@ def forward( if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] + or (self.device_compute_capability == 8.6 and key_layer.shape[-1] > 64) ): use_flash_attention = False From 7e8c3e69da100e485895e44ec9c1699cb1add629 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 28 Mar 2023 09:42:26 -0700 Subject: [PATCH 010/427] Fix usage of return_bias argument (#114) * fix usage of return_bias argument Signed-off-by: Kirthi Shankar Sivamani * review comments Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/module.py | 28 +++++++++++------------ transformer_engine/pytorch/transformer.py | 4 ++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index 4b67f1b91a..4e012be58c 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -1123,6 +1123,7 @@ def __init__( self.fuse_wgrad_accumulation = fuse_wgrad_accumulation self.use_bias = bias self.return_bias = return_bias + self.apply_bias = bias and not return_bias self.return_layernorm_output = return_layernorm_output self.parameters_split = parameters_split self.zero_centered_gamma = zero_centered_gamma @@ -1187,7 +1188,7 @@ def __init__( stride=1, ) - if self.use_bias or self.return_bias: + if self.use_bias: self.register_buffer("bias_tensor", torch.empty( self.out_features, @@ -1229,7 +1230,7 @@ def __init__( stride=1, ) - if self.use_bias or self.return_bias: + if self.use_bias: self.register_parameter( bname, Parameter(self.bias_tensor[i * split_size : (i+1) * split_size]) ) @@ -1246,9 +1247,8 @@ def __init__( # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM - if self.parallel_mode == "row" and self.use_bias: + if self.parallel_mode == "row" and self.apply_bias: self.gemm_bias_unfused_add = True - self.use_bias = False else: self.gemm_bias_unfused_add = False @@ -1331,7 +1331,7 @@ def forward( self.weight1_fp8 if self.fp8 else None, self.weight1_t_fp8 if self.fp8 else None, bias_tensor, - self.use_bias, + self.apply_bias and not self.gemm_bias_unfused_add, self.eps, is_first_microbatch, self.fp8, @@ -1776,6 +1776,7 @@ def __init__( self.fuse_wgrad_accumulation = fuse_wgrad_accumulation self.use_bias = bias self.return_bias = return_bias + self.apply_bias = bias and not return_bias self.parameters_split = parameters_split if tp_group is None: @@ -1819,7 +1820,7 @@ def __init__( stride=1, ) - if self.use_bias or self.return_bias: + if self.use_bias: self.register_buffer("bias_tensor", torch.empty( self.out_features, @@ -1861,7 +1862,7 @@ def __init__( stride=1, ) - if self.use_bias or self.return_bias: + if self.use_bias: self.register_parameter( bname, Parameter(self.bias_tensor[i * split_size : (i+1) * split_size]) ) @@ -1878,9 +1879,8 @@ def __init__( # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM - if self.parallel_mode == "row" and self.use_bias: + if self.parallel_mode == "row" and self.apply_bias: self.gemm_bias_unfused_add = True - self.use_bias = False else: self.gemm_bias_unfused_add = False @@ -1946,7 +1946,7 @@ def forward( self.weight1_t_fp8 if self.fp8 else None, inp, bias_tensor, - self.use_bias, + self.apply_bias and not self.gemm_bias_unfused_add, is_first_microbatch, self.fp8, self.fp8_calibration, @@ -2667,6 +2667,7 @@ def __init__( self.fuse_wgrad_accumulation = fuse_wgrad_accumulation self.use_bias = bias self.return_bias = return_bias + self.apply_bias = bias and not return_bias self.return_layernorm_output = return_layernorm_output self.bias_gelu_nvfusion = bool(int(os.getenv("NVTE_BIAS_GELU_NVFUSION", "1"))) self.set_parallel_mode = set_parallel_mode @@ -2759,7 +2760,7 @@ def __init__( stride=1, ) - if self.use_bias or self.return_bias: + if self.use_bias: self.fc2_bias = Parameter( torch.empty( hidden_size, device=torch.cuda.current_device(), dtype=params_dtype @@ -2770,9 +2771,8 @@ def __init__( # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM - if self.set_parallel_mode and self.use_bias: + if self.set_parallel_mode and self.apply_bias: self.gemm_bias_unfused_add = True - self.use_bias = False else: self.gemm_bias_unfused_add = False @@ -2845,7 +2845,7 @@ def forward( self.weight2_fp8 if self.fp8 else None, self.weight2_t_fp8 if self.fp8 else None, self.fc2_bias, - self.use_bias, + self.apply_bias and not self.gemm_bias_unfused_add, self.eps, is_first_microbatch, self.fp8, diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index cbd0622947..774c9fd11e 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -607,7 +607,7 @@ def __init__( hidden_size, hidden_size, init_method=output_layer_init_method, - bias=False, + bias=True, return_bias=True, parallel_mode="row" if set_parallel_mode else None, **common_gemm_kwargs, @@ -1059,7 +1059,7 @@ def __init__( get_rng_state_tracker=get_rng_state_tracker, init_method=init_method, output_layer_init_method=output_layer_init_method, - bias=False, + bias=True, return_bias=True, sequence_parallel=self.sequence_parallel, params_dtype=params_dtype, From 626da0deca4b77cfe1e0ad2de970d39938f43210 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 28 Mar 2023 09:44:15 -0700 Subject: [PATCH 011/427] Fix zombie process when querying TE install path (#121) * Remove zombie process from querying TE install path Co-authored-by: Naman Goyal Signed-off-by: Tim Moon * Fix FA version checking Signed-off-by: Kirthi Shankar Sivamani * fix unused import error Signed-off-by: Kirthi Shankar Sivamani * Fix lint warning Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Naman Goyal Co-authored-by: Kirthi Shankar Sivamani --- transformer_engine/common/__init__.py | 24 +++++++++++------------ transformer_engine/pytorch/transformer.py | 4 ++-- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 791ba793a8..7dfcdc96bb 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -3,25 +3,23 @@ # See LICENSE for license information. """FW agnostic user-end APIs""" +import ctypes +import os +import platform +import subprocess def get_te_path(): - """Find TE path using pip""" + """Find Transformer Engine install path using pip""" - import os - - te_info = ( - os.popen("pip show transformer_engine").read().replace("\n", ":").split(":") - ) - return te_info[te_info.index("Location") + 1].strip() + command = ["pip", "show", "transformer_engine"] + result = subprocess.run(command, capture_output=True, check=True, text=True) + result = result.stdout.replace("\n", ":").split(":") + return result[result.index("Location")+1].strip() def _load_library(): - """Load TE .so""" - - import os - import ctypes - import platform + """Load shared library with Transformer Engine C extensions""" system = platform.system() if system == "Linux": @@ -31,7 +29,7 @@ def _load_library(): elif system == "Windows": extension = "dll" else: - raise "Unsupported operating system " + system + "." + raise RuntimeError(f"Unsupported operating system ({system})") lib_name = "libtransformer_engine." + extension dll_path = get_te_path() dll_path = os.path.join(dll_path, lib_name) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 774c9fd11e..fa00fb86fc 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -4,9 +4,9 @@ """Transformer.""" import os -import re import math import warnings +from importlib.metadata import version from contextlib import nullcontext from typing import Any, Callable, Optional, Tuple, Union @@ -42,7 +42,7 @@ checkpoint, ) -_flash_attn_version = re.search("Version: (.*)", os.popen("pip show flash_attn").read()).group(1) +_flash_attn_version = version("flash-attn") warnings.filterwarnings("module", category=DeprecationWarning, module="transformer") From 084b1e54a5d5bc84e380cbebde18d53d0243fc5a Mon Sep 17 00:00:00 2001 From: Jeng Bai-Cheng Date: Wed, 29 Mar 2023 01:39:20 +0800 Subject: [PATCH 012/427] [JAX] Add TE examples (#108) * refactor JAX examples Signed-off-by: Ryan Jeng * fix doc-string Signed-off-by: Ryan Jeng * add dp example Signed-off-by: Ryan Jeng * refactor Signed-off-by: Ryan Jeng * fix params_axes_pspec Signed-off-by: Ryan Jeng * Add model parallel example and refactor Update readme Signed-off-by: Ryan Jeng * align code and readme Signed-off-by: Ryan Jeng * update verification Signed-off-by: Ryan Jeng * add mask Signed-off-by: Ryan Jeng * num_gpu is configurable Signed-off-by: Ryan Jeng * update readme Signed-off-by: Ryan Jeng * update readme Signed-off-by: Ryan Jeng * solvepylint issue Signed-off-by: Ryan Jeng * ignore markdown and txt file from license check Signed-off-by: Ryan Jeng * Update README.md Signed-off-by: Ryan Jeng * add flax into requirements.txt Signed-off-by: Ryan Jeng --------- Signed-off-by: Ryan Jeng --- examples/jax/README.md | 7 + examples/jax/encoder/README.md | 69 +++ examples/jax/encoder/requirements.txt | 4 + .../encoder/test_model_parallel_encoder.py | 441 ++++++++++++++++++ examples/jax/encoder/test_multigpu_encoder.py | 420 +++++++++++++++++ .../encoder/test_single_gpu_bf16_training.py | 75 --- .../jax/encoder/test_single_gpu_encoder.py | 344 ++++++++++++++ .../encoder/test_single_gpu_fp8_training.py | 99 ---- examples/jax/mnist/README.md | 34 ++ examples/jax/mnist/requirements.txt | 3 + examples/jax/mnist/test_single_gpu_mnist.py | 311 ++++++++++++ qa/L0_jax_unittest/test.sh | 3 + qa/L0_license/config.json | 4 +- qa/L0_license/copyright_checker.py | 1 + tests/jax/test_mnist.py | 227 --------- transformer_engine/jax/module.py | 8 +- transformer_engine/jax/transformer.py | 4 +- 17 files changed, 1646 insertions(+), 408 deletions(-) create mode 100644 examples/jax/README.md create mode 100644 examples/jax/encoder/README.md create mode 100644 examples/jax/encoder/requirements.txt create mode 100644 examples/jax/encoder/test_model_parallel_encoder.py create mode 100644 examples/jax/encoder/test_multigpu_encoder.py delete mode 100644 examples/jax/encoder/test_single_gpu_bf16_training.py create mode 100644 examples/jax/encoder/test_single_gpu_encoder.py delete mode 100644 examples/jax/encoder/test_single_gpu_fp8_training.py create mode 100644 examples/jax/mnist/README.md create mode 100644 examples/jax/mnist/requirements.txt create mode 100644 examples/jax/mnist/test_single_gpu_mnist.py delete mode 100644 tests/jax/test_mnist.py diff --git a/examples/jax/README.md b/examples/jax/README.md new file mode 100644 index 0000000000..d2c98f15c2 --- /dev/null +++ b/examples/jax/README.md @@ -0,0 +1,7 @@ +# Transformer Engine Examples # + +This folder contains simple examples introducing Transformer Engine and FP8 training usage. + +**Examples Outline** +* MNIST training: Training MNIST dataset is a good start point to learn how use Transformer Engine and enable FP8 training +* Encoder training: The encoder examples introduce more about how to scale up training on multiple GPUs with Transformer Engine \ No newline at end of file diff --git a/examples/jax/encoder/README.md b/examples/jax/encoder/README.md new file mode 100644 index 0000000000..388f2f40c6 --- /dev/null +++ b/examples/jax/encoder/README.md @@ -0,0 +1,69 @@ +# Basic Transformer Encoder Example with Optional FP8 # + +This example uses Transformer Encoder to demonstrate the Transformer Engine usage. And more focus on scaling up training on multiple GPUs. Highly recommend studying the [MNIST example of the Transformer Engine](/examples/jax/mnist) before reading this example. The Transformer Engine is built on top of [Flax](https://github.com/google/flax). Thus, examples use `pjit` to set up multiple GPU training. The basic pjit usage can be referred to [Scale up Flax Modules on multiple devices with pjit](https://flax.readthedocs.io/en/latest/guides/flax_on_pjit.html). + +## Single GPU ## + +1. Setup dataset: This is done by using the `tfds` library to download the GLUE/CoLA dataset and using `nltk` to tokenize the sentences. This example focuses on Transformer Engine usage. Thus, a simple algorithm is used to convert tokens to INT32 tensors as input to the embedding layer. The `get_datasets` and `data_preprocess` routines are used for this purpose. + +2. Define model: The `Net` class is a small Transformer Encoder model for sentence classification. The Transformer Engine provides `te.TransformerLayer` as encoder block and `te.DenseGeneral`. The structure of encoder block can be referred to [Scaling Up Models and Data with t5x and seqio](https://arxiv.org/abs/2203.17189) + +3. Build training loop: The `train_and_evaluate` is the main routine to initialize the model and start training and evaluating. Use `fp8_autocast` context manager to enable FP8 training and check `var_collect` if the variable collection contains `Float8`. + +4. Training process: In `train_step`, combine the FP8 metadata and latest model parameters into var_collect as a frozen dictionary and fill it to the gradient function. And then, call `te.update_fp8_metas` to update FP8 metadata. The number of training steps to update FP8 metadata can be customized. In this example, it is updated every step. + +5. Evaluating process: Same as the training process, the FP8 metadata needs to be in var_collect and fill it into a loss function, if enabling FP8 computing. + +### Run ### + +```bash +python test_single_gpu_encoder.py +python test_single_gpu_encoder.py --use-fp8 +``` + +## Multiple GPU with Data Parallelism ## + +1. The data parallelism (DP) divides a mini-batch for multiple devices, and each device has complete model parameters. In this example, the first dimension of input tensor is `batch_size` which is 64 by default, and uses 8 GPUs to train the model, so each device takes 8 sentences at once. The "dividing" is called "sharding" in the JAX documents. + +2. In order to let JAX know how to do sharding, the `device_mesh` needs to be defined and each axis need to be named. A common way to annotate axis names is `data` which means the mesh dimension used for data-parallel sharding of the batch dimension of inputs and activations. And the first argument of `te.ShardingResource` is the name of the device axis which is used for data parallelism. + +3. On the model side, the logical axis of each weight tensor of the model can be named. The `te.TransformerLayer` has the default names, which are stored in `abs_var_collect`, a collection of variables returned by `jax.eval_shape(encoder.init, ...)`. The key index is `params_axes`. The `te.DenseGeneral` doesn't have the default named axis because it is generic. Also, data-parallel sharding doesn't need to divide weight tensor, so named axis is not required for this case. But te.DenseGeneral is based on [XLA custom-call](https://www.tensorflow.org/xla/custom_call) and [xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html), the `sharding_type` must be set to map weights and xmap correctly. + +4. The next is to create sharding rules, mapping the device axis to the logical axis. The `te.extend_logical_axis_rules` under fp8_autocast will return a list of pairs of the mapping, such as `(('batch', 'data'), ...)`. The first is the logical axis and second is the device axis. + +5. Refer structure of `abs_var_collect['params']` and `abs_var_collect['params_axes']` to set up `PartitionSpec` for pjit. All logical axes should be replaced by device axes. If the value of PartitionSpec is None, that means no sharding, broadcasting the data to every device. Note that the `params_axes` attribute is provided by Transformer Engine. The Flax's module doesn't have it, such as `nn.Embed`. For nn.Embed, assigning an empty PartitionSpec is fine because each device has its own embedding layer in DP mode. The `get_params_pspec` routine is used for this purpose. Because each device has a complete model in DP mode, all values of PartitionSpec in params_pspec should be None. This will be different in the model parallelism example. + +6. Fill in `params_pspec` and `encoder.init` to pjit to get a compiled function, `pjit_encoder_init `, and use it to initialize the model, so JAX now can know how to do the sharding. + +7. The `train_step` and `eval_step` also needs to be compiled by pjit. Thus, every input and output argument has to be set up `PartitionSpec` if the argument contains a tensor. For instance, the `input_pspec` is `PartitionSpec('data', None)` because the input shape is (batch size, sequence length). Then, the rest of the workflow is similar to the previous example. + +### Run ### + +```bash +python test_multigpu_encoder.py +python test_multigpu_encoder.py --use-fp8 +``` + +## Multiple GPU with Model Parallelism ## + +1. The model parallelism as known as tensor parallelism (TP) divides a model for multiple devices, and each device has part of model parameters. This example inherits previous DP example, but divides a model to two devices. + +2. To set up device mesh for TP, adding a new named axis called `model`, which is used for sharding parameters of the model across devices. This example divides the model to two parts (`num_gpu_tp = 2`). One device only has half of the model. + +3. On the model side, The `te.TransformerLayer` doesn't need additional settings because it has the default axis name already. It will be divided by `DEVICE_TP_AXIS` when model initialization. The first `te.DenseGeneral` is divided by columns and second one is divided by rows for TP. Because `te.DenseGeneral` doesn't have the default named axis, the names must be set manually by passing `kernel_axes` and `bias_axes` arguments. Then, the rest of the workflow is similar to the previous example. + +4. The tips for debugging TP: + * Use [inspect_array_sharding](https://jax.readthedocs.io/en/latest/_autosummary/jax.debug.inspect_array_sharding.html) or [visualize_array_sharding](https://jax.readthedocs.io/en/latest/_autosummary/jax.debug.visualize_array_sharding.html) to check the shape of activations and weights. + * Check the shape of device buffer of weight tensor. For instance, `var_collect['params']['DenseGeneral_0']['kernel'].device_buffers[device_id].shape`. The `device_id` is an integer. If a weight tensor's shape is (256, 256) and you intend to divide it for two devices by second dimension, then the shape returned by device_buffers should be (256, 128). + * Dump XLA HLO by setting `XLA_FLAGS` and see whether it contains unexpected `all-gather` operations or not. + ```python + import os + os.environ['XLA_FLAGS'] = "--xla_dump_hlo_as_proto --xla_dump_hlo_as_text --xla_dump_hlo_as_html --xla_dump_to=" + ``` + +### Run ### + +```bash +python test_model_parallel_encoder.py +python test_model_parallel_encoder.py --use-fp8 +``` diff --git a/examples/jax/encoder/requirements.txt b/examples/jax/encoder/requirements.txt new file mode 100644 index 0000000000..bc1b755cb9 --- /dev/null +++ b/examples/jax/encoder/requirements.txt @@ -0,0 +1,4 @@ +flax +nltk +optax +tensorflow-datasets diff --git a/examples/jax/encoder/test_model_parallel_encoder.py b/examples/jax/encoder/test_model_parallel_encoder.py new file mode 100644 index 0000000000..10c880710e --- /dev/null +++ b/examples/jax/encoder/test_model_parallel_encoder.py @@ -0,0 +1,441 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +""" Encoder training on multi-GPU with tesnor parallelism""" +import argparse +import unittest +from functools import partial + +import jax +import jax.numpy as jnp +import nltk +import numpy as np +import optax +import tensorflow_datasets as tfds +from cuda import cudart +from flax import linen as nn +from flax.core.frozen_dict import FrozenDict +from flax.training import train_state +from jax.experimental import mesh_utils +from jax.experimental.pjit import pjit + +import transformer_engine.jax as te + +DEVICE_DP_AXIS = 'data' +DEVICE_TP_AXIS = 'model' +NAMED_BROADCAST_AXIS = 'my_broadcast_axis' +NAMED_TP_AXIS = 'my_tp_axis' +PARAMS_KEY = 'params' +PARAMS_AXES_KEY = PARAMS_KEY + '_axes' +DROPOUT_KEY = 'dropout' +INPUT_KEY = 'input_rng' + + +def check_num_gpu(desired_num_gpu): + """Check if the number of GPUs are correct.""" + actual_num_gpu = len(jax.local_devices()) + assert actual_num_gpu == desired_num_gpu, f"Number of GPUs is mismatch. " \ + f"{desired_num_gpu} GPUs are assigned, but the actual number of GPUs is {actual_num_gpu}" + + +def gpu_has_fp8(): + """Check if the GPU has FP8.""" + cudaSuccess = cudart.cudaError_t.cudaSuccess + ret, gpu_id = cudart.cudaGetDevice() + assert ret == cudaSuccess + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor + _, major = cudart.cudaDeviceGetAttribute(flag, gpu_id) + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor + _, minor = cudart.cudaDeviceGetAttribute(flag, gpu_id) + sm_arch = major * 10 + minor + return sm_arch >= 89 + + +class Net(nn.Module): + """NLP Encoder""" + num_embed: int + + @nn.compact + def __call__(self, x, mask, disable_dropout=False): + x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) + + te_Encoder = partial(te.TransformerLayer, + hidden_size=256, + mlp_hidden_size=1024, + num_attention_heads=8, + hidden_dropout=0.1, + attention_dropout=0.1, + dropout_rng_name=DROPOUT_KEY, + layer_type=te.TransformerLayerType.ENCODER, + enable_relative_embedding=False, + dtype=jnp.bfloat16) + x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) + + x = x.reshape(x.shape[0], -1) + + x = te.DenseGeneral(features=256, + kernel_axes=(NAMED_BROADCAST_AXIS, NAMED_TP_AXIS), + bias_axes=(NAMED_TP_AXIS,), + sharding_type=te.ShardingType.DP_TP_COL, + dtype=jnp.bfloat16)(x) + + x = te.DenseGeneral(features=256, + kernel_axes=(NAMED_TP_AXIS, NAMED_BROADCAST_AXIS), + bias_axes=(NAMED_BROADCAST_AXIS,), + sharding_type=te.ShardingType.DP_TP_ROW, + dtype=jnp.bfloat16)(x) + + x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) + return x + + +def train_step(state, inputs, masks, labels, var_collect, rngs, use_fp8): + """Computes gradients, loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout, rngs=rngs) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + grad_fn = jax.value_and_grad(loss_fn, has_aux=True) + (loss, logits), grads = grad_fn(var_collect) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + + var_collect, grads = grads.pop(PARAMS_KEY) + state = state.apply_gradients(grads=grads) + if use_fp8: + var_collect = te.update_fp8_metas(var_collect) + + return state, loss, accuracy, var_collect + + +def train_epoch(state, train_ds, batch_size, rngs, var_collect, use_fp8, train_fn): + """Train for a single epoch.""" + train_ds_size = len(train_ds['sentence']) + steps_per_epoch = train_ds_size // batch_size + perms = jax.random.permutation(rngs[INPUT_KEY], train_ds_size) + perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch + perms = perms.reshape((steps_per_epoch, batch_size)) + epoch_loss = [] + epoch_accuracy = [] + + for perm in perms: + batch_inputs = train_ds['sentence'][perm, ...] + batch_masks = train_ds['mask'][perm, ...] + batch_labels = train_ds['label'][perm, ...] + state, loss, accuracy, var_collect = train_fn(state, batch_inputs, batch_masks, + batch_labels, var_collect, rngs, use_fp8) + epoch_loss.append(loss) + epoch_accuracy.append(accuracy) + + avg_loss = np.mean(epoch_loss) + avg_accuracy = np.mean(epoch_accuracy) + return state, avg_loss, avg_accuracy, var_collect + + +def eval_step(state, inputs, masks, labels, var_collect): + """Computes loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + loss, logits = loss_fn(var_collect, disable_dropout=True) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + return loss, accuracy + + +def eval_model(state, test_ds, batch_size, var_collect, eval_fn): + """Evaluation loop.""" + test_ds_size = len(test_ds['sentence']) + num_steps = test_ds_size // batch_size + valid_size = num_steps * batch_size + all_loss = [] + all_accuracy = [] + + for batch_start in range(0, valid_size, batch_size): + batch_end = batch_start + batch_size + batch_inputs = test_ds['sentence'][batch_start:batch_end] + batch_masks = test_ds['mask'][batch_start:batch_end] + batch_labels = test_ds['label'][batch_start:batch_end] + loss, accuracy = eval_fn(state, batch_inputs, batch_masks, batch_labels, var_collect) + all_loss.append(loss) + all_accuracy.append(accuracy) + + avg_loss = np.mean(all_loss) + avg_accuracy = np.mean(all_accuracy) + return avg_loss, avg_accuracy + + +def data_preprocess(dataset, vocab, word_id, max_seq_len): + """Convert tokens to numbers.""" + nltk.download('punkt') + dataset_size = len(dataset['sentence']) + output = np.zeros((dataset_size, max_seq_len), dtype=np.int32) + mask_3d = np.empty((dataset_size, max_seq_len, max_seq_len), dtype=np.uint8) + + for j, sentence in enumerate(dataset['sentence']): + tokens = nltk.word_tokenize(sentence.decode("utf-8")) + tensor = output[j] + mask_1d = np.zeros((1, max_seq_len), dtype=np.uint8) + + for i, word in enumerate(tokens): + if i >= max_seq_len: + break + + if word not in vocab: + vocab[word] = word_id + tensor[i] = word_id + word_id = word_id + 1 + else: + tensor[i] = vocab[word] + + mask_1d[0, i] = 1 + + mask_2d = mask_3d[j] + np.dot(mask_1d.T, mask_1d, out=mask_2d) + np.subtract(1, mask_2d, out=mask_2d) + + dataset['sentence'] = output + dataset['label'] = dataset['label'].astype(np.float32) + dataset['mask'] = mask_3d.reshape((dataset_size, 1, max_seq_len, max_seq_len)) + return dataset, vocab, word_id + + +def get_datasets(max_seq_len): + """Load GLUE train and test datasets into memory.""" + vocab = {} + word_id = 0 + dataset = 'glue/cola' + train_ds = tfds.as_numpy(tfds.load(dataset, split='train', batch_size=-1)) + train_ds, vocab, word_id = data_preprocess(train_ds, vocab, word_id, max_seq_len) + test_ds = tfds.as_numpy(tfds.load(dataset, split='validation', batch_size=-1)) + test_ds, vocab, word_id = data_preprocess(test_ds, vocab, word_id, max_seq_len) + return train_ds, test_ds, word_id + + +def check_fp8(state, var_collect, inputs, masks, labels): + "Check if model includes FP8." + rngs = {DROPOUT_KEY: jax.random.PRNGKey(0)} + assert "Float8" in str( + jax.make_jaxpr(train_step, static_argnums=6)(state, inputs, masks, labels, var_collect, + rngs, True)) + + +def get_params_pspec(sharding_rules, abs_var_collect): + """Refer params to create params partition spec""" + rules_dict = {} + for key, value in sharding_rules: + rules_dict[key] = value + + def to_device_axis(logical_axis): + partitions = [rules_dict[key] for key in logical_axis] + return jax.sharding.PartitionSpec(*partitions) + + params_axes = abs_var_collect.get(PARAMS_AXES_KEY, {}) + params_axes_pspec = jax.tree_map(to_device_axis, nn.partitioning.get_axis_names(params_axes)) + params_pspec = jax.tree_map(lambda x: jax.sharding.PartitionSpec(), abs_var_collect[PARAMS_KEY]) + params_pspec = FrozenDict({**params_pspec, **params_axes_pspec}) + return params_pspec + + +def get_state_pspec(state, params_pspec): + """Refer params_pspec to create state partition spec""" + + def replace_params(x): + return params_pspec if isinstance(x, FrozenDict) else None + + state_pspec = jax.tree_map(replace_params, state, is_leaf=lambda x: isinstance(x, FrozenDict)) + return state_pspec + + +def train_and_evaluate(args): + """Execute model training and evaluation loop.""" + print(args) + check_num_gpu(args.num_gpu) + + if args.use_fp8: + assert gpu_has_fp8(), "GPU needs to support FP8." + + num_gpu_tp = 2 + if args.num_gpu % num_gpu_tp == 0: + num_gpu_dp = args.num_gpu // num_gpu_tp + else: + num_gpu_dp = 1 + num_gpu_tp = 1 + + assert args.batch_size % num_gpu_dp == 0, f"Batch size needs to be multiple of {num_gpu_dp}" + assert args.test_batch_size % num_gpu_dp == 0, \ + f"Test batch size needs to be multiple of {num_gpu_dp}" + + device_mesh = mesh_utils.create_device_mesh((num_gpu_dp, num_gpu_tp)) + with jax.sharding.Mesh(devices=device_mesh, axis_names=(DEVICE_DP_AXIS, DEVICE_TP_AXIS)): + + rng = jax.random.PRNGKey(args.seed) + rng, params_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + init_rngs = {PARAMS_KEY: params_rng, DROPOUT_KEY: dropout_rng} + + input_shape = [args.batch_size, args.max_seq_len] + mask_shape = [args.batch_size, 1, args.max_seq_len, args.max_seq_len] + label_shape = [args.batch_size] + + with te.fp8_autocast(args.use_fp8, + sharding_resource=te.ShardingResource(DEVICE_DP_AXIS, DEVICE_TP_AXIS)): + train_ds, test_ds, num_embed = get_datasets(args.max_seq_len) + encoder = Net(num_embed) + inputs = jnp.zeros(input_shape, dtype=jnp.int32) + masks = jnp.zeros(mask_shape, dtype=jnp.uint8) + abs_var_collect = jax.eval_shape(encoder.init, init_rngs, inputs, masks) + + customized_rules = ((NAMED_BROADCAST_AXIS, None), (NAMED_TP_AXIS, DEVICE_TP_AXIS)) + sharding_rules = te.extend_logical_axis_rules(tuple()) + customized_rules + params_pspec = get_params_pspec(sharding_rules, abs_var_collect) + inputs_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS, None) + masks_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS, None, None, None) + + in_shardings = (None, inputs_pspec, masks_pspec) + out_shardings = FrozenDict({key: params_pspec if key is PARAMS_KEY else None \ + for key in abs_var_collect}) + pjit_encoder_init = pjit(encoder.init, in_shardings, out_shardings) + var_collect = pjit_encoder_init(init_rngs, inputs, masks) + + optimizer = optax.adamw(args.lr) + var_collect, params = var_collect.pop(PARAMS_KEY) + state = train_state.TrainState.create(apply_fn=encoder.apply, + params=params, + tx=optimizer) + state_pspec = get_state_pspec(state, params_pspec) + labels_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS,) + + in_shardings = (state_pspec, inputs_pspec, masks_pspec, labels_pspec, None, None) + out_shardings = (state_pspec, None, None, None) + pjit_train_step = pjit(train_step, in_shardings, out_shardings, static_argnums=(6,)) + + in_shardings = (state_pspec, inputs_pspec, masks_pspec, labels_pspec, None) + out_shardings = (None, None) + pjit_eval_step = pjit(eval_step, in_shardings, out_shardings) + + if args.use_fp8: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + check_fp8(state, var_collect, inputs, masks, labels) + + if args.dry_run: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + rngs = {DROPOUT_KEY: dropout_rng} + pjit_train_step(state, inputs, masks, labels, var_collect, rngs, args.use_fp8) + print("PASSED") + return None + + for epoch in range(1, args.epochs + 1): + rng, input_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + rngs = {INPUT_KEY: input_rng, DROPOUT_KEY: dropout_rng} + + state, train_loss, train_accuracy, var_collect = train_epoch( + state, train_ds, args.batch_size, rngs, var_collect, args.use_fp8, + pjit_train_step) + + test_loss, test_accuracy = eval_model(state, test_ds, args.test_batch_size, + var_collect, pjit_eval_step) + + print(f"Epoch: {epoch:>2} " + f"Train Loss: {train_loss:.6f} " + f"Train Accuracy: {train_accuracy:.6f} " + f"Test Loss: {test_loss:.6f} " + f"Test Accuracy: {test_accuracy:.6f} ") + + return [train_loss, train_accuracy, test_loss, test_accuracy] + + +def encoder_parser(args): + """Training settings.""" + parser = argparse.ArgumentParser(description="JAX Encoder Example") + parser.add_argument( + "--num-gpu", + type=int, + default=8, + metavar="N", + help="number of GPUs (default: 8)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for testing (default: 64)", + ) + parser.add_argument( + "--max-seq-len", + type=int, + default=32, + metavar="N", + help="maximum sequence length (default: 32)", + ) + parser.add_argument( + "--epochs", + type=int, + default=3, + metavar="N", + help="number of epochs to train (default: 3)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.0001, + metavar="LR", + help="learning rate (default: 0.0001)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="quickly check a single pass", + ) + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--use-fp8", + action="store_true", + default=False, + help="Use FP8 for inference and training without recalibration") + + return parser.parse_args(args) + + +class TestEncoder(unittest.TestCase): + """Encoder unittests""" + + @classmethod + def setUpClass(cls): + """Run 3 epochs for testing""" + num_gpu = len(jax.local_devices()) + if num_gpu % 2 != 0: + num_gpu = 1 + cls.args = encoder_parser(["--epochs", "3", "--num-gpu", str(num_gpu)]) + + def test_te_bf16(self): + """Test Transformer Engine with BF16""" + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + @unittest.skipIf(not gpu_has_fp8(), reason='GPU capability is not enough to run FP8') + def test_te_fp8(self): + """Test Transformer Engine with FP8""" + self.args.use_fp8 = True + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + +if __name__ == "__main__": + train_and_evaluate(encoder_parser(None)) diff --git a/examples/jax/encoder/test_multigpu_encoder.py b/examples/jax/encoder/test_multigpu_encoder.py new file mode 100644 index 0000000000..9cb420b0c8 --- /dev/null +++ b/examples/jax/encoder/test_multigpu_encoder.py @@ -0,0 +1,420 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +""" Encoder training on multi-GPU with data parallelism""" +import argparse +import unittest +from functools import partial + +import jax +import jax.numpy as jnp +import nltk +import numpy as np +import optax +import tensorflow_datasets as tfds +from cuda import cudart +from flax import linen as nn +from flax.core.frozen_dict import FrozenDict +from flax.training import train_state +from jax.experimental import mesh_utils +from jax.experimental.pjit import pjit + +import transformer_engine.jax as te + +DEVICE_DP_AXIS = 'data' +PARAMS_KEY = 'params' +PARAMS_AXES_KEY = PARAMS_KEY + '_axes' +DROPOUT_KEY = 'dropout' +INPUT_KEY = 'input_rng' + + +def check_num_gpu(desired_num_gpu): + """Check if the number of GPUs are correct.""" + actual_num_gpu = len(jax.local_devices()) + assert actual_num_gpu == desired_num_gpu, f"Number of GPUs is mismatch. " \ + f"{desired_num_gpu} GPUs are assigned, but the actual number of GPUs is {actual_num_gpu}" + + +def gpu_has_fp8(): + """Check if the GPU has FP8.""" + cudaSuccess = cudart.cudaError_t.cudaSuccess + ret, gpu_id = cudart.cudaGetDevice() + assert ret == cudaSuccess + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor + _, major = cudart.cudaDeviceGetAttribute(flag, gpu_id) + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor + _, minor = cudart.cudaDeviceGetAttribute(flag, gpu_id) + sm_arch = major * 10 + minor + return sm_arch >= 89 + + +class Net(nn.Module): + """NLP Encoder""" + num_embed: int + + @nn.compact + def __call__(self, x, mask, disable_dropout=False): + x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) + + te_Encoder = partial(te.TransformerLayer, + hidden_size=256, + mlp_hidden_size=1024, + num_attention_heads=8, + hidden_dropout=0.1, + attention_dropout=0.1, + dropout_rng_name=DROPOUT_KEY, + layer_type=te.TransformerLayerType.ENCODER, + enable_relative_embedding=False, + dtype=jnp.bfloat16) + x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) + + x = x.reshape(x.shape[0], -1) + + x = te.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, dtype=jnp.bfloat16)(x) + + x = te.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, dtype=jnp.bfloat16)(x) + + x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) + return x + + +def train_step(state, inputs, masks, labels, var_collect, rngs, use_fp8): + """Computes gradients, loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout, rngs=rngs) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + grad_fn = jax.value_and_grad(loss_fn, has_aux=True) + (loss, logits), grads = grad_fn(var_collect) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + + var_collect, grads = grads.pop(PARAMS_KEY) + state = state.apply_gradients(grads=grads) + if use_fp8: + var_collect = te.update_fp8_metas(var_collect) + + return state, loss, accuracy, var_collect + + +def train_epoch(state, train_ds, batch_size, rngs, var_collect, use_fp8, train_fn): + """Train for a single epoch.""" + train_ds_size = len(train_ds['sentence']) + steps_per_epoch = train_ds_size // batch_size + perms = jax.random.permutation(rngs[INPUT_KEY], train_ds_size) + perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch + perms = perms.reshape((steps_per_epoch, batch_size)) + epoch_loss = [] + epoch_accuracy = [] + + for perm in perms: + batch_inputs = train_ds['sentence'][perm, ...] + batch_masks = train_ds['mask'][perm, ...] + batch_labels = train_ds['label'][perm, ...] + state, loss, accuracy, var_collect = train_fn(state, batch_inputs, batch_masks, + batch_labels, var_collect, rngs, use_fp8) + epoch_loss.append(loss) + epoch_accuracy.append(accuracy) + + avg_loss = np.mean(epoch_loss) + avg_accuracy = np.mean(epoch_accuracy) + return state, avg_loss, avg_accuracy, var_collect + + +def eval_step(state, inputs, masks, labels, var_collect): + """Computes loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + loss, logits = loss_fn(var_collect, disable_dropout=True) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + return loss, accuracy + + +def eval_model(state, test_ds, batch_size, var_collect, eval_fn): + """Evaluation loop.""" + test_ds_size = len(test_ds['sentence']) + num_steps = test_ds_size // batch_size + valid_size = num_steps * batch_size + all_loss = [] + all_accuracy = [] + + for batch_start in range(0, valid_size, batch_size): + batch_end = batch_start + batch_size + batch_inputs = test_ds['sentence'][batch_start:batch_end] + batch_masks = test_ds['mask'][batch_start:batch_end] + batch_labels = test_ds['label'][batch_start:batch_end] + loss, accuracy = eval_fn(state, batch_inputs, batch_masks, batch_labels, var_collect) + all_loss.append(loss) + all_accuracy.append(accuracy) + + avg_loss = np.mean(all_loss) + avg_accuracy = np.mean(all_accuracy) + return avg_loss, avg_accuracy + + +def data_preprocess(dataset, vocab, word_id, max_seq_len): + """Convert tokens to numbers.""" + nltk.download('punkt') + dataset_size = len(dataset['sentence']) + output = np.zeros((dataset_size, max_seq_len), dtype=np.int32) + mask_3d = np.empty((dataset_size, max_seq_len, max_seq_len), dtype=np.uint8) + + for j, sentence in enumerate(dataset['sentence']): + tokens = nltk.word_tokenize(sentence.decode("utf-8")) + tensor = output[j] + mask_1d = np.zeros((1, max_seq_len), dtype=np.uint8) + + for i, word in enumerate(tokens): + if i >= max_seq_len: + break + + if word not in vocab: + vocab[word] = word_id + tensor[i] = word_id + word_id = word_id + 1 + else: + tensor[i] = vocab[word] + + mask_1d[0, i] = 1 + + mask_2d = mask_3d[j] + np.dot(mask_1d.T, mask_1d, out=mask_2d) + np.subtract(1, mask_2d, out=mask_2d) + + dataset['sentence'] = output + dataset['label'] = dataset['label'].astype(np.float32) + dataset['mask'] = mask_3d.reshape((dataset_size, 1, max_seq_len, max_seq_len)) + return dataset, vocab, word_id + + +def get_datasets(max_seq_len): + """Load GLUE train and test datasets into memory.""" + vocab = {} + word_id = 0 + dataset = 'glue/cola' + train_ds = tfds.as_numpy(tfds.load(dataset, split='train', batch_size=-1)) + train_ds, vocab, word_id = data_preprocess(train_ds, vocab, word_id, max_seq_len) + test_ds = tfds.as_numpy(tfds.load(dataset, split='validation', batch_size=-1)) + test_ds, vocab, word_id = data_preprocess(test_ds, vocab, word_id, max_seq_len) + return train_ds, test_ds, word_id + + +def check_fp8(state, var_collect, inputs, masks, labels): + "Check if model includes FP8." + rngs = {DROPOUT_KEY: jax.random.PRNGKey(0)} + assert "Float8" in str( + jax.make_jaxpr(train_step, static_argnums=6)(state, inputs, masks, labels, var_collect, + rngs, True)) + + +def get_params_pspec(sharding_rules, abs_var_collect): + """Refer params to create params partition spec""" + rules_dict = {} + for key, value in sharding_rules: + rules_dict[key] = value + + def to_device_axis(logical_axis): + partitions = [rules_dict[key] for key in logical_axis] + return jax.sharding.PartitionSpec(*partitions) + + params_axes = abs_var_collect.get(PARAMS_AXES_KEY, {}) + params_axes_pspec = jax.tree_map(to_device_axis, nn.partitioning.get_axis_names(params_axes)) + params_pspec = jax.tree_map(lambda x: jax.sharding.PartitionSpec(), abs_var_collect[PARAMS_KEY]) + params_pspec = FrozenDict({**params_pspec, **params_axes_pspec}) + return params_pspec + + +def get_state_pspec(state, params_pspec): + """Refer params_pspec to create state partition spec""" + + def replace_params(x): + return params_pspec if isinstance(x, FrozenDict) else None + + state_pspec = jax.tree_map(replace_params, state, is_leaf=lambda x: isinstance(x, FrozenDict)) + return state_pspec + + +def train_and_evaluate(args): + """Execute model training and evaluation loop.""" + print(args) + check_num_gpu(args.num_gpu) + assert args.batch_size % args.num_gpu == 0, f"Batch size needs to be multiple of {args.num_gpu}" + assert args.test_batch_size % args.num_gpu == 0, \ + f"Test batch size needs to be multiple of {args.num_gpu}" + + if args.use_fp8: + assert gpu_has_fp8(), "GPU needs to support FP8." + + device_mesh = mesh_utils.create_device_mesh((args.num_gpu,)) + with jax.sharding.Mesh(devices=device_mesh, axis_names=(DEVICE_DP_AXIS,)): + + rng = jax.random.PRNGKey(args.seed) + rng, params_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + init_rngs = {PARAMS_KEY: params_rng, DROPOUT_KEY: dropout_rng} + + input_shape = [args.batch_size, args.max_seq_len] + mask_shape = [args.batch_size, 1, args.max_seq_len, args.max_seq_len] + label_shape = [args.batch_size] + + with te.fp8_autocast(args.use_fp8, sharding_resource=te.ShardingResource(DEVICE_DP_AXIS)): + train_ds, test_ds, num_embed = get_datasets(args.max_seq_len) + encoder = Net(num_embed) + inputs = jnp.zeros(input_shape, dtype=jnp.int32) + masks = jnp.zeros(mask_shape, dtype=jnp.uint8) + abs_var_collect = jax.eval_shape(encoder.init, init_rngs, inputs, masks) + + sharding_rules = te.extend_logical_axis_rules(tuple()) + params_pspec = get_params_pspec(sharding_rules, abs_var_collect) + inputs_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS, None) + masks_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS, None, None, None) + + in_shardings = (None, inputs_pspec, masks_pspec) + out_shardings = FrozenDict({key: params_pspec if key is PARAMS_KEY else None \ + for key in abs_var_collect}) + pjit_encoder_init = pjit(encoder.init, in_shardings, out_shardings) + var_collect = pjit_encoder_init(init_rngs, inputs, masks) + + optimizer = optax.adamw(args.lr) + var_collect, params = var_collect.pop(PARAMS_KEY) + state = train_state.TrainState.create(apply_fn=encoder.apply, + params=params, + tx=optimizer) + state_pspec = get_state_pspec(state, params_pspec) + labels_pspec = jax.sharding.PartitionSpec(DEVICE_DP_AXIS,) + + in_shardings = (state_pspec, inputs_pspec, masks_pspec, labels_pspec, None, None) + out_shardings = (state_pspec, None, None, None) + pjit_train_step = pjit(train_step, in_shardings, out_shardings, static_argnums=(6,)) + + in_shardings = (state_pspec, inputs_pspec, masks_pspec, labels_pspec, None) + out_shardings = (None, None) + pjit_eval_step = pjit(eval_step, in_shardings, out_shardings) + + if args.use_fp8: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + check_fp8(state, var_collect, inputs, masks, labels) + + if args.dry_run: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + rngs = {DROPOUT_KEY: dropout_rng} + pjit_train_step(state, inputs, masks, labels, var_collect, rngs, args.use_fp8) + print("PASSED") + return None + + for epoch in range(1, args.epochs + 1): + rng, input_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + rngs = {INPUT_KEY: input_rng, DROPOUT_KEY: dropout_rng} + + state, train_loss, train_accuracy, var_collect = train_epoch( + state, train_ds, args.batch_size, rngs, var_collect, args.use_fp8, + pjit_train_step) + + test_loss, test_accuracy = eval_model(state, test_ds, args.test_batch_size, + var_collect, pjit_eval_step) + + print(f"Epoch: {epoch:>2} " + f"Train Loss: {train_loss:.6f} " + f"Train Accuracy: {train_accuracy:.6f} " + f"Test Loss: {test_loss:.6f} " + f"Test Accuracy: {test_accuracy:.6f} ") + + return [train_loss, train_accuracy, test_loss, test_accuracy] + + +def encoder_parser(args): + """Training settings.""" + parser = argparse.ArgumentParser(description="JAX Encoder Example") + parser.add_argument( + "--num-gpu", + type=int, + default=8, + metavar="N", + help="number of GPUs (default: 8)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for testing (default: 64)", + ) + parser.add_argument( + "--max-seq-len", + type=int, + default=32, + metavar="N", + help="maximum sequence length (default: 32)", + ) + parser.add_argument( + "--epochs", + type=int, + default=3, + metavar="N", + help="number of epochs to train (default: 3)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.0001, + metavar="LR", + help="learning rate (default: 0.0001)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="quickly check a single pass", + ) + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--use-fp8", + action="store_true", + default=False, + help="Use FP8 for inference and training without recalibration") + + return parser.parse_args(args) + + +class TestEncoder(unittest.TestCase): + """Encoder unittests""" + + @classmethod + def setUpClass(cls): + """Run 3 epochs for testing""" + num_gpu = len(jax.local_devices()) + if num_gpu % 2 != 0: + num_gpu = 1 + cls.args = encoder_parser(["--epochs", "3", "--num-gpu", str(num_gpu)]) + + def test_te_bf16(self): + """Test Transformer Engine with BF16""" + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + @unittest.skipIf(not gpu_has_fp8(), reason='GPU capability is not enough to run FP8') + def test_te_fp8(self): + """Test Transformer Engine with FP8""" + self.args.use_fp8 = True + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + +if __name__ == "__main__": + train_and_evaluate(encoder_parser(None)) diff --git a/examples/jax/encoder/test_single_gpu_bf16_training.py b/examples/jax/encoder/test_single_gpu_bf16_training.py deleted file mode 100644 index 122f2aa599..0000000000 --- a/examples/jax/encoder/test_single_gpu_bf16_training.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. -""" Encoder with BF16 Training on single GPU""" -import jax -import jax.numpy as jnp -import optax -from flax.core.frozen_dict import FrozenDict -from flax.training import train_state - -import transformer_engine.jax as te - -PARAMS_KEY = 'params' - -BATCH = 32 -SEQLEN = 512 -HIDDEN = 1024 - - -def network(): - """NLP Encoder""" - encoder = te.TransformerLayer(hidden_size=HIDDEN, - mlp_hidden_size=4 * HIDDEN, - hidden_dropout=0.0, - attention_dropout=0.0, - layernorm_type='rmsnorm', - mlp_activations=('gelu', 'linear'), - layer_type=te.TransformerLayerType.ENCODER, - transpose_batch_sequence=True, - dtype=jnp.bfloat16) - return encoder - - -def synthesis_data(data_rng): - """Dataset generator""" - return jax.random.normal(data_rng, [SEQLEN, BATCH, HIDDEN], jnp.bfloat16) - - -def train_step(batch, state, others): - """Training function.""" - - def loss_fn(collections): - logits = state.apply_fn(collections, batch) - loss = jnp.mean(logits) - return loss - - grad_fn = jax.value_and_grad(loss_fn) - loss, grads = grad_fn(FrozenDict({PARAMS_KEY: state.params, **others})) - grads, params_grads = grads.pop(PARAMS_KEY) - state = state.apply_gradients(grads=params_grads) - return loss, state, others - - -def test_encoder(): - """Encoder example""" - rng = jax.random.PRNGKey(0) - rng, init_rng, data_rng = jax.random.split(rng, 3) - inputs = synthesis_data(data_rng) - - encoder = network() - variables = jax.jit(encoder.init)(init_rng, inputs) - variables, params = variables.pop(PARAMS_KEY) - optimizer = optax.sgd(0.001, 0.9) - state = train_state.TrainState.create(apply_fn=encoder.apply, params=params, tx=optimizer) - jitted_train_step = jax.jit(train_step) - - for i in range(5): - rng, data_rng = jax.random.split(rng) - inputs = synthesis_data(data_rng) - loss, state, variables = jitted_train_step(inputs, state, variables) - print(f"Step {i} - Loss: {loss}") - - -if __name__ == "__main__": - test_encoder() diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py new file mode 100644 index 0000000000..bac1469b5b --- /dev/null +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -0,0 +1,344 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +""" Encoder training on single GPU""" +import argparse +import os +import unittest +from functools import partial + +import jax +import jax.numpy as jnp +import nltk +import numpy as np +import optax +import tensorflow_datasets as tfds +from cuda import cudart +from flax import linen as nn +from flax.core.frozen_dict import FrozenDict +from flax.training import train_state + +import transformer_engine.jax as te + +PARAMS_KEY = 'params' +DROPOUT_KEY = 'dropout' +INPUT_KEY = 'input_rng' + + +def gpu_has_fp8(): + """Check if the GPU has FP8.""" + cudaSuccess = cudart.cudaError_t.cudaSuccess + ret, gpu_id = cudart.cudaGetDevice() + assert ret == cudaSuccess + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor + _, major = cudart.cudaDeviceGetAttribute(flag, gpu_id) + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor + _, minor = cudart.cudaDeviceGetAttribute(flag, gpu_id) + sm_arch = major * 10 + minor + return sm_arch >= 89 + + +class Net(nn.Module): + """NLP Encoder""" + num_embed: int + + @nn.compact + def __call__(self, x, mask, disable_dropout=False): + x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) + + te_Encoder = partial(te.TransformerLayer, + hidden_size=256, + mlp_hidden_size=1024, + num_attention_heads=8, + hidden_dropout=0.1, + attention_dropout=0.1, + dropout_rng_name=DROPOUT_KEY, + layer_type=te.TransformerLayerType.ENCODER, + enable_relative_embedding=False, + dtype=jnp.bfloat16) + x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) + + x = x.reshape(x.shape[0], -1) + + x = te.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) + + x = te.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) + + x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) + return x + + +@partial(jax.jit, static_argnums=6) +def train_step(state, inputs, masks, labels, var_collect, rngs, use_fp8): + """Computes gradients, loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout, rngs=rngs) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + grad_fn = jax.value_and_grad(loss_fn, has_aux=True) + (loss, logits), grads = grad_fn(var_collect) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + + var_collect, grads = grads.pop(PARAMS_KEY) + state = state.apply_gradients(grads=grads) + if use_fp8: + var_collect = te.update_fp8_metas(var_collect) + + return state, loss, accuracy, var_collect + + +def train_epoch(state, train_ds, batch_size, rngs, var_collect, use_fp8): + """Train for a single epoch.""" + train_ds_size = len(train_ds['sentence']) + steps_per_epoch = train_ds_size // batch_size + perms = jax.random.permutation(rngs[INPUT_KEY], train_ds_size) + perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch + perms = perms.reshape((steps_per_epoch, batch_size)) + epoch_loss = [] + epoch_accuracy = [] + + for perm in perms: + batch_inputs = train_ds['sentence'][perm, ...] + batch_masks = train_ds['mask'][perm, ...] + batch_labels = train_ds['label'][perm, ...] + state, loss, accuracy, var_collect = train_step(state, batch_inputs, batch_masks, + batch_labels, var_collect, rngs, use_fp8) + epoch_loss.append(loss) + epoch_accuracy.append(accuracy) + + avg_loss = np.mean(epoch_loss) + avg_accuracy = np.mean(epoch_accuracy) + return state, avg_loss, avg_accuracy, var_collect + + +@jax.jit +def eval_step(state, inputs, masks, labels, var_collect): + """Computes loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, inputs, masks, disable_dropout) + one_hot = jax.nn.one_hot(labels, 2) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + loss, logits = loss_fn(var_collect, disable_dropout=True) + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + return loss, accuracy + + +def eval_model(state, test_ds, batch_size, var_collect): + """Evaluation loop.""" + test_ds_size = len(test_ds['sentence']) + num_steps = test_ds_size // batch_size + valid_size = num_steps * batch_size + all_loss = [] + all_accuracy = [] + + for batch_start in range(0, valid_size, batch_size): + batch_end = batch_start + batch_size + batch_inputs = test_ds['sentence'][batch_start:batch_end] + batch_masks = test_ds['mask'][batch_start:batch_end] + batch_labels = test_ds['label'][batch_start:batch_end] + loss, accuracy = eval_step(state, batch_inputs, batch_masks, batch_labels, var_collect) + all_loss.append(loss) + all_accuracy.append(accuracy) + + avg_loss = np.mean(all_loss) + avg_accuracy = np.mean(all_accuracy) + return avg_loss, avg_accuracy + + +def data_preprocess(dataset, vocab, word_id, max_seq_len): + """Convert tokens to numbers.""" + nltk.download('punkt') + dataset_size = len(dataset['sentence']) + output = np.zeros((dataset_size, max_seq_len), dtype=np.int32) + mask_3d = np.empty((dataset_size, max_seq_len, max_seq_len), dtype=np.uint8) + + for j, sentence in enumerate(dataset['sentence']): + tokens = nltk.word_tokenize(sentence.decode("utf-8")) + tensor = output[j] + mask_1d = np.zeros((1, max_seq_len), dtype=np.uint8) + + for i, word in enumerate(tokens): + if i >= max_seq_len: + break + + if word not in vocab: + vocab[word] = word_id + tensor[i] = word_id + word_id = word_id + 1 + else: + tensor[i] = vocab[word] + + mask_1d[0, i] = 1 + + mask_2d = mask_3d[j] + np.dot(mask_1d.T, mask_1d, out=mask_2d) + np.subtract(1, mask_2d, out=mask_2d) + + dataset['sentence'] = output + dataset['label'] = dataset['label'].astype(np.float32) + dataset['mask'] = mask_3d.reshape((dataset_size, 1, max_seq_len, max_seq_len)) + return dataset, vocab, word_id + + +def get_datasets(max_seq_len): + """Load GLUE train and test datasets into memory.""" + vocab = {} + word_id = 0 + dataset = 'glue/cola' + train_ds = tfds.as_numpy(tfds.load(dataset, split='train', batch_size=-1)) + train_ds, vocab, word_id = data_preprocess(train_ds, vocab, word_id, max_seq_len) + test_ds = tfds.as_numpy(tfds.load(dataset, split='validation', batch_size=-1)) + test_ds, vocab, word_id = data_preprocess(test_ds, vocab, word_id, max_seq_len) + return train_ds, test_ds, word_id + + +def check_fp8(state, var_collect, inputs, masks, labels): + "Check if model includes FP8." + rngs = {DROPOUT_KEY: jax.random.PRNGKey(0)} + assert "Float8" in str( + jax.make_jaxpr(train_step, static_argnums=6)(state, inputs, masks, labels, var_collect, + rngs, True)) + + +def train_and_evaluate(args): + """Execute model training and evaluation loop.""" + os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false" + print(args) + + if args.use_fp8: + assert gpu_has_fp8(), "GPU needs to support FP8." + + rng = jax.random.PRNGKey(args.seed) + rng, params_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + init_rngs = {PARAMS_KEY: params_rng, DROPOUT_KEY: dropout_rng} + + input_shape = [args.batch_size, args.max_seq_len] + mask_shape = [args.batch_size, 1, args.max_seq_len, args.max_seq_len] + label_shape = [args.batch_size] + + with te.fp8_autocast(enabled=args.use_fp8): + train_ds, test_ds, num_embed = get_datasets(args.max_seq_len) + encoder = Net(num_embed) + inputs = jnp.zeros(input_shape, dtype=jnp.int32) + masks = jnp.zeros(mask_shape, dtype=jnp.uint8) + var_collect = encoder.init(init_rngs, inputs, masks) + tx = optax.adamw(args.lr) + state = train_state.TrainState.create(apply_fn=encoder.apply, + params=var_collect[PARAMS_KEY], + tx=tx) + + if args.use_fp8: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + check_fp8(state, var_collect, inputs, masks, labels) + + if args.dry_run: + labels = jnp.zeros(label_shape, dtype=jnp.bfloat16) + rngs = {DROPOUT_KEY: dropout_rng} + train_step(state, inputs, masks, labels, var_collect, rngs, args.use_fp8) + print("PASSED") + return None + + for epoch in range(1, args.epochs + 1): + rng, input_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + rngs = {INPUT_KEY: input_rng, DROPOUT_KEY: dropout_rng} + + state, train_loss, train_accuracy, var_collect = train_epoch( + state, train_ds, args.batch_size, rngs, var_collect, args.use_fp8) + + test_loss, test_accuracy = eval_model(state, test_ds, args.test_batch_size, var_collect) + + print(f"Epoch: {epoch:>2} " + f"Train Loss: {train_loss:.6f} " + f"Train Accuracy: {train_accuracy:.6f} " + f"Test Loss: {test_loss:.6f} " + f"Test Accuracy: {test_accuracy:.6f} ") + + return [train_loss, train_accuracy, test_loss, test_accuracy] + + +def encoder_parser(args): + """Training settings.""" + parser = argparse.ArgumentParser(description="JAX Encoder Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for testing (default: 64)", + ) + parser.add_argument( + "--max-seq-len", + type=int, + default=32, + metavar="N", + help="maximum sequence length (default: 32)", + ) + parser.add_argument( + "--epochs", + type=int, + default=3, + metavar="N", + help="number of epochs to train (default: 3)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.0001, + metavar="LR", + help="learning rate (default: 0.0001)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="quickly check a single pass", + ) + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--use-fp8", + action="store_true", + default=False, + help="Use FP8 for inference and training without recalibration") + + return parser.parse_args(args) + + +class TestEncoder(unittest.TestCase): + """Encoder unittests""" + + @classmethod + def setUpClass(cls): + """Run 4 epochs for testing""" + cls.args = encoder_parser(["--epochs", "3"]) + + def test_te_bf16(self): + """Test Transformer Engine with BF16""" + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + @unittest.skipIf(not gpu_has_fp8(), reason='GPU capability is not enough to run FP8') + def test_te_fp8(self): + """Test Transformer Engine with FP8""" + self.args.use_fp8 = True + actual = train_and_evaluate(self.args) + assert actual[0] < 0.45 and actual[1] > 0.79 + + +if __name__ == "__main__": + train_and_evaluate(encoder_parser(None)) diff --git a/examples/jax/encoder/test_single_gpu_fp8_training.py b/examples/jax/encoder/test_single_gpu_fp8_training.py deleted file mode 100644 index f03b43250a..0000000000 --- a/examples/jax/encoder/test_single_gpu_fp8_training.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. -""" Encoder with FP8 Training on single GPU""" -import jax -import jax.numpy as jnp -import optax -from cuda import cudart -from flax.core.frozen_dict import FrozenDict -from flax.training import train_state - -import transformer_engine.jax as te -from transformer_engine.jax.fp8 import FP8Helper -from transformer_engine.common.recipe import Format as FP8Format -from transformer_engine.common.recipe import DelayedScaling - -PARAMS_KEY = 'params' - -BATCH = 32 -SEQLEN = 512 -HIDDEN = 1024 - - -def gpu_has_fp8(): - """GPU arch has to support FP8""" - cudaSuccess = cudart.cudaError_t.cudaSuccess - ret, gpu_id = cudart.cudaGetDevice() - assert ret == cudaSuccess - flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor - _, major = cudart.cudaDeviceGetAttribute(flag, gpu_id) - flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor - _, minor = cudart.cudaDeviceGetAttribute(flag, gpu_id) - sm_arch = major * 10 + minor - return sm_arch >= 89 - - -def network(): - """NLP Encoder""" - encoder = te.TransformerLayer(hidden_size=HIDDEN, - mlp_hidden_size=4 * HIDDEN, - hidden_dropout=0.0, - attention_dropout=0.0, - layernorm_type='rmsnorm', - mlp_activations=('gelu', 'linear'), - layer_type=te.TransformerLayerType.ENCODER, - transpose_batch_sequence=True, - dtype=jnp.bfloat16) - return encoder - - -def synthesis_data(data_rng): - """Dataset generator""" - return jax.random.normal(data_rng, [SEQLEN, BATCH, HIDDEN], jnp.bfloat16) - - -def train_step(batch, state, others): - """Training function.""" - - def loss_fn(collections): - logits = state.apply_fn(collections, batch) - loss = jnp.mean(logits) - return loss - - grad_fn = jax.value_and_grad(loss_fn) - loss, grads = grad_fn(FrozenDict({PARAMS_KEY: state.params, **others})) - grads, params_grads = grads.pop(PARAMS_KEY) - state = state.apply_gradients(grads=params_grads) - others = FP8Helper.update_fp8_metas(grads) - return loss, state, others - - -def test_encoder(): - """Encoder example""" - if gpu_has_fp8() is False: - print("GPU doesn't support FP8") - return - - rng = jax.random.PRNGKey(0) - rng, init_rng, data_rng = jax.random.split(rng, 3) - inputs = synthesis_data(data_rng) - optimizer = optax.sgd(0.001, 0.9) - - with te.fp8_autocast(enabled=True, fp8_recipe=DelayedScaling(fp8_format=FP8Format.HYBRID)): - encoder = network() - variables = jax.jit(encoder.init)(init_rng, inputs) - variables, params = variables.pop(PARAMS_KEY) - state = train_state.TrainState.create(apply_fn=encoder.apply, params=params, tx=optimizer) - jitted_train_step = jax.jit(train_step) - assert "fp8" in str(jax.make_jaxpr(jitted_train_step)(inputs, state, variables)) - - for i in range(5): - rng, data_rng = jax.random.split(rng) - inputs = synthesis_data(data_rng) - loss, state, variables = jitted_train_step(inputs, state, variables) - print(f"Step {i} - Loss: {loss}") - - -if __name__ == "__main__": - test_encoder() diff --git a/examples/jax/mnist/README.md b/examples/jax/mnist/README.md new file mode 100644 index 0000000000..51e4f45f5f --- /dev/null +++ b/examples/jax/mnist/README.md @@ -0,0 +1,34 @@ +# Basic MNIST Example with Optional FP8 # + +This example uses MNIST training to demonstrate the Transformer Engine usage. The Transformer Engine is built on top of [Flax](https://github.com/google/flax), a neural network library and ecosystem for JAX. Thus, the Transformer Engine is free to interoperate with other Flax modules. The basic Flax usage can be referred to [Flax Basics](https://flax.readthedocs.io/en/latest/guides/flax_basics.html). + +1. Setup dataset: The first step is to prepare the dataset. This is done by using the `tfds` library to download the MNIST dataset and perform image preprocessing. The `get_datasets` routine is used for this purpose. + +2. Define model: The `Net` class is a small CNN model for image classification. It has an option to switch between using `nn.Dense` provided by Flax and `te.DenseGeneral` provided by the Transformer Engine. This allows for easy comparison between the two libraries. + +3. Build training loop: The `train_and_evaluate` is the main routine to initialize the model and start training and evaluating. For FP8 training, the key is `te.fp8_autocast` context manager. If fp8_autocast is enabled, it will cast all `te.DenseGeneral` to FP8 precision. The `var_collect` is a collection including needed information for model training, such as parameters and FP8 metadata, which is necessary for correct casting of BF16 tensors into FP8 tensors at runtime. If fp8_autocast is turned on and print var_collect, you will see FP8 metadata inside, such as `fp8_meta_collection` section. The training and evaluating with FP8 have to be done under fp8_autocast. If not, then fp8_autocast will deconstruct the FP8 metadata, and the model will fall back to higher floating point precision, such as BF16 in this example. To check if FP8 is enabled, use the `check_fp8` routine. If model initialization with FP8 works fine, the string returned by jax.make_jaxpr should include the `Float8` keyword. + +4. Training process: In `apply_model`, the main difference between normal Flax usage and this example is, with FP8 training, the FP8 metadata has to be filled into the gradient function `grad_fn`. Otherwise, the Transformer Engine doesn't know how to cast the BF16 tensor into FP8 tensor at runtime correctly. The FP8 metadata doesn't belong in model parameters (`state.params`), so we need to manually combine the metadata and latest model parameters into var_collect as a frozen dictionary and fill it to the gradient function. After getting loss and gradient, we also need to call `te.update_fp8_metas` to update FP8 metadata in the `update_model` routine. The number of training steps to update FP8 metadata can be customized. In this example, it is updated every step. + +5. Evaluating process: The evaluating process is the same as the training process. Need to ensure FP8 metadata is inside var_collect and fill it into loss function. + +6. Additional options: The `te.fp8_autocast` context manager has additional options + * FP8 Recipe: control FP8 training behavior. See the [FP8 tutorial](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html) for a detailed explanation of FP8 recipes and the supported options. **Noted** that FP8 metadata is now the responsibility of the user to update (i.e., manually calling `te.update_fp8_metas`). The JAX version of Transformer Engine cannot update FP8 metadata on its own. + * Sharding Resource: tell Transformer Engine how to make data parallelism and tensor parallelism. We will introduce it more in Encoder examples. + +## Run ## + +1. Use Flax to train MNIST with BF16 as usual +```bash +python test_single_gpu_mnist.py +``` + +2. Use `te.DenseGeneral` provided by Transformer Engine to train MNIST with BF16 +```bash +python test_single_gpu_mnist.py --use-te +``` + +3. Use `te.DenseGeneral` provided by Transformer Engine to train MNIST and enable FP8 training and evaluation. +```bash +python test_single_gpu_mnist.py --use-fp8 +``` diff --git a/examples/jax/mnist/requirements.txt b/examples/jax/mnist/requirements.txt new file mode 100644 index 0000000000..b5b1aca343 --- /dev/null +++ b/examples/jax/mnist/requirements.txt @@ -0,0 +1,3 @@ +flax +optax +tensorflow-datasets diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py new file mode 100644 index 0000000000..0b16dd8b98 --- /dev/null +++ b/examples/jax/mnist/test_single_gpu_mnist.py @@ -0,0 +1,311 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +""" MNIST training on single GPU""" +import argparse +import os +import unittest +from functools import partial + +import jax +import jax.numpy as jnp +import numpy as np +import optax +import tensorflow_datasets as tfds +from cuda import cudart +from flax import linen as nn +from flax.core.frozen_dict import FrozenDict +from flax.training import train_state + +import transformer_engine.jax as te + +IMAGE_H = 28 +IMAGE_W = 28 +IMAGE_C = 1 +PARAMS_KEY = 'params' +DROPOUT_KEY = 'dropout' +INPUT_KEY = 'input_rng' + + +def gpu_has_fp8(): + """Check if the GPU has FP8.""" + cudaSuccess = cudart.cudaError_t.cudaSuccess + ret, gpu_id = cudart.cudaGetDevice() + assert ret == cudaSuccess + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor + _, major = cudart.cudaDeviceGetAttribute(flag, gpu_id) + flag = cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor + _, minor = cudart.cudaDeviceGetAttribute(flag, gpu_id) + sm_arch = major * 10 + minor + return sm_arch >= 89 + + +class Net(nn.Module): + """CNN model for MNIST.""" + use_te: bool = False + + @nn.compact + def __call__(self, x, disable_dropout=False): + if self.use_te: + nn_Dense = te.DenseGeneral + else: + nn_Dense = nn.Dense + + x = nn.Conv(features=32, kernel_size=(3, 3), strides=1, dtype=jnp.bfloat16)(x) + x = nn.relu(x) + x = nn.Conv(features=64, kernel_size=(3, 3), strides=1, dtype=jnp.bfloat16)(x) + x = nn.relu(x) + x = nn.max_pool(x, window_shape=(2, 2), strides=(2, 2)) + x = nn.Dropout(rate=0.25)(x, deterministic=disable_dropout) + x = x.reshape(x.shape[0], -1) + x = nn_Dense(features=128, dtype=jnp.bfloat16)(x) + x = nn.relu(x) + x = nn.Dropout(rate=0.5)(x, deterministic=disable_dropout) + x = nn_Dense(features=16, dtype=jnp.bfloat16)(x) + x = nn.Dense(features=10, dtype=jnp.bfloat16)(x) + return x + + +@jax.jit +def apply_model(state, images, labels, var_collect, rngs=None): + """Computes gradients, loss and accuracy for a single batch.""" + + def loss_fn(var_collect, disable_dropout=False): + logits = state.apply_fn(var_collect, images, disable_dropout, rngs=rngs) + one_hot = jax.nn.one_hot(labels, 10) + loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) + return loss, logits + + var_collect = FrozenDict({**var_collect, PARAMS_KEY: state.params}) + + if rngs is not None: + grad_fn = jax.value_and_grad(loss_fn, has_aux=True) + (loss, logits), grads = grad_fn(var_collect) + else: + loss, logits = loss_fn(var_collect, disable_dropout=True) + grads = None + + accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) + return grads, loss, accuracy + + +@partial(jax.jit, static_argnums=2) +def update_model(state, grads, use_fp8): + """Update model params and FP8 meta.""" + state = state.apply_gradients(grads=grads[PARAMS_KEY]) + if use_fp8: + grads = te.update_fp8_metas(grads) + return state, grads + + +def train_epoch(state, train_ds, batch_size, rngs, var_collect, use_fp8): + """Train for a single epoch.""" + train_ds_size = len(train_ds['image']) + steps_per_epoch = train_ds_size // batch_size + perms = jax.random.permutation(rngs[INPUT_KEY], train_ds_size) + perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch + perms = perms.reshape((steps_per_epoch, batch_size)) + epoch_loss = [] + epoch_accuracy = [] + + for perm in perms: + batch_images = train_ds['image'][perm, ...] + batch_labels = train_ds['label'][perm, ...] + grads, loss, accuracy = apply_model(state, batch_images, batch_labels, var_collect, rngs) + state, var_collect = update_model(state, grads, use_fp8) + epoch_loss.append(loss) + epoch_accuracy.append(accuracy) + + avg_loss = np.mean(epoch_loss) + avg_accuracy = np.mean(epoch_accuracy) + return state, avg_loss, avg_accuracy, var_collect + + +def eval_model(state, test_ds, batch_size, var_collect): + """Evaluation loop.""" + test_ds_size = len(test_ds['image']) + num_steps = test_ds_size // batch_size + valid_size = num_steps * batch_size + all_loss = [] + all_accuracy = [] + + for batch_start in range(0, valid_size, batch_size): + batch_end = batch_start + batch_size + batch_images = test_ds['image'][batch_start:batch_end] + batch_labels = test_ds['label'][batch_start:batch_end] + _, loss, accuracy = apply_model(state, batch_images, batch_labels, var_collect) + all_loss.append(loss) + all_accuracy.append(accuracy) + + avg_loss = np.mean(all_loss) + avg_accuracy = np.mean(all_accuracy) + return avg_loss, avg_accuracy + + +def get_datasets(): + """Load MNIST train and test datasets into memory.""" + ds_builder = tfds.builder('mnist') + ds_builder.download_and_prepare() + train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1)) + test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1)) + train_ds['image'] = jnp.float32(train_ds['image']) / 255. + test_ds['image'] = jnp.float32(test_ds['image']) / 255. + return train_ds, test_ds + + +def check_fp8(state, var_collect, input_shape, label_shape): + "Check if model includes FP8." + assert "Float8" in str( + jax.make_jaxpr(apply_model)(state, jnp.empty(input_shape, dtype=jnp.bfloat16), + jnp.empty(label_shape, dtype=jnp.bfloat16), var_collect)) + + +def train_and_evaluate(args): + """Execute model training and evaluation loop.""" + os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false" + print(args) + + if args.use_fp8: + assert gpu_has_fp8(), "GPU needs to support FP8." + args.use_te = True + + train_ds, test_ds = get_datasets() + rng = jax.random.PRNGKey(args.seed) + rng, params_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + init_rngs = {PARAMS_KEY: params_rng, DROPOUT_KEY: dropout_rng} + + input_shape = [args.batch_size, IMAGE_H, IMAGE_W, IMAGE_C] + label_shape = [args.batch_size] + + with te.fp8_autocast(enabled=args.use_fp8): + cnn = Net(args.use_te) + var_collect = cnn.init(init_rngs, jnp.empty(input_shape, dtype=jnp.bfloat16)) + tx = optax.sgd(args.lr, args.momentum) + state = train_state.TrainState.create(apply_fn=cnn.apply, + params=var_collect[PARAMS_KEY], + tx=tx) + + if args.use_fp8: + check_fp8(state, var_collect, input_shape, label_shape) + + if args.dry_run: + apply_model(state, jnp.empty(input_shape, dtype=jnp.bfloat16), + jnp.empty(label_shape, dtype=jnp.bfloat16), var_collect, + {DROPOUT_KEY: dropout_rng}) + print("PASSED") + return None + + for epoch in range(1, args.epochs + 1): + rng, input_rng = jax.random.split(rng) + rng, dropout_rng = jax.random.split(rng) + rngs = {INPUT_KEY: input_rng, DROPOUT_KEY: dropout_rng} + + state, train_loss, train_accuracy, var_collect = train_epoch( + state, train_ds, args.batch_size, rngs, var_collect, args.use_fp8) + test_loss, test_accuracy = eval_model(state, test_ds, args.test_batch_size, var_collect) + + print(f"Epoch: {epoch:>2} " + f"Train Loss: {train_loss:.6f} " + f"Train Accuracy: {train_accuracy:.6f} " + f"Test Loss: {test_loss:.6f} " + f"Test Accuracy: {test_accuracy:.6f} ") + + return [train_loss, train_accuracy, test_loss, test_accuracy] + + +def mnist_parser(args): + """Training settings.""" + parser = argparse.ArgumentParser(description="JAX MNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=800, + metavar="N", + help="input batch size for testing (default: 800)", + ) + parser.add_argument( + "--epochs", + type=int, + default=10, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.9, + metavar="M", + help="Momentum (default: 0.9)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="quickly check a single pass", + ) + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--use-fp8", + action="store_true", + default=False, + help="Use FP8 for inference and training without recalibration. " \ + "It also enables Transformer Engine implicitly.") + parser.add_argument("--use-te", + action="store_true", + default=False, + help="Use Transformer Engine") + + return parser.parse_args(args) + + +class TestMNIST(unittest.TestCase): + """MNIST unittests""" + + @classmethod + def setUpClass(cls): + """Run MNIST without Transformer Engine""" + cls.args = mnist_parser(["--epochs", "5"]) + + @staticmethod + def verify(actual): + """Check If loss and accuracy match target""" + desired_traing_loss = 0.055 + desired_traing_accuracy = 0.98 + desired_test_loss = 0.035 + desired_test_accuracy = 0.098 + assert actual[0] < desired_traing_loss + assert actual[1] > desired_traing_accuracy + assert actual[2] < desired_test_loss + assert actual[3] > desired_test_accuracy + + def test_te_bf16(self): + """Test Transformer Engine with BF16""" + self.args.use_te = True + self.args.use_fp8 = False + actual = train_and_evaluate(self.args) + self.verify(actual) + + @unittest.skipIf(not gpu_has_fp8(), reason='GPU capability is not enough to run FP8') + def test_te_fp8(self): + """Test Transformer Engine with FP8""" + self.args.use_fp8 = True + actual = train_and_evaluate(self.args) + self.verify(actual) + + +if __name__ == "__main__": + train_and_evaluate(mnist_parser(None)) diff --git a/qa/L0_jax_unittest/test.sh b/qa/L0_jax_unittest/test.sh index c040e973bf..247a388edb 100644 --- a/qa/L0_jax_unittest/test.sh +++ b/qa/L0_jax_unittest/test.sh @@ -6,4 +6,7 @@ set -xe : ${TE_PATH:=/opt/transformerengine} pytest -Wignore -v $TE_PATH/tests/jax + +pip install -r $TE_PATH/examples/jax/mnist/requirements.txt +pip install -r $TE_PATH/examples/jax/encoder/requirements.txt pytest -Wignore -v $TE_PATH/examples/jax diff --git a/qa/L0_license/config.json b/qa/L0_license/config.json index f9a93a70f5..ad47393434 100644 --- a/qa/L0_license/config.json +++ b/qa/L0_license/config.json @@ -17,7 +17,9 @@ "VERSION", "Doxyfile", "pylintrc", - ".json" + ".json", + ".md", + ".txt" ], "exclude_copyright": [], "copyright_only": false diff --git a/qa/L0_license/copyright_checker.py b/qa/L0_license/copyright_checker.py index c2f462e690..cd80b957da 100644 --- a/qa/L0_license/copyright_checker.py +++ b/qa/L0_license/copyright_checker.py @@ -69,6 +69,7 @@ def get_file_type(path): "txt": ["txt"], "cfg": ["cfg"], "sh": ["sh"], + "md": ["md"], } tmp = path.split(".") for filetype, ext_list in ext.items(): diff --git a/tests/jax/test_mnist.py b/tests/jax/test_mnist.py deleted file mode 100644 index ce5d9e4d8c..0000000000 --- a/tests/jax/test_mnist.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. - -import os -import tempfile -import unittest -from functools import partial - -import jax -import jax.numpy as jnp -import numpy as np -import optax -import tensorflow_datasets as tfds -from flax import linen as nn -from flax.training import train_state - -from transformer_engine.common.recipe import Format as FP8Format -from transformer_engine.jax import DenseGeneral -from transformer_engine.jax.fp8 import FP8Helper -from utils import is_fp8_supported - - -class MLPNN(nn.Module): - - use_fp8_dense: bool = True - - @nn.compact - def __call__(self, x): - x = x.reshape((x.shape[0], -1)) # flatten - x = nn.Dense(features=512)(x) - x = nn.relu(x) - - features = [256, 256, 128] - for feature in features: - x = DenseGeneral(features=feature, transpose_batch_sequence=False, - dtype=jnp.bfloat16, use_bias=True)(x) \ - if self.use_fp8_dense else nn.Dense(features=feature)(x) - x = nn.relu(x) - - x = nn.Dense(features=10, use_bias=True)(x) - return x - - -def cross_entropy_loss(*, logits, labels): - labels_onehot = jax.nn.one_hot(labels, num_classes=10) - return optax.softmax_cross_entropy(logits=logits, labels=labels_onehot).mean() - - -def compute_metrics(*, logits, labels): - loss = cross_entropy_loss(logits=logits, labels=labels) - accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) - metrics = { - 'loss': loss, - 'accuracy': accuracy, - } - return metrics - - -def get_datasets(): - """Load MNIST train and test datasets into memory.""" - ds_builder = tfds.builder('mnist', data_dir="/tmp/tensorflow-datasets/downloads") - ds_builder.download_and_prepare() - train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1)) - test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1)) - train_ds['image'] = jnp.float32(train_ds['image']) / 255. - test_ds['image'] = jnp.float32(test_ds['image']) / 255. - return train_ds, test_ds - - -def create_train_state(rng, learning_rate, momentum, use_fp8_dense): - """Creates initial `TrainState`.""" - cnn = MLPNN(use_fp8_dense=use_fp8_dense) - variables = cnn.init(rng, jnp.ones([32, 28, 28, 1])) - tx = optax.sgd(learning_rate, momentum) - return train_state.TrainState.create(apply_fn=cnn.apply, params=variables['params'], - tx=tx), variables - - -@partial(jax.jit, static_argnums=(3,)) -def train_step(state, others, batch, use_fp8_dense): - """Train for a single step.""" - - def loss_fn(collections): - logits = MLPNN(use_fp8_dense=use_fp8_dense).apply(collections, batch['image']) - loss = cross_entropy_loss(logits=logits, labels=batch['label']) - return loss, logits - - grad_fn = jax.value_and_grad(loss_fn, has_aux=True) - (_, logits), grads = grad_fn(others) - state = state.apply_gradients(grads=grads['params']) - metrics = compute_metrics(logits=logits, labels=batch['label']) - return state, metrics, grads - - -def train_epoch(state, variables, train_ds, batch_size, rng, use_fp8_dense): - """Train for a single epoch.""" - train_ds_size = len(train_ds['image']) - steps_per_epoch = train_ds_size // batch_size - perms = jax.random.permutation(rng, train_ds_size) - perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch - perms = perms.reshape((steps_per_epoch, batch_size)) - batch_metrics = [] - for idx, perm in enumerate(perms): - idx = idx + 1 - batch = {k: v[perm, ...] for k, v in train_ds.items()} - state, metrics, grads = train_step(state, variables, batch, use_fp8_dense) - - updated_coll = {'params': state.params} - if use_fp8_dense: - updated_coll[FP8Helper.FP8_COLLECTION_NAME] \ - = grads[FP8Helper.FP8_COLLECTION_NAME] - variables = FP8Helper.update_collections(updated_coll, variables) - batch_metrics.append(metrics) - - if use_fp8_dense: - variables = FP8Helper.update_fp8_metas(variables) - - return state, variables - - -@partial(jax.jit, static_argnums=(2,)) -def eval_step(variables, batch, use_fp8_dense): - logits = MLPNN(use_fp8_dense=use_fp8_dense).apply(variables, batch['image']) - return compute_metrics(logits=logits, labels=batch['label']) - - -def eval_model(variables, test_ds, batch_size, use_fp8_dense): - test_ds_size = len(test_ds['image']) - steps_per_epoch = test_ds_size // batch_size - perms = np.arange(0, test_ds_size) - perms = perms[:steps_per_epoch * batch_size] # skip incomplete batch - perms = perms.reshape((steps_per_epoch, batch_size)) - total_summary = {'correct': 0, 'loss': 0, 'total': 0} - for _, perm in enumerate(perms): - batch = {k: v[perm, ...] for k, v in test_ds.items()} - metrics = eval_step(variables, batch, use_fp8_dense) - metrics = jax.device_get(metrics) - summary = jax.tree_map(lambda x: x.item(), metrics) - total_summary['correct'] += summary['accuracy'] * batch_size - total_summary['loss'] += summary['loss'] * batch_size - total_summary['total'] += batch_size - return total_summary['loss']/total_summary['total'], \ - total_summary['correct']/total_summary['total'] - - -class TestMnist(unittest.TestCase): - - def setUp(self) -> None: - os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false" - self.learning_rate = 0.1 - self.momentum = 0.9 - - self.num_epochs = 5 - self.batch_size = 512 - self.train_ds, self.test_ds = get_datasets() - - self.margin = 0.0 - self.num_fp8_layers = 3 - self.fp8_meta_update_interval = 1 - self.temp_file = tempfile.NamedTemporaryFile() # pylint: disable=consider-using-with - self.fp8_ckpt_path = self.temp_file.name - - self.seed = 0 - - acc_bfp16_ = self._mnist_baseline_runner() - acc_rtol = 0.005 - self.target_accuracy = acc_bfp16_ * (1. - acc_rtol) - - def tearDown(self): - self.temp_file.close() - - @unittest.skipIf(not is_fp8_supported(), reason='GPU capability is not enough to run FP8') - def test_mnist_e4m3(self): - self._mnist_test_runner(FP8Format.E4M3) - - @unittest.skipIf(not is_fp8_supported(), reason='GPU capability is not enough to run FP8') - def test_mnist_hybrid(self): - self._mnist_test_runner(FP8Format.HYBRID) - - # Skip for now due to lack bf16 in TE.Format - # def test_mnist_bfloa16(self): - # self._mnist_test_runner(FP8Format.BFLOAT16) - - def _mnist_baseline_runner(self): - rng = jax.random.PRNGKey(self.seed) - rng, init_rng = jax.random.split(rng) - - state, variables = create_train_state(init_rng, self.learning_rate, self.momentum, False) - del init_rng - - _, accuracy = self._train_model(state, variables, self.num_epochs, rng, False) - return accuracy - - def _mnist_test_runner(self, fp8_format): - FP8Helper.initialize(margin=self.margin, fp8_format=fp8_format) - - rng = jax.random.PRNGKey(self.seed) - rng, init_rng = jax.random.split(rng) - - state, variables = create_train_state(init_rng, self.learning_rate, self.momentum, True) - del init_rng - - _, test_accuracy = self._train_model(state, variables, self.num_epochs, rng, True) - - self.assertGreater( - test_accuracy, self.target_accuracy, - f"Convergence test failed on MNIST with FP8Fomat.{fp8_format.name}. " - f"Test Accuracy {test_accuracy:.4f} is lower than target {self.target_accuracy:.4f}") - - FP8Helper.finalize() - - def _train_model(self, state, variables, epochs, rng, use_fp8_dense): - max_test_acc = 0.0 - for _ in range(0, epochs): - rng, input_rng = jax.random.split(rng) - - state, variables = train_epoch(state, variables, self.train_ds, self.batch_size, - input_rng, use_fp8_dense) - - _, test_accuracy = eval_model(variables, self.test_ds, self.batch_size, use_fp8_dense) - max_test_acc = test_accuracy if test_accuracy > max_test_acc else max_test_acc - return state, max_test_acc - - -if __name__ == '__main__': - unittest.main() diff --git a/transformer_engine/jax/module.py b/transformer_engine/jax/module.py index 33029b049d..61dee42475 100644 --- a/transformer_engine/jax/module.py +++ b/transformer_engine/jax/module.py @@ -219,7 +219,7 @@ class LayerNorm(nn.Module): ----------------------- dtype : jax.numpy.dtype, default = jax.numpy.float32 the data type used to allocate the initial parameters. - transpose_batch_sequence : bool, default = True + transpose_batch_sequence : bool, default = False Indicate whether the input tensors were switched axis of batch and sequence length dimension. If set to True, the input tensors should be in (seqlen, batch, hidden), otherwise (batch, seqlen, hidden). @@ -233,7 +233,7 @@ class LayerNorm(nn.Module): bias_init: Initializer = nn.initializers.zeros bias_axes: Tuple[str, ...] = ('embed',) dtype: DType = jnp.float32 - transpose_batch_sequence: bool = True + transpose_batch_sequence: bool = False sharding_type: ShardingType = ShardingType.SINGLE @nn.compact @@ -358,12 +358,12 @@ class DenseGeneral(TransformerEngineBase): features: Union[Iterable[int], int] kernel_init: Initializer = None kernel_axes: Tuple[str, ...] = () - use_bias: bool = False + use_bias: bool = True bias_init: Initializer = nn.initializers.zeros bias_axes: Tuple[str, ...] = () axis: Union[Iterable[int], int] = -1 dtype: DType = jnp.float32 - transpose_batch_sequence: bool = True + transpose_batch_sequence: bool = False sharding_type: ShardingType = ShardingType.SINGLE def __post_init__(self): diff --git a/transformer_engine/jax/transformer.py b/transformer_engine/jax/transformer.py index 0a5dfce147..69b1325df0 100644 --- a/transformer_engine/jax/transformer.py +++ b/transformer_engine/jax/transformer.py @@ -720,7 +720,7 @@ class TransformerLayer(nn.Module): If set to True, `TransformerLayer` module exposes a single fused parameter for query-key-value for self-attention and key-value for cross-attention. - transpose_batch_sequence : bool, default = True + transpose_batch_sequence : bool, default = False Indicate whether the input tensors were switched axis of batch and sequence length dimension. if set to True, the input tensors should be in (seqlen, batch, hidden), otherwise (batch, seqlen, hidden). @@ -755,7 +755,7 @@ class TransformerLayer(nn.Module): dtype: DType = jnp.float32 drop_path: float = 0.0 fuse_qkv_params: bool = True - transpose_batch_sequence: bool = True + transpose_batch_sequence: bool = False scale_attn_logits: bool = False scaled_query_init: bool = True From bc0e44848fc83aa422f4777e377abc5cf8bc2474 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Wed, 29 Mar 2023 08:43:32 +0800 Subject: [PATCH 013/427] Fix Bugs of TE/JAX (#119) * Support transpose_bs when decoded=True Signed-off-by: Ming-Xu Huang Signed-off-by: Ming Huang * Fix Bugs, 1. Fix missing dropout_dims in LayerNormMLP. 2. Fix broadcast issues in decoded. Signed-off-by: Ming-Xu Huang Signed-off-by: Ming Huang * Fix wrong masks in decoded. Signed-off-by: Ming-Xu Huang Signed-off-by: Ming Huang * Fixed wrong assert condition in TransformerLayer Signed-off-by: Ming Huang * Fix amax is not set as 0 in each step. Signed-off-by: Ming Huang * Enhance rules conflict checking and docs. Signed-off-by: Ming Huang * fix code formatting. Signed-off-by: Ming Huang --------- Signed-off-by: Ming-Xu Huang Signed-off-by: Ming Huang --- tests/jax/test_sharding.py | 10 ++++-- tests/jax/utils.py | 3 +- transformer_engine/jax/fp8.py | 2 +- transformer_engine/jax/module.py | 8 +++-- transformer_engine/jax/transformer.py | 49 +++++++++++++++------------ 5 files changed, 43 insertions(+), 29 deletions(-) diff --git a/tests/jax/test_sharding.py b/tests/jax/test_sharding.py index e572d2162a..458e10ffac 100644 --- a/tests/jax/test_sharding.py +++ b/tests/jax/test_sharding.py @@ -38,9 +38,13 @@ def _get_sharding_resource(mesh_names, sharding_type): ((4,), ("tp",), ShardingType.TP_ROW), ((2, 2), ("dp", "tp"), ShardingType.DP_TP_COL), ((2, 2), ("dp", "tp"), ShardingType.DP_TP_ROW)] -LOGICAL_RULES = [[(('a1', None), ('a2', 'ma2')), False], - [(('a1', None), ('a2', 'ma2'), ('a3', ('ma31', 'ma32'))), True], - [(('a1', None), ('a2', 'ma2'), ('batch', 'batch_1200234')), True]] +LOGICAL_RULES = [ + [(('a1', None), ('a2', 'ma2')), False], + [(('a1', None), ('a2', 'ma2'), ('a3', ('ma31', 'ma32'))), True], + [(('a1', None), ('a2', 'ma2'), ('a3', 'ma31'), ('a3', 'ma32')), False], + [(('a1', None), ('a2', 'ma2'), ('batch', 'batch_1200234')), True], + [(('a1', None), ('a2', 'ma2'), ('a2', 'ma1'), ('batch', 'model'), ('batch', 'data')), True], +] SRS = [ ShardingResource(), ShardingResource('data', None), diff --git a/tests/jax/utils.py b/tests/jax/utils.py index c8a1b0e402..bbd0b1392f 100644 --- a/tests/jax/utils.py +++ b/tests/jax/utils.py @@ -321,8 +321,9 @@ def __call__(self, inputs, deterministic: bool = False): # Take elementwise product of above intermediate activations. x = functools.reduce(operator.mul, activations) + dropout_broadcast_dims = (0,) if self.transpose_batch_sequence else (1,) # Apply dropout and final dense output projection. - x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))( + x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=dropout_broadcast_dims)( x, deterministic=deterministic) # Broadcast along length. if self.transpose_batch_sequence: x = nn_partitioning.with_sharding_constraint(x, ('length', 'batch', 'mlp')) diff --git a/transformer_engine/jax/fp8.py b/transformer_engine/jax/fp8.py index 106f8e310f..906f7d273b 100644 --- a/transformer_engine/jax/fp8.py +++ b/transformer_engine/jax/fp8.py @@ -190,7 +190,7 @@ def update_amax_history(amax_buffers: jnp.ndarray) -> jnp.ndarray: Update the amax history """ updated_amax_buffers = jnp.roll(amax_buffers, -1, 1) - updated_amax_buffers.at[:, 0].set(0) + updated_amax_buffers = updated_amax_buffers.at[:, 0].set(0) return updated_amax_buffers @staticmethod diff --git a/transformer_engine/jax/module.py b/transformer_engine/jax/module.py index 61dee42475..2cb0bfea0a 100644 --- a/transformer_engine/jax/module.py +++ b/transformer_engine/jax/module.py @@ -683,6 +683,8 @@ class LayerNormMLP(TransformerEngineBase): Each activation has its own transformation layer. intermediate_dropout_rate: float, default = 0.1 Dropout probability for the dropout op after the :attr:`activations`. + intermediate_hidden_dropout_dims: Sequence[int], default = () + Dimensions that will share the same dropout mask for hidden axis: Union[Iterable[int], int], default = -1 An integer tuple with axes to apply the transformation on. @@ -716,6 +718,7 @@ class LayerNormMLP(TransformerEngineBase): return_layernorm_output: bool = True activations: Sequence[Union[str, Callable]] = ('relu',) intermediate_dropout_rate: float = 0.1 + intermediate_hidden_dropout_dims: Sequence[int] = () axis: Union[Iterable[int], int] = -1 dtype: DType = jnp.float32 transpose_batch_sequence: bool = True @@ -912,8 +915,9 @@ def fp8_meta_generator(): z = functools.reduce(operator.mul, activations) z = jnp.reshape(z, (*z.shape[:-2], -1)) - z = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))( - z, deterministic=deterministic) # Broadcast along length. + z = nn.Dropout(rate=self.intermediate_dropout_rate, + broadcast_dims=self.intermediate_hidden_dropout_dims)( + z, deterministic=deterministic) # DenseGeneral 2 hidden_size = inputs.shape[-1] diff --git a/transformer_engine/jax/transformer.py b/transformer_engine/jax/transformer.py index 69b1325df0..51ead9ceba 100644 --- a/transformer_engine/jax/transformer.py +++ b/transformer_engine/jax/transformer.py @@ -53,6 +53,10 @@ def extend_logical_axis_rules(rules: LogicalRules) -> LogicalRules: .. warning:: Please make sure ShardingResource is set via fp8_autocast before calling this function. + .. note:: + This function is only needed when using TransformerLayer. For other modules, such as + DenseGeneral, please properly set axes of kernels and bias. + Parameters ---------- rules : Sequence[Tuple[str, Union[str, None]]] @@ -73,10 +77,12 @@ def extend_logical_axis_rules(rules: LogicalRules) -> LogicalRules: f"Thie axis_name should be str, but got {type(key)}." assert isinstance(val, str) or (val is None), \ f"Thie mesh_axis_name should be str or None, but got {type(val)}." - rules_map[key] = val + if key in rules_map: + rules_map[key].append(val) + else: + rules_map[key] = [val] gsr = global_shard_resource() - te_logical_axis_rules = (('batch', gsr.dp_resource), ('embed', None), ('mlp', gsr.tp_resource), ('heads', gsr.tp_resource), ('kv', None), ('qkv_dim', None), ('kv_dim', None), ('joined_kv', gsr.tp_resource), ('act', None), @@ -87,7 +93,7 @@ def extend_logical_axis_rules(rules: LogicalRules) -> LogicalRules: key = item[0] val = item[1] if key in rules_map: - assert rules_map[key] == val, \ + assert len(rules_map[key]) == 1 and rules_map[key][0] == val, \ f"The rule diverged between TE and given rule." \ f"Axis:{key} map to {rules_map[key]} in the given" \ f" rules, but {val} in TE's rules." @@ -447,21 +453,22 @@ def kv_init(key, shape, dtype): if decode: is_initialized = self.has_variable('cache', 'cached_key') - # TODO (Ming Huang): Check performance on GPU withou swap dimensions # pylint: disable=fixme - def swap_dims(x): - return x[:-3] + tuple(x[i] for i in [-2, -1, -3]) - - cached_key = self.variable('cache', 'cached_key', jnp.zeros, swap_dims(key.shape), - key.dtype) - cached_value = self.variable('cache', 'cached_value', jnp.zeros, swap_dims(value.shape), + cached_key = self.variable('cache', 'cached_key', jnp.zeros, key.shape, key.dtype) + cached_value = self.variable('cache', 'cached_value', jnp.zeros, value.shape, value.dtype) cache_index = self.variable('cache', 'cache_index', lambda: jnp.array(0, dtype=jnp.int32)) if is_initialized: - batch, num_heads, head_dim, length = cached_key.value.shape + if self.transpose_batch_sequence: + length, batch, num_heads, head_dim = cached_key.value.shape + expected_shape = (1, batch, num_heads, head_dim) + one_hot_indices_shape = (length, 1, 1, 1) + else: + batch, length, num_heads, head_dim = cached_key.value.shape + expected_shape = (batch, 1, num_heads, head_dim) + one_hot_indices_shape = (1, length, 1, 1) # Sanity shape check of cached key against input query. - expected_shape = (batch, 1, num_heads, head_dim) if expected_shape != query.shape: raise ValueError( 'Autoregressive cache shape error, ' @@ -469,19 +476,15 @@ def swap_dims(x): cur_index = cache_index.value one_hot_indices = jax_nn.one_hot(cur_index, length, dtype=key.dtype) - one_token_key = jnp.moveaxis(key, -3, -1) - one_token_value = jnp.moveaxis(value, -3, -1) - key = cached_key.value + one_token_key * one_hot_indices - value = cached_value.value + one_token_value * one_hot_indices + one_hot_indices = jnp.reshape(one_hot_indices, one_hot_indices_shape) + key = cached_key.value + key * one_hot_indices + value = cached_value.value + value * one_hot_indices cached_key.value = key cached_value.value = value cache_index.value = cache_index.value + 1 - key = jnp.moveaxis(key, -1, -3) - value = jnp.moveaxis(value, -1, -3) - mask = combine_masks( - mask, jnp.broadcast_to(jnp.arange(length) <= cur_index, (batch, 1, 1, length))) + mask, jnp.broadcast_to(jnp.arange(length) > cur_index, (batch, 1, 1, length))) if bias is not None: bias = dynamic_vector_slice_in_dim(jnp.squeeze(bias, axis=0), @@ -889,10 +892,11 @@ def hidden_dropout(x, deterministic): assert isinstance(self.hidden_dropout_dims, Sequence) x_shape_len = len(x.shape) for dims in self.hidden_dropout_dims: - assert -x_shape_len < dims < x_shape_len + assert -x_shape_len <= dims < x_shape_len return nn.Dropout(rate=self.hidden_dropout, - broadcast_dims=self.hidden_dropout_dims)(x, deterministic) + broadcast_dims=self.hidden_dropout_dims)(x, + deterministic=deterministic) x = hidden_dropout(x, deterministic) if self.drop_path > 0.0: @@ -944,6 +948,7 @@ def hidden_dropout(x, deterministic): intermediate_dim=self.mlp_hidden_size, activations=self.mlp_activations, intermediate_dropout_rate=self.hidden_dropout, + intermediate_hidden_dropout_dims=self.hidden_dropout_dims, dtype=self.dtype, scale_axes=('embed',), kernel_init=self.mlp_kernel_init, From 78c375d297970ab561f351faac2423fe9bdcb00a Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 29 Mar 2023 17:38:25 -0700 Subject: [PATCH 014/427] Change FP8 recipe defaults (#112) * Change FP8 recipe defaults Signed-off-by: Kirthi Shankar Sivamani * Increase default amax history length Signed-off-by: Kirthi Shankar Sivamani * Always check history size Signed-off-by: Kirthi Shankar Sivamani * no amax history for onnx export Signed-off-by: Kirthi Shankar Sivamani * revert onnx export test changes Signed-off-by: Kirthi Shankar Sivamani * Fix indices in onnx test Co-authored-by: Neta Zmora Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Neta Zmora --- tests/pytorch/test_onnx_export.py | 17 ++++++++++------- transformer_engine/common/recipe.py | 8 ++++---- transformer_engine/pytorch/module.py | 25 +++++++++++++++++++------ 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index e72d1cae59..40486057f4 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -92,12 +92,15 @@ def to_numpy(tensor): return tensor.cpu().numpy() -def set_layer_scale(module: torch.nn.Module, scale: float): - module.fp8_init() +def set_layer_scale(module: torch.nn.Module, scale: float, num_gemms: int): + """Initialize the FP8 quantization scales in module""" + NB_SCALES_PER_GEMM = 3 # One scale per: input, weights, and output GEMM tensors. + nb_total_scales = num_gemms * NB_SCALES_PER_GEMM + module.fp8_init(num_gemms) module.fp8_meta["scaling_fwd"].scale = torch.ones( - 2, dtype=torch.float32, device="cuda") / scale + nb_total_scales, dtype=torch.float32, device="cuda") / scale module.fp8_meta["scaling_fwd"].scale_inv = torch.ones( - 2, dtype=torch.float32, device="cuda") * scale + nb_total_scales, dtype=torch.float32, device="cuda") * scale def te_infer(model: torch.nn.Module, inps: Union[Tuple[torch.tensor], torch.tensor], is_fp8: bool): @@ -649,7 +652,7 @@ def forward(self, inp): precision ).to(device='cuda') if use_fp8: - set_layer_scale(model.linear, scale_factor) + set_layer_scale(model.linear, scale_factor, num_gemms=1) do_export(model, inp, fname, use_fp8) if precision in (torch.bfloat16, ): @@ -707,7 +710,7 @@ def test_export_layernorm_linear( zero_centered_gamma=zero_centered_gamma, ).to(device='cuda') if use_fp8: - set_layer_scale(model, scale_factor) + set_layer_scale(model, scale_factor, num_gemms=1) do_export(model, inp, fname, use_fp8) if not use_fp8: validate_result(fname, inp, model, atol=1e-3) @@ -763,7 +766,7 @@ def test_export_layernorm_mlp( zero_centered_gamma=zero_centered_gamma, ).to(device='cuda') if use_fp8: - set_layer_scale(model, scale_factor) + set_layer_scale(model, scale_factor, num_gemms=2) do_export(model, inp, fname, use_fp8) if not use_fp8: validate_result(fname, inp, model, atol=1e-3) diff --git a/transformer_engine/common/recipe.py b/transformer_engine/common/recipe.py index 583b47d80c..3bb5320475 100644 --- a/transformer_engine/common/recipe.py +++ b/transformer_engine/common/recipe.py @@ -66,10 +66,10 @@ class DelayedScaling: fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.HYBRID Controls the FP8 data format used during forward and backward pass. - amax_history_len : int, default = 1 + amax_history_len : int, default = 1024 The length of the amax history window used for scaling factor computation. - amax_compute_algo : {'max', 'most_recent', Callable}, default = 'most_recent' + amax_compute_algo : {'max', 'most_recent', Callable}, default = 'max' Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2 predefined choices: `max` chooses the largest `amax` in the history @@ -125,8 +125,8 @@ def scaling_factor_compute(amax: Tensor, margin: int = 0 interval: int = 1 fp8_format: Format = Format.HYBRID - amax_history_len: int = 1 - amax_compute_algo: Union[Literal["max", "most_recent"], Callable] = "most_recent" + amax_history_len: int = 1024 + amax_compute_algo: Union[Literal["max", "most_recent"], Callable] = "max" override_linear_precision: _OverrideLinearPrecision = _OverrideLinearPrecision() scaling_factor_compute_algo: Optional[Callable] = None reduce_amax: bool = True diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index 4e012be58c..516081c7b2 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -13,6 +13,7 @@ import numpy as np import torch +import torch.nn.functional as F from torch.nn.parameter import Parameter from torch.nn import init @@ -187,6 +188,23 @@ def __init__(self) -> None: def set_meta_tensor(self, fwd: bool) -> None: """Init scales and amaxes for fwd | bwd.""" fp8_meta_tensor_key = "scaling_fwd" if fwd else "scaling_bwd" + + if self.fp8_meta_tensors_initialized: + # Handle changed amax history size. + curr_len = self.fp8_meta[fp8_meta_tensor_key].amax_history.shape[0] + need_len = self.fp8_meta["recipe"].amax_history_len + if need_len < curr_len: + self.fp8_meta[fp8_meta_tensor_key].amax_history = ( + self.fp8_meta[fp8_meta_tensor_key] + .amax_history[: self.fp8_meta["recipe"].amax_history_len].clone() + ) + elif need_len > curr_len: + extra_rows = need_len - curr_len + self.fp8_meta[fp8_meta_tensor_key].amax_history = F.pad( + self.fp8_meta[fp8_meta_tensor_key].amax_history, pad=(0, 0, 0, extra_rows) + ) + return + # Max. number of fp8 tensors per GEMM = 3 (input, weight, output) for fwd and # 2 (grad_output and grad_input) for bwd num_fp8_tensors = ( @@ -222,12 +240,9 @@ def set_meta_tensor(self, fwd: bool) -> None: def init_fp8_meta_tensors(self) -> None: """Init scales and amaxes.""" - # Checkpoint loaded - if self.fp8_meta_tensors_initialized: - return - self.set_meta_tensor(True) self.set_meta_tensor(False) + self.fp8_meta_tensors_initialized = True def get_extra_state(self) -> torch.Tensor: """Save before checkpointing.""" @@ -280,7 +295,6 @@ def set_extra_state(self, state: torch.Tensor) -> None: self.fp8_meta["scaling_fwd"].amax_history.copy_(amax_history_fwd) self.fp8_meta["scaling_bwd"].scale.copy_(scale_bwd) self.fp8_meta["scaling_bwd"].amax_history.copy_(amax_history_bwd) - self.fp8_meta_tensors_initialized = True # Restore global FP8 buffer state. set_global_fp8_buffer(state[4]) @@ -310,7 +324,6 @@ def set_extra_state(self, state: torch.Tensor) -> None: self.fp8_meta["scaling_fwd"].amax_history.copy_(state["amax_history_fwd"]) self.fp8_meta["scaling_bwd"].scale.copy_(state["scale_bwd"]) self.fp8_meta["scaling_bwd"].amax_history.copy_(state["amax_history_bwd"]) - self.fp8_meta_tensors_initialized = True def set_activation_dtype(self, inp: torch.Tensor) -> None: """Get activation data type for AMP.""" From a7537155847907d2a27b330d94d21e526ddbf20e Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 4 Apr 2023 09:56:44 -0700 Subject: [PATCH 015/427] Add FP8 support for Ada (#129) * Add FP8 support for Ada Signed-off-by: Kirthi Shankar Sivamani * Fixes Signed-off-by: Kirthi Shankar Sivamani * better message Signed-off-by: Kirthi Shankar Sivamani * lint fixes Signed-off-by: Kirthi Shankar Sivamani * Address review comments Signed-off-by: Kirthi Shankar Sivamani * better message for no fp8 Signed-off-by: Kirthi Shankar Sivamani * same thing for onnx test Signed-off-by: Kirthi Shankar Sivamani * fix Signed-off-by: Kirthi Shankar Sivamani * Fix CI and review Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/pytorch/test_onnx_export.py | 35 +++++++++---------- tests/pytorch/test_sanity.py | 24 ++++++------- transformer_engine/CMakeLists.txt | 2 +- transformer_engine/pytorch/csrc/common.h | 1 + transformer_engine/pytorch/csrc/extensions.cu | 8 +++++ transformer_engine/pytorch/fp8.py | 31 ++++++++++++++-- 6 files changed, 67 insertions(+), 34 deletions(-) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 40486057f4..9f2308f5e4 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -31,6 +31,7 @@ import transformer_engine.pytorch.cpp_extensions as texcpp import transformer_engine.pytorch.softmax as softmax_defs from transformer_engine.pytorch.utils import get_default_init_method +from transformer_engine.pytorch.fp8 import is_fp8_available # Directory where generated ONNX test models are stored. @@ -46,10 +47,8 @@ OPSET = 15 assert OPSET >= TRILU_OPSET -skip_FP8 = pytest.mark.skipif( - torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9, - reason="Device compute capability 9.x required for FP8 execution.", -) +fp8_available, reason_for_no_fp8 = is_fp8_available() +skip_FP8 = pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) def create_fp8_recipe(): return recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3) @@ -346,8 +345,8 @@ def test_export_gemm( scale_factors ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) class TestFP8_GEMM(nn.Module): def __init__(self, precision, use_bias, gelu, scale_factors): @@ -467,8 +466,8 @@ def test_export_layernorm( zero_centered_gamma: bool ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) # Set dimensions (these are arbitrary). inp_shape = [64, 32] @@ -608,8 +607,8 @@ def test_export_linear( precision: torch.dtype ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) # Set dimensions (these are arbitrary). in_features = 64 @@ -686,8 +685,8 @@ def test_export_layernorm_linear( zero_centered_gamma: bool ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) # Set dimensions (these are arbitrary). in_features = 64 @@ -741,8 +740,8 @@ def test_export_layernorm_mlp( zero_centered_gamma: bool ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) # Set dimensions (these are arbitrary). in_features = 64 @@ -861,8 +860,8 @@ def test_export_multihead_attention( fuse_qkv_params: bool ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) hidden_size = 256 sequence_length = 128 @@ -938,8 +937,8 @@ def test_export_transformer_layer( zero_centered_gamma: bool ): # Skip FP8 tests on non-hopper devices - if use_fp8 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9: - pytest.skip("Device compute capability 9.x required for FP8 execution.") + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) # Layer configuration hidden_size = 64 diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index 3ff0f66bc9..3af50f59c3 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -5,7 +5,7 @@ import torch import pytest -from transformer_engine.pytorch.fp8 import fp8_autocast +from transformer_engine.pytorch.fp8 import fp8_autocast, is_fp8_available from transformer_engine.pytorch.utils import ( init_method_normal, scaled_init_method_normal, @@ -19,7 +19,7 @@ from transformer_engine.common import recipe # Only run FP8 tests on H100. -fp8_available = torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 9 +fp8_available, reason_for_no_fp8 = is_fp8_available() def custom_amax_to_scale( @@ -263,7 +263,7 @@ def _test_sanity_common(block, bs, dtype, config, fp8_recipe, skip_wgrad): @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_layernorm_linear(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -291,7 +291,7 @@ def test_sanity_layernorm_linear(dtype, bs, fp8_recipe, model, skip_wgrad, zero_ @pytest.mark.parametrize("skip_wgrad", all_boolean) def test_sanity_linear(dtype, bs, fp8_recipe, model, skip_wgrad): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -316,7 +316,7 @@ def test_sanity_linear(dtype, bs, fp8_recipe, model, skip_wgrad): @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -347,7 +347,7 @@ def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, zero_cen @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -385,7 +385,7 @@ def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamm @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -423,7 +423,7 @@ def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gam @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_T5(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -461,7 +461,7 @@ def test_sanity_T5(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma @pytest.mark.parametrize("skip_wgrad", all_boolean) def test_sanity_amp_and_nvfuser(dtype, bs, fp8_recipe, model, skip_wgrad): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -495,7 +495,7 @@ def test_sanity_amp_and_nvfuser(dtype, bs, fp8_recipe, model, skip_wgrad): @pytest.mark.parametrize("skip_wgrad", all_boolean) def test_sanity_drop_path(dtype, bs, fp8_recipe, model, skip_wgrad): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -532,7 +532,7 @@ def test_sanity_drop_path(dtype, bs, fp8_recipe, model, skip_wgrad): @pytest.mark.parametrize("skip_wgrad", all_boolean) def test_sanity_fused_qkv_params(dtype, bs, fp8_recipe, model, skip_wgrad): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] @@ -570,7 +570,7 @@ def test_sanity_fused_qkv_params(dtype, bs, fp8_recipe, model, skip_wgrad): @pytest.mark.parametrize("zero_centered_gamma", all_boolean) def test_sanity_gradient_accumulation_fusion(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): if fp8_recipe is not None and not fp8_available: - pytest.skip("FP8 device not available.") + pytest.skip(reason_for_no_fp8) config = model_configs[model] diff --git a/transformer_engine/CMakeLists.txt b/transformer_engine/CMakeLists.txt index d3ee61ac66..c6977e5ece 100644 --- a/transformer_engine/CMakeLists.txt +++ b/transformer_engine/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.18) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 70 80 90) + set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90) endif() diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index 67b47dcdcc..f6c9898601 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index 47f1eb465e..ec99ad403f 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -830,6 +830,11 @@ at::Tensor scaled_upper_triang_masked_softmax_backward(at::Tensor output_grads_, } +size_t get_cublasLt_version() { + return cublasLtGetVersion(); +} + + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Softmax functions m.def("scaled_softmax_forward", &scaled_softmax_forward, "Scaled Softmax FWD"); @@ -862,6 +867,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); m.def("fp8_gelu", &fp8_gelu, "GeLU with FP8 output"); + // Misc + m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); + // Data structures py::class_(m, "FP8TensorMeta") .def(py::init<>()) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index 98d35df363..ed9e10ae0d 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -12,6 +12,7 @@ from transformer_engine.common.recipe import DelayedScaling, Format from .constants import dist_group_type +from .utils import get_device_compute_capability _FP8_ENABLED = False _FP8_CALIBRATION = False @@ -26,6 +27,29 @@ _amax_forward_global_reduce_func = None _buffer_delete_key_fwd = None _buffer_delete_key_bwd = None +_is_fp8_available = None +_reason_for_no_fp8 = "" + + +def _check_fp8_support() -> Tuple[bool, str]: + """Return if fp8 support is available""" + if get_device_compute_capability() >= 9.0: # hopper and above + return True, "" + if get_device_compute_capability() < 8.9: # pre-ada + return False, "Device compute capability 8.9 or higher required for FP8 execution." + if tex.get_cublasLt_version() < 120103: + return False, "CublasLt version 12.1.3.x or higher required for FP8 execution on Ada." + if float(torch.version.cuda) < 12.1: + return False, "Cuda version 12.1 or higher required for FP8 execution on Ada." + return True, "" + + +def is_fp8_available() -> Tuple[bool, str]: + """Return if fp8 support is available""" + global _is_fp8_available, _reason_for_no_fp8 + if _is_fp8_available is None: + _is_fp8_available, _reason_for_no_fp8 = _check_fp8_support() + return _is_fp8_available, _reason_for_no_fp8 def get_meta_tensor_key(forward: bool = True) -> str: @@ -253,9 +277,8 @@ def fp8_autocast( _FP8_AUTOCAST_DEPTH += 1 if enabled: - assert ( - torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 9 - ), "Device compute capability 9.x required for FP8 execution." + fp8_available, reason_for_no_fp8 = is_fp8_available() + assert fp8_available, reason_for_no_fp8 yield finally: _FP8_ENABLED,_FP8_CALIBRATION, _FP8_RECIPE, _FP8_DISTRIBUTED_GROUP = fp8_state @@ -290,10 +313,12 @@ def is_fp8_enabled() -> bool: """Is FP8 enabled""" return _FP8_ENABLED + def is_fp8_calibration() -> bool: """Is FP8 calibration""" return _FP8_CALIBRATION + def is_first_fp8_module(): """Returns `True` only the first time when called multiple times from within the same `fp8_autocast` context. From 770e968b073c4712f03bcc1a84eb564bf7067997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Bastien?= Date: Wed, 5 Apr 2023 11:30:14 -0400 Subject: [PATCH 016/427] Update installation instruction for JAX and add some dependencies. (#117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update installation instructio for JAX and add some depenencies. Signed-off-by: Frederic Bastien * Bring back support for none pip installed pybind11. Signed-off-by: Frederic Bastien * Apply suggestions from code review Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: Frédéric Bastien * Changes following review. Signed-off-by: Frederic Bastien * Change order to make it more clear. Signed-off-by: Frederic Bastien * Add other reviers suggestion. Signed-off-by: Frederic Bastien * pybind11 is needed for all FW. Signed-off-by: Frederic Bastien * Add flax as a dep Signed-off-by: Frederic Bastien * Update README.rst Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: Frédéric Bastien --------- Signed-off-by: Frederic Bastien Signed-off-by: Frédéric Bastien Co-authored-by: Kirthi Shankar Sivamani --- README.rst | 26 ++++++++++++++++++++++---- docs/installation.rst | 8 +++++--- setup.py | 27 ++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 8a042194a0..8bccb56912 100644 --- a/README.rst +++ b/README.rst @@ -131,13 +131,31 @@ Transformer Engine comes preinstalled in the pyTorch container on From source ^^^^^^^^^^^ -Clone the repository and inside it type: +For JAX, pybind11 must be installed: .. code-block:: bash - NVTE_FRAMEWORK=all pip install . # Building with all frameworks. - NVTE_FRAMEWORK=pytorch pip install . # Building with pyTorch only. - NVTE_FRAMEWORK=jax pip install . # Building with JAX only. + pip install pybind11 + +Then, you can install this optional dependency: + +.. code-block:: bash + + pip install ninja + +Install TE (optionally specifying the framework): + +.. code-block:: bash + + git clone https://github.com/NVIDIA/TransformerEngine.git + cd TransformerEngine + + # Execute one of the following command + NVTE_FRAMEWORK=all pip install . # Build TE for all supported frameworks. + NVTE_FRAMEWORK=pytorch pip install . # Build TE for PyTorch only. + NVTE_FRAMEWORK=jax pip install . # Build TE for JAX only. + +If the framework is not explicitly specified, TE will be built for PyTorch only. User Guide ---------- diff --git a/docs/installation.rst b/docs/installation.rst index 263d3ed760..0c12b6b79e 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -29,9 +29,11 @@ pip - from GitHub Additional Prerequisites ^^^^^^^^^^^^^^^^^^^^^^^^ -1. `CMake `__ version 3.18 or later -2. `pyTorch `__ with GPU support -3. `Ninja `__ +1. `CMake `__ version 3.18 or later. +2. [For pyTorch support] `pyTorch `__ with GPU support. +3. [For JAX support] `JAX `__ with GPU support, version >= 0.4.7. +4. `pybind11`: `pip install pybind11`. +5. [Optional] `Ninja `__: `pip install ninja`. Installation (stable release) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/setup.py b/setup.py index 4b45cfd7de..46c5106794 100644 --- a/setup.py +++ b/setup.py @@ -161,11 +161,16 @@ def install_requires(): class JaxBuilder(FrameworkBuilderBase): def cmake_flags(self): - return ["-DENABLE_JAX=ON"] + p = [d for d in sys.path if 'dist-packages' in d][0] + return ["-DENABLE_JAX=ON", "-DCMAKE_PREFIX_PATH="+p] def run(self, extensions): print("Building jax extensions!") + def install_requires(): + # TODO: find a way to install pybind11 and ninja directly. + return ['cmake', 'flax'] + ext_modules = [] dlfw_builder_funcs = [] @@ -195,8 +200,13 @@ def run(self, extensions): if framework in ("all", "jax"): dlfw_builder_funcs.append(JaxBuilder) + # Trigger a better error when pybind11 isn't present. + # Sadly, if pybind11 was installed with `apt -y install pybind11-dev` + # This doesn't install a python packages. So the line bellow is too strict. + # When it fail, we need to detect if cmake will find pybind11. + # import pybind11 -dlfw_install_requires = [] +dlfw_install_requires = ['pydantic'] for builder in dlfw_builder_funcs: dlfw_install_requires = dlfw_install_requires + builder.install_requires() @@ -257,10 +267,16 @@ def build_extensions(self) -> None: build_dir = os.path.abspath(build_dir) cmake_args = [ - "-GNinja", "-DCMAKE_BUILD_TYPE=" + config, "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(config.upper(), build_dir), ] + try: + import ninja + except ImportError: + pass + else: + cmake_args.append("-GNinja") + cmake_args = cmake_args + self.dlfw_flags cmake_build_args = ["--config", config] @@ -384,5 +400,10 @@ def get_outputs(self): ext_modules=ext_modules, cmdclass={"build_ext": TEBuildExtension}, install_requires=dlfw_install_requires, + extras_require={ + 'test': ['pytest', + 'tensorflow_datasets'], + 'test_pytest': ['onnxruntime',], + }, license_files=("LICENSE",), ) From ee87982096355b860beacd1eae7057715b51e989 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 18 Apr 2023 09:07:32 -0700 Subject: [PATCH 017/427] Amax reduction interval (#154) * amax reduction internval Signed-off-by: Sangkug Lym Skip TP-domain only AMAX reduction when TP-group is not initialized Signed-off-by: Sangkug Lym * Update transformer_engine/pytorch/fp8.py Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: Sangkug Lym * check TP group initialized Signed-off-by: Sangkug Lym fix Signed-off-by: Sangkug Lym --------- Signed-off-by: Sangkug Lym Co-authored-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/fp8.py | 33 ++++++++++++++++++- transformer_engine/pytorch/module.py | 48 ++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index 4304c8cd8f..07cad012ec 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -3,6 +3,7 @@ # See LICENSE for license information. """FP8 utilities for TransformerEngine""" +import os from contextlib import contextmanager from collections import deque from typing import Callable, List, Optional, Dict, Any, Tuple, Union @@ -30,6 +31,9 @@ _amax_reduce_handle_fwd = None _is_fp8_available = None _reason_for_no_fp8 = "" +_dp_amax_reduce_interval = None +_dp_amax_reduce_forward_idx = 0 +_dp_amax_reduce_backward_idx = 0 def _check_fp8_support() -> Tuple[bool, str]: @@ -545,6 +549,8 @@ def reduce_tensor_across_group_op_max( def global_amax_reduction( fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, forward: bool = True, ) -> None: """Concatenate, reduce, and split amaxes in the global buffer.""" @@ -555,12 +561,37 @@ def global_amax_reduction( if amax_buffer_key not in _global_fp8_buffer: return None + # Reduce AMAX in DP-domain at an interval. + global _dp_amax_reduce_interval, _dp_amax_reduce_forward_idx, _dp_amax_reduce_backward_idx + if _dp_amax_reduce_interval is None: + _dp_amax_reduce_interval = int(os.getenv("NVTE_DP_AMAX_REDUCE_INTERVAL", "1")) + + tp_amax_reduce = False + if forward: + if _dp_amax_reduce_forward_idx == 0: + reduce_group = fp8_meta["fp8_group"] + else: + tp_amax_reduce = True + _dp_amax_reduce_forward_idx = (_dp_amax_reduce_forward_idx + 1) % _dp_amax_reduce_interval + else: + if _dp_amax_reduce_backward_idx == 0: + reduce_group = fp8_meta["fp8_group"] + else: + tp_amax_reduce = True + _dp_amax_reduce_backward_idx = (_dp_amax_reduce_backward_idx + 1) % _dp_amax_reduce_interval + + if tp_amax_reduce: + if tp_size > 1: + reduce_group = tp_group + else: + return None + chunk_sizes = [x.numel() for x in _global_fp8_buffer[amax_buffer_key]] contiguous_amax = torch.cat(_global_fp8_buffer[amax_buffer_key]) wait_handle = reduce_tensor_across_group_op_max( contiguous_amax, - fp8_meta["fp8_group"], + reduce_group, fp8_meta["async_amax_reduction"], ) diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index 7c25619485..dff37497d6 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -105,7 +105,13 @@ def get_workspace() -> torch.Tensor: return _cublas_workspace @contextmanager -def _prepare_backward(fp8: bool, fp8_meta: Dict[str, Any], name: str = "") -> None: +def _prepare_backward( + fp8: bool, + fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, + name: str = "" +) -> None: """Checks and prep for BWD.""" if fp8: global _amax_reduce_handle_bwd @@ -132,7 +138,12 @@ def _prepare_backward(fp8: bool, fp8_meta: Dict[str, Any], name: str = "") -> N if fp8 and fp8_meta["recipe"].reduce_amax: if fp8_meta["first_module"]: - _amax_reduce_handle_bwd = global_amax_reduction(fp8_meta, forward=False) + _amax_reduce_handle_bwd = global_amax_reduction( + fp8_meta, + tp_group, + tp_size, + forward=False + ) delete_key_from_amax_buffer(forward=False) @@ -186,7 +197,6 @@ def __init__(self) -> None: self.fp8_meta["recipe"] = get_default_fp8_recipe() self.fp8_meta_tensors_initialized = False self.tp_group = None - self.tp_group_initialized = False self.tp_size = 1 self.sequence_parallel = False self.fp8_weight_shapes = [] @@ -541,7 +551,13 @@ def prepare_forward( if self.fp8 and self.training and self.fp8_meta["recipe"].reduce_amax: set_fp8_context_id(self.fp8_meta["autocast_id_fwd"]) - reduce_func = partial(global_amax_reduction, self.fp8_meta, forward=True) + reduce_func = partial( + global_amax_reduction, + self.fp8_meta, + self.tp_group, + self.tp_size, + forward=True + ) setup_amax_forward_global_reduce_func(reduce_func) def set_nccl_overlap_warning_if_tp(self) -> None: @@ -692,6 +708,7 @@ def forward( fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, tp_group: Union[dist_group_type, None], + tp_size: int, sequence_parallel: bool, tensor_parallel: bool, activation_dtype: torch.dtype, @@ -867,6 +884,7 @@ def forward( ctx.inp_shape = inp.shape ctx.parallel_mode = parallel_mode ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.return_layernorm_output = return_layernorm_output ctx.bwd_ln_sm_margin = bwd_ln_sm_margin ctx.zero_centered_gamma = zero_centered_gamma @@ -890,7 +908,9 @@ def forward( def backward( ctx, *grad_outputs: Tuple[torch.Tensor, ...] ) -> Tuple[Union[torch.Tensor, None], ...]: - with _prepare_backward(ctx.fp8, ctx.fp8_meta, name="_LayerNormLinear"): + with _prepare_backward( + ctx.fp8, ctx.fp8_meta, ctx.tp_group, ctx.tp_size, name="_LayerNormLinear" + ): ( inputmat, ln_weight, @@ -1065,6 +1085,7 @@ def backward( None, None, None, + None, ) @@ -1381,6 +1402,7 @@ def forward( self.fp8_meta, self.fuse_wgrad_accumulation, self.tp_group, + self.tp_size, self.sequence_parallel, self.tp_size > 1, self.activation_dtype, @@ -1427,6 +1449,7 @@ def forward( fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, tp_group: Union[dist_group_type, None], + tp_size: int, sequence_parallel: bool, tensor_parallel: bool, activation_dtype: torch.dtype, @@ -1563,6 +1586,7 @@ def forward( ctx.inp_shape = inp.shape ctx.parallel_mode = parallel_mode ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.requires_dgrad = inp.requires_grad # Row Parallel Linear @@ -1579,7 +1603,9 @@ def forward( def backward( ctx, grad_output: torch.Tensor ) -> Tuple[Union[torch.Tensor, None], ...]: - with _prepare_backward(ctx.fp8, ctx.fp8_meta, name="_Linear"): + with _prepare_backward( + ctx.fp8, ctx.fp8_meta, ctx.tp_group, ctx.tp_size, name="_Linear" + ): ( inputmat, inputmat_t, @@ -1730,6 +1756,7 @@ def backward( None, None, None, + None, ) @@ -1995,6 +2022,7 @@ def forward( self.fp8_meta, self.fuse_wgrad_accumulation, self.tp_group, + self.tp_size, self.sequence_parallel, self.tp_size > 1, self.activation_dtype, @@ -2039,6 +2067,7 @@ def forward( fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, tp_group: Union[dist_group_type, None], + tp_size: int, sequence_parallel: bool, tensor_parallel: bool, activation_dtype: torch.dtype, @@ -2282,6 +2311,7 @@ def forward( ctx.tensor_parallel = tensor_parallel ctx.inp_shape = inp.shape ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.bias_gelu_nvfusion = bias_gelu_nvfusion ctx.return_layernorm_output = return_layernorm_output ctx.set_parallel_mode = set_parallel_mode @@ -2307,7 +2337,9 @@ def forward( def backward( ctx, *grad_outputs: Tuple[torch.Tensor, ...] ) -> Tuple[Union[torch.Tensor, None], ...]: - with _prepare_backward(ctx.fp8, ctx.fp8_meta, name="_LayerNormMLP"): + with _prepare_backward( + ctx.fp8, ctx.fp8_meta, ctx.tp_group, ctx.tp_size, name="_LayerNormMLP" + ): ( inputmat, ln_weight, @@ -2610,6 +2642,7 @@ def backward( None, None, None, + None, ) @@ -2904,6 +2937,7 @@ def forward( self.fp8_meta, self.fuse_wgrad_accumulation, self.tp_group, + self.tp_size, self.sequence_parallel, self.tp_size > 1, self.activation_dtype, From e64fc3be6a7dacf21e992ec4f1ddd5ea6fb6ce21 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 19 Apr 2023 10:52:31 -0700 Subject: [PATCH 018/427] TP communication overlap with userbuffers (#147) * Port initial changes Co-authored-by: Sangkug Lym Co-authored-by: Vasudevan Rengasamy Signed-off-by: Kirthi Shankar Sivamani * readd FA include for PyTorch Signed-off-by: Kirthi Shankar Sivamani * Re-enable sm_70 + cleanup Signed-off-by: Kirthi Shankar Sivamani * LICENSE, cleanup header Signed-off-by: Kirthi Shankar Sivamani * 5k -> 173 errors Signed-off-by: Kirthi Shankar Sivamani * license and fixes in userbuffers-host Signed-off-by: Kirthi Shankar Sivamani * next round fixes Signed-off-by: Kirthi Shankar Sivamani * final cpp cleanup Signed-off-by: Kirthi Shankar Sivamani * pylinting Signed-off-by: Kirthi Shankar Sivamani * fix from linting Signed-off-by: Kirthi Shankar Sivamani * Turn off default async amax reduction (#148) Signed-off-by: Kirthi Shankar Sivamani * remove unused code path Signed-off-by: Sangkug Lym * cleanup Macros Signed-off-by: Sangkug Lym * fix conflict resolution bug Signed-off-by: Sangkug Lym * Fix gencode flags in setup (#145) * Fix gencode flags based on cuda version Signed-off-by: Kirthi Shankar Sivamani * review suggestions Signed-off-by: Kirthi Shankar Sivamani * revert append_nvcc_threads change Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani * Change overlap config dict error message Signed-off-by: Sangkug Lym * simplify ub initialization Signed-off-by: Sangkug Lym * lint Signed-off-by: Kirthi Shankar Sivamani * fix sanity imports Signed-off-by: Kirthi Shankar Sivamani * cpplint Signed-off-by: Kirthi Shankar Sivamani * fix TensorFlow build Signed-off-by: Kirthi Shankar Sivamani * fix TE macros in public header Signed-off-by: Kirthi Shankar Sivamani * fix lint Signed-off-by: Kirthi Shankar Sivamani * More fixes Signed-off-by: Kirthi Shankar Sivamani * compiles with and w/o MPI Signed-off-by: Kirthi Shankar Sivamani * fixes for python side annotations for conditional compile Signed-off-by: Kirthi Shankar Sivamani * link gdrAPI only when MPI found Signed-off-by: Kirthi Shankar Sivamani * fix comments for dummy var Signed-off-by: Kirthi Shankar Sivamani * Fix linking Signed-off-by: Kirthi Shankar Sivamani * Review comments Signed-off-by: Kirthi Shankar Sivamani * load MPI before TE Signed-off-by: Kirthi Shankar Sivamani * Add Py side argument checks Signed-off-by: Kirthi Shankar Sivamani * remove unused code and catch silent failures Signed-off-by: Kirthi Shankar Sivamani * Fix cpp tests Signed-off-by: Kirthi Shankar Sivamani * fix find_lib path for tests Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym Co-authored-by: Vasudevan Rengasamy --- qa/L0_cppunittest/test.sh | 7 +- qa/L0_jax_lint/CPPLINT.cfg | 1 + qa/L0_lint/CPPLINT.cfg | 1 + qa/L0_tensorflow_lint/CPPLINT.cfg | 1 + setup.py | 30 +- tests/cpp/CMakeLists.txt | 6 + tests/cpp/operator/CMakeLists.txt | 8 +- transformer_engine/__init__.py | 1 - transformer_engine/common/CMakeLists.txt | 58 +- transformer_engine/common/__init__.py | 23 + .../comm_gemm_overlap/userbuffers-host.cpp | 464 +++++ .../common/comm_gemm_overlap/userbuffers.cu | 1734 +++++++++++++++++ .../common/gemm/cublaslt_gemm.cu | 11 + .../common/include/transformer_engine/gemm.h | 2 + .../include/transformer_engine/userbuffers.h | 227 +++ transformer_engine/jax/csrc/modules.cpp | 2 +- transformer_engine/pytorch/cpp_extensions.py | 89 +- .../pytorch/csrc/comm_gemm_overlap.h | 579 ++++++ transformer_engine/pytorch/csrc/extensions.cu | 187 +- transformer_engine/pytorch/csrc/extensions.h | 34 +- transformer_engine/pytorch/csrc/ts_fp8_op.cpp | 3 +- transformer_engine/pytorch/module.py | 517 ++++- transformer_engine/pytorch/transformer.py | 38 + .../tensorflow/csrc/extensions.cu | 2 +- 24 files changed, 3942 insertions(+), 83 deletions(-) create mode 100644 transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp create mode 100644 transformer_engine/common/comm_gemm_overlap/userbuffers.cu create mode 100644 transformer_engine/common/include/transformer_engine/userbuffers.h create mode 100644 transformer_engine/pytorch/csrc/comm_gemm_overlap.h diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh index 55406c2089..73a27a1fcd 100644 --- a/qa/L0_cppunittest/test.sh +++ b/qa/L0_cppunittest/test.sh @@ -4,11 +4,16 @@ set -e +# Find TE : ${TE_PATH:=/opt/transformerengine} TE_LIB_PATH=`pip show transformer-engine | grep Location | cut -d ' ' -f 2` export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH +# Find MPI +MPI_HOME=${MPI_HOME:-/usr/local/mpi} +NVTE_MPI_INCLUDE="$MPI_HOME/lib" + cd $TE_PATH/tests/cpp -cmake -GNinja -Bbuild . +cmake -GNinja -Bbuild -DNVTE_MPI_INCLUDE=$NVTE_MPI_INCLUDE . cmake --build build ctest --test-dir build -j4 diff --git a/qa/L0_jax_lint/CPPLINT.cfg b/qa/L0_jax_lint/CPPLINT.cfg index 9eb7b734bb..a2a06602c1 100644 --- a/qa/L0_jax_lint/CPPLINT.cfg +++ b/qa/L0_jax_lint/CPPLINT.cfg @@ -14,3 +14,4 @@ filter=-build/namespaces filter=-readability/todo filter=-build/header_guard filter=-build/include +filter=-build/c++11 diff --git a/qa/L0_lint/CPPLINT.cfg b/qa/L0_lint/CPPLINT.cfg index 9eb7b734bb..a2a06602c1 100644 --- a/qa/L0_lint/CPPLINT.cfg +++ b/qa/L0_lint/CPPLINT.cfg @@ -14,3 +14,4 @@ filter=-build/namespaces filter=-readability/todo filter=-build/header_guard filter=-build/include +filter=-build/c++11 diff --git a/qa/L0_tensorflow_lint/CPPLINT.cfg b/qa/L0_tensorflow_lint/CPPLINT.cfg index 9eb7b734bb..a2a06602c1 100644 --- a/qa/L0_tensorflow_lint/CPPLINT.cfg +++ b/qa/L0_tensorflow_lint/CPPLINT.cfg @@ -14,3 +14,4 @@ filter=-build/namespaces filter=-readability/todo filter=-build/header_guard filter=-build/include +filter=-build/c++11 diff --git a/setup.py b/setup.py index 55552294e4..decdce51a4 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,11 @@ path = os.path.dirname(os.path.realpath(__file__)) with open(path + "/VERSION", "r") as f: te_version = f.readline() + CUDA_HOME = os.environ.get("CUDA_HOME", "/usr/local/cuda") +MPI_HOME = os.environ.get("MPI_HOME", "/usr/local/mpi") +NVTE_MPI_FOUND = os.path.exists(MPI_HOME) +NVTE_MPI_INCLUDE = os.path.join(MPI_HOME, "include") def get_cuda_bare_metal_version(cuda_dir): raw_output = subprocess.check_output( @@ -51,7 +55,7 @@ def extra_gencodes(cc_flag): def extra_compiler_flags(): - return [ + extra_flags = [ "-O3", "-gencode", "arch=compute_70,code=sm_70", @@ -66,6 +70,9 @@ def extra_compiler_flags(): "--expt-extended-lambda", "--use_fast_math", ] + if NVTE_MPI_FOUND: + extra_flags.append("-DNVTE_MPI_FOUND") + return extra_flags cc_flag = [] @@ -76,12 +83,6 @@ def make_abs_path(l): return [os.path.join(path, p) for p in l] -include_dirs = [ - "transformer_engine/common/include", - "transformer_engine/pytorch/csrc", -] -include_dirs = make_abs_path(include_dirs) - pytorch_sources = [ "transformer_engine/pytorch/csrc/extensions.cu", "transformer_engine/pytorch/csrc/common.cu", @@ -100,6 +101,14 @@ def make_abs_path(l): framework = os.environ.get("NVTE_FRAMEWORK", "pytorch") +include_dirs = [ + "transformer_engine/common/include", + "transformer_engine/pytorch/csrc", +] +if (framework in ("all", "pytorch")) and NVTE_MPI_FOUND: + include_dirs.append(NVTE_MPI_INCLUDE) +include_dirs = make_abs_path(include_dirs) + args = sys.argv.copy() for s in args: if s.startswith("--framework="): @@ -155,10 +164,16 @@ def run(self, extensions): print("Building pyTorch extensions!") self.pytorch_build_extensions.run() + def cmake_flags(self): + if not NVTE_MPI_FOUND: + return [] + return ["-DNVTE_MPI_FOUND=1", f"-DNVTE_MPI_INCLUDE={NVTE_MPI_INCLUDE}"] + @staticmethod def install_requires(): return ["flash-attn>=1.0.2",] + class TensorFlowBuilder(FrameworkBuilderBase): def cmake_flags(self): p = [d for d in sys.path if 'dist-packages' in d][0] @@ -167,6 +182,7 @@ def cmake_flags(self): def run(self, extensions): print("Building TensorFlow extensions!") + class JaxBuilder(FrameworkBuilderBase): def cmake_flags(self): p = [d for d in sys.path if 'dist-packages' in d][0] diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 75a9d13a20..631b356fec 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -27,6 +27,12 @@ if(NOT DEFINED TE_LIB_PATH) endif() find_library(TE_LIB NAMES transformer_engine PATHS ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED) + +if(EXISTS ${NVTE_MPI_INCLUDE}) + find_library(MPI_LIB NAMES mpi PATHS ${NVTE_MPI_INCLUDE} REQUIRED) + message(STATUS "Found MPI library: ${MPI_LIB}") +endif() + message(STATUS "Found transformer_engine library: ${TE_LIB}") include_directories(../../transformer_engine/common/include) include_directories(${CMAKE_SOURCE_DIR}) diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt index d720798db5..a77cf98a73 100644 --- a/tests/cpp/operator/CMakeLists.txt +++ b/tests/cpp/operator/CMakeLists.txt @@ -17,7 +17,13 @@ add_executable(test_operator test_multi_cast_transpose.cu ../test_common.cu) -target_link_libraries(test_operator PUBLIC CUDA::cudart GTest::gtest_main ${TE_LIB}) +list(APPEND test_operator_LINKER_LIBS CUDA::cudart GTest::gtest_main ${TE_LIB}) + +if(EXISTS ${NVTE_MPI_INCLUDE}) + list(APPEND test_operator_LINKER_LIBS ${MPI_LIB}) +endif() + +target_link_libraries(test_operator PUBLIC ${test_operator_LINKER_LIBS}) target_compile_options(test_operator PRIVATE -O2) include(GoogleTest) diff --git a/transformer_engine/__init__.py b/transformer_engine/__init__.py index bbe18df6db..6d89b9aad5 100644 --- a/transformer_engine/__init__.py +++ b/transformer_engine/__init__.py @@ -5,7 +5,6 @@ """Top level package""" from . import common - try: from . import pytorch except ImportError as e: diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index cee3cad71d..7459f77e4f 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -1,35 +1,55 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. -add_library(transformer_engine SHARED - transformer_engine.cpp - transpose/cast_transpose.cu - transpose/transpose.cu - transpose/cast_transpose_fusion.cu - transpose/transpose_fusion.cu - transpose/multi_cast_transpose.cu - activation/gelu.cu - gemm/cublaslt_gemm.cu - layer_norm/ln_api.cpp - layer_norm/ln_bwd_semi_cuda_kernel.cu - layer_norm/ln_fwd_cuda_kernel.cu - rmsnorm/rmsnorm_api.cpp - rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu - rmsnorm/rmsnorm_fwd_cuda_kernel.cu - util/cast.cu - fused_softmax/scaled_masked_softmax.cu - fused_softmax/scaled_upper_triang_masked_softmax.cu) + +set(transformer_engine_SOURCES) +list(APPEND transformer_engine_SOURCES transformer_engine.cpp + transpose/cast_transpose.cu + transpose/transpose.cu + transpose/cast_transpose_fusion.cu + transpose/transpose_fusion.cu + transpose/multi_cast_transpose.cu + activation/gelu.cu + gemm/cublaslt_gemm.cu + layer_norm/ln_api.cpp + layer_norm/ln_bwd_semi_cuda_kernel.cu + layer_norm/ln_fwd_cuda_kernel.cu + rmsnorm/rmsnorm_api.cpp + rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu + rmsnorm/rmsnorm_fwd_cuda_kernel.cu + util/cast.cu + fused_softmax/scaled_masked_softmax.cu + fused_softmax/scaled_upper_triang_masked_softmax.cu) + +if(NVTE_MPI_FOUND) + list(APPEND transformer_engine_SOURCES comm_gemm_overlap/userbuffers.cu + comm_gemm_overlap/userbuffers-host.cpp) +endif() + +add_library(transformer_engine SHARED ${transformer_engine_SOURCES}) target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") list(APPEND transformer_engine_LINKER_LIBS CUDA::cublas CUDA::cudart CUDA::nvToolsExt) -target_link_libraries(transformer_engine PUBLIC ${transformer_engine_LINKER_LIBS}) +if(NVTE_MPI_FOUND) + list(APPEND transformer_engine_LINKER_LIBS gdrapi) +endif() +target_link_libraries(transformer_engine PUBLIC ${transformer_engine_LINKER_LIBS}) target_include_directories(transformer_engine PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) set_source_files_properties(fused_softmax/scaled_masked_softmax.cu fused_softmax/scaled_upper_triang_masked_softmax.cu PROPERTIES COMPILE_OPTIONS "--use_fast_math") + +if(NVTE_MPI_FOUND) + set_source_files_properties(comm_gemm_overlap/userbuffers.cu + comm_gemm_overlap/userbuffers-host.cpp + PROPERTIES + INCLUDE_DIRECTORIES ${NVTE_MPI_INCLUDE} + COMPILE_OPTIONS "$<$:-maxrregcount=64>") +endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 7dfcdc96bb..0a8924f8ed 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -37,4 +37,27 @@ def _load_library(): return ctypes.CDLL(dll_path, mode=ctypes.RTLD_GLOBAL) +def _load_mpi(): + """Load MPI shared library""" + + system = platform.system() + if system == "Linux": + extension = "so" + elif system == "Darwin": + extension = "dylib" + elif system == "Windows": + extension = "dll" + else: + raise RuntimeError(f"Unsupported operating system ({system})") + lib_name = "libmpi." + extension + MPI_HOME = os.environ.get("MPI_HOME", "/usr/local/mpi") + NVTE_MPI_FOUND = os.path.exists(MPI_HOME) + dll_path = os.path.join(MPI_HOME, "lib", lib_name) + + if NVTE_MPI_FOUND: + return ctypes.CDLL(dll_path, mode=ctypes.RTLD_GLOBAL) + return None + + +_TE_LIB_CTYPES = _load_mpi() _TE_LIB_CTYPES = _load_library() diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp new file mode 100644 index 0000000000..14928ed5a1 --- /dev/null +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp @@ -0,0 +1,464 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int oob_bcast(void *comm_context, void *buf, int size, int root) { + MPI_Bcast(buf, size, MPI_BYTE, root, + (reinterpret_cast(comm_context))->comm_inter); + return 0; +} + +static int oob_barrier(void *comm_context) { + MPI_Barrier((reinterpret_cast(comm_context))->comm_inter); + return 0; +} + +static int oob_gather(void *comm_context, int root, void *sbuf, void *rbuf, int len) { + MPI_Gather(sbuf, len, MPI_BYTE, rbuf, len, MPI_BYTE, root, + (reinterpret_cast(comm_context))->comm_inter); + return 0; +} + +int stringCmp(const void *a, const void *b) { return strcmp((const char *)a, (const char *)b); } + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +int pipe_rank(communicator *comm, int step) { + int mynode = comm->myrank / comm->nvsize; + int mylocal = comm->nvrank; + int numlocal = comm->nvsize; + + int newlocal1 = mylocal + step * comm->ar_nvsize * comm->ar2_nvsize; + int newlocal = (numlocal + (newlocal1 % numlocal)) % numlocal; + int newnode = mynode; + newnode += (newlocal1 - newlocal) / numlocal * comm->num_nodes * comm->num2_nodes; + int allnodes = comm->nranks / comm->nvsize; + newnode = (allnodes + (newnode % allnodes)) % allnodes; + return newnode * numlocal + newlocal; +} + +int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenodes, int tensorgpus, + int tensornodes) { + *comm = reinterpret_cast(malloc(sizeof(communicator))); + + int myrank, nranks, cur_dev, ndev; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + (*comm)->nranks = nranks; + (*comm)->myrank = myrank; + (*comm)->free_region = 0; + (*comm)->launch_mode = NVTE_LAUNCH_GPU | NVTE_LAUNCH_CPU; + + cudaDeviceProp device_prop; + CUDACHECK(cudaGetDevice(&cur_dev)); + CUDACHECK(cudaGetDeviceCount(&ndev)); + CUDACHECK(cudaGetDeviceProperties(&device_prop, cur_dev)); + (*comm)->sm_arch = device_prop.major; + // (*comm)->use_rr_kernel = device_prop.major == 8; + (*comm)->use_rr_kernel = 0; + (*comm)->push = 1; + (*comm)->use_ce = 0; + (*comm)->cga_size = 2; + for (int i = 0; i < userbuffers_op_types; i++) (*comm)->basecounter[i] = 0; + (*comm)->head = 0; + (*comm)->tail = 0; + (*comm)->activeproxy = 1; + (*comm)->active_nreqs = 0; + for (int i = 0; i < userbuffers_op_types; i++) (*comm)->active_req[i].active = -1; + + int ret = 0; + // split communicator + char host_name[MPI_MAX_PROCESSOR_NAME]; + char(*host_names)[MPI_MAX_PROCESSOR_NAME]; + int namelen, bytes, color, my_node, mylocal, numlocal, num_nodes; + int rank = (*comm)->myrank, size = (*comm)->nranks; + MPI_Get_processor_name(host_name, &namelen); + bytes = size * sizeof(char[MPI_MAX_PROCESSOR_NAME]); + host_names = (char(*)[MPI_MAX_PROCESSOR_NAME])malloc(bytes); + strcpy(host_names[rank], host_name); // NOLINT(*) + for (int n = 0; n < size; n++) + MPI_Bcast(&(host_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD); + qsort(host_names, size, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp); + + color = 0; + for (int n = 0; n < size; n++) { + if (n > 0 && strcmp(host_names[n - 1], host_names[n])) color++; + if (strcmp(host_name, host_names[n]) == 0) break; + } + free(host_names); + + MPI_Comm_split(MPI_COMM_WORLD, color, rank, &(*comm)->comm_intra); + // find intranode numbers and make internode communicator + // figure out mylocal + MPI_Comm_rank((*comm)->comm_intra, &mylocal); + MPI_Comm_size((*comm)->comm_intra, &numlocal); + (*comm)->nvrank = mylocal; + (*comm)->nvsize = numlocal; + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + int core; + if (mylocal == 0) core = 50; + if (mylocal == 1) core = 58; + if (mylocal == 2) core = 18; + if (mylocal == 3) core = 26; + if (mylocal == 4) core = 114; + if (mylocal == 5) core = 122; + if (mylocal == 6) core = 82; + if (mylocal == 7) core = 90; + + CPU_SET(core, &cpuset); + if (!getenv("NVTE_NODOUBLE")) { + if (core > 128) + CPU_SET(core - 128, &cpuset); + else + CPU_SET(core + 128, &cpuset); + } + if (getenv("NVTE_DOPIN")) pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + + if (ndev == numlocal) { // all visible devices + if (cur_dev != mylocal) + printf("%d: device used %d[%d] ,resetting device to %d\n", rank, cur_dev, ndev, mylocal); + CUDACHECK(cudaSetDevice(mylocal)); + } + (*comm)->mydev = cur_dev; + // FIXME need to check that numlocal is multiple of pipegpus x tensorgpus + // ar1 is data + int divgpus = pipegpus * tensorgpus; + int datagpus = numlocal / divgpus; + (*comm)->ar_nvsize = datagpus; + (*comm)->ar_firstgpu = mylocal - ((mylocal / tensorgpus) % datagpus) * tensorgpus; + (*comm)->ar_nvrank = (mylocal - (*comm)->ar_firstgpu) / tensorgpus; + // ar2 is tensor + (*comm)->ar2_nvsize = tensorgpus; + (*comm)->ar2_firstgpu = mylocal - mylocal % tensorgpus; + (*comm)->ar2_nvrank = mylocal - (*comm)->ar2_firstgpu; + // ar2 has step equal to ar_nvsize + int allnodes = nranks / numlocal; + int mynode = myrank / numlocal; + int datanodes = allnodes / pipenodes / tensornodes; + int pipenodegroup_id = myrank / numlocal / (datanodes * tensornodes); + + (*comm)->pipe_id = pipegpus * pipenodegroup_id + mylocal / (datagpus * tensorgpus); + + CUDACHECK(cudaFree(0)); + int datanodegroup_id = + myrank / numlocal / datanodes; // data reduction group node belongs, equals 0 for all if both + // pipenodes=1 and tensornodes=1 + // mpi communicator only needed for SHARP which is always allreduce1/data-parallel + MPI_Comm_split(MPI_COMM_WORLD, mylocal + numlocal * datanodegroup_id, rank, &(*comm)->comm_inter); + // different rails from same group are in different subcommunicators + + MPI_Comm_size((*comm)->comm_inter, &num_nodes); + MPI_Comm_rank((*comm)->comm_inter, &my_node); + (*comm)->first_node = mynode - my_node; + (*comm)->num_nodes = num_nodes; + (*comm)->my_node = my_node; + + (*comm)->num2_nodes = tensornodes; + (*comm)->my2_node = (mynode / datanodes) % tensornodes; + (*comm)->first2_node = mynode - (*comm)->my2_node * datanodes; + + char *ib_dev_list; + int ZIONROCE = getenv("NVTE_ZIONROCE") ? atoi(getenv("NVTE_ZIONROCE")) : 0; + int ROCE = getenv("NVTE_ROCE") ? atoi(getenv("NVTE_ROCE")) : 0; + if (ZIONROCE) ROCE = 1; + int DGX_H100 = device_prop.major == 9; + + switch (mylocal) { + case 0:ib_dev_list = "mlx5_0:1"; break; // NOLINT(*) + case 1:ib_dev_list = (char*)(DGX_H100?"mlx5_3:1":"mlx5_1:1"); break; // NOLINT(*) + case 2:ib_dev_list = (char*)(ZIONROCE?"mlx5_4:1":DGX_H100?"mlx5_4:1":"mlx5_2:1"); break; // NOLINT(*) + case 3:ib_dev_list = (char*)(DGX_H100?"mlx5_5:1":"mlx5_3:1"); break; // NOLINT(*) + case 4:ib_dev_list = (char*)(DGX_H100?"mlx5_6:1":"mlx5_6:1"); break; // NOLINT(*) + case 5:ib_dev_list = (char*)(DGX_H100?"mlx5_9:1":"mlx5_7:1"); break; // NOLINT(*) + case 6:ib_dev_list = (char*)(ZIONROCE?"mlx5_10:1":DGX_H100?"mlx5_10:1":"mlx5_8:1"); break; // NOLINT(*) + case 7:ib_dev_list = (char*)(DGX_H100?"mlx5_11:1":"mlx5_9:1"); break; // NOLINT(*) + default: break; + } + + (*comm)->fifo = reinterpret_cast(malloc(sizeof(ub_request) * NVTE_MAX_REQUESTS)); + (*comm)->nblocks = 8; + (*comm)->alignblock = 1024 * 512; + (*comm)->minblock = 1024 * 2 * 1024; + (*comm)->asyncblocks = 16; + + CUDACHECK(cudaMallocHost((void **)&(*comm)->hostflags, // NOLINT(*) + (NVTE_MAX_SMS + 100) * sizeof(int))); + for (int i = 0; i < 100 + NVTE_MAX_SMS; i++) (*comm)->hostflags[i] = 0; + _mm_mfence(); + sleep(1); + + // init_p2p_transport(); + (*comm)->ibnvsize = (*comm)->nvsize; + +#define NBUF 2 +#define LOCALSIZE 4 * (NVTE_REG0_OFFSET(*comm) + NVTE_REG0_FLAGS + NVTE_REG0_COMMBUFFER * NBUF) + // peer pointers + op flags + comm buffer + + CUDACHECK(cudaMalloc(&(*comm)->gpu_ptrs, LOCALSIZE)); // flags and pointers, no block data yet + CUDACHECK(cudaMemset((*comm)->gpu_ptrs, 0, LOCALSIZE)); + CUDACHECK(cudaDeviceSynchronize()); + register_user_buffer_collective(&((*comm)->gpu_ptrs), LOCALSIZE, *comm); // will use handler 0 + CUDACHECK(cudaMalloc(&(*comm)->send_id, (*comm)->nranks * sizeof(int))); + CUDACHECK(cudaMalloc(&(*comm)->recv_id, NVTE_MAX_REGIONS * (*comm)->nranks * sizeof(int))); + CUDACHECK(cudaMemset((*comm)->send_id, 0, (*comm)->nranks * sizeof(int))); + CUDACHECK(cudaMemset((*comm)->recv_id, 0, NVTE_MAX_REGIONS * (*comm)->nranks * sizeof(int))); + (*comm)->sms = 16; + (*comm)->threads = 1024; + +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) +#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1) +#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) + CUDACHECK(cudaMalloc(&(*comm)->flags, 2 * GPU_PAGE_SIZE)); + unsigned int flag = 1; + // cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)(*comm)->flags); + CUDACHECK(cudaMemset((*comm)->flags, 0, 2 * GPU_PAGE_SIZE)); + (*comm)->flags = + reinterpret_cast(((CUdeviceptr)(*comm)->flags + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK); + + using namespace std; + (*comm)->g = gdr_open(); + if ((*comm)->g == NULL) { + fprintf(stderr, "gdrcopy open failed\n"); + return -1; + } + gdr_mh_t mh; + ret = gdr_pin_buffer((*comm)->g, (CUdeviceptr)(*comm)->flags, GPU_PAGE_SIZE, 0, 0, &mh); + if (ret) { + fprintf(stderr, "gdr_pin_buffer failed\n"); + return -1; + } + ret = gdr_map((*comm)->g, mh, (void **)&((*comm)->map_flags), GPU_PAGE_SIZE); // NOLINT(*) + + if (ret) { + fprintf(stderr, "gdr_map failed\n"); + return -1; + } + sched_param param; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_getschedparam(&attr, ¶m); + param.sched_priority = sched_get_priority_max(SCHED_FIFO); + + pthread_attr_setschedparam(&attr, ¶m); + + if (getenv("NVTE_UBDEBUG")) + printf("%d/%d:(%d x %d): DP %d x %d TP %d x %d, DPGROUP %dx%d TPGROUP %dx%d PIPE_ID %d/%d\n", + myrank, nranks, myrank / numlocal, myrank % numlocal, (*comm)->my_node, + (*comm)->ar_nvrank, (*comm)->my2_node, (*comm)->ar2_nvrank, (*comm)->num_nodes, + (*comm)->ar_nvsize, (*comm)->num2_nodes, (*comm)->ar2_nvsize, (*comm)->pipe_id, + pipegpus * pipenodes); + fflush(NULL); + + return 0; +} +int create_communicator_grouped(communicator **comm, int pipegpus, int pipenodes) { + return create_communicator_grouped2(comm, pipegpus, pipenodes, 1, 1); +} + +int create_communicator(communicator **comm) { + return create_communicator_grouped2(comm, 1, 1, 1, 1); +} + +void destroy_communicator(communicator *comm) { + comm->activeproxy = 0; + if (!comm->myrank && getenv("NVTE_UBDEBUG")) + printf("waiting for userbuffers proxy thread to exit()\n"); + gdr_close(comm->g); +} + +int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *comm, bool alloc) { + if (comm->free_region > NVTE_MAX_REGIONS) return -1; + int hndl = comm->free_region; + // printf("%d register %d size %lld\n",comm->myrank,hndl,bytes);fflush(NULL); + comm->peer_ptr[hndl] = reinterpret_cast(malloc(sizeof(void *) * (comm->nvsize))); + + if (alloc) { + CUDACHECK(cudaMalloc(gpubuff, bytes)); + } + assert(comm->nvsize <= 8); + cudaIpcMemHandle_t *memhndl = + reinterpret_cast(malloc(sizeof(cudaIpcMemHandle_t) * (comm->nvsize))); + + CUDACHECK(cudaIpcGetMemHandle(&memhndl[comm->nvrank], *gpubuff)); + + MPI_Allgather(&memhndl[comm->nvrank], sizeof(cudaIpcMemHandle_t), MPI_BYTE, memhndl, + sizeof(cudaIpcMemHandle_t), MPI_BYTE, comm->comm_intra); + + for (int i = 0; i < comm->nvsize; i++) + if (i != comm->nvrank) + CUDACHECK(cudaIpcOpenMemHandle((void **)&(comm->peer_ptr[hndl][i]), // NOLINT(*) + memhndl[i], cudaIpcMemLazyEnablePeerAccess)); + comm->peer_ptr[hndl][comm->nvrank] = *gpubuff; + CUDACHECK(cudaDeviceSynchronize()); + + CUDACHECK( + cudaMemcpy(reinterpret_cast(comm->gpu_ptrs) + (hndl * comm->nvsize * sizeof(void *)), + comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice)); + + CUDACHECK(cudaDeviceSynchronize()); + free(memhndl); + + comm->mem_ptr[hndl] = *gpubuff; + return comm->free_region++; +} + +int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const int elements, + const int blocksize, communicator *comm, cudaStream_t stream); + +int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op); + +int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op); + +int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op); + +void allreduce_nonsharp_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream, int op) { + if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + // if(comm->myrank==0) fprintf(stderr,"AR2(%d) user call launch_mode=%d\n",op,comm->launch_mode); + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + int blocksize = elements * 2; + int maxcredit = 0; + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / + comm->nblocks; // FIXME TUNING + blocksize *= comm->alignblock; + if (blocksize < comm->minblock) blocksize = comm->minblock; + + maxcredit = (elements * 2 + blocksize - 1) / blocksize; + // if(maxcredit>4) maxcredit=4; + // if(maxcredit>4 && ar_nvsize==1) maxcredit=4; + size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit + if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; + // blocksize=elements*2; + int sms = allreduce2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, + stream, op); + + if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) { + if (!sms) return; + comm->fifo[comm->head].optype = op; + comm->fifo[comm->head].basecounter = comm->basecounter[op]; + comm->fifo[comm->head].blocksize = blocksize; + comm->fifo[comm->head].maxcredit = maxcredit; + comm->fifo[comm->head].handler = handler; + comm->fifo[comm->head].offset = offset; + comm->fifo[comm->head].elements = elements; + + int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1); + while (newhead == comm->tail) { + } + comm->head = newhead; + + comm->basecounter[op] += (elements * 2 + blocksize - 1) / blocksize; + } +} + +void allreduce2_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + allreduce_nonsharp_inplace(handler, offset, elements, comm, stream, + userbuffers_allreduceop_nonsharp2); +} + +void allreduce_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + allreduce_nonsharp_inplace(handler, offset, elements, comm, stream, + userbuffers_allreduceop_nonsharp); + return; +} + +void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + + int op = userbuffers_allreduceop_nonsharp; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + int blocksize = elements * 2; + int maxcredit = 0; + + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / + comm->nblocks; // FIXME TUNING + blocksize *= comm->alignblock; + if (blocksize < comm->minblock) blocksize = comm->minblock; + + maxcredit = (elements * 2 + blocksize - 1) / blocksize; + size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit + if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; + + int sms = reducescatter2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, + comm, stream, op); + + if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) { + if (!sms) return; + comm->fifo[comm->head].optype = op; + comm->fifo[comm->head].basecounter = comm->basecounter[op]; + comm->fifo[comm->head].blocksize = blocksize; + comm->fifo[comm->head].maxcredit = maxcredit; + comm->fifo[comm->head].handler = handler; + comm->fifo[comm->head].offset = offset; + comm->fifo[comm->head].elements = elements; + + int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1); + while (newhead == comm->tail) { + } + comm->head = newhead; + + comm->basecounter[op] += (elements * 2 + blocksize - 1) / blocksize; + } +} + +void allgather_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + int op = userbuffers_allreduceop_nonsharp; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + int blocksize = elements * 2; + int maxcredit = 0; + + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / + comm->nblocks; // FIXME TUNING + blocksize *= comm->alignblock; + if (blocksize < comm->minblock) blocksize = comm->minblock; + + maxcredit = (elements * 2 + blocksize - 1) / blocksize; + size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit + if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; + + int sms = allgather2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, + stream, op); +} diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers.cu b/transformer_engine/common/comm_gemm_overlap/userbuffers.cu new file mode 100644 index 0000000000..684771801b --- /dev/null +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers.cu @@ -0,0 +1,1734 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include +#include +#if __CUDA_ARCH__ >= 800 +#include +#define half nv_bfloat16 +#else +#include +#endif +#include +#include +#include + +#define MAX_THREADS 1024 +#define TIMEOUT 200000000000ull + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rw(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, const int lineoffset, + const int numlines, void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + // if(blockIdx.x==0 && threadIdx.x==0) printf("%d/%d(phys %d gpustep %d firstrank %d):RRkernel(d) + // start, size %lld\n",myrank,RANKS,gpustep*myrank+firstrank,gpustep,firstrank,numlines*16ull); + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + reduce_id++; + } + __syncthreads(); + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); line < numlines; + line += blockDim.x * gridDim.x * RANKS) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + // int dest = (i+myrank+warp)&(RANKS-1); + val[i] = userptr[dest[i]][lineoffset + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) s[j] += x[j]; + } +#pragma unroll + for (int i = 0; i < RANKS; i++) { + // int dest = (i+myrank+warp)&(RANKS-1); + userptr[dest[i]][lineoffset + line] = sum; + } + } + + __syncthreads(); + if (threadIdx.x == 0) __threadfence_system(); + __syncthreads(); + + if (threadIdx.x < RANKS) { + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 inplace reduce kernel (Volta,Hopper) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, const int lineoffset, + const int numlines, void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + reduce_id++; + } + __syncthreads(); + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); line < numlines; + line += blockDim.x * gridDim.x * RANKS) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][lineoffset + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) s[j] += x[j]; + } + + userptr[myrank][lineoffset + line] = sum; + } + __syncthreads(); + if (threadIdx.x == 0) __threadfence(); + __syncthreads(); + + if (threadIdx.x < RANKS) { + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } + + int skipmy = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + int dst = (i + warp + myrank) & (RANKS - 1); + if (dst == myrank) { + skipmy++; + continue; + } + dest[i - skipmy] = dst; + } + __syncthreads(); + + for (int line = threadIdx.x + blockDim.x * RANKS * blockIdx.x; line < numlines; + line += blockDim.x * gridDim.x * RANKS) { + int4 val[RANKS - 1]; + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + val[i] = userptr[dest[i]][lineoffset + line + blockDim.x * dest[i]]; + } + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[myrank][lineoffset + line + blockDim.x * dest[i]] = val[i]; + } + } + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 inplace reduce kernel (Ampere) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } + __syncthreads(); + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][mylineoffset + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) s[j] += x[j]; + } + + userptr[myrank][mylineoffset + line] = sum; + } + + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 inplace reduce-scatter kernel + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop(const int op, const int flagoffset, + const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, + const int totallines, const int rowlines, + const int skiplines, void **commbuff, + const int handleridx, void *outbuf) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } + __syncthreads(); + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][mylineoffset + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) s[j] += x[j]; + } + + (reinterpret_cast(outbuf))[(line / rowlines) * skiplines + (line % rowlines)] = sum; + } + + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + } + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; + + int skipmy = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + int dst = (i + warp + myrank) & (RANKS - 1); + if (dst == myrank) { + skipmy++; + continue; + } + dest[i - skipmy] = dst; + } + __syncthreads(); + + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS - 1]; + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + val[i] = userptr[dest[i]][mylineoffset + line + totallines * dest[i]]; + } + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[myrank][mylineoffset + line + totallines * dest[i]] = val[i]; + } + } + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 inplace reduce kernel (Ampere) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rw_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + int *flagptr, physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int4 *localptr; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; + myptr += blockflagoffset; + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + reduce_id++; + } + __syncthreads(); + localptr = userptr[myrank]; + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS - 1]; + int skipmy = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + int dst = (i + warp + myrank) & (RANKS - 1); + if (dst == myrank) { + skipmy++; + continue; + } + dest[i - skipmy] = dst; + } +#define UNROLLAG 4 + __syncthreads(); + const int loop_step0 = blockDim.x * gridDim.x; + const int loop_step = loop_step0 * UNROLLAG; + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = max(start_elem, totallines); + const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; + const int end_aligned = start_elem + aligned_elem; + + for (int line = start_elem; line < end_aligned; line += loop_step) { + int4 val[UNROLLAG]; +#pragma unroll + for (int j = 0; j < UNROLLAG; j++) val[j] = localptr[mylineoffset + line + loop_step0 * j]; + +#pragma unroll + for (int j = 0; j < UNROLLAG; j++) +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[dest[i]][mylineoffset + line + j * loop_step0] = val[j]; + } + } + + for (int line = end_aligned; line < end_elem; line += loop_step0) { + int4 sum = localptr[mylineoffset + line]; +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[dest[i]][mylineoffset + line] = sum; + } + } + + __syncthreads(); + if (threadIdx.x == 0) __threadfence_system(); + __syncthreads(); + + if (threadIdx.x < RANKS) { + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } + if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; +} // fp16 inplace allgather kernel (Volta,Hopper) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_blocked(const int op, const int flagoffset, + const int firstrank, const int myrank, + const int lineoffset, const int numlines, + void **commbuff, const int handleridx, + const int peerblocklines, int *hostflags, + int *gpuflag, const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + +#define REDUCETHREADS (blockDim.x - 32) + + if (threadIdx.x < 32) { + int *flagptr; + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[threadIdx.x + firstrank]); + flagptr[flagoffset + myrank + firstrank] = basecounter; + } + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[myrank + firstrank]))[flagoffset + threadIdx.x + firstrank]); + while (*flag < basecounter) { + } + } + __syncthreads(); + + int startblock = 0, endblock = numblocks; + + for (int nblock = 0; nblock < endblock; nblock++) { + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + if (threadIdx.x == 0) { + __threadfence(); + if (blockIdx.x) gpuflag[op * NVTE_MAX_SMS * 2 + blockIdx.x] = nblock + basecounter + 1; + } else if (blockIdx.x == 0) { + int expecting = (basecounter + nblock + 1); + if (threadIdx.x < gridDim.x) + while (((volatile int *)gpuflag)[op * NVTE_MAX_SMS * 2 + threadIdx.x] < expecting) { + } + } + if (!blockIdx.x) { + asm volatile("bar.sync 15, %0;" ::"r"(32)); + if (!threadIdx.x) hostflags[0] = nblock + basecounter + 1; + } + } + + int cachedflag = basecounter; + +#define ALLGATHERFLAG NVTE_GF_IBSHARPDONE + + if (blockIdx.x == 0 && threadIdx.x < RANKS) { + while (cachedflag < basecounter + numblocks) { + int newflag = ((volatile int *)gpuflag)[ALLGATHERFLAG]; + if (newflag == cachedflag) continue; + cachedflag = newflag; + flagptr[flagoffset + myrank + 32 + firstrank] = cachedflag; + } + } + + if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + } else { + const int warp = blockIdx.x + (threadIdx.x >> 5); + int4 *userptr[RANKS]; + int4 *userptrmyrank; +#pragma unroll + for (int i = 0; i < RANKS; i++) + userptr[i] = reinterpret_cast( + commbuff[((i + myrank + warp) & (RANKS - 1)) + handleridx + firstrank]); + userptrmyrank = reinterpret_cast(commbuff[myrank + handleridx + firstrank]); + __syncthreads(); + + int blocklineoffset = 0; + + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset + blocklines * myrank; + + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[i][blockstart + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; + } + + userptrmyrank[blockstart + line] = sum; + } // single block loop + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + blocklineoffset += peerblocklines * RANKS; + } // block loop NVLINK-REDUCESCATTER + const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1); + const int myblockDim = nwarps << 5; + const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1); + const int maxthreadIdx = myblockDim * (RANKS - 1) + 32; + const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1); + const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31); + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[myrank + firstrank]))[flagoffset + mydest + 32 + firstrank]); + + int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)]; + + blocklineoffset = 0; + int gathercounter = basecounter + 1; + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset; + +#define UNROLL 6 + int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest]; + int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest]; + + if (threadIdx.x < maxthreadIdx) { + const int start_elem = mythreadIdx + myblockDim * blockIdx.x; + const int end_elem = max(start_elem, blocklines); + const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) * + (myblockDim * gridDim.x * UNROLL); + const int end_aligned = start_elem + aligned_elem; + + if (mythreadIdx == 0) { + while (*flag < gathercounter) { + } + gathercounter++; + } + + asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim)); + + for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { + int4 val[UNROLL]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) + myptr[line] = peerptr[line]; + } + blocklineoffset += peerblocklines * RANKS; + } // block loop for NVLINK-ALLGATHER + } // worker warps else block +} // fp16 inplace reduce kernel with SHARP / in blocks + +// threadfence and SMs sync to SM0 +#define SMBAR(offset, block) \ + asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); \ + if (threadIdx.x == 0) { \ + __threadfence_system(); \ + if (blockIdx.x) gpuflag[offset + blockIdx.x] = block + basecounter + 1; \ + } else if (blockIdx.x == 0) { \ + int expecting = (basecounter + block + 1); \ + if (threadIdx.x < gridDim.x) \ + while (((volatile int *)gpuflag)[offset + threadIdx.x] < expecting) { \ + } \ + } \ + if (blockIdx.x == 0) asm volatile("bar.sync 15, %0;" ::"r"(32)); + +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2( + const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks, + const int commbufoffset, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int lineoffset, const int numlines, void **commbuff, + const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag, + const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + if (threadIdx.x < 32) { + int *flagptr; + volatile int *localflag = (volatile int *)&( + ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]); // NOLINT(*) + // initial intranode barrier - once + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[gpustep * threadIdx.x + firstrank]); + flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter; + } + volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank]; + while (*flag < basecounter) { + } + } + __syncthreads(); + + for (int nblock = 0; nblock < numblocks + headstart; nblock++) { + if (nblock < numblocks) { + // RS happens here + SMBAR(op * 2 * NVTE_MAX_SMS, nblock); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1; + } + + if (nblock >= headstart) { + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + if (ibflag != myibrank) + while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) { + } + asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); + // REDUCE happens here + SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = nblock + basecounter + 1 - headstart; + } + } + // final part doing NVAG based on responses from NIC-RMW:IBAG + + if (blockIdx.x == 0) { + for (int nblock = 0; nblock < numblocks; nblock++) { + const int expected = basecounter + nblock + 1; + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + if (ibflag != myibrank) + while (localflag[NVTE_REG0_IBAG + ibflag] < expected) { + } + asm volatile("bar.sync 15, %0;" ::"r"(32)); + if (threadIdx.x < RANKS) + flagptr[flagoffset + gpustep * myrank + NVTE_MAX_NVLINK + firstrank] = expected; + } + } + + if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + } else { // sync warp + // reducethreads + const int warp = blockIdx.x + (threadIdx.x >> 5); + int4 *userptr[RANKS]; + int4 *userptrmyrank; +#pragma unroll + for (int i = 0; i < RANKS; i++) + userptr[i] = reinterpret_cast( + commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]); + userptrmyrank = reinterpret_cast(commbuff[gpustep * myrank + handleridx + firstrank]); + int4 *internalbuf = reinterpret_cast(commbuff[myrank * gpustep + firstrank] + + commbufoffset * sizeof(int)); + __syncthreads(); + + int blocklineoffset = 0, rblocklineoffset = 0; + + for (int nblock = 0; nblock < numblocks + headstart; nblock++) { + // NVRS part(only first numblocks steps) + if (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset + blocklines * myrank; + if (RANKS > 1) { + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[i][blockstart + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; + } + + userptrmyrank[blockstart + line] = sum; + } // single block loop + } + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + blocklineoffset += peerblocklines * RANKS; + } + if (nblock >= headstart) { +#define UNROLLRS 2 + const int remainder = min(numlines - rblocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + rblocklineoffset += peerblocklines * RANKS; + const int ibblocklines = blocklines / ibranks; + int4 *tempbufptr = &internalbuf[((nblock - headstart) % maxcredit) * peerblocklines]; + const int tempstart = lineoffset + (nblock - headstart) * peerblocklines * RANKS + + myrank * blocklines + ibblocklines * myibrank; + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < ibblocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[UNROLLRS]; + +#pragma unroll + for (int i = 0; i < UNROLLRS; i++) + val[i] = i == myibrank ? userptrmyrank[tempstart + line] + : tempbufptr[i * ibblocklines + line]; + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + + for (int i = 0; i < ibranks - UNROLLRS; i++) { + val[i % UNROLLRS] = i == myibrank ? userptrmyrank[tempstart + line] + : tempbufptr[i * ibblocklines + line]; + half *x = reinterpret_cast(&val[(i + 1) % UNROLLRS]); +#pragma unroll + for (int j = 0; j < 16; j++) s[j] += x[j]; + } +#pragma unroll + for (int i = 1; i < UNROLLRS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 16; j++) s[j] += x[j]; + } + userptrmyrank[tempstart + line] = sum; + } + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + } + } // nblock loop NVLINK-REDUCESCATTER + IBREDUCE LOCAL COMPUTE + + if (RANKS != 1) { + const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1); + const int myblockDim = nwarps << 5; + const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1); + const int maxthreadIdx = myblockDim * (RANKS - 1) + 32; + const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1); + const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31); + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[gpustep * myrank + firstrank]))[flagoffset + gpustep * mydest + NVTE_MAX_NVLINK + + firstrank]); + + int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)]; + + blocklineoffset = 0; + int gathercounter = basecounter + 1; + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset; + +#define UNROLL 6 + int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest]; + int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest]; + + if (threadIdx.x < maxthreadIdx) { + const int start_elem = mythreadIdx + myblockDim * blockIdx.x; + const int end_elem = max(start_elem, blocklines); + const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) * + (myblockDim * gridDim.x * UNROLL); + const int end_aligned = start_elem + aligned_elem; + + if (mythreadIdx == 0) { + while (*flag < gathercounter) { + } + gathercounter++; + } + + asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim)); + + for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { + int4 val[UNROLL]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) + myptr[line] = peerptr[line]; + } + blocklineoffset += peerblocklines * RANKS; + } // block loop for NVLINK-ALLGATHER + } // RANKS!=1 + } // worker warps else block +} // fp16 inplace reduce kernel with SHARP / in blocks + +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs( + const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks, + const int commbufoffset, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int lineoffset, const int numlines, void **commbuff, + const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag, + const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + if (threadIdx.x < 32) { + int *flagptr; + volatile int *localflag = (volatile int *)&( + ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]); // NOLINT(*) + // initial intranode barrier - once + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[gpustep * threadIdx.x + firstrank]); + flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter; + } + volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank]; + while (*flag < basecounter) { + } + } + __syncthreads(); + + for (int nblock = 0; nblock < numblocks + headstart; nblock++) { + if (nblock < numblocks) { + // RS happens here + SMBAR(op * 2 * NVTE_MAX_SMS, nblock); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1; + } + + if (nblock >= headstart) { + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + if (ibflag != myibrank) + while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) { + } + asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); + // REDUCE happens here + SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart); + } + } + } else { // sync warp + // reducethreads + const int warp = blockIdx.x + (threadIdx.x >> 5); + int4 *userptr[RANKS]; + int4 *userptrmyrank; +#pragma unroll + for (int i = 0; i < RANKS; i++) + userptr[i] = reinterpret_cast( + commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]); + userptrmyrank = reinterpret_cast(commbuff[gpustep * myrank + handleridx + firstrank]); + int4 *internalbuf = reinterpret_cast(commbuff[myrank * gpustep + firstrank] + + commbufoffset * sizeof(int)); + __syncthreads(); + + int blocklineoffset = 0, rblocklineoffset = 0; + + for (int nblock = 0; nblock < numblocks + headstart; nblock++) { + // NVRS part(only first numblocks steps) + if (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset + blocklines * myrank; + if (RANKS > 1) { + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[i][blockstart + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; + } + + userptrmyrank[blockstart + line] = sum; + } // single block loop + } + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + blocklineoffset += peerblocklines * RANKS; + } + if (nblock >= headstart) { +#define UNROLLRS 2 + const int remainder = min(numlines - rblocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + rblocklineoffset += peerblocklines * RANKS; + const int ibblocklines = blocklines / ibranks; + int4 *tempbufptr = &internalbuf[((nblock - headstart) % maxcredit) * peerblocklines]; + const int tempstart = lineoffset + (nblock - headstart) * peerblocklines * RANKS + + myrank * blocklines + ibblocklines * myibrank; + // if(threadIdx.x==32) printf("[%d] block%d thread %d offset %d line %d ibblocklines %d ptr + // %lx commbufoffset + // %d\n",myrank,blockIdx.x,threadIdx.x,tempstart,0,ibblocklines,(void*)&tempbufptr[(1-myibrank)*ibblocklines],(1-myibrank)*ibblocklines*16); + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < ibblocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[UNROLLRS]; + +#pragma unroll + for (int i = 0; i < UNROLLRS; i++) + val[i] = i == myibrank ? userptrmyrank[tempstart + line] + : tempbufptr[i * ibblocklines + line]; + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + + for (int i = 0; i < ibranks - UNROLLRS; i++) { + val[i % UNROLLRS] = i == myibrank ? userptrmyrank[tempstart + line] + : tempbufptr[i * ibblocklines + line]; + half *x = reinterpret_cast(&val[(i + 1) % UNROLLRS]); +#pragma unroll + for (int j = 0; j < 16; j++) s[j] += x[j]; + } +#pragma unroll + for (int i = 1; i < UNROLLRS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 16; j++) s[j] += x[j]; + } + userptrmyrank[tempstart + line] = sum; + } + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + } + } // nblock loop NVLINK-REDUCESCATTER + IBREDUCE LOCAL COMPUTE + } // worker warps else block +} // fp16 inplace reduce kernel with SHARP / in blocks + +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag( + const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks, + const int commbufoffset, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int lineoffset, const int numlines, void **commbuff, + const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag, + const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + if (threadIdx.x < 32) { + int *flagptr; + volatile int *localflag = (volatile int *)&( + ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]); // NOLINT(*) + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[gpustep * threadIdx.x + firstrank]); + } + } + __syncthreads(); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = numblocks + basecounter; + // tell CPU proxy all blocks are done and ready for NVAG + + // final part doing NVAG based on responses from NIC-RMW:IBAG + + if (blockIdx.x == 0) { + for (int nblock = 0; nblock < numblocks; nblock++) { + const int expected = basecounter + nblock + 1; + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + if (ibflag != myibrank) + while (localflag[NVTE_REG0_IBAG + ibflag] < expected) { + } + asm volatile("bar.sync 15, %0;" ::"r"(32)); + if (threadIdx.x < RANKS) + flagptr[flagoffset + gpustep * myrank + NVTE_MAX_NVLINK + firstrank] = expected; + } + } + + if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + } else { // sync warp + // reducethreads + const int warp = blockIdx.x + (threadIdx.x >> 5); + int4 *userptr[RANKS]; + int4 *userptrmyrank; +#pragma unroll + for (int i = 0; i < RANKS; i++) + userptr[i] = reinterpret_cast( + commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]); + userptrmyrank = reinterpret_cast(commbuff[gpustep * myrank + handleridx + firstrank]); + __syncthreads(); + + int blocklineoffset = 0, rblocklineoffset = 0; + + if (RANKS != 1) { + const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1); + const int myblockDim = nwarps << 5; + const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1); + const int maxthreadIdx = myblockDim * (RANKS - 1) + 32; + const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1); + const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31); + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[gpustep * myrank + firstrank]))[flagoffset + gpustep * mydest + NVTE_MAX_NVLINK + + firstrank]); + + int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)]; + + blocklineoffset = 0; + int gathercounter = basecounter + 1; + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset; + +#define UNROLL 6 + int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest]; + int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest]; + + if (threadIdx.x < maxthreadIdx) { + const int start_elem = mythreadIdx + myblockDim * blockIdx.x; + const int end_elem = max(start_elem, blocklines); + const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) * + (myblockDim * gridDim.x * UNROLL); + const int end_aligned = start_elem + aligned_elem; + + if (mythreadIdx == 0) { + while (*flag < gathercounter) { + } + gathercounter++; + } + + asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim)); + + for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { + int4 val[UNROLL]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) + myptr[line] = peerptr[line]; + } + blocklineoffset += peerblocklines * RANKS; + } // block loop for NVLINK-ALLGATHER + } // RANKS!=1 + } // worker warps else block +} // fp16 inplace reduce kernel with SHARP / in blocks + +__global__ void userbuffers_fp16_sum_inplace_gpu_null(const int op, int *hostflags, int *gpuflag, + int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op] + numblocks; + hostflags[0] = basecounter; + gpuflag[NVTE_GF_STATE + op] = basecounter; + while (((volatile int *)gpuflag)[NVTE_GF_IBSHARPDONE] < basecounter) { + } +} + +#define callranks_block(x) \ + if (comm->ar_nvsize == x) \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked<<>>( \ + userbuffers_allreduceop_sharp, NVTE_REG0_OFFSET(comm), comm->ar_firstgpu, comm->ar_nvrank, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / comm->ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, \ + (elements * 2 + blocksize - 1) / blocksize); + +#define callranks2_block(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) headstart = maxcredit; \ + if (x == 1) headstart = maxcredit; \ + if (headstart > numblocks) headstart = numblocks; \ + if (headstart == 0) headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks2_block_rs(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) headstart = maxcredit; \ + if (x == 1) headstart = maxcredit; \ + if (headstart > numblocks) headstart = numblocks; \ + if (headstart == 0) headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks2_block_ag(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) headstart = maxcredit; \ + if (x == 1) headstart = maxcredit; \ + if (headstart > numblocks) headstart = numblocks; \ + if (headstart == 0) headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8, \ + arg7 = elements / 8; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr \ + : userbuffers_fp16_sum_inplace_gpu_rw), \ + kernelArgs)); \ + } + +#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \ + cudaLaunchConfig_t cfg = {sms, threads, 0, stream, NULL, 0}; \ + cudaLaunchAttribute attribute_ub[2]; \ + attribute_ub[1].id = cudaLaunchAttributeClusterDimension; \ + attribute_ub[1].val.clusterDim.x = sms % comm->cga_size == 0 ? comm->cga_size : 1; \ + attribute_ub[1].val.clusterDim.y = 1; \ + attribute_ub[1].val.clusterDim.z = 1; \ + attribute_ub[0].id = cudaLaunchAttributeCooperative; \ + cfg.attrs = attribute_ub; \ + cfg.numAttrs = comm->sm_arch >= 9 ? 2 : 1; + +int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const int elements, + const int blocksize, communicator *comm, cudaStream_t stream) { + // schedule GPU kernel only + // CPU/SHARP part is responsibility of caller + const int ar_step = comm->ar2_nvsize; + const int op = userbuffers_allreduceop_nonsharp; + const int ar_nvsize = comm->nvsize; + const int ar_firstgpu = comm->ar_firstgpu; + const int ar_nvrank = comm->ar_nvrank; + if (elements < 8) return 0; + int sms = sms = comm->sms; + int warps = comm->threads / 32; + if (warps < comm->ar_nvsize) warps = comm->ar_nvsize; + + if (comm->launch_mode & NVTE_LAUNCH_GPU) { + if (comm->ar_nvsize == 1) + userbuffers_fp16_sum_inplace_gpu_null<<<1, 1, 0, stream>>>( + userbuffers_allreduceop_sharp, reinterpret_cast(comm->hostflags), comm->flags, + (elements * 2 + blocksize - 1) / blocksize); + callranks_block(2) callranks_block(4) callranks_block(8) + } + return sms; +} + +int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op) { + // schedule GPU kernel only + // CPU/SHARP part is responsibility of caller + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 8) return 0; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + if (num_nodes > 1) { + callranks2_block(1) callranks2_block(2) callranks2_block(4) callranks2_block(8) + } else { + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks(2) callranks(4) callranks(8) + } + return sms; +} + +#define callranks_ag(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + (comm->use_rr_kernel ? 0 : arg4 * arg7); \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr_ag \ + : userbuffers_fp16_sum_inplace_gpu_rw_ag), \ + kernelArgs)); \ + } + +#define callranks_rs(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs), kernelArgs)); \ + } + +#define callranks_rs_oop(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7, arg8 = rowelements / 8, arg9 = strideelements / 8; \ + void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ + int arg11 = handler * comm->nvsize; \ + void *arg12 = output; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop), \ + kernelArgs)); \ + } + +int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op) { + // schedule GPU kernel only + // CPU/SHARP part is responsibility of caller + + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 8) return 0; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + + if (num_nodes > 1) { + callranks2_block_rs(1) callranks2_block_rs(2) callranks2_block_rs(4) callranks2_block_rs(8) + } else { + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs(2) callranks_rs(4) callranks_rs(8) + } + return sms; +} + +int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op) { + // schedule GPU kernel only + // CPU/SHARP part is responsibility of caller + + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 8) return 0; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + + if (num_nodes > 1) { + callranks2_block_ag(1) callranks2_block_ag(2) callranks2_block_ag(4) callranks2_block_ag(8) + } else { + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_ag(2) callranks_ag(4) callranks_ag(8) + } + return sms; +} + +void allgather2_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 64) return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_ag(2) callranks_ag(4) callranks_ag(8) +} + +void allgather2_userbuff_inplace_sliced(const int handler, const int offset, const int elements, + communicator *comm, const int slice_id, const int nslices, + cudaStream_t stream) { + const int op = userbuffers_allreduceop_nonsharp2; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + int peerelements = elements / ar_nvsize; + int saverrkernel = comm->use_rr_kernel; + comm->use_rr_kernel = 0; + allgather2_userbuff_inplace( + handler, offset + ar_nvrank * peerelements * (nslices - 1) + slice_id * peerelements, + elements, comm, stream); + comm->use_rr_kernel = saverrkernel; +} + +void reducescatter2_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 64) return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs(2) callranks_rs(4) callranks_rs(8) +} +void reducescatter2_userbuff_stridedoutput(void *output, const int handler, const int offset, + const int rowelements, const int colelements, + const int strideelements, communicator *comm, + cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 64) return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs_oop(2) callranks_rs_oop(4) callranks_rs_oop(8) +} +void reducescatter2_userbuff(void *output, const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream) { + reducescatter2_userbuff_stridedoutput(output, handler, offset, elements, 1, 0, comm, stream); +} + +__global__ void kuserbuffers_pullsend(int myrank, int peer, int *send_id, int *flagptr) { + atomicAdd(flagptr, 1); +} + +__global__ void kuserbuffers_inc(int *id) { + const int signal_id = (*id) + 1; + *id = signal_id; +} + +__global__ void kuserbuffers_proxysend(int *id, int *hostflag) { + const int signal_id = (*id) + 1; + *hostflag = signal_id; + *id = signal_id; +} + +__global__ void kuserbuffers_dummy(void) {} + +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pullrecv(int myrank, int peer, int *recv_id, int *flagptr, int4 *srcptr, + int4 *dstptr, const int lines) { +#define UNROLLCOPY 8 + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = (end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1)); + const int end_aligned = start_elem + aligned_elem; + + if (threadIdx.x == 0) { + const int signal_id = (*recv_id) + 1; + volatile int *flag = (volatile int *)flagptr; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d from %d] pullrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, + *flag); + break; + } + } + if (lines == 0) { + *recv_id = signal_id; + return; + } // otherwise need an extra kernel + } + __syncthreads(); + + if (end_elem <= start_elem) return; + + for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) + dstptr[line] = srcptr[line]; +} + +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pushsend(int *send_id, int *flagptr, int4 *srcptr, int4 *dstptr, const int lines) { + if (lines) { + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = + ((end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1))); + const int end_aligned = start_elem + aligned_elem; + if (end_elem > start_elem) { + for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) + dstptr[line] = srcptr[line]; + } + __syncthreads(); + if (threadIdx.x) return; + __threadfence_system(); + atomicAdd(flagptr, 1); // otherwise need local SM sync before sending flag + } else { // 0 bytes and 1 SM only + atomicAdd(flagptr, 1); + } +} + +__global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *flagptr, int adder) { + const int signal_id = (*recv_id) + adder; + *recv_id = signal_id; + volatile int *flag = (volatile int *)flagptr; + if (*flag >= signal_id) return; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, *flag); + return; + } + } +} + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define INTRANODE(peer) ((peer / comm->nvsize) == (comm->myrank / comm->nvsize)) + +void userbuffers_send(const int srchandler, const size_t srcoffset, const int dsthandler, + const size_t dstoffset, const size_t bytes, communicator *comm, + const int peer, cudaStream_t stream) { + int peerlocal = peer % comm->nvsize; + void *flagptr = + (comm->peer_ptr[0][peerlocal]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0); + bool intranode = INTRANODE(peer); + if (!intranode && (comm->launch_mode & NVTE_LAUNCH_CPU)) { + comm->fifo[comm->head].optype = userbuffers_sendop; + comm->fifo[comm->head].basecounter = comm->basecounter[userbuffers_sendop]; + comm->fifo[comm->head].handler = srchandler; + comm->fifo[comm->head].offset = srcoffset; + comm->fifo[comm->head].handler2 = dsthandler; + comm->fifo[comm->head].offset2 = dstoffset; + comm->fifo[comm->head].elements = bytes; + comm->fifo[comm->head].peer = peer; + + int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1); + while (newhead == comm->tail) { + } + comm->head = newhead; + comm->basecounter[userbuffers_sendop] += 1; + } + if (!intranode && (comm->launch_mode & NVTE_LAUNCH_GPU)) { + kuserbuffers_proxysend<<<1, 1, 0, stream>>>(&(comm->flags[NVTE_GF_STATE + userbuffers_sendop]), + comm->hostflags + userbuffers_sendop); + return; + } + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + if (comm->push == 0) { + kuserbuffers_pullsend<<<1, 1, 0, stream>>>(comm->myrank, peer, &(comm->send_id[peer]), + reinterpret_cast(flagptr)); + } else { + void *srcptr = (comm->mem_ptr[srchandler]) + srcoffset; + void *dstptr = (comm->peer_ptr[dsthandler][peerlocal]) + dstoffset; + + if (comm->use_ce) + CUDACHECK(cudaMemcpyAsync(dstptr, srcptr, bytes, cudaMemcpyDeviceToDevice, stream)); + SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream); + int *arg1 = &comm->send_id[peer], *arg2 = reinterpret_cast(flagptr); + int4 *arg3 = reinterpret_cast(srcptr), *arg4 = reinterpret_cast(dstptr); + int arg5 = signalonly ? 0 : bytes / 16; + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), + reinterpret_cast(&arg3), reinterpret_cast(&arg4), + reinterpret_cast(&arg5)}; + CUDACHECK( + cudaLaunchKernelExC(&cfg, reinterpret_cast(kuserbuffers_pushsend), kernelArgs)); + } +} + +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_alltoall(void **baseflagptrs, int flagoffset, int4 *basesrcptr, void **dstptrs, + size_t dstoffset, const int lines, const int myrank) { + if (blockIdx.x == myrank) return; + int4 *dstptr = reinterpret_cast(dstptrs[blockIdx.x] + dstoffset); + int *flagptr = reinterpret_cast(baseflagptrs[blockIdx.x] + flagoffset); + const size_t myblockoffset = blockIdx.x * lines; + int4 *srcptr = basesrcptr + myblockoffset; + dstptr += myblockoffset; + + if (lines) { + const int start_elem = threadIdx.x; + const int end_elem = lines; + const int aligned_elem = ((end_elem - start_elem) & (~(blockDim.x * UNROLLCOPY - 1))); + const int end_aligned = start_elem + aligned_elem; + if (end_elem > start_elem) { + for (int line = start_elem; line < end_aligned; line += blockDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += blockDim.x) dstptr[line] = srcptr[line]; + } + __syncthreads(); + if (threadIdx.x) return; + __threadfence_system(); + atomicAdd(flagptr, 1); + + } else { + atomicAdd(flagptr, 1); + } +} + +void userbuffers_alltoall_send(const int srchandler, const size_t srcoffset, const int dsthandler, + const size_t dstoffset, const size_t bytes, communicator *comm, + cudaStream_t stream) { + if (comm->launch_mode & NVTE_LAUNCH_CPU) { + comm->fifo[comm->head].optype = userbuffers_alltoall; + comm->fifo[comm->head].basecounter = comm->basecounter[userbuffers_alltoall]; + comm->fifo[comm->head].handler = srchandler; + comm->fifo[comm->head].offset = srcoffset; + comm->fifo[comm->head].handler2 = dsthandler; + comm->fifo[comm->head].offset2 = dstoffset; + comm->fifo[comm->head].elements = bytes; + + int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1); + while (newhead == comm->tail) { + } + comm->head = newhead; + comm->basecounter[userbuffers_alltoall] += 1; + } + if (comm->launch_mode & NVTE_LAUNCH_GPU) + kuserbuffers_proxysend<<<1, 1, 0, stream>>>( + &(comm->flags[NVTE_GF_STATE + userbuffers_alltoall]), + comm->hostflags + userbuffers_alltoall); +} + +void userbuffers_recv(const int srchandler, const size_t srcoffset, const int dsthandler, + const size_t dstoffset, const size_t bytes, communicator *comm, + const int peer, cudaStream_t stream) { + int peerlocal = peer % comm->nvsize; + void *flagptr = + (comm->mem_ptr[0]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + peer * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0); + bool intranode = INTRANODE(peer); + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + if (comm->push == 0 && intranode) { + void *dstptr = (comm->mem_ptr[dsthandler]) + dstoffset; + void *srcptr = (comm->peer_ptr[srchandler][peerlocal]) + srcoffset; + + kuserbuffers_pullrecv<<sms, signalonly ? 1 : 1024, 0, stream>>>( + comm->myrank, peer, &(comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler]), + reinterpret_cast(flagptr), reinterpret_cast(srcptr), + reinterpret_cast(dstptr), signalonly ? 0 : bytes / 16); + if (!signalonly) + kuserbuffers_inc<<<1, 1, 0, stream>>>(&(comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler])); + if (comm->use_ce) { + CUDACHECK(cudaMemcpyAsync(dstptr, srcptr, bytes, cudaMemcpyDeviceToDevice, stream)); + } + } else { + kuserbuffers_pushrecv<<<1, 1, 0, stream>>>( + comm->myrank, peer, &comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler], + reinterpret_cast(flagptr), signalonly || !intranode ? 1 : comm->sms); + } +} + +void userbuffers_alltoall_recv(communicator *comm, cudaStream_t stream) { + void *flagptr = + (comm->mem_ptr[0]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * userbuffers_alltoall) * sizeof(int)); + + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + kuserbuffers_pushrecv<<<1, 1, 0, stream>>>(comm->myrank, -1, reinterpret_cast(flagptr + 4), + reinterpret_cast(flagptr), comm->nranks - 1); +} diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 1e28cec70e..a216799a5c 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -49,6 +49,7 @@ void cublas_gemm(const Tensor *inputA, size_t workspaceSize, bool accumulate, bool use_split_accumulator, + int math_sm_count, cudaStream_t stream ) { void *A = inputA->data.dptr; @@ -124,6 +125,13 @@ void cublas_gemm(const Tensor *inputA, &transa, sizeof(transa))); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb))); + // Set math SM count + if (math_sm_count != 0) { + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + operationDesc, CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET, + &math_sm_count, sizeof(math_sm_count))); + } + // set fp8 attributes -- input and output types should already be set to fp8 as appropriate // Note: gelu fusion isn't available right now, and we don't need @@ -227,6 +235,7 @@ void cublas_gemm(const Tensor *inputA, if (returnedResults == 0) throw std::runtime_error("Unable to find any suitable algorithms"); // D = alpha * (A * B) + beta * C + NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, operationDesc, static_cast(&one), /* alpha */ @@ -266,6 +275,7 @@ void nvte_cublas_gemm(const NVTETensor A, NVTETensor workspace, bool accumulate, bool use_split_accumulator, + int math_sm_count, cudaStream_t stream) { NVTE_API_CALL(nvte_cublas_gemm); using namespace transformer_engine; @@ -308,5 +318,6 @@ void nvte_cublas_gemm(const NVTETensor A, grad, wspace->data.dptr, wspace->data.shape[0], accumulate, use_split_accumulator, + math_sm_count, stream); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 035f467adb..8cd549b658 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -36,6 +36,7 @@ extern "C" { * \param[out] workspace Workspace tensor. * \param[in] accumulate Whether to accumulate the result into the D matrix. * \param[in] use_split_accumulator Whether to use split accumulator in the FP8 GEMM. + * \param[in] math_sm_count Number of GPU SMs to use (default=0: use cuBLAS heuristics) * \param[in] stream CUDA stream used for the operation. */ void nvte_cublas_gemm(const NVTETensor A, @@ -49,6 +50,7 @@ void nvte_cublas_gemm(const NVTETensor A, NVTETensor workspace, bool accumulate, bool use_split_accumulator, + int math_sm_count, cudaStream_t stream ); diff --git a/transformer_engine/common/include/transformer_engine/userbuffers.h b/transformer_engine/common/include/transformer_engine/userbuffers.h new file mode 100644 index 0000000000..cd5b1ec382 --- /dev/null +++ b/transformer_engine/common/include/transformer_engine/userbuffers.h @@ -0,0 +1,227 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#ifndef TRANSFORMER_ENGINE_USERBUFFERS_H_ +#define TRANSFORMER_ENGINE_USERBUFFERS_H_ + +#include +#include +#include "cuda_runtime.h" +#include +#include +#include "gdrapi.h" +#include + +#define NVTE_MAX_REGIONS 16 +#define NVTE_MAX_SMS 32 +#define NVTE_MAX_OPS 32 +#define NVTE_MAX_PEERS 8192 +#define NVTE_MAX_REQUESTS 1024 +#define NVTE_LAUNCH_GPU 1 +#define NVTE_LAUNCH_CPU 2 +#define NVTE_MAX_NVLINK 8 + +// region 0 flag offsets +#define NVTE_REG0_OPFLAGS 1024 +#define NVTE_REG0_RECV (NVTE_REG0_OPFLAGS * userbuffers_op_types) +#define NVTE_REG0_SINGLENODE (2 * NVTE_MAX_NVLINK * NVTE_MAX_SMS + NVTE_MAX_OPS) +#define NVTE_REG0_OFFSET(comm) ((2 * NVTE_MAX_REGIONS) * NVTE_MAX_NVLINK \ + + NVTE_REG0_SINGLENODE * 2 + NVTE_MAX_PEERS) +#define NVTE_REG0_COMMBUFFER 0 +#define NVTE_REG0_FLAGS (NVTE_REG0_RECV + NVTE_MAX_PEERS * NVTE_MAX_REGIONS) +#define NVTE_REG0_IBRS 32 +#define NVTE_REG0_IBAG 512 +#undef NVTE_REG0_COMMBUFFER +#define NVTE_REG0_COMMBUFFER (1024 * 1024 * 16) + +// gpuflags map offsets +#define NVTE_GF_STATE 16000 +#define NVTE_GF_IBSHARPDONE 0 +#define NVTE_HF_NVRSDONE (userbuffers_op_types + 1) +#define NVTE_HF_NVREDUCEDONE (userbuffers_op_types + 3) +#define NVTE_MAX_SHARP 16 + +typedef struct ub_request { + int optype; + int blocksize; + int basecounter; + int elements; + int handler; + int handler2; + size_t offset; + size_t offset2; + int peer; + // ----execution states + int active, maxcredit; + int nblock, numblocks, unconfirmed_ib_in_flight; +} ub_request; + +enum req_type { + userbuffers_allreduceop_sharp, + userbuffers_sendop, + userbuffers_allreduceop_nonsharp, + userbuffers_allreduceop_nonsharp2, + userbuffers_alltoall, + userbuffers_op_types +}; + +struct communicator { + int myrank, nranks; // global job communicator + int nvrank, nvsize; // single node comm_intra + int free_region; + + int launch_mode; + + void *gpu_ptrs; + int sms, threads; + int use_rr_kernel; // Whether to use RR (or RW) for NVLink-only kernel + int cga_size; + int push, use_ce; + + void *mem_ptr[NVTE_MAX_REGIONS]; + void **peer_ptr[NVTE_MAX_REGIONS]; + int ar_nvsize, ar_firstgpu, + ar_nvrank; // number of gpus(and first gpu in a group) of gpus per node in reduction subgroup + // (_splitar init used) would be equal to (nvsize,0) for regular comm_create + int ar2_nvsize, ar2_firstgpu, ar2_nvrank; // with ar_nvsize as a step + int pipe_id; // which allreduce set of groups (pipeline rank in range of 0..pipeline_size) + int sm_arch; + int num_nodes, my_node, + first_node; // comm_inter communicator, per-rail allreduce (might have subset of nodes) + int num2_nodes, my2_node, first2_node; // with num_nodes as a stride + // max value for running block counters in hostflags + int basecounter[userbuffers_op_types]; // NOLINT(*) + + int *hostflags; + int *flags, *map_flags; + gdr_t g; + + struct sharp_coll_context *sharp_coll_context; + struct sharp_coll_comm *sharp_coll_comm; + void *mem_mr[NVTE_MAX_REGIONS]; + + ub_request *fifo; + volatile int activeproxy; + int nblocks, alignblock, minblock, asyncblocks, active_nreqs; + ub_request active_req[userbuffers_op_types]; // NOLINT(*) + int padding[7]; + volatile int head; + int padding2[15]; + volatile int tail; + + MPI_Request mpihndl[NVTE_MAX_SHARP]; + MPI_Comm comm_inter, // reduction group communicator (subset of the nodes) along GPU rail + comm_intra; // full intranode (all ndev GPUS) + int ibnvsize; // can be used to fake smaller or larger nvlink domain to use ib instead of nvlink + // or force MNNVL + int *send_id, *recv_id; + int mydev; +}; +typedef struct communicator communicator; + +int create_communicator(communicator **comm); +/* creates communicator, allocates all internal buffers if necessary */ + +int create_communicator_grouped(communicator **comm, int pipegpus, int pipenodes); +int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenodes, int tensorgpus, + int tensornodes); +/* creates communicator with + allreduce1 to happen in datagpus x datanodes groups, + allreduce2 to happen in tensorgpus x tensor nodes, + where num_nodes = pipenodes x tensornodes x datanodes + nvlink_size = pipegpus x tensorgpus x datagpus + */ + +// int check_user_buffer_registration(void* gpubuff, int bytes, communicator* comm, size_t* offset); +/* + local calls, doesnt communicate between peers + returns handler if buffer is registered already, or -1 if not. + returned offset is offset of gpubuff relative to buffer registered +*/ + +int pipe_rank(communicator *comm, + int step); // helper function to help walk across allreduce1 x allreduce2 groups + // data-parallel and tensor-parallel position within data and tensor + // groups would be preserved + +int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *comm, + bool alloc = false); +/* returns handler and registers buffers. assumed to be collective i.e. you use same groups and + dont mix buffers for different operations returns -1 if cant register (too many preregistered + regions already) if alloc==true will allocate memory and fill the pointers (required for NVL + SHARP and NSO/MNNVL) +*/ + +void allreduce_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream = 0); +// for DP distributed optimizer, only nonSHARP multinode is implemented & calls must come in pairs +// ordered +void allgather_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream = 0); +void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream = 0); + +void allreduce2_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream = 0); +// for TP-parallelism, only single node is implemented +void allgather2_userbuff_inplace(const int handler, const int offset, const int elements, + communicator *comm, cudaStream_t stream = 0); +void allgather2_userbuff_inplace_sliced(const int handler, const int offset, const int elements, + communicator *comm, const int slice_id, const int nslices, + cudaStream_t stream = 0); +/* +each Rank input is +allgather2_userbuff_inplace: offset+myrank*elements +allgather2_userbuff_inplace_sliced: offset+myrank*elements*nslices+slice_id*elements + +equivalent codes would be: +for(int slice=0;slice torch.Tensor: """TN layout GEMM with fp8 inputs.""" @@ -55,7 +58,7 @@ def fp8_gemm( out_dtype = TE_DType[out.dtype] if D_dtype is None else D_dtype - _ = torch.ops.tex_ts.te_gemm_ts( + args = ( A, A_scale_inv, A_fp8_tensor, @@ -77,8 +80,29 @@ def fp8_gemm( workspace, workspace.shape[0], accumulate, - use_split_accumulator, - ) + use_split_accumulator) + fn = torch.ops.tex_ts.te_gemm_ts + if ub_algo is not None: + assert ub is not None, 'ub object is None!' + if ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_AG: + fn = ub.bulk_overlap + args = tuple(args + (1,)) + elif ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_RS: + fn = ub.bulk_overlap + args = tuple(args + (0,)) + elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG: + fn = ub.split_overlap_ag + extra_output_tensor = ( + empty_tensor if extra_output_tensor is None else extra_output_tensor + ) + args = tuple(args + (extra_output_tensor,)) + elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS: + fn = ub.split_overlap_rs + assert ( + extra_output_tensor is not None + ), 'SPLIT_PIPELINED_RS requires extra output tensor' + args = tuple(args + (True, extra_output_tensor,)) + _ = fn(*args) if return_output: if gelu: @@ -102,6 +126,9 @@ def gemm( out: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, use_bias: bool = False, + ub_algo: tex.UbufOverlapAlgo = None, + ub: tex.UbufCommOverlap = None, + extra_output_tensor: torch.Tensor = None, ) -> Tuple[Union[torch.Tensor, None], ...]: """Non FP8 GEMM.""" @@ -142,7 +169,7 @@ def gemm( else: bias_dtype = output_dtype - _ = torch.ops.tex_ts.te_gemm_ts( + args = ( A, empty_tensor, fp8_index, @@ -166,6 +193,28 @@ def gemm( accumulate, False, # use_split_accumulator ) + fn = torch.ops.tex_ts.te_gemm_ts + if ub_algo is not None: + assert ub is not None, 'ub object is None!' + if ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_AG: + fn = ub.bulk_overlap + args = tuple(args + (1,)) + elif ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_RS: + fn = ub.bulk_overlap + args = tuple(args + (0,)) + elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG: + fn = ub.split_overlap_ag + extra_output_tensor = ( + empty_tensor if extra_output_tensor is None else extra_output_tensor + ) + args = tuple(args + (extra_output_tensor,)) + elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS: + fn = ub.split_overlap_rs + assert ( + extra_output_tensor is not None + ), 'SPLIT_PIPELINED_RS requires extra output tensor' + args = tuple(args + (False, extra_output_tensor,)) + _ = fn(*args) if return_output: return out, grad_bias, gelu_input @@ -283,9 +332,25 @@ def layernorm_fwd_fp8( fp8_tensor: Union[tex.FP8FwdTensors, tex.FP8BwdTensors], otype: tex.DType, sm_margin: int, - zero_centered_gamma: bool + zero_centered_gamma: bool, + ln_out: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """LayerNorm with FP8 output""" + if ln_out is not None: + return tex.layernorm_fwd_fp8_noalloc( + inp, + weight, + bias, + eps, + fp8_meta_tensor.scale[fp8_tensor], + ln_out, + fp8_meta_tensor.amax_history[0][fp8_tensor], + fp8_meta_tensor.scale_inv[fp8_tensor], + otype, + sm_margin, + zero_centered_gamma + ) + return tex.layernorm_fwd_fp8( inp, weight, @@ -351,8 +416,20 @@ def cast_to_fp8( fp8_meta_tensor: tex.FP8TensorMeta, fp8_tensor: Union[tex.FP8FwdTensors, tex.FP8BwdTensors], otype: tex.DType, -) -> torch.Tensor: + out: Optional[torch.Tensor] = None, +) -> Optional[torch.Tensor]: """Cast input to FP8""" + + if out is not None: + tex.cast_to_fp8_noalloc( + inp, + fp8_meta_tensor.scale[fp8_tensor], + out, + fp8_meta_tensor.amax_history[0][fp8_tensor], + fp8_meta_tensor.scale_inv[fp8_tensor], + otype + ) + return None return torch.ops.tex_ts.cast_to_fp8_ts( inp, fp8_meta_tensor.scale, diff --git a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h new file mode 100644 index 0000000000..18863a7858 --- /dev/null +++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h @@ -0,0 +1,579 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HALF_BYTES 2 + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status_ = call; \ + if (status_ != cudaSuccess) { \ + fprintf(stderr, "CUDA Error at line %d: %s\n", __LINE__, cudaGetErrorString(status_)); \ + exit(1); \ + } \ + } while (0) + +namespace ubuf { + +enum class COMM_TYPE { RS = 0, AG = 1 }; + +enum class UBOverlapAlgo { + BULK_OVERLAP_AG = 0, + BULK_OVERLAP_RS = 1, + SPLIT_PIPELINED_AG = 2, + SPLIT_PIPELINED_RS = 3 +}; + +struct UbufCommOverlap : torch::CustomClassHolder { + communicator *_ub_comm; + int _tp_id; + int _tp_size; + int _num_splits; + int _math_sms; + int _ub_reg; + void *_ubuf_ptr; + torch::Tensor _ubuf; + torch::Tensor output_tensor; + at::cuda::CUDAStream _stream_comm = at::cuda::getStreamFromPool(true); + std::vector _stream_compute; + cudaEvent_t _start_compute, _stop_compute, _start_d2dcopy, _start_comm, _stop_comm; + + UbufCommOverlap(torch::Tensor sample, int rank, int tp_size, int num_comm_sm, int comm_cga_size, + int num_splits, bool set_sm_margin, int num_max_streams) { + // Initialize userbuf communicator + create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); + _ub_comm->use_ce = 0; + _ub_comm->sms = num_comm_sm; + _ub_comm->cga_size = comm_cga_size; + + // Allocate and register extra userbuffers + int ubuf_bytes = sample.numel() * sample.element_size(); + _ub_reg = register_user_buffer_collective(reinterpret_cast(&_ubuf_ptr), ubuf_bytes, + _ub_comm, true); + _ubuf = torch::from_blob(_ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options()); + + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + for (int i = 0; i < std::min(num_max_streams, num_splits); i++) { + cudaStream_t stream; + cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, -1); + _stream_compute.push_back( + at::cuda::getStreamFromExternal(stream, stream_main.device_index())); + } + + _num_splits = num_splits; + _tp_size = tp_size; + _tp_id = (rank % tp_size); + + // Set the number of SMs for GEMM with margin + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + _math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount; + + output_tensor = torch::Tensor(); + // CUDA event creation + cudaEventCreateWithFlags(&_start_compute, 0); + cudaEventCreateWithFlags(&_stop_compute, 0); + cudaEventCreateWithFlags(&_start_d2dcopy, 0); + cudaEventCreateWithFlags(&_start_comm, 0); + cudaEventCreateWithFlags(&_stop_comm, 0); + } + + /* + ** Bulk GEMM + COMM + ** This function assumes the communication input is pre-copied to _ubuf + */ + std::vector bulk_overlap( + at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, at::Tensor B_scale_inverse, + int64_t B_fp8_tensor, transformer_engine::DType B_type, bool transb, at::Tensor D, + at::Tensor D_scale, transformer_engine::DType D_type, at::Tensor D_amax, at::Tensor bias, + transformer_engine::DType bias_type, at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, + size_t workspaceSize, bool accumulate, bool use_split_accumulator, int comm_type) { + // Get the current userbuf offset + char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); + int comm_elements = (_ubuf.numel() / 2) * _ubuf.element_size(); // UBUF uses 2Byte element size + COMM_TYPE _comm_type = static_cast(comm_type); + if (_comm_type == COMM_TYPE::RS) { + ubuf_wt_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); + } + + // Catch up the default torch stream + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_comm, (cudaStream_t)stream_main)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); + + // Communication: AG and RS + if (_comm_type == COMM_TYPE::AG) { + allgather2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, (cudaStream_t)_stream_comm); + } else if (_comm_type == COMM_TYPE::RS) { + reducescatter2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, + (cudaStream_t)_stream_comm); + } else { + NVTE_ERROR("Not supported communication type."); + } + + if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + + if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + + assert(pre_gelu_out.numel() == 0); + te_gemm(A, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, D, D_scale, + D_type, D_amax, bias, bias_type, pre_gelu_out, grad, workspace, workspaceSize, + accumulate, use_split_accumulator, _math_sms); + + CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_comm, 0)); + + // Generate output tensor from userbuf data pointer + int output_c_dim0 = (_comm_type == COMM_TYPE::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size; + int output_c_dim1 = _ubuf.size(1); + output_tensor = torch::from_blob(ubuf_wt_ptr, {output_c_dim0, output_c_dim1}, _ubuf.options()); + + return {D, output_tensor}; + } // bulk_overlap + + /* + ** Split FPROP GEMM + ReduceScatter + */ + void split_overlap_rs(at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, + at::Tensor B_scale_inverse, int64_t B_fp8_tensor, + transformer_engine::DType B_type, bool transb, at::Tensor D, + at::Tensor D_scale, transformer_engine::DType D_type, at::Tensor D_amax, + at::Tensor bias, transformer_engine::DType bias_type, + at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, + size_t workspaceSize, bool accumulate, bool use_split_accumulator, + bool gemm_overlap, at::Tensor rs_output) { + // Get GEMM dimensions + int m = A.size(0); + int k = A.size(1); + int n = B.size(0); + int m_chunk = m / _num_splits; + int input_a_chunk_size = m_chunk * k; + int output_chunk_size = n * m_chunk; + int workspace_size_chunk = workspaceSize / _stream_compute.size(); + + // Get input, output, and workspace data pointers + char *input_a_chunk_ptr = reinterpret_cast(A.data_ptr()); + char *output_buf_chunk_ptr = reinterpret_cast(_ubuf.data_ptr()); + char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); + + char *rs_output_ptr = reinterpret_cast(rs_output.data_ptr()); + int ubuf_offset = 0; + + // Catch up the default torch stream + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_compute, stream_main)); + for (int i = 0; i < _stream_compute.size(); i++) { + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[i], _start_compute, 0)); + } + + if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + + if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + + assert(pre_gelu_out.numel() == 0); + + if (gemm_overlap) { + torch::Tensor input_a_chunk = torch::from_blob(input_a_chunk_ptr, {m_chunk, k}, A.options()); + torch::Tensor output_chunk = + torch::from_blob(output_buf_chunk_ptr, {n, m_chunk}, _ubuf.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr, {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[0]); + te_gemm(input_a_chunk, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, + output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, _math_sms); + + for (int i = 1; i < _num_splits; i++) { + input_a_chunk_ptr += input_a_chunk_size * B.element_size(); + output_buf_chunk_ptr += output_chunk_size * _ubuf.element_size(); + + torch::Tensor input_a_chunk = + torch::from_blob(input_a_chunk_ptr, {m_chunk, k}, A.options()); + torch::Tensor output_chunk = + torch::from_blob(output_buf_chunk_ptr, {n, m_chunk}, _ubuf.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr + (i % _stream_compute.size()) * workspace_size_chunk, + {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[i % _stream_compute.size()]); + te_gemm(input_a_chunk, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, + output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms); + + CHECK_CUDA(cudaEventRecord( + _start_comm, (cudaStream_t)_stream_compute[(i - 1) % _stream_compute.size()])); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); + + // Communication chunk + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, (i - 1) * output_chunk_size, + m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); + + rs_output_ptr += m_chunk * _ubuf.element_size(); + } + int last_compute_stream_id = + (_num_splits + _stream_compute.size() - 1) % _stream_compute.size(); + CHECK_CUDA( + cudaEventRecord(_start_comm, (cudaStream_t)_stream_compute[last_compute_stream_id])); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); + + // Communication chunk + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, + (_num_splits - 1) * output_chunk_size, m_chunk, n, m, + _ub_comm, (cudaStream_t)_stream_comm); + } else { + for (int i = 0; i < _num_splits; i++) { + torch::Tensor input_a_chunk = + torch::from_blob(input_a_chunk_ptr, {m_chunk, k}, A.options()); + torch::Tensor output_chunk = + torch::from_blob(output_buf_chunk_ptr, {n, m_chunk}, _ubuf.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr + (i % _stream_compute.size()) * workspace_size_chunk, + {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[i % _stream_compute.size()]); + te_gemm(input_a_chunk, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, + output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms); + + CHECK_CUDA(cudaEventRecord(_start_comm, + (cudaStream_t)_stream_compute[i % _stream_compute.size()])); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); + + // Communication chunk + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, i * output_chunk_size, + m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); + + rs_output_ptr += m_chunk * _ubuf.element_size(); + input_a_chunk_ptr += input_a_chunk_size * B.element_size(); + output_buf_chunk_ptr += output_chunk_size * _ubuf.element_size(); + } + } + int last_compute_stream_id = + (_num_splits + _stream_compute.size() - 1) % _stream_compute.size(); + CHECK_CUDA( + cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[last_compute_stream_id])); + CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_comm, 0)); + at::cuda::setCurrentCUDAStream(stream_main); + + return; + } // split_overlap_rs + + /* + ** Helper function to copy input to _ubuf + */ + void copy_input_to_ubuf(torch::Tensor input, int comm_type) { + char *ubuf_ptr = reinterpret_cast(_ubuf.data_ptr()); + COMM_TYPE _comm_type = static_cast(comm_type); + if (_comm_type == COMM_TYPE::AG) { + if ((input.numel() * _tp_size) != _ubuf.numel() || + input.element_size() != _ubuf.element_size()) { + NVTE_ERROR("input and ubuf size do not match!"); + } + ubuf_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); + } else { + if (input.numel() != _ubuf.numel() || input.element_size() != _ubuf.element_size()) { + NVTE_ERROR("input and ubuf size do not match!"); + } + } + + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_d2dcopy, (cudaStream_t)stream_main)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_d2dcopy, 0)); + CHECK_CUDA(cudaMemcpyAsync(ubuf_ptr, input.data_ptr(), input.numel() * input.element_size(), + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_comm)); + } + + torch::Tensor &get_ubuf_output(int comm_type) { + char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); + COMM_TYPE _comm_type = static_cast(comm_type); + if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) NVTE_ERROR("Invalid comm_type"); + if (_comm_type == COMM_TYPE::RS) + ubuf_wt_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); + int output_c_dim0 = (_comm_type == COMM_TYPE::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size; + int output_c_dim1 = _ubuf.size(1); + output_tensor = torch::from_blob(ubuf_wt_ptr, {output_c_dim0, output_c_dim1}, _ubuf.options()); + return output_tensor; + } +}; // UbufCommOverlap + +struct UbufP2PCommOverlap : torch::CustomClassHolder { + communicator *_ub_comm; + int _tp_id; + int _tp_size; + int _ub_reg; + int _next_rank, _prev_rank, _rank, _rank_round_tp; + int _aggregate2; + int _math_sms; + void *_ubuf_ptr; + torch::Tensor _ubuf; + std::vector _ubufs; + at::cuda::CUDAStream _stream_comm = at::cuda::getStreamFromPool(true); + std::vector _stream_compute; + cudaEvent_t _start_compute, _stop_compute, _start_comm, _stop_comm, _start_accum, _stop_accum; + + UbufP2PCommOverlap(torch::Tensor sample, int rank, int tp_size, bool aggregate2, + int num_max_streams) { + // Initialize userbuf communicator + create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); + _ub_comm->use_ce = 1; + _ub_comm->sms = 1; + _ub_comm->cga_size = 1; + + // Create workspace tensor with userbuffer + int ubuf_bytes = sample.numel() * sample.element_size(); + int ubuf_chunk_bytes = ubuf_bytes / tp_size; + _ub_reg = register_user_buffer_collective(reinterpret_cast(&_ubuf_ptr), ubuf_bytes, + _ub_comm, true); + _ubuf = torch::from_blob(_ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options()); + + // Create tensor chunks for easy management + char *ubuf_byte_ptr = reinterpret_cast(_ubuf.data_ptr()); + for (int i = 0; i < tp_size; i++) { + torch::Tensor ubuf_chunk = torch::from_blob( + ubuf_byte_ptr, {sample.size(0) / tp_size, sample.size(1)}, sample.options()); + _ubufs.push_back(ubuf_chunk); + ubuf_byte_ptr += ubuf_chunk_bytes; + } + + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + for (int i = 0; i < std::min(num_max_streams, tp_size); i++) { + cudaStream_t stream; + cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, -1); + _stream_compute.push_back( + at::cuda::getStreamFromExternal(stream, stream_main.device_index())); + } + + // Set the number of SMs for GEMM with margin + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + _math_sms = prop.multiProcessorCount; + + _tp_size = tp_size; + _aggregate2 = aggregate2; + + _rank = rank; + _tp_id = (rank % tp_size); + _rank_round_tp = (rank / tp_size) * tp_size; + _next_rank = (tp_size + rank + 1) % tp_size + _rank_round_tp; + _prev_rank = (tp_size + rank + -1) % tp_size + _rank_round_tp; + + // CUDA event creation + cudaEventCreateWithFlags(&_start_compute, 0); + cudaEventCreateWithFlags(&_stop_compute, 0); + cudaEventCreateWithFlags(&_start_comm, 0); + cudaEventCreateWithFlags(&_stop_comm, 0); + cudaEventCreateWithFlags(&_start_accum, 0); + cudaEventCreateWithFlags(&_stop_accum, 0); + } + + /* + ** Split AllGather + GEMM using P2P communication + ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is needed to have AG + *outputs + ** in each rank to be in the contiguous memory space after all ring exchange phases. + */ + torch::Tensor split_overlap_ag(at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, + at::Tensor B_scale_inverse, int64_t B_fp8_tensor, + transformer_engine::DType B_type, bool transb, at::Tensor D, + at::Tensor D_scale, transformer_engine::DType D_type, + at::Tensor D_amax, at::Tensor bias, + transformer_engine::DType bias_type, at::Tensor pre_gelu_out, + bool grad, at::Tensor workspace, size_t workspaceSize, + bool accumulate, bool use_split_accumulator, at::Tensor B_copy) { + // Get GEMM dimensions between TN and NN input layouts + const int m = (transa) ? A.size(0) : A.size(1); + const int k = (transa) ? A.size(1) : A.size(0); + const int n_chunk = _ubufs[0].size(0); + + // Get communication and GEMM output chunk sizes + const int comm_bytes = _ubufs[0].numel() * _ubufs[0].element_size(); + const int output_chunk_bytes = (n_chunk * m) * HALF_BYTES; + + // Get output and workspace data pointers + char *output_ptr = reinterpret_cast(D.data_ptr()); + char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); + int workspace_size_chunk = workspaceSize / _stream_compute.size(); + + if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + + if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_compute, (cudaStream_t)stream_main)); + + assert(pre_gelu_out.numel() == 0); + if (_aggregate2) { + // Catch up the default torch stream + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_compute, 0)); + + const int num_steps = _tp_size / 2; + char *input_b_ptr = reinterpret_cast(_ubuf.data_ptr()); + + // Initial 1X input chunk exchange between neighboring peers + int send_chunk_id = _tp_id; + int recv_chunk_id = (_tp_id % 2 == 0) ? _tp_id + 1 : _tp_id - 1; + int send_offset = comm_bytes * send_chunk_id; + int recv_offset = comm_bytes * recv_chunk_id; + int peer_rank = (_tp_id % 2 == 0) ? _next_rank : _prev_rank; + userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes, _ub_comm, peer_rank, + (cudaStream_t)_stream_comm); + userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes, _ub_comm, peer_rank, + (cudaStream_t)_stream_comm); + CHECK_CUDA(cudaEventRecord(_start_compute, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); + + int local_rank_round2 = (_tp_id % 2 == 0) ? _tp_id : _tp_id - 1; + const int next_rank = (_tp_size + _tp_id + 2) % _tp_size + _rank_round_tp; + const int prev_rank = (_tp_size + _tp_id - 2) % _tp_size + _rank_round_tp; + + // Ring exchange of 2X inputs chunks + for (int i = 0; i < num_steps; i++) { + send_chunk_id = (_tp_size + local_rank_round2 - i * 2) % _tp_size; + recv_chunk_id = (_tp_size + local_rank_round2 - i * 2 - 2) % _tp_size; + send_offset = comm_bytes * send_chunk_id; + recv_offset = comm_bytes * recv_chunk_id; + + // GEMM + torch::Tensor input_b_chunk = + torch::from_blob(input_b_ptr + send_offset, {n_chunk * 2, k}, _ubuf.options()); + torch::Tensor output_chunk = torch::from_blob( + output_ptr + (send_chunk_id * output_chunk_bytes), {n_chunk * 2, m}, D.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr + (i % _stream_compute.size()) * workspace_size_chunk, + {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[i % _stream_compute.size()]); + te_gemm(A, A_scale_inverse, A_type, transa, input_b_chunk, B_scale_inverse, B_type, transb, + output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms); + + if (i < num_steps - 1) { + // P2P communication + userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes * 2, _ub_comm, + next_rank, (cudaStream_t)_stream_comm); + userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes * 2, _ub_comm, + prev_rank, (cudaStream_t)_stream_comm); + CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent( + (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_comm, 0)); + } else if (B_copy.numel() > 0) { + assert(B_copy.numel() == _ubufs[_tp_id].numel()); + assert(B_copy.element_size() == _ubufs[_tp_id].element_size()); + CHECK_CUDA(cudaMemcpyAsync(B_copy.data_ptr(), _ubufs[_tp_id].data_ptr(), + _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_comm)); + } + } + at::cuda::setCurrentCUDAStream(stream_main); + int last_compute_stream_id = + (num_steps + _stream_compute.size() - 1) % _stream_compute.size(); + CHECK_CUDA( + cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[last_compute_stream_id])); + } else { + // Catch up the default torch stream + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); + + for (int i = 0; i < _tp_size; i++) { + // Set the userbuffer id. Buffer under send is the input for the current GEMM chunk + // The initial input chunk is stored _ubuf[rank]. This is to have the AG output in all ranks + // to be contiguous after the ring exchanges + int send_chunk_id = (_tp_size + _tp_id - i) % _tp_size; + int recv_chunk_id = (_tp_size + _tp_id - i - 1) % _tp_size; + int send_offset = comm_bytes * send_chunk_id; + int recv_offset = comm_bytes * recv_chunk_id; + + // GEMM + torch::Tensor output_chunk = torch::from_blob( + output_ptr + (send_chunk_id * output_chunk_bytes), {n_chunk, m}, D.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr + (i % _stream_compute.size()) * workspace_size_chunk, + {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[i % _stream_compute.size()]); + te_gemm(A, A_scale_inverse, A_type, transa, _ubufs[send_chunk_id], B_scale_inverse, B_type, + transb, output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms); + + if (i < _tp_size - 1) { + // P2P communication + userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes, _ub_comm, + _next_rank, (cudaStream_t)_stream_comm); + userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes, _ub_comm, + _prev_rank, (cudaStream_t)_stream_comm); + CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent( + (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_comm, 0)); + } else if (B_copy.numel() > 0) { + assert(B_copy.numel() == _ubufs[_tp_id].numel()); + assert(B_copy.element_size() == _ubufs[_tp_id].element_size()); + CHECK_CUDA(cudaMemcpyAsync(B_copy.data_ptr(), _ubufs[_tp_id].data_ptr(), + _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_comm)); + } + } + at::cuda::setCurrentCUDAStream(stream_main); + int last_compute_stream_id = (_tp_size + _stream_compute.size() - 1) % _stream_compute.size(); + CHECK_CUDA( + cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[last_compute_stream_id])); + } + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _stop_compute, 0)); + + return D; + } // split_overlap_ag + + /* + ** Copy input to _ubufs[0] + */ + void copy_input_to_ubuf(torch::Tensor input, bool chunk) { + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + if (chunk) { + // Copy input to the target ubuf chunk by rank offset + if (input.numel() != _ubufs[0].numel() || input.element_size() != _ubufs[0].element_size()) { + NVTE_ERROR("input and ubuf size do not match!"); + } + CHECK_CUDA(cudaMemcpyAsync(_ubufs[_tp_id].data_ptr(), input.data_ptr(), + input.numel() * input.element_size(), cudaMemcpyDeviceToDevice, + (cudaStream_t)stream_main)); + } else { + if (input.numel() != _ubuf.numel() || input.element_size() != _ubuf.element_size()) { + NVTE_ERROR("input and ubuf size do not match!"); + } + CHECK_CUDA(cudaMemcpyAsync(_ubuf.data_ptr(), input.data_ptr(), + input.numel() * input.element_size(), cudaMemcpyDeviceToDevice, + (cudaStream_t)stream_main)); + } + } + torch::Tensor get_ubuf_output(int comm_type) { + char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); + COMM_TYPE _comm_type = static_cast(comm_type); + if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) NVTE_ERROR("Invalid comm_type"); + if (_comm_type == COMM_TYPE::RS) + ubuf_wt_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); + int output_c_dim0 = (_comm_type == COMM_TYPE::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size; + int output_c_dim1 = _ubuf.size(1); + return torch::from_blob(ubuf_wt_ptr, {output_c_dim0, output_c_dim1}, _ubuf.options()); + } +}; // UbufP2PCommOverlap + +} // namespace ubuf diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index ede0a5ef6c..e34c79d980 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -5,7 +5,9 @@ ************************************************************************/ #include "extensions.h" - +#ifdef NVTE_MPI_FOUND +#include "comm_gemm_overlap.h" +#endif // NVTE_MPI_FOUND void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, @@ -26,7 +28,8 @@ void te_gemm(at::Tensor A, at::Tensor workspace, size_t workspaceSize, bool accumulate, - bool use_split_accumulator + bool use_split_accumulator, + int math_sm_count ) { using namespace transformer_engine; auto te_A = makeTransformerEngineTensor(A.data_ptr(), @@ -70,6 +73,7 @@ void te_gemm(at::Tensor A, te_workspace.data(), accumulate, use_split_accumulator, + math_sm_count, at::cuda::getCurrentCUDAStream()); } @@ -536,6 +540,67 @@ std::vector layernorm_fwd_fp8(const at::Tensor &input, } +std::vector layernorm_fwd_fp8_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + at::Tensor scale, + at::Tensor ln_out, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto input_cu = makeTransformerEngineTensor(input); + auto gamma_cu = makeTransformerEngineTensor(weight); + auto beta_cu = makeTransformerEngineTensor(bias); + auto z_cu = makeTransformerEngineTensor(ln_out.data_ptr(), {N, H}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto mu_cu = makeTransformerEngineTensor(mu); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + transformer_engine::TensorWrapper workspace, barrier; + + // This call populates workspace and barrier tensors with the required config + const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Fill workspace and barrier + auto workspace_data = allocateSpace(workspace.shape(), + workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), + barrier.dtype(), + true); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + + // Actual call to fwd kernel + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return {ln_out, mu, rsigma}; +} + + at::Tensor layernorm_fwd_fp8_inf(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, @@ -609,6 +674,61 @@ std::vector layernorm_fwd(const at::Tensor &input, } +std::vector layernorm_fwd_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + at::Tensor ln_out, + float eps, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto input_cu = makeTransformerEngineTensor(input); + auto gamma_cu = makeTransformerEngineTensor(weight); + auto beta_cu = makeTransformerEngineTensor(bias); + auto z_cu = makeTransformerEngineTensor(ln_out); + auto mu_cu = makeTransformerEngineTensor(mu); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + transformer_engine::TensorWrapper workspace, barrier; + + // This call populates workspace and barrier tensors with the required config + const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Fill workspace and barrier + auto workspace_data = allocateSpace(workspace.shape(), + workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), + barrier.dtype(), + true); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + + // Actual call to fwd kernel + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return {ln_out, mu, rsigma}; +} + + at::Tensor layernorm_fwd_inf(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, @@ -646,6 +766,29 @@ at::Tensor cast_to_fp8(const at::Tensor &input, } +void cast_to_fp8_noalloc(const at::Tensor &input, + const at::Tensor &scale, + at::Tensor output, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + auto input_cu = makeTransformerEngineTensor(input); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, H}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_fp8_quantize(input_cu.data(), output_cu.data(), + at::cuda::getCurrentCUDAStream()); + + return; +} + + at::Tensor cast_from_fp8(const at::Tensor &input, const at::Tensor &scale_inv, transformer_engine::DType itype, @@ -878,6 +1021,17 @@ size_t get_cublasLt_version() { } +bool userbuf_comm_available() { // TODO(ksivamani) check on python side +#ifdef NVTE_MPI_FOUND + return true; +#else + return false; +#endif +} + +void placeholder() {} // TODO(ksivamani) clean this up + + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Softmax functions m.def("scaled_softmax_forward", &scaled_softmax_forward, "Scaled Softmax FWD"); @@ -895,8 +1049,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Other granular functions m.def("layernorm_fwd_fp8", &layernorm_fwd_fp8, "LN FWD FP8"); + m.def("layernorm_fwd_fp8_noalloc", &layernorm_fwd_fp8_noalloc, "LN FWD FP8"); m.def("layernorm_bwd", &layernorm_bwd, "LN BWD"); m.def("layernorm_fwd", &layernorm_fwd, "LN FWD"); + m.def("layernorm_fwd_noalloc", &layernorm_fwd_noalloc, "LN FWD"); m.def("fused_cast_transpose", &fused_cast_transpose, "Fused Cast + Transpose"); m.def("fused_cast_transpose_bgrad", &fused_cast_transpose_bgrad, "Fused Cast + Transpose + BGRAD"); @@ -907,6 +1063,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("fused_multi_cast_transpose", &fused_multi_cast_transpose, "Fused Multi-tensor Cast + Transpose"); m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8"); + m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8"); m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8"); m.def("te_gemm", &te_gemm, "CublasLt GEMM"); m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); @@ -914,6 +1071,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Misc m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); + m.def("userbuf_comm_available", &userbuf_comm_available, "If userbuf backend is available"); // Data structures py::class_(m, "FP8TensorMeta") @@ -922,6 +1080,31 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .def_readwrite("scale_inv", &transformer_engine::FP8TensorMeta::scale_inv) .def_readwrite("amax_history", &transformer_engine::FP8TensorMeta::amax_history); +#ifdef NVTE_MPI_FOUND + py::enum_(m, "UbufOverlapAlgo") + .value("BULK_OVERLAP_AG", ubuf::UBOverlapAlgo::BULK_OVERLAP_AG) + .value("BULK_OVERLAP_RS", ubuf::UBOverlapAlgo::BULK_OVERLAP_RS) + .value("SPLIT_PIPELINED_RS", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_RS) + .value("SPLIT_PIPELINED_AG", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_AG); + + py::class_(m, "UbufCommOverlap") + .def(py::init()) + .def("bulk_overlap", &ubuf::UbufCommOverlap::bulk_overlap) + .def("split_overlap_rs", &ubuf::UbufCommOverlap::split_overlap_rs) + .def("copy_input_to_ubuf", &ubuf::UbufCommOverlap::copy_input_to_ubuf) + .def("get_ubuf_output", &ubuf::UbufCommOverlap::get_ubuf_output); + + py::class_(m, "UbufP2PCommOverlap") + .def(py::init()) + .def("split_overlap_ag", &ubuf::UbufP2PCommOverlap::split_overlap_ag) + .def("copy_input_to_ubuf", &ubuf::UbufP2PCommOverlap::copy_input_to_ubuf) + .def("get_ubuf_output", &ubuf::UbufP2PCommOverlap::get_ubuf_output); +#else // NVTE_MPI_FOUND + m.def("UbufOverlapAlgo", &placeholder, "Dummy function for python side annotations"); + m.def("UbufCommOverlap", &placeholder, "Dummy function for python side annotations"); + m.def("UbufP2PCommOverlap", &placeholder, "Dummy function for python side annotations"); +#endif // NVTE_MPI_FOUND + py::enum_(m, "DType", py::module_local()) .value("kByte", transformer_engine::DType::kByte) .value("kInt32", transformer_engine::DType::kInt32) diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index 99849c15fe..6be404226e 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -26,7 +26,8 @@ void te_gemm(at::Tensor A, at::Tensor workspace, size_t workspaceSize, bool accumulate, - bool use_split_accumulator + bool use_split_accumulator, + int math_sm_count ); @@ -111,6 +112,19 @@ std::vector layernorm_fwd_fp8(const at::Tensor &input, const bool zero_centered_gamma ); +std::vector layernorm_fwd_fp8_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + at::Tensor scale, + at::Tensor ln_out, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +); + at::Tensor layernorm_fwd_fp8_inf(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, @@ -130,6 +144,15 @@ std::vector layernorm_fwd(const at::Tensor &input, const bool zero_centered_gamma ); +std::vector layernorm_fwd_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + at::Tensor ln_out, + float eps, + const int sm_margin, + const bool zero_centered_gamma +); + at::Tensor layernorm_fwd_inf(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, @@ -145,6 +168,15 @@ at::Tensor cast_to_fp8(const at::Tensor &input, ); +void cast_to_fp8_noalloc(const at::Tensor &input, + const at::Tensor &scale, + at::Tensor output, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +); + + at::Tensor cast_from_fp8(const at::Tensor &input, const at::Tensor &scale_inv, transformer_engine::DType itype, diff --git a/transformer_engine/pytorch/csrc/ts_fp8_op.cpp b/transformer_engine/pytorch/csrc/ts_fp8_op.cpp index b0085de04e..e3d1ef4d7b 100755 --- a/transformer_engine/pytorch/csrc/ts_fp8_op.cpp +++ b/transformer_engine/pytorch/csrc/ts_fp8_op.cpp @@ -121,7 +121,8 @@ at::Tensor te_gemm_ts(at::Tensor A, workspace, workspaceSize_arg, accumulate_arg, - use_split_accumulator_arg); + use_split_accumulator_arg, + 0); return D; } diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index dff37497d6..3e0a868047 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -85,6 +85,8 @@ _2X_ACC_DGRAD = True _2X_ACC_WGRAD = True _cublas_workspace = None +_ub_communicators = None +_NUM_MAX_UB_STREAMS = 3 _amax_reduce_handle_bwd = None @@ -147,6 +149,105 @@ def _prepare_backward( delete_key_from_amax_buffer(forward=False) +def initialize_ub( + shape: list, + tp_size: int, + use_fp8: bool = False, + ub_cfgs: Optional[dict] = None +) -> None: + """Initialize communicators for TP comm overlap using userbuffers.""" + global _ub_communicators + assert _ub_communicators is None, "UB communicators are already initialized." + _ub_communicators = {} + rank_id = torch.distributed.get_rank() + + # Increase the workspace by the number of maximum concurrent streams + global _cublas_workspace + _cublas_workspace = get_workspace().repeat(_NUM_MAX_UB_STREAMS) + + # Default buffer precision: AllGather buffers use fp8 when using fp8 recipe + fp8_buf = [ + "qkv_fprop", "qkv_dgrad", "proj_dgrad", "fc1_fprop", "fc1_dgrad", "fc2_dgrad" + ] + # Default overlap methods for layers + methods = { + "ring_exchange":["qkv_fprop", "fc1_fprop", "proj_dgrad", "fc2_dgrad"], + "pipeline":["proj_fprop", "fc2_fprop"], + "bulk":["qkv_dgrad", "qkv_wgrad", "fc1_dgrad", "fc1_wgrad"], + } + + def get_method(name): + for method, names in methods.items(): + if name in names: + return method + raise KeyError(f"Given layer name {name} does not exist.") + + def add_ub( + name: str, + method: str, + num_sm: int = 16, + cga_size: int = 2, + set_sm_margin: int = 0, + num_splits: int = 4, + aggregate: int = 0, + ) -> None: + dtype = torch.uint8 if (use_fp8 and name in fp8_buf) else torch.bfloat16 + sample_buffer = torch.empty(shape, dtype=dtype, device='cuda') + if method == 'ring_exchange': + ub_obj = tex.UbufP2PCommOverlap( + sample_buffer, # Sample userbuffer + rank_id, # Rank id + tp_size, # TP size + aggregate, # Aggregate 2X GEMM chunks + _NUM_MAX_UB_STREAMS, # Max concurrent GEMM streams + ) + else: + ub_obj = tex.UbufCommOverlap( + sample_buffer, # Sample userbuffer + rank_id, # Rank id + tp_size, # TP size + num_sm, # Number of communication SMs + cga_size, # CGA cluster size + num_splits, # Number of communication splits + set_sm_margin, # Set SM margin + _NUM_MAX_UB_STREAMS, # Max concurrent GEMM streams + ) + _ub_communicators[name] = ub_obj + + for name in (methods["ring_exchange"]+methods["pipeline"]+methods["bulk"]): + if ub_cfgs is not None and name in ub_cfgs: + ub_cfg = ub_cfgs[name] + method = ub_cfg["method"] if "method" in ub_cfg else get_method(name) + num_sm = ub_cfg["num_sm"] if "num_sm" in ub_cfg else 16 + cga_size = ub_cfg["cga_size"] if "cga_size" in ub_cfg else 2 + num_splits = ub_cfg["num_splits"] if "num_splits" in ub_cfg else 0 + set_sm_margin = ub_cfg["set_sm_margin"] if "set_sm_margin" in ub_cfg else 0 + aggregate = ub_cfg["aggregate"] if "aggregate" in ub_cfg else 0 + add_ub( + name, + method, + num_sm, + cga_size, + set_sm_margin, + num_splits, + aggregate + ) + else: + method = get_method(name) + if method == "pipeline": + add_ub(name, method) + else: + add_ub(name, method, num_splits=0) + + +def get_ub(name: str): + """Get userbuffer communicator corresponding to give key.""" + global _ub_communicators + assert _ub_communicators is not None, "UB manager is not initialized." + assert name in _ub_communicators, f"UB for {name} is not registered." + return _ub_communicators[name] + + class _NoopCat(torch.autograd.Function): """This class is a no-op replacement for `torch.cat`.""" @@ -596,9 +697,13 @@ def grad_output_preprocess( # No-FP8 case: bgrad is fused with wgrad for this case. if not ctx.fp8: if gather_grad_output: - grad_output_mat, _ = gather_along_first_dim( - grad_output_mat, ctx.tp_group - ) + if not ctx.ub_split_ag: + grad_output_mat, _ = gather_along_first_dim( + grad_output_mat, ctx.tp_group + ) + else: + ctx.ub_obj_gradout.copy_input_to_ubuf(grad_output, True) + grad_output_mat = ctx.ub_obj_gradout.get_ubuf_output(1) return grad_output_mat, None, None, None fp8_dtype_backward = get_fp8_te_dtype( @@ -610,6 +715,9 @@ def grad_output_preprocess( gather_grad_output and ctx.fp8_meta["recipe"].override_linear_precision.wgrad ): + assert ( + not ctx.ub_split_ag + ), "override_linear_precision.wgrad not supported with ub_split_ag" grad_output_mat, _ = gather_along_first_dim(grad_output_mat, ctx.tp_group) # FP8 case with gather: unfused bgrad, cast, transpose for efficient gather elif gather_grad_output: @@ -617,14 +725,23 @@ def grad_output_preprocess( grad_bias = grad_output_mat.sum(dim=0) else: grad_bias = None - grad_output_c = cast_to_fp8( + if ctx.ub_split_ag: + grad_output_c = ctx.ub_obj_gradout.get_ubuf_output(0) + else: + grad_output_c = torch.empty_like(grad_output_mat, dtype=torch.uint8) + cast_to_fp8( grad_output_mat, ctx.fp8_meta["scaling_bwd"], tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, + out=grad_output_c, ) - grad_output_c, _ = gather_along_first_dim(grad_output_c, ctx.tp_group) - grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) + if not ctx.ub_split_ag: + grad_output_c, _ = gather_along_first_dim(grad_output_c, ctx.tp_group) + grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) + else: + grad_output_c = ctx.ub_obj_gradout.get_ubuf_output(1) + grad_output_t = None return grad_output_mat, grad_output_c, grad_output_t, grad_bias @@ -718,6 +835,9 @@ def forward( fwd_ln_sm_margin: int, bwd_ln_sm_margin: int, zero_centered_gamma: bool, + ub_bulk_wgrad: bool, + ub_bulk_dgrad: bool, + ub_split_ag: bool, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -733,16 +853,26 @@ def forward( inputmat = cast_if_needed(inputmat, activation_dtype) ln_weight = cast_if_needed(ln_weight, activation_dtype) ln_bias = cast_if_needed(ln_bias, activation_dtype) - # If residual connection is after LN, we need `ln_out` # tensor in higher precision, this comes at the cost # of an extra fp8 cast. + if ub_split_ag: + tp_world_size = get_distributed_world_size(tp_group) + if tp_world_size == 1 or (not is_grad_enabled) or return_layernorm_output: + ub_split_ag = False + if ub_split_ag: + dim_size = list(inputmat.size()) + dim_size[0] = dim_size[0] * tp_world_size + ub_obj_lnout = get_ub("qkv_fprop") + ln_out = ub_obj_lnout.get_ubuf_output(0) if fp8: fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) if not return_layernorm_output: if is_grad_enabled: - ln_out, mu, rsigma = layernorm_fwd_fp8( + if not ub_split_ag: + ln_out = torch.empty_like(inputmat, dtype=torch.uint8) + _, mu, rsigma = layernorm_fwd_fp8( inputmat, ln_weight, ln_bias, @@ -752,6 +882,7 @@ def forward( fp8_dtype_forward, fwd_ln_sm_margin, zero_centered_gamma, + ln_out = ln_out ) else: mu = rsigma = None @@ -783,17 +914,25 @@ def forward( ) else: if is_grad_enabled: - ln_out, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) + if ub_split_ag: + _, mu, rsigma = tex.layernorm_fwd_noalloc( + inputmat, ln_weight, ln_bias, ln_out, eps, + fwd_ln_sm_margin, zero_centered_gamma + ) + else: + ln_out, mu, rsigma = tex.layernorm_fwd( + inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma + ) else: ln_out, mu, rsigma = layernorm_fwd_inf( inputmat, ln_weight, ln_bias, eps, zero_centered_gamma ), None, None ln_out_return = ln_out - # Column Parallel Linear - if parallel_mode == "column" and sequence_parallel: + if ub_split_ag: + ln_out_total = ub_obj_lnout.get_ubuf_output(1) + ln_out = torch.empty_like(ln_out) + elif parallel_mode == "column" and sequence_parallel: ln_out_total, _ = gather_along_first_dim(ln_out, tp_group) else: ln_out_total = ln_out @@ -838,6 +977,9 @@ def forward( bias=bias, use_bias=use_bias, use_split_accumulator=_2X_ACC_FPROP, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, + ub=ub_obj_lnout if ub_split_ag else None, + extra_output_tensor=ln_out if ub_split_ag else None, ) else: # Cast for native AMP @@ -859,6 +1001,9 @@ def forward( get_workspace(), bias=bias, use_bias=use_bias, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, + ub=ub_obj_lnout if ub_split_ag else None, + extra_output_tensor=ln_out if ub_split_ag else None, ) if is_grad_enabled: @@ -888,6 +1033,8 @@ def forward( ctx.return_layernorm_output = return_layernorm_output ctx.bwd_ln_sm_margin = bwd_ln_sm_margin ctx.zero_centered_gamma = zero_centered_gamma + ctx.ub_bulk_wgrad = ub_bulk_wgrad + ctx.ub_bulk_dgrad = ub_bulk_dgrad ctx.requires_dgrad = inp.requires_grad # Row Parallel Linear @@ -922,6 +1069,15 @@ def backward( fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.ub_bulk_dgrad: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_bulk_dgrad = False + if ctx.ub_bulk_dgrad: + dim_size = list(ln_out.size()) + dim_size[0] = dim_size[0] * tp_world_size + ub_obj_lnout = get_ub("qkv_dgrad") + ub_obj_lnout.copy_input_to_ubuf(ln_out, 1) ( grad_output, grad_output_c, @@ -931,9 +1087,14 @@ def backward( ctx, grad_outputs[0], ctx.parallel_mode == "row" ) + if ctx.ub_bulk_wgrad: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_bulk_wgrad = False + # Column Parallel Linear # Overlap input AG with dgrad - if ctx.parallel_mode == "column" and ctx.sequence_parallel: + if (not ctx.ub_bulk_dgrad) and ctx.parallel_mode == "column" and ctx.sequence_parallel: ln_out_total, handle = gather_along_first_dim( ln_out, ctx.tp_group, async_op=True ) @@ -947,6 +1108,15 @@ def backward( else: accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation + + dgrad_size = list(grad_output.size()) + dgrad_size[1] = weight.size(1) + if ctx.ub_bulk_wgrad: # allocate dgrad output + ub_obj_dgrad = get_ub("qkv_wgrad") + dgrad = ub_obj_dgrad.get_ubuf_output(1) # AllGather output + else: + dgrad = torch.empty (dgrad_size, dtype=ctx.activation_dtype, device=weight.device) + if ctx.fp8: fp8_dtype_forward = get_fp8_te_dtype( ctx.fp8_meta["recipe"], fprop_tensor=True @@ -956,7 +1126,7 @@ def backward( ) # DGRAD: Evaluated unconditionally to feed into Linear backward - dgrad = fp8_gemm( + _ = fp8_gemm( weight_t_fp8, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -967,25 +1137,35 @@ def backward( fp8_dtype_backward, ctx.activation_dtype, get_workspace(), + out=dgrad, use_split_accumulator=_2X_ACC_DGRAD, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None ) else: # DGRAD: Evaluated unconditionally to feed into Linear backward - dgrad, _, _ = gemm( + _, _, _ = gemm( weight, grad_output, ctx.activation_dtype, get_workspace(), + out=dgrad, layout="NN", grad=True, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None ) + if ctx.ub_bulk_dgrad: + ln_out_total = ub_obj_lnout.get_ubuf_output(1) # Overlap dgrad-RS/AR with wgrad if ctx.parallel_mode == "column" and ctx.sequence_parallel: - handle.wait() - dgrad, handle = reduce_scatter_along_first_dim( - dgrad, ctx.tp_group, async_op=True - ) + if not ctx.ub_bulk_dgrad: + handle.wait() + if not ctx.ub_bulk_wgrad: + dgrad, handle = reduce_scatter_along_first_dim( + dgrad, ctx.tp_group, async_op=True + ) elif ctx.parallel_mode == "column" and ctx.tensor_parallel: dgrad, handle = allreduce(dgrad, ctx.tp_group, async_op=True) @@ -1008,6 +1188,9 @@ def backward( accumulate=accumulate_wgrad_into_param_main_grad, out=weight.main_grad if ctx.fuse_wgrad_accumulation else None, use_split_accumulator=_2X_ACC_WGRAD, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS + if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) else: ln_out_total_c = cast_from_fp8( @@ -1026,6 +1209,9 @@ def backward( grad=True, accumulate=accumulate_wgrad_into_param_main_grad, out=weight.main_grad if ctx.fuse_wgrad_accumulation else None, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS + if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) else: # WGRAD @@ -1039,10 +1225,15 @@ def backward( use_bias=ctx.use_bias, accumulate=accumulate_wgrad_into_param_main_grad, out=weight.main_grad if ctx.fuse_wgrad_accumulation else None, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) + + if ctx.ub_bulk_wgrad: + dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output # Column Parallel Linear - if ctx.parallel_mode == "column" and ctx.tensor_parallel and handle is not None: + elif ctx.parallel_mode == "column" and ctx.tensor_parallel and handle is not None: handle.wait() # LayerNorm gradient @@ -1086,6 +1277,9 @@ def backward( None, None, None, + None, + None, + None, ) @@ -1179,6 +1373,9 @@ def __init__( skip_weight_param_allocation: bool = False, parameters_split: Optional[Tuple[str, ...]] = None, zero_centered_gamma: bool = False, + ub_bulk_wgrad: bool = False, + ub_bulk_dgrad: bool = False, + ub_split_ag: bool = False, ) -> None: super().__init__() self.in_features = in_features @@ -1190,6 +1387,14 @@ def __init__( self.return_layernorm_output = return_layernorm_output self.parameters_split = parameters_split self.zero_centered_gamma = zero_centered_gamma + self.ub_bulk_wgrad = ub_bulk_wgrad + self.ub_bulk_dgrad = ub_bulk_dgrad + self.ub_split_ag = ub_split_ag + + if ub_bulk_wgrad or ub_bulk_dgrad or ub_split_ag: + assert ( + tex.userbuf_comm_available() + ), "Userbuffer communication backend not available." if tp_group is None: self.tp_size = tp_size @@ -1308,6 +1513,7 @@ def __init__( self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) + # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM if self.parallel_mode == "row" and self.apply_bias: @@ -1412,6 +1618,9 @@ def forward( self.fwd_ln_sm_margin, self.bwd_ln_sm_margin, self.zero_centered_gamma, + self.ub_bulk_wgrad, + self.ub_bulk_dgrad, + self.ub_split_ag, ) out = fwd_fn(*args) @@ -1455,6 +1664,8 @@ def forward( activation_dtype: torch.dtype, parallel_mode: Union[str, None], is_grad_enabled: bool, + ub_split_rs: bool, + ub_split_ag: bool, ) -> torch.Tensor: # Make sure input dimensions are compatible in_features = weight.shape[-1] @@ -1466,6 +1677,10 @@ def forward( update_fp8_weights = is_first_microbatch is None or is_first_microbatch + if ub_split_rs: + tp_world_size = get_distributed_world_size(tp_group) + if tp_world_size == 1: + ub_split_rs = False # Cast for native AMP inputmat = cast_if_needed(inputmat, activation_dtype) inputmat_no_fp8 = inputmat @@ -1529,7 +1744,19 @@ def forward( fp8_dtype_forward, ) - out = fp8_gemm( + if ub_split_rs: + ub_obj_projout = get_ub("proj_fprop") + out = ub_obj_projout.get_ubuf_output(1) + dim_size = list(inputmat_total.size()) + dim_size[0] = dim_size[0] // tp_world_size + dim_size[1] = weight.size(0) + rs_out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + else: + dim_size = list(inputmat_total.size()) + dim_size[1] = weight.size(0) + out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + + _ = fp8_gemm( weight_fp8, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -1543,6 +1770,10 @@ def forward( bias=bias, use_bias=use_bias, use_split_accumulator=_2X_ACC_FPROP, + out=out, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, + ub=ub_obj_projout if ub_split_rs else None, + extra_output_tensor=rs_out if ub_split_rs else None, ) else: # Cast for native AMP @@ -1557,13 +1788,29 @@ def forward( fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_WEIGHT] = \ torch.amax(weight).float() - out, _, _ = gemm( + if ub_split_rs: + ub_obj_projout = get_ub("proj_fprop") + out = ub_obj_projout.get_ubuf_output(1) + dim_size = list(inputmat_total.size()) + dim_size[0] = dim_size[0] // tp_world_size + dim_size[1] = weight.size(0) + rs_out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + else: + dim_size = list(inputmat_total.size()) + dim_size[1] = weight.size(0) + out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + + _, _, _ = gemm( weight, inputmat_total, activation_dtype, get_workspace(), bias=bias, use_bias=use_bias, + out=out, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, + ub=ub_obj_projout if ub_split_rs else None, + extra_output_tensor=rs_out if ub_split_rs else None, ) if is_grad_enabled: @@ -1586,11 +1833,14 @@ def forward( ctx.inp_shape = inp.shape ctx.parallel_mode = parallel_mode ctx.tp_group = tp_group + ctx.ub_split_ag = ub_split_ag ctx.tp_size = tp_size ctx.requires_dgrad = inp.requires_grad # Row Parallel Linear - if parallel_mode == "row" and sequence_parallel: + if ub_split_rs: + out = rs_out + elif parallel_mode == "row" and sequence_parallel: out, _ = reduce_scatter_along_first_dim(out, tp_group) elif parallel_mode == "row" and tensor_parallel: out, _ = allreduce(out, tp_group) @@ -1614,6 +1864,14 @@ def backward( fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.ub_split_ag: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_split_ag = False + if ctx.ub_split_ag: + dim_size = list(grad_output.size()) + dim_size[0] = dim_size[0] * tp_world_size + ctx.ub_obj_gradout = get_ub("proj_dgrad") ( grad_output, grad_output_c, @@ -1667,6 +1925,8 @@ def backward( ctx.activation_dtype, get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, + ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, ) else: dgrad, _, _ = gemm( @@ -1676,6 +1936,8 @@ def backward( get_workspace(), layout="NN", grad=True, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, + ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, ) # Overlap dgrad-RS/AR with wgrad @@ -1691,6 +1953,8 @@ def backward( if ctx.fp8: # WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: + if ctx.ub_split_ag: + grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) wgrad = fp8_gemm( inputmat_t_total, fwd_scale_inverses, @@ -1757,6 +2021,8 @@ def backward( None, None, None, + None, + None, ) @@ -1838,6 +2104,8 @@ def __init__( parallel_mode: Optional[str] = None, skip_weight_param_allocation: bool = False, parameters_split: Optional[Tuple[str, ...]] = None, + ub_split_rs: bool = False, + ub_split_ag: bool = False, ) -> None: super().__init__() self.in_features = in_features @@ -1847,6 +2115,13 @@ def __init__( self.return_bias = return_bias self.apply_bias = bias and not return_bias self.parameters_split = parameters_split + self.ub_split_rs = ub_split_rs + self.ub_split_ag = ub_split_ag + + if ub_split_rs or ub_split_ag: + assert ( + tex.userbuf_comm_available() + ), "Userbuffer communication backend not available." if tp_group is None: self.tp_size = tp_size @@ -2028,6 +2303,8 @@ def forward( self.activation_dtype, self.parallel_mode, torch.is_grad_enabled(), + self.ub_split_rs, + self.ub_split_ag, ) out = linear_fn(*args) @@ -2078,6 +2355,10 @@ def forward( fwd_ln_sm_margin: int, bwd_ln_sm_margin: int, zero_centered_gamma: bool, + ub_bulk_wgrad: bool, + ub_bulk_dgrad: bool, + ub_split_rs: bool, + ub_split_ag: bool, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -2094,6 +2375,18 @@ def forward( ln_weight = cast_if_needed(ln_weight, activation_dtype) ln_bias = cast_if_needed(ln_bias, activation_dtype) + if ub_split_ag: + tp_world_size = get_distributed_world_size(tp_group) + if tp_world_size == 1 or (not is_grad_enabled) or return_layernorm_output: + ub_split_ag = False + if ub_split_ag: + ub_obj_lnout = get_ub("fc1_fprop") + ln_out = ub_obj_lnout.get_ubuf_output(0) + if ub_split_rs: + tp_world_size = get_distributed_world_size(tp_group) + if tp_world_size == 1: + ub_split_rs = False + # If residual connection is after LN, we need `ln_out` # tensor in higher precision, this comes at the cost # of an extra fp8 cast. @@ -2101,7 +2394,9 @@ def forward( fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) if not return_layernorm_output: if is_grad_enabled: - ln_out, mu, rsigma = layernorm_fwd_fp8( + if not ub_split_ag: + ln_out = torch.empty_like(inputmat, dtype=torch.uint8) + _, mu, rsigma = layernorm_fwd_fp8( inputmat, ln_weight, ln_bias, @@ -2111,6 +2406,7 @@ def forward( fp8_dtype_forward, fwd_ln_sm_margin, zero_centered_gamma, + ln_out = ln_out, ) else: ln_out = layernorm_fwd_fp8_inf( @@ -2135,9 +2431,15 @@ def forward( ) else: if is_grad_enabled: - ln_out, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) + if ub_split_ag: + _, mu, rsigma = tex.layernorm_fwd_noalloc( + inputmat, ln_weight, ln_bias, ln_out, eps, + fwd_ln_sm_margin, zero_centered_gamma + ) + else: + ln_out, mu, rsigma = tex.layernorm_fwd( + inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma + ) else: ln_out, mu, rsigma = layernorm_fwd_inf( inputmat, ln_weight, ln_bias, eps, zero_centered_gamma @@ -2145,7 +2447,10 @@ def forward( ln_out_return = ln_out # Column Parallel Linear - if set_parallel_mode and sequence_parallel: + if ub_split_ag: + ln_out_total = ub_obj_lnout.get_ubuf_output(1) + ln_out = torch.empty_like(ln_out) + elif set_parallel_mode and sequence_parallel: ln_out_total, _ = gather_along_first_dim(ln_out, tp_group) else: ln_out_total = ln_out @@ -2208,6 +2513,9 @@ def forward( bias=fc1_bias, use_bias=use_fc1_bias, use_split_accumulator=_2X_ACC_FPROP, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, + ub=ub_obj_lnout if ub_split_ag else None, + extra_output_tensor=ln_out if ub_split_ag else None, ) gelu_out = fp8_gelu( @@ -2217,7 +2525,19 @@ def forward( fp8_dtype_forward, ) - fc2_out = fp8_gemm( + if ub_split_rs: + ub_obj_fc2out = get_ub("fc2_fprop") + fc2_out = ub_obj_fc2out.get_ubuf_output(1) + dim_size = list(gelu_out.size()) + dim_size[0] = dim_size[0] // tp_world_size + dim_size[1] = fc2_weight.size(0) + rs_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + else: + dim_size = list(gelu_out.size()) + dim_size[1] = fc2_weight.size(0) + fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + + _ = fp8_gemm( fc2_weight_fp8, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM2_WEIGHT, @@ -2231,6 +2551,10 @@ def forward( bias=fc2_bias, use_bias=use_fc2_bias, use_split_accumulator=_2X_ACC_FPROP, + out=fc2_out, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, + ub=ub_obj_fc2out if ub_split_rs else None, + extra_output_tensor=rs_out if ub_split_rs else None, ) else: # Cast for native AMP @@ -2259,6 +2583,9 @@ def forward( bias=fc1_bias, use_bias=(not bias_gelu_nvfusion) and use_fc1_bias, gelu=not bias_gelu_nvfusion, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, + ub=ub_obj_lnout if ub_split_ag else None, + extra_output_tensor=ln_out if ub_split_ag else None, ) if bias_gelu_nvfusion: @@ -2276,14 +2603,30 @@ def forward( fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM2_WEIGHT] = \ torch.amax(fc2_weight).float() - fc2_out, _, _ = gemm( + if ub_split_rs: + ub_obj_fc2out = get_ub("fc2_fprop") + fc2_out = ub_obj_fc2out.get_ubuf_output(1) + dim_size = list(gelu_out.size()) + dim_size[0] = dim_size[0] // tp_world_size + dim_size[1] = fc2_weight.size(0) + rs_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + else: + dim_size = list(gelu_out.size()) + dim_size[1] = fc2_weight.size(0) + fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + _, _, _ = gemm( fc2_weight, gelu_out, activation_dtype, get_workspace(), bias=fc2_bias, use_bias=use_fc2_bias, + out=fc2_out, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, + ub=ub_obj_fc2out if ub_split_rs else None, + extra_output_tensor=rs_out if ub_split_rs else None, ) + if is_grad_enabled: ctx.save_for_backward( inputmat, @@ -2317,10 +2660,15 @@ def forward( ctx.set_parallel_mode = set_parallel_mode ctx.bwd_ln_sm_margin = bwd_ln_sm_margin ctx.zero_centered_gamma = zero_centered_gamma + ctx.ub_bulk_wgrad = ub_bulk_wgrad + ctx.ub_bulk_dgrad = ub_bulk_dgrad + ctx.ub_split_ag = ub_split_ag ctx.requires_dgrad = inp.requires_grad # Row Parallel Linear - if set_parallel_mode and sequence_parallel: + if ub_split_rs: + fc2_out = rs_out + elif set_parallel_mode and sequence_parallel: fc2_out, _ = reduce_scatter_along_first_dim(fc2_out, tp_group) elif set_parallel_mode and tensor_parallel: fc2_out, _ = allreduce(fc2_out, tp_group) @@ -2356,6 +2704,24 @@ def backward( fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.ub_bulk_dgrad: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_bulk_dgrad = False + if ctx.ub_bulk_dgrad: + dim_size = list(ln_out.size()) + dim_size[0] = dim_size[0] * tp_world_size + ub_obj_lnout = get_ub("fc1_dgrad") + ub_obj_lnout.copy_input_to_ubuf(ln_out, 1) + if ctx.ub_split_ag: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_split_ag = False + if ctx.ub_split_ag: + dim_size = list(grad_outputs[0].size()) + dim_size[0] = dim_size[0] * tp_world_size + ctx.ub_obj_gradout = get_ub("fc2_dgrad") + ctx.use_bias = ctx.use_fc2_bias # For grad_output_preprocess ( grad_output, @@ -2365,10 +2731,13 @@ def backward( ) = TransformerEngineBaseModule.grad_output_preprocess( ctx, grad_outputs[0], True ) - + if ctx.ub_bulk_wgrad: + tp_world_size = get_distributed_world_size(ctx.tp_group) + if tp_world_size == 1: + ctx.ub_bulk_wgrad = False # Column Parallel Linear # Overlap input AG with dgrad - if ctx.set_parallel_mode and ctx.sequence_parallel: + if (not ctx.ub_bulk_dgrad) and ctx.set_parallel_mode and ctx.sequence_parallel: ln_out_total, handle = gather_along_first_dim( ln_out, ctx.tp_group, async_op=True ) @@ -2403,8 +2772,11 @@ def backward( ctx.activation_dtype, get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, + ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, ) - + if ctx.ub_split_ag: + grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) # FC2 WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: if fc2_weight.requires_grad: @@ -2469,8 +2841,17 @@ def backward( ) dgelu_t = None + fc1_dgrad_size = list(dgelu.size()) + fc1_dgrad_size[1] = fc1_weight.size(1) + if ctx.ub_bulk_wgrad: # allocate dgrad output + ub_obj_dgrad = get_ub("fc1_wgrad") + fc1_dgrad = ub_obj_dgrad.get_ubuf_output(1) # AllGather output + else: + fc1_dgrad = torch.empty( + fc1_dgrad_size, dtype=ctx.activation_dtype, device=fc1_weight.device + ) # FC1 DGRAD: Unconditional - fc1_dgrad = fp8_gemm( + _ = fp8_gemm( fc1_weight_t_fp8, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -2481,7 +2862,10 @@ def backward( fp8_dtype_backward, ctx.activation_dtype, get_workspace(), + out=fc1_dgrad, use_split_accumulator=_2X_ACC_DGRAD, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None ) else: # FC2 DGRAD; Unconditional @@ -2494,6 +2878,8 @@ def backward( gelu=not ctx.bias_gelu_nvfusion, grad=True, gelu_input=fc1_out, + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, + ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, ) # FC2 WGRAD @@ -2515,22 +2901,38 @@ def backward( else: dgelu = fc2_dgrad + fc1_dgrad_size = list(dgelu.size()) + fc1_dgrad_size[1] = fc1_weight.size(1) + if ctx.ub_bulk_wgrad: # allocate dgrad output + ub_obj_dgrad = get_ub("fc1_wgrad") + fc1_dgrad = ub_obj_dgrad.get_ubuf_output(1) # AllGather output + else: + fc1_dgrad = torch.empty( + fc1_dgrad_size, dtype=ctx.activation_dtype, device=fc1_weight.device + ) # FC1 DGRAD: Unconditional - fc1_dgrad, _, _ = gemm( + _, _, _ = gemm( fc1_weight, dgelu, ctx.activation_dtype, get_workspace(), + out=fc1_dgrad, layout="NN", grad=True, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None ) + if ctx.ub_bulk_dgrad: + ln_out_total = ub_obj_lnout.get_ubuf_output(1) # Overlap dgrad-RS/AR with wgrad if ctx.set_parallel_mode and ctx.sequence_parallel: - handle.wait() - fc1_dgrad, handle = reduce_scatter_along_first_dim( - fc1_dgrad, ctx.tp_group, async_op=True - ) + if not ctx.ub_bulk_dgrad: + handle.wait() + if not ctx.ub_bulk_wgrad: + fc1_dgrad, handle = reduce_scatter_along_first_dim( + fc1_dgrad, ctx.tp_group, async_op=True + ) elif ctx.set_parallel_mode and ctx.tensor_parallel: fc1_dgrad, handle = allreduce(fc1_dgrad, ctx.tp_group, async_op=True) @@ -2555,6 +2957,9 @@ def backward( if ctx.fuse_wgrad_accumulation else None, use_split_accumulator=_2X_ACC_WGRAD, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS + if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, ) else: ln_out_total_c = cast_from_fp8( @@ -2575,6 +2980,9 @@ def backward( out=fc1_weight.main_grad if ctx.fuse_wgrad_accumulation else None, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS + if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, ) else: # FC1 WGRAD @@ -2588,6 +2996,8 @@ def backward( use_bias=not ctx.bias_gelu_nvfusion, accumulate=accumulate_wgrad_into_param_main_grad, out=fc1_weight.main_grad if ctx.fuse_wgrad_accumulation else None, + ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) if ctx.bias_gelu_nvfusion: @@ -2596,7 +3006,9 @@ def backward( fc1_wgrad, fc1_bias_grad, _ = fc1_wgrad_outputs # Column Parallel Linear - if ctx.set_parallel_mode and ctx.tensor_parallel and handle is not None: + if ctx.ub_bulk_wgrad: + fc1_dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output + elif ctx.set_parallel_mode and ctx.tensor_parallel and handle is not None: handle.wait() # LayerNorm gradient @@ -2643,6 +3055,10 @@ def backward( None, None, None, + None, + None, + None, + None, ) @@ -2741,6 +3157,10 @@ def __init__( micro_batch_size: Optional[int] = None, set_parallel_mode: bool = False, zero_centered_gamma: bool = False, + ub_bulk_wgrad: bool = False, + ub_bulk_dgrad: bool = False, + ub_split_rs: bool = False, + ub_split_ag: bool = False, ) -> None: super().__init__() @@ -2752,6 +3172,15 @@ def __init__( self.bias_gelu_nvfusion = bool(int(os.getenv("NVTE_BIAS_GELU_NVFUSION", "1"))) self.set_parallel_mode = set_parallel_mode self.zero_centered_gamma = zero_centered_gamma + self.ub_bulk_wgrad = ub_bulk_wgrad + self.ub_bulk_dgrad = ub_bulk_dgrad + self.ub_split_rs = ub_split_rs + self.ub_split_ag = ub_split_ag + + if ub_bulk_wgrad or ub_bulk_dgrad or ub_split_rs or ub_split_ag: + assert ( + tex.userbuf_comm_available() + ), "Userbuffer communication backend not available." if tp_group is None: self.tp_size = tp_size @@ -2948,6 +3377,10 @@ def forward( self.fwd_ln_sm_margin, self.bwd_ln_sm_margin, self.zero_centered_gamma, + self.ub_bulk_wgrad, + self.ub_bulk_dgrad, + self.ub_split_rs, + self.ub_split_ag, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 83582e2aae..52d303e8f4 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -15,6 +15,7 @@ from flash_attn.flash_attn_interface import flash_attn_unpadded_func +import transformer_engine_extensions as tex from transformer_engine.pytorch.module import LayerNormLinear, Linear, LayerNormMLP, LayerNorm from transformer_engine.pytorch.jit import ( set_jit_fusion_options, @@ -495,6 +496,10 @@ def __init__( fuse_qkv_params: bool = False, zero_centered_gamma: bool = False, qkv_weight_interleaved: bool = True, + ub_bulk_wgrad: bool = False, + ub_bulk_dgrad: bool = False, + ub_split_rs: bool = False, + ub_split_ag: bool = False, bias: bool = True, ) -> None: super().__init__() @@ -547,6 +552,9 @@ def __init__( return_layernorm_output=return_layernorm_output, parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None, zero_centered_gamma=zero_centered_gamma, + ub_bulk_wgrad=ub_bulk_wgrad, + ub_bulk_dgrad=ub_bulk_dgrad, + ub_split_ag=ub_split_ag, **common_gemm_kwargs, ) else: @@ -572,6 +580,9 @@ def __init__( parallel_mode=qkv_parallel_mode, return_layernorm_output=return_layernorm_output, zero_centered_gamma=zero_centered_gamma, + ub_bulk_wgrad=ub_bulk_wgrad, + ub_bulk_dgrad=ub_bulk_dgrad, + ub_split_ag=ub_split_ag, **common_gemm_kwargs, ) else: @@ -616,6 +627,8 @@ def __init__( bias=bias, return_bias=True, parallel_mode="row" if set_parallel_mode else None, + ub_split_rs=ub_split_rs, + ub_split_ag=ub_split_ag, **common_gemm_kwargs, ) @@ -911,6 +924,12 @@ class TransformerLayer(torch.nn.Module): `set_tensor_parallel_group(tp_group)` method on the initialized module before the forward pass to supply the tensor parallel group needed for tensor and sequence parallel collectives. + ub_bulk_wgrad: bool, default = False + Bulk overlap UserBuffer ReduceScatter | WGRAD GEMM + ub_bulk_dgrad: bool, default = False + Bulk overlap UserBuffer AllGather | DGRAD GEMM + ub_split_ag: bool, default = False + Split pipelined overlap UserBuffer AllGather -> GEMM Optimization parameters ----------------------- @@ -970,6 +989,7 @@ def __init__( fuse_qkv_params: bool = False, zero_centered_gamma: bool = False, qkv_weight_interleaved: bool = True, + ub_tp_comm_overlap: bool = False, bias: bool = True, ) -> None: super().__init__() @@ -980,6 +1000,16 @@ def __init__( category=DeprecationWarning, ) + if ub_tp_comm_overlap: + assert ( + tex.userbuf_comm_available() + ), "Userbuffer communication backend not available." + + ub_tp_comm_overlap = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_OVERLAP", "1"))) + ub_bulk_wgrad = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_BULK_WGRAD", "1"))) + ub_bulk_dgrad = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_BULK_DGRAD", "1"))) + ub_split_ag = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_SPLIT_AG", "1"))) + ub_split_rs = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_SPLIT_RS", "1"))) bias_dropout_fusion = bool(int(os.getenv("NVTE_BIAS_DROPOUT_FUSION", "1"))) self.layer_number = layer_number self.output_layernorm = output_layernorm @@ -1037,6 +1067,10 @@ def __init__( "fuse_qkv_params": fuse_qkv_params, "zero_centered_gamma": zero_centered_gamma, "qkv_weight_interleaved" : qkv_weight_interleaved, + "ub_bulk_wgrad" : ub_bulk_wgrad, + "ub_bulk_dgrad" : ub_bulk_dgrad, + "ub_split_ag" : ub_split_ag, + "ub_split_rs" : ub_split_rs, } self.self_attention = MultiHeadAttention( @@ -1080,6 +1114,10 @@ def __init__( micro_batch_size=micro_batch_size, set_parallel_mode=set_parallel_mode, zero_centered_gamma=zero_centered_gamma, + ub_bulk_wgrad=ub_bulk_wgrad, + ub_bulk_dgrad=ub_bulk_dgrad, + ub_split_rs=ub_split_rs, + ub_split_ag=ub_split_ag, ) self.hidden_dropout = hidden_dropout diff --git a/transformer_engine/tensorflow/csrc/extensions.cu b/transformer_engine/tensorflow/csrc/extensions.cu index aa2ad0b3ba..8cda79a7ed 100644 --- a/transformer_engine/tensorflow/csrc/extensions.cu +++ b/transformer_engine/tensorflow/csrc/extensions.cu @@ -568,7 +568,7 @@ py::object TFE_Py_TeGemm_wrapper( nvte_cublas_gemm(a_tensor.data(), b_tensor.data(), d_tensor.data(), bias_tensor.data(), gelu_input_tensor.data(), transa, transb, grad, workspace_tensor.data(), accumulate, - use_split_accumulate, stream); + use_split_accumulate, 0, stream); auto d_eager = CreateTensor(d_ptr, d_shape, otype); if (use_gelu && !grad) { From 7bf886d1e9cfc23146f0d6da4db7edfcabad3338 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 21 Apr 2023 00:25:12 -0700 Subject: [PATCH 019/427] Move userbuffer to PyTorch (#162) * Initial refactor; linker error Signed-off-by: Kirthi Shankar Sivamani * Fix linking issue and make mpi conditional Signed-off-by: Kirthi Shankar Sivamani * Fix TF/JAX build Signed-off-by: Kirthi Shankar Sivamani * Use max SMs at the last RS chunk in pipelined overlap Co-authored-by: Sangkug Lym Signed-off-by: Kirthi Shankar Sivamani * lint Signed-off-by: Kirthi Shankar Sivamani * Make userbuffers support opt-in Decouple userbuffers from MPI. Refactor MPI handling in build system. Standardize names to "userbuffers". Signed-off-by: Tim Moon * Lint Signed-off-by: Tim Moon --------- Signed-off-by: Kirthi Shankar Sivamani Signed-off-by: Tim Moon Co-authored-by: Sangkug Lym Co-authored-by: Tim Moon --- qa/L0_cppunittest/test.sh | 6 +- setup.py | 22 +++--- tests/cpp/CMakeLists.txt | 7 +- tests/cpp/operator/CMakeLists.txt | 4 - transformer_engine/CMakeLists.txt | 4 +- transformer_engine/common/CMakeLists.txt | 74 ++++++++----------- transformer_engine/common/__init__.py | 15 ++-- .../pytorch/csrc/comm_gemm_overlap.h | 13 +++- transformer_engine/pytorch/csrc/extensions.cu | 12 +-- .../pytorch/csrc/userbuffers/CMakeLists.txt | 33 +++++++++ .../csrc/userbuffers}/userbuffers-host.cpp | 17 +++-- .../csrc/userbuffers}/userbuffers.cu | 2 +- .../csrc/userbuffers}/userbuffers.h | 2 +- 13 files changed, 117 insertions(+), 94 deletions(-) create mode 100644 transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt rename transformer_engine/{common/comm_gemm_overlap => pytorch/csrc/userbuffers}/userbuffers-host.cpp (96%) rename transformer_engine/{common/comm_gemm_overlap => pytorch/csrc/userbuffers}/userbuffers.cu (99%) rename transformer_engine/{common/include/transformer_engine => pytorch/csrc/userbuffers}/userbuffers.h (99%) diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh index 73a27a1fcd..6333f33fb1 100644 --- a/qa/L0_cppunittest/test.sh +++ b/qa/L0_cppunittest/test.sh @@ -9,11 +9,7 @@ set -e TE_LIB_PATH=`pip show transformer-engine | grep Location | cut -d ' ' -f 2` export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH -# Find MPI -MPI_HOME=${MPI_HOME:-/usr/local/mpi} -NVTE_MPI_INCLUDE="$MPI_HOME/lib" - cd $TE_PATH/tests/cpp -cmake -GNinja -Bbuild -DNVTE_MPI_INCLUDE=$NVTE_MPI_INCLUDE . +cmake -GNinja -Bbuild . cmake --build build ctest --test-dir build -j4 diff --git a/setup.py b/setup.py index decdce51a4..cb0c37fe3a 100644 --- a/setup.py +++ b/setup.py @@ -21,9 +21,10 @@ te_version = f.readline() CUDA_HOME = os.environ.get("CUDA_HOME", "/usr/local/cuda") -MPI_HOME = os.environ.get("MPI_HOME", "/usr/local/mpi") -NVTE_MPI_FOUND = os.path.exists(MPI_HOME) -NVTE_MPI_INCLUDE = os.path.join(MPI_HOME, "include") +NVTE_WITH_USERBUFFERS = int(os.environ.get("NVTE_WITH_USERBUFFERS", "0")) +if NVTE_WITH_USERBUFFERS: + MPI_HOME = os.environ.get("MPI_HOME", "") + assert MPI_HOME, "MPI_HOME must be set if NVTE_WITH_USERBUFFERS=1" def get_cuda_bare_metal_version(cuda_dir): raw_output = subprocess.check_output( @@ -70,8 +71,8 @@ def extra_compiler_flags(): "--expt-extended-lambda", "--use_fast_math", ] - if NVTE_MPI_FOUND: - extra_flags.append("-DNVTE_MPI_FOUND") + if NVTE_WITH_USERBUFFERS: + extra_flags.append("-DNVTE_WITH_USERBUFFERS") return extra_flags @@ -105,8 +106,9 @@ def make_abs_path(l): "transformer_engine/common/include", "transformer_engine/pytorch/csrc", ] -if (framework in ("all", "pytorch")) and NVTE_MPI_FOUND: - include_dirs.append(NVTE_MPI_INCLUDE) +if NVTE_WITH_USERBUFFERS: + if MPI_HOME: + include_dirs.append(os.path.join(MPI_HOME, "include")) include_dirs = make_abs_path(include_dirs) args = sys.argv.copy() @@ -165,9 +167,7 @@ def run(self, extensions): self.pytorch_build_extensions.run() def cmake_flags(self): - if not NVTE_MPI_FOUND: - return [] - return ["-DNVTE_MPI_FOUND=1", f"-DNVTE_MPI_INCLUDE={NVTE_MPI_INCLUDE}"] + return [] @staticmethod def install_requires(): @@ -338,6 +338,8 @@ def __init__(self, *args, **kwargs) -> None: self.dlfw_builder.append(functor(*args, **kwargs)) flags = [] + if NVTE_WITH_USERBUFFERS: + flags.append('-DNVTE_WITH_USERBUFFERS=ON') for builder in self.dlfw_builder: flags = flags + builder.cmake_flags() diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 631b356fec..8bdfb89df2 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -19,7 +19,7 @@ add_subdirectory(../../3rdparty/googletest ${PROJECT_BINARY_DIR}/googletest) enable_testing() -include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) +include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) if(NOT DEFINED TE_LIB_PATH) execute_process(COMMAND bash -c "pip show transformer-engine | grep Location | cut -d ' ' -f 2 | tr -d '\n'" @@ -28,11 +28,6 @@ endif() find_library(TE_LIB NAMES transformer_engine PATHS ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED) -if(EXISTS ${NVTE_MPI_INCLUDE}) - find_library(MPI_LIB NAMES mpi PATHS ${NVTE_MPI_INCLUDE} REQUIRED) - message(STATUS "Found MPI library: ${MPI_LIB}") -endif() - message(STATUS "Found transformer_engine library: ${TE_LIB}") include_directories(../../transformer_engine/common/include) include_directories(${CMAKE_SOURCE_DIR}) diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt index a77cf98a73..65a7ccaebd 100644 --- a/tests/cpp/operator/CMakeLists.txt +++ b/tests/cpp/operator/CMakeLists.txt @@ -19,10 +19,6 @@ add_executable(test_operator list(APPEND test_operator_LINKER_LIBS CUDA::cudart GTest::gtest_main ${TE_LIB}) -if(EXISTS ${NVTE_MPI_INCLUDE}) - list(APPEND test_operator_LINKER_LIBS ${MPI_LIB}) -endif() - target_link_libraries(test_operator PUBLIC ${test_operator_LINKER_LIBS}) target_compile_options(test_operator PRIVATE -O2) diff --git a/transformer_engine/CMakeLists.txt b/transformer_engine/CMakeLists.txt index a03cd42806..336f41be70 100644 --- a/transformer_engine/CMakeLists.txt +++ b/transformer_engine/CMakeLists.txt @@ -8,7 +8,6 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90) endif() - set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) @@ -26,6 +25,9 @@ find_package(Python COMPONENTS Interpreter Development REQUIRED) include_directories(${PROJECT_SOURCE_DIR}) add_subdirectory(common) +if(NVTE_WITH_USERBUFFERS) + add_subdirectory(pytorch/csrc/userbuffers) +endif() option(ENABLE_JAX "Enable JAX in the building workflow." OFF) if(ENABLE_JAX) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 7459f77e4f..c5bc6bb0f1 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -2,54 +2,42 @@ # # See LICENSE for license information. +# Configure Transformer Engine library set(transformer_engine_SOURCES) -list(APPEND transformer_engine_SOURCES transformer_engine.cpp - transpose/cast_transpose.cu - transpose/transpose.cu - transpose/cast_transpose_fusion.cu - transpose/transpose_fusion.cu - transpose/multi_cast_transpose.cu - activation/gelu.cu - gemm/cublaslt_gemm.cu - layer_norm/ln_api.cpp - layer_norm/ln_bwd_semi_cuda_kernel.cu - layer_norm/ln_fwd_cuda_kernel.cu - rmsnorm/rmsnorm_api.cpp - rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu - rmsnorm/rmsnorm_fwd_cuda_kernel.cu - util/cast.cu - fused_softmax/scaled_masked_softmax.cu - fused_softmax/scaled_upper_triang_masked_softmax.cu) - -if(NVTE_MPI_FOUND) - list(APPEND transformer_engine_SOURCES comm_gemm_overlap/userbuffers.cu - comm_gemm_overlap/userbuffers-host.cpp) -endif() - +list(APPEND transformer_engine_SOURCES + transformer_engine.cpp + transpose/cast_transpose.cu + transpose/transpose.cu + transpose/cast_transpose_fusion.cu + transpose/transpose_fusion.cu + transpose/multi_cast_transpose.cu + activation/gelu.cu + gemm/cublaslt_gemm.cu + layer_norm/ln_api.cpp + layer_norm/ln_bwd_semi_cuda_kernel.cu + layer_norm/ln_fwd_cuda_kernel.cu + rmsnorm/rmsnorm_api.cpp + rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu + rmsnorm/rmsnorm_fwd_cuda_kernel.cu + util/cast.cu + fused_softmax/scaled_masked_softmax.cu + fused_softmax/scaled_upper_triang_masked_softmax.cu) add_library(transformer_engine SHARED ${transformer_engine_SOURCES}) - -target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") - -list(APPEND transformer_engine_LINKER_LIBS CUDA::cublas CUDA::cudart CUDA::nvToolsExt) -if(NVTE_MPI_FOUND) - list(APPEND transformer_engine_LINKER_LIBS gdrapi) -endif() - -target_link_libraries(transformer_engine PUBLIC ${transformer_engine_LINKER_LIBS}) -target_include_directories(transformer_engine PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - +target_include_directories(transformer_engine PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}/include") + +# Configure dependencies +target_link_libraries(transformer_engine PUBLIC + CUDA::cublas + CUDA::cudart + CUDA::nvToolsExt) +target_include_directories(transformer_engine PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + +# Compiler options set_source_files_properties(fused_softmax/scaled_masked_softmax.cu fused_softmax/scaled_upper_triang_masked_softmax.cu PROPERTIES COMPILE_OPTIONS "--use_fast_math") - -if(NVTE_MPI_FOUND) - set_source_files_properties(comm_gemm_overlap/userbuffers.cu - comm_gemm_overlap/userbuffers-host.cpp - PROPERTIES - INCLUDE_DIRECTORIES ${NVTE_MPI_INCLUDE} - COMPILE_OPTIONS "$<$:-maxrregcount=64>") -endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 0a8924f8ed..220bec7003 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -37,8 +37,8 @@ def _load_library(): return ctypes.CDLL(dll_path, mode=ctypes.RTLD_GLOBAL) -def _load_mpi(): - """Load MPI shared library""" +def _load_userbuffers(): + """Load shared library with userbuffers""" system = platform.system() if system == "Linux": @@ -49,15 +49,14 @@ def _load_mpi(): extension = "dll" else: raise RuntimeError(f"Unsupported operating system ({system})") - lib_name = "libmpi." + extension - MPI_HOME = os.environ.get("MPI_HOME", "/usr/local/mpi") - NVTE_MPI_FOUND = os.path.exists(MPI_HOME) - dll_path = os.path.join(MPI_HOME, "lib", lib_name) + lib_name = "libtransformer_engine_userbuffers." + extension + dll_path = get_te_path() + dll_path = os.path.join(dll_path, lib_name) - if NVTE_MPI_FOUND: + if os.path.exists(dll_path): return ctypes.CDLL(dll_path, mode=ctypes.RTLD_GLOBAL) return None -_TE_LIB_CTYPES = _load_mpi() _TE_LIB_CTYPES = _load_library() +_UB_LIB_CTYPES = _load_userbuffers() diff --git a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h index 18863a7858..1e8b96f46b 100644 --- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h +++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h @@ -14,9 +14,10 @@ #include #include #include -#include +#include "userbuffers/userbuffers.h" #define HALF_BYTES 2 +#define UB_MAX_SM 32 #define CHECK_CUDA(call) \ do { \ @@ -174,6 +175,7 @@ struct UbufCommOverlap : torch::CustomClassHolder { char *rs_output_ptr = reinterpret_cast(rs_output.data_ptr()); int ubuf_offset = 0; + int ori_sms = _ub_comm->sms; // Catch up the default torch stream at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); @@ -232,7 +234,8 @@ struct UbufCommOverlap : torch::CustomClassHolder { cudaEventRecord(_start_comm, (cudaStream_t)_stream_compute[last_compute_stream_id])); CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); - // Communication chunk + // Last communication chunk with max SM + _ub_comm->sms = UB_MAX_SM; reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, (_num_splits - 1) * output_chunk_size, m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); @@ -255,7 +258,10 @@ struct UbufCommOverlap : torch::CustomClassHolder { (cudaStream_t)_stream_compute[i % _stream_compute.size()])); CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); - // Communication chunk + // Communication chunk. Uses MAX_SM at the last chunk + if (i == _num_splits-1) { + _ub_comm->sms = UB_MAX_SM; + } reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, i * output_chunk_size, m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); @@ -264,6 +270,7 @@ struct UbufCommOverlap : torch::CustomClassHolder { output_buf_chunk_ptr += output_chunk_size * _ubuf.element_size(); } } + _ub_comm->sms = ori_sms; int last_compute_stream_id = (_num_splits + _stream_compute.size() - 1) % _stream_compute.size(); CHECK_CUDA( diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index e34c79d980..23330efbf0 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -5,9 +5,9 @@ ************************************************************************/ #include "extensions.h" -#ifdef NVTE_MPI_FOUND +#ifdef NVTE_WITH_USERBUFFERS #include "comm_gemm_overlap.h" -#endif // NVTE_MPI_FOUND +#endif // NVTE_WITH_USERBUFFERS void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, @@ -1022,7 +1022,7 @@ size_t get_cublasLt_version() { bool userbuf_comm_available() { // TODO(ksivamani) check on python side -#ifdef NVTE_MPI_FOUND +#ifdef NVTE_WITH_USERBUFFERS return true; #else return false; @@ -1080,7 +1080,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .def_readwrite("scale_inv", &transformer_engine::FP8TensorMeta::scale_inv) .def_readwrite("amax_history", &transformer_engine::FP8TensorMeta::amax_history); -#ifdef NVTE_MPI_FOUND +#ifdef NVTE_WITH_USERBUFFERS py::enum_(m, "UbufOverlapAlgo") .value("BULK_OVERLAP_AG", ubuf::UBOverlapAlgo::BULK_OVERLAP_AG) .value("BULK_OVERLAP_RS", ubuf::UBOverlapAlgo::BULK_OVERLAP_RS) @@ -1099,11 +1099,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .def("split_overlap_ag", &ubuf::UbufP2PCommOverlap::split_overlap_ag) .def("copy_input_to_ubuf", &ubuf::UbufP2PCommOverlap::copy_input_to_ubuf) .def("get_ubuf_output", &ubuf::UbufP2PCommOverlap::get_ubuf_output); -#else // NVTE_MPI_FOUND +#else // NVTE_WITH_USERBUFFERS m.def("UbufOverlapAlgo", &placeholder, "Dummy function for python side annotations"); m.def("UbufCommOverlap", &placeholder, "Dummy function for python side annotations"); m.def("UbufP2PCommOverlap", &placeholder, "Dummy function for python side annotations"); -#endif // NVTE_MPI_FOUND +#endif // NVTE_WITH_USERBUFFERS py::enum_(m, "DType", py::module_local()) .value("kByte", transformer_engine::DType::kByte) diff --git a/transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt b/transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt new file mode 100644 index 0000000000..fde8632ec6 --- /dev/null +++ b/transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +# Configure userbuffers library +add_library(transformer_engine_userbuffers SHARED + userbuffers.cu + userbuffers-host.cpp) +target_include_directories(transformer_engine_userbuffers PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}") + +# Configure dependencies +find_package(MPI REQUIRED) +find_library(GDRCOPY_LIBRARY gdrapi + HINTS "${GDRCOPY_LIBRARY_DIR}" "$ENV{GDRCOPY_LIBRARY_DIR}") +if(NOT GDRCOPY_LIBRARY) + message(FATAL_ERROR "Could not find GDRCopy, please set GDRCOPY_LIBRARY_DIR") +endif() +message(STATUS "Found GDRCopy: ${GDRCOPY_LIBRARY}") +target_link_libraries(transformer_engine_userbuffers PUBLIC + CUDA::cudart + MPI::MPI_CXX + ${GDRCOPY_LIBRARY}) +target_include_directories(transformer_engine_userbuffers PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + +# Compiler options +set_source_files_properties(userbuffers.cu + userbuffers-host.cpp + PROPERTIES + COMPILE_OPTIONS "$<$:-maxrregcount=64>") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp b/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp similarity index 96% rename from transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp rename to transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp index 14928ed5a1..59afc4b452 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers-host.cpp +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp @@ -13,12 +13,11 @@ #include #include #include -#include -#include #include #include #include #include +#include "userbuffers.h" static int oob_bcast(void *comm_context, void *buf, int size, int root) { MPI_Bcast(buf, size, MPI_BYTE, root, @@ -48,6 +47,12 @@ int stringCmp(const void *a, const void *b) { return strcmp((const char *)a, (co } \ } while (0) +#define NVTE_UB_ERROR(x) \ + do { \ + throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) + \ + " in function " + __func__ + ": " + x); \ + } while (false) + int pipe_rank(communicator *comm, int step) { int mynode = comm->myrank / comm->nvsize; int mylocal = comm->nvrank; @@ -347,7 +352,7 @@ int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons void allreduce_nonsharp_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream, int op) { - if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); // if(comm->myrank==0) fprintf(stderr,"AR2(%d) user call launch_mode=%d\n",op,comm->launch_mode); const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; int blocksize = elements * 2; @@ -394,7 +399,7 @@ void allreduce2_userbuff_inplace(const int handler, const int offset, const int void allreduce_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); allreduce_nonsharp_inplace(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp); return; @@ -402,7 +407,7 @@ void allreduce_userbuff_inplace(const int handler, const int offset, const int e void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); int op = userbuffers_allreduceop_nonsharp; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; @@ -443,7 +448,7 @@ void reducescatter_userbuff_inplace(const int handler, const int offset, const i void allgather_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); int op = userbuffers_allreduceop_nonsharp; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; int blocksize = elements * 2; diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers.cu b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu similarity index 99% rename from transformer_engine/common/comm_gemm_overlap/userbuffers.cu rename to transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu index 684771801b..9144e9e739 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers.cu +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu @@ -14,7 +14,7 @@ #endif #include #include -#include +#include "userbuffers.h" #define MAX_THREADS 1024 #define TIMEOUT 200000000000ull diff --git a/transformer_engine/common/include/transformer_engine/userbuffers.h b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h similarity index 99% rename from transformer_engine/common/include/transformer_engine/userbuffers.h rename to transformer_engine/pytorch/csrc/userbuffers/userbuffers.h index cd5b1ec382..1d4c1d4024 100644 --- a/transformer_engine/common/include/transformer_engine/userbuffers.h +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h @@ -8,7 +8,7 @@ #define TRANSFORMER_ENGINE_USERBUFFERS_H_ #include -#include +#include // TODO (tym): Removing will remove PyT extension dependence on MPI #include "cuda_runtime.h" #include #include From ac5d44ecf7cdcf9896f04f7326ce9514b4f39aeb Mon Sep 17 00:00:00 2001 From: cyanguwa Date: Fri, 21 Apr 2023 16:22:39 -0700 Subject: [PATCH 020/427] Add FP8 fused attention (#155) * Add FP8 fused attention to TE for PyTorch Signed-off-by: Charlene Yang * add license for cudnn-frontend, modify installation requirements, and refactor some headers for aesthetics Signed-off-by: Charlene Yang * add c api docs for fused attention Signed-off-by: Charlene Yang * add exception for unsupported precision/sequence length combinations Signed-off-by: Charlene Yang * fix installation requirement for non fused attn use cases Signed-off-by: Charlene Yang * fix docs for fused-attn Signed-off-by: Kirthi Shankar Sivamani * prefix enums with NVTE_ and replace old MHA_Matrix with NVTE_QKV_Matrix Signed-off-by: Charlene Yang * minor fixes based on PR comments Signed-off-by: Charlene Yang * fix description for kvpacked fwd Signed-off-by: Charlene Yang * fix description of Bias in C api Signed-off-by: Charlene Yang * minor fixes for cudnn requirement and description for QKV tensors Signed-off-by: Charlene Yang * fix QKV layout description and support matrix for C api Signed-off-by: Charlene Yang * add asserts to cpp_extensions for qkv layout/bias type/attn mask type Signed-off-by: Charlene Yang * fix typo precision Signed-off-by: Charlene Yang --------- Signed-off-by: Charlene Yang Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Charlene Yang Co-authored-by: Kirthi Shankar Sivamani --- .github/workflows/build.yml | 6 + .gitmodules | 3 + 3rdparty/cudnn-frontend | 1 + Acknowledgements.txt | 22 + docs/api/c/fused_attn.rst | 9 + docs/api/c/index.rst | 1 + docs/installation.rst | 2 + setup.py | 1 + tests/cpp/test_common.cu | 1 + tests/cpp/test_common.h | 8 + transformer_engine/CMakeLists.txt | 2 + transformer_engine/cmake/FindCUDNN.cmake | 78 + transformer_engine/common/CMakeLists.txt | 7 +- .../common/fused_attn/fused_attn.cpp | 232 ++ .../common/fused_attn/fused_attn_fp8.cu | 2138 +++++++++++++++++ .../common/fused_attn/fused_attn_fp8.h | 46 + transformer_engine/common/fused_attn/utils.cu | 167 ++ transformer_engine/common/fused_attn/utils.h | 90 + .../include/transformer_engine/fused_attn.h | 262 ++ .../include/transformer_engine/logging.h | 9 + .../transformer_engine/transformer_engine.h | 35 +- .../common/transformer_engine.cpp | 13 + transformer_engine/pytorch/constants.py | 2 +- transformer_engine/pytorch/cpp_extensions.py | 730 +++++- transformer_engine/pytorch/csrc/common.cu | 13 + transformer_engine/pytorch/csrc/common.h | 15 + transformer_engine/pytorch/csrc/extensions.cu | 756 +++++- transformer_engine/pytorch/csrc/extensions.h | 90 +- transformer_engine/pytorch/module.py | 6 +- 29 files changed, 4720 insertions(+), 25 deletions(-) create mode 160000 3rdparty/cudnn-frontend create mode 100644 docs/api/c/fused_attn.rst create mode 100644 transformer_engine/cmake/FindCUDNN.cmake create mode 100644 transformer_engine/common/fused_attn/fused_attn.cpp create mode 100644 transformer_engine/common/fused_attn/fused_attn_fp8.cu create mode 100644 transformer_engine/common/fused_attn/fused_attn_fp8.h create mode 100644 transformer_engine/common/fused_attn/utils.cu create mode 100644 transformer_engine/common/fused_attn/utils.h create mode 100644 transformer_engine/common/include/transformer_engine/fused_attn.h diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ff64f1de72..24d87c0416 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,6 +17,8 @@ jobs: steps: - name: 'Checkout' uses: actions/checkout@v3 + with: + submodules: recursive - name: 'Build' run: | mkdir -p wheelhouse && \ @@ -41,6 +43,8 @@ jobs: steps: - name: 'Checkout' uses: actions/checkout@v3 + with: + submodules: recursive - name: 'Build' run: | pip install ninja pybind11 && \ @@ -66,6 +70,8 @@ jobs: steps: - name: 'Checkout' uses: actions/checkout@v3 + with: + submodules: recursive - name: 'Build' run: | pip install ninja pybind11 && \ diff --git a/.gitmodules b/.gitmodules index 85675ac0bc..21492db5ef 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "3rdparty/googletest"] path = 3rdparty/googletest url = https://github.com/google/googletest.git +[submodule "3rdparty/cudnn-frontend"] + path = 3rdparty/cudnn-frontend + url = https://github.com/NVIDIA/cudnn-frontend.git diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend new file mode 160000 index 0000000000..e7f64390e9 --- /dev/null +++ b/3rdparty/cudnn-frontend @@ -0,0 +1 @@ +Subproject commit e7f64390e9bb4a3db622ffe11c973834f572b609 diff --git a/Acknowledgements.txt b/Acknowledgements.txt index 7eec81a9ce..ad11acc047 100644 --- a/Acknowledgements.txt +++ b/Acknowledgements.txt @@ -138,3 +138,25 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +======================== +cudnn-frontend + +Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/docs/api/c/fused_attn.rst b/docs/api/c/fused_attn.rst new file mode 100644 index 0000000000..c2384b7e12 --- /dev/null +++ b/docs/api/c/fused_attn.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +fused_attn.h +============ + +.. doxygenfile:: fused_attn.h diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst index 0f83b8dc02..f98a419088 100644 --- a/docs/api/c/index.rst +++ b/docs/api/c/index.rst @@ -17,6 +17,7 @@ directly from C/C++, without Python. activation.h cast.h gemm.h + fused_attn.h layer_norm.h softmax.h transformer_engine.h diff --git a/docs/installation.rst b/docs/installation.rst index 088d65f9ca..9aded82d0f 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -14,6 +14,8 @@ Prerequisites 1. Linux x86_64 2. `CUDA 11.8 `__ 3. |driver link|_ supporting CUDA 11.8 or later. +4. `cuDNN 8 `__ or later. +5. For FP8 fused attention, `CUDA 12.1 `__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9 `__ or later. Transformer Engine in NGC Containers diff --git a/setup.py b/setup.py index cb0c37fe3a..b88e4fbcc4 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,7 @@ def make_abs_path(l): include_dirs = [ "transformer_engine/common/include", "transformer_engine/pytorch/csrc", + "3rdparty/cudnn-frontend/include", ] if NVTE_WITH_USERBUFFERS: if MPI_HOME: diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 151eddb9f9..bbb25bb2fc 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -42,6 +42,7 @@ const std::string &typeName(DType type) { static const std::unordered_map name_map = { {DType::kByte, "byte"}, {DType::kInt32, "int32"}, + {DType::kInt64, "int64"}, {DType::kFloat32, "float32"}, {DType::kFloat16, "float16"}, {DType::kBFloat16, "bfloat16"}, diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index f35d494c8d..7278f1827b 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -44,6 +44,7 @@ struct BytesToType<8> { using byte = uint8_t; using int32 = int32_t; +using int64 = int64_t; using fp32 = float; using fp16 = half; using bf16 = nv_bfloat16; @@ -54,6 +55,7 @@ template struct TypeInfo{ using types = std::tuple + $ +) + +target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn_adv_train + CUDNN::cudnn_ops_train + CUDNN::cudnn_cnn_train + CUDNN::cudnn_adv_infer + CUDNN::cudnn_cnn_infer + CUDNN::cudnn_ops_infer + CUDNN::cudnn +) + diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index c5bc6bb0f1..7b844540ae 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -12,6 +12,9 @@ list(APPEND transformer_engine_SOURCES transpose/transpose_fusion.cu transpose/multi_cast_transpose.cu activation/gelu.cu + fused_attn/fused_attn_fp8.cu + fused_attn/fused_attn.cpp + fused_attn/utils.cu gemm/cublaslt_gemm.cu layer_norm/ln_api.cpp layer_norm/ln_bwd_semi_cuda_kernel.cu @@ -30,9 +33,11 @@ target_include_directories(transformer_engine PUBLIC target_link_libraries(transformer_engine PUBLIC CUDA::cublas CUDA::cudart - CUDA::nvToolsExt) + CUDA::nvToolsExt + CUDNN::cudnn) target_include_directories(transformer_engine PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +target_include_directories(transformer_engine PRIVATE "${CMAKE_SOURCE_DIR}/../3rdparty/cudnn-frontend/include") # Compiler options set_source_files_properties(fused_softmax/scaled_masked_softmax.cu diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp new file mode 100644 index 0000000000..17b6505038 --- /dev/null +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -0,0 +1,232 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "transformer_engine/fused_attn.h" +#include "../common.h" +#include "utils.h" +#include "fused_attn_fp8.h" + +// NVTE fused attention FWD FP8 with packed QKV +void nvte_fused_attn_fwd_qkvpacked( + const NVTETensor QKV, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_Output_Tensors, + const NVTETensor cu_seqlens, + const NVTETensor rng_state, + size_t max_seqlen, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_fwd_qkvpacked); + using namespace transformer_engine; + const Tensor *input_cu_seqlens = reinterpret_cast(cu_seqlens); + const Tensor *input_rng_state = reinterpret_cast(rng_state); + const Tensor *input_QKV = reinterpret_cast(QKV); + const Tensor *input_Bias = reinterpret_cast(Bias); + Tensor *input_output_S = reinterpret_cast(S); + Tensor *output_O = reinterpret_cast(O); + Tensor *wkspace = reinterpret_cast(workspace); + + // QKV shape is [total_seqs, 3, h, d] + size_t b = input_cu_seqlens->data.shape[0] - 1; + size_t h = input_QKV->data.shape[2]; + size_t d = input_QKV->data.shape[3]; + const DType QKV_type = input_QKV->data.dtype; + + if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) + && (max_seqlen <= 512)) { +#if (CUDNN_VERSION >= 8900) + auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); + // FP8 API doesn't use input_Bias, bias_type or attn_mask_type + fused_attn_fwd_fp8_qkvpacked( + b, max_seqlen, h, d, + is_training, attn_scale, dropout, qkv_layout, + input_QKV, input_output_S, output_O, + Aux_Output_Tensors, + input_cu_seqlens, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9 is required to run FP8 fused attention. \n"); +#endif + } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) + && (max_seqlen <= 512)) { + NVTE_ERROR("TBD: No support for BF16/FP16 fused attention currently. \n"); + } else if (max_seqlen > 512) { + NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else { + NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + } +} +// NVTE fused attention BWD FP8 with packed QKV +void nvte_fused_attn_bwd_qkvpacked( + const NVTETensor QKV, + const NVTETensor dBias, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQKV, + const NVTETensor cu_seqlens, + size_t max_seqlen, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_bwd_qkvpacked); + using namespace transformer_engine; + const Tensor *input_cu_seqlens = reinterpret_cast(cu_seqlens); + const Tensor *input_QKV = reinterpret_cast(QKV); + const Tensor *input_dBias = reinterpret_cast(dBias); + const Tensor *input_O = reinterpret_cast(O); + const Tensor *input_dO = reinterpret_cast(dO); + const Tensor *input_S = reinterpret_cast(S); + Tensor *input_output_dP = reinterpret_cast(dP); + Tensor *output_dQKV = reinterpret_cast(dQKV); + Tensor *wkspace = reinterpret_cast(workspace); + + // QKV shape is [total_seqs, 3, h, d] + size_t b = input_cu_seqlens->data.shape[0] - 1; + size_t h = input_QKV->data.shape[2]; + size_t d = input_QKV->data.shape[3]; + const DType QKV_type = input_QKV->data.dtype; + + if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) + && (max_seqlen <= 512)) { +#if (CUDNN_VERSION >= 8900) + // Aux_CTX_Tensors contain [M, ZInv, rng_state] generated by the forward pass + const Tensor *input_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + const Tensor *input_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); + // FP8 API doesn't use input_dBias, bias_type or attn_mask_type + fused_attn_bwd_fp8_qkvpacked( + b, max_seqlen, h, d, + attn_scale, dropout, qkv_layout, + input_QKV, input_O, input_dO, + input_M, input_ZInv, + input_S, input_output_dP, + output_dQKV, + input_cu_seqlens, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9 is required to run FP8 fused attention. \n"); +#endif + } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) + && (max_seqlen <= 512)) { + NVTE_ERROR("TBD: No support for BF16/FP16 fused attention currently. \n"); + } else if (max_seqlen > 512) { + NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else { + NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + } +} +// NVTE fused attention FWD FP8 with packed KV +void nvte_fused_attn_fwd_kvpacked( + const NVTETensor Q, + const NVTETensor KV, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_Output_Tensors, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + const NVTETensor rng_state, + size_t max_seqlen_q, size_t max_seqlen_kv, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_fwd_kvpacked); + using namespace transformer_engine; + const Tensor *input_cu_seqlens_q = reinterpret_cast(cu_seqlens_q); + const Tensor *input_cu_seqlens_kv = reinterpret_cast(cu_seqlens_kv); + const Tensor *input_rng_state = reinterpret_cast(rng_state); + const Tensor *input_Q = reinterpret_cast(Q); + const Tensor *input_KV = reinterpret_cast(KV); + const Tensor *input_Bias = reinterpret_cast(Bias); + Tensor *input_output_S = reinterpret_cast(S); + Tensor *output_O = reinterpret_cast(O); + Tensor *wkspace = reinterpret_cast(workspace); + + // Q shape is [total_seqs, h, d] + size_t b = input_cu_seqlens_q->data.shape[0] - 1; + size_t h = input_Q->data.shape[1]; + size_t d = input_Q->data.shape[2]; + const DType QKV_type = input_Q->data.dtype; + + if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) + && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); + } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) + && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_ERROR("TBD: No support for BF16/FP16 fused attention currently. \n"); + } else if ((max_seqlen_q > 512) || (max_seqlen_kv > 512)) { + NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else { + NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + } +} +// NVTE fused attention BWD FP8 with packed KV +void nvte_fused_attn_bwd_kvpacked( + const NVTETensor Q, + const NVTETensor KV, + const NVTETensor dBias, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQ, + NVTETensor dKV, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_bwd_kvpacked); + using namespace transformer_engine; + const Tensor *input_cu_seqlens_q = reinterpret_cast(cu_seqlens_q); + const Tensor *input_cu_seqlens_kv = reinterpret_cast(cu_seqlens_kv); + const Tensor *input_Q = reinterpret_cast(Q); + const Tensor *input_KV = reinterpret_cast(KV); + const Tensor *input_dBias = reinterpret_cast(dBias); + const Tensor *input_O = reinterpret_cast(O); + const Tensor *input_dO = reinterpret_cast(dO); + const Tensor *input_S = reinterpret_cast(S); + Tensor *input_output_dP = reinterpret_cast(dP); + Tensor *output_dQ = reinterpret_cast(dQ); + Tensor *output_dKV = reinterpret_cast(dKV); + Tensor *wkspace = reinterpret_cast(workspace); + + // Q shape is [total_seqs, h, d] + size_t b = input_cu_seqlens_q->data.shape[0] - 1; + size_t h = input_Q->data.shape[1]; + size_t d = input_Q->data.shape[2]; + const DType QKV_type = input_Q->data.dtype; + if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) + && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); + } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) + && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_ERROR("TBD: No support for BF16/FP16 fused attention currently. \n"); + } else if ((max_seqlen_q > 512) || (max_seqlen_kv > 512)) { + NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else { + NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + } +} diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu new file mode 100644 index 0000000000..633f46c51f --- /dev/null +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu @@ -0,0 +1,2138 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "transformer_engine/fused_attn.h" +#include "../common.h" +#include "utils.h" +#include "fused_attn_fp8.h" + +namespace transformer_engine { +namespace fused_attn { + +using namespace transformer_engine; + +#if (CUDNN_VERSION >= 8900) +std::unordered_map tensor_name_to_uid = { + {"Q", 1}, + {"K", 2}, + {"V", 3}, + {"O", 4}, + {"S", 5}, + {"B", 6}, + {"DROPOUT_SCALE", 7}, + {"S_CONST", 8}, + {"MNK_OVERRIDE", 9}, + {"dQ", 11}, + {"dK", 12}, + {"dV", 13}, + {"dO", 14}, + {"MASK_VAL", 15}, + {"dS", 16}, + {"O_SEQLEN", 17}, + {"M", 18}, + {"Z", 19}, + {"descaleQ", 20}, + {"descaleK", 21}, + {"descaleV", 22}, + {"descaleS", 23}, + {"scaleS", 24}, + {"amaxS", 25}, + {"amaxO", 26}, + {"QKV_RAGGED", 27}, + {"O_RAGGED", 28}, + {"K_TRANSPOSE", 29}, + {"AttnScale", 30}, + {"scaleO", 31}, + {"Z_INV", 32}, + {"descaleO", 33}, + {"descaledO", 34}, + {"descaledS", 35}, + {"descaledQ", 36}, + {"descaledK", 37}, + {"descaledV", 38}, + {"scaledS", 39}, + {"scaledQ", 40}, + {"scaledK", 41}, + {"scaledV", 42}, + {"amaxdS", 43}, + {"amaxdQ", 44}, + {"amaxdK", 45}, + {"amaxdV", 46}, + {"V_TRANSPOSE", 47}, + {"AttnScale_dS_K", 48}, + {"AttnScale_dSTranspose_Q", 49}, + {"DROPOUT_SCALE_dOVt_OdO", 50}, + {"DROPOUT_OFFSET", 51}, + {"DROPOUT_SEED", 52}, + {"VIRTUAL", 80} +}; + +bool allowAllConfig(cudnnBackendDescriptor_t engine_config) { + (void)engine_config; + return false; +} + +static cudnn_frontend::Tensor tensor_create( + cudnnDataType_t type, int64_t id, + int64_t const * dim, int64_t const * stride, + bool is_virtual, bool is_value) { + int nbDims = 4; + auto tensor_created = cudnn_frontend::TensorBuilder() + .setDim(nbDims, dim) + .setStride(nbDims, stride) + .setId(id) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(type) + .setVirtual(is_virtual) + .setByValue(is_value) + .build(); + return tensor_created; +} + +static cudnn_frontend::Tensor tensor_create_with_offset( + cudnnDataType_t type, int64_t id, + int64_t const * dim, int64_t const * stride, + bool is_virtual, bool is_value, + std::shared_ptr raggedOffset) { + int nbDims = 4; + auto tensor_created = cudnn_frontend::TensorBuilder() + .setDim(nbDims, dim) + .setStride(nbDims, stride) + .setId(id) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(type) + .setVirtual(is_virtual) + .setByValue(is_value) + .setRaggedOffset(raggedOffset) + .build(); + return tensor_created; +} + +static cudnn_frontend::PointWiseDesc pw_desc_create( + cudnnDataType_t type, cudnnPointwiseMode_t mode) { + auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder() + .setMode(mode) + .setComputeType(type) + .build(); + return pw_desc_created; +} + +static cudnn_frontend::Operation unary_pw_op_create( + cudnn_frontend::Tensor const &xDesc, + cudnn_frontend::Tensor const &yDesc, + cudnn_frontend::PointWiseDesc const &pwDesc) { + auto pw_op_created = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(xDesc) + .setyDesc(yDesc) + .setpwDesc(pwDesc) + .build(); + return pw_op_created; +} + +static cudnn_frontend::Operation binary_pw_op_create( + cudnn_frontend::Tensor const &xDesc, + cudnn_frontend::Tensor const &bDesc, + cudnn_frontend::Tensor const &yDesc, + cudnn_frontend::PointWiseDesc const &pwDesc) { + auto pw_op_created = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(xDesc) + .setbDesc(bDesc) + .setyDesc(yDesc) + .setpwDesc(pwDesc) + .build(); + return pw_op_created; +} + +static cudnn_frontend::Operation ternary_pw_op_create( + cudnn_frontend::Tensor const &xDesc, + cudnn_frontend::Tensor const &bDesc, + cudnn_frontend::Tensor const &tDesc, + cudnn_frontend::Tensor const &yDesc, + cudnn_frontend::PointWiseDesc const &pwDesc) { + auto pw_op_created = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(xDesc) + .setbDesc(bDesc) + .settDesc(tDesc) + .setyDesc(yDesc) + .setpwDesc(pwDesc) + .build(); + return pw_op_created; +} + +static cudnn_frontend::Tensor createAmax( + const std::string& amax_tensor_name, + const cudnn_frontend::Tensor& prevBlockOutputTensor, + std::vector* ops) { + int64_t amax_dim[4] = {1, 1, 1, 1}; + int64_t amax_stride[4] = {1, 1, 1, 1}; + auto amaxTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid[amax_tensor_name], + amax_dim, amax_stride, false, false); + + // Define the amax descriptor + auto reductionDesc = cudnn_frontend::ReductionDescBuilder() + .setMathPrecision(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_AMAX) + .build(); + + // Create a reduction amax Node + auto reduction_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(prevBlockOutputTensor) + .setyDesc(amaxTensor) + .setreductionDesc(reductionDesc) + .build(); + ops->push_back(std::move(reduction_op)); + return amaxTensor; +} + +static cudnn_frontend::Tensor createScale( + const cudnn_frontend::Tensor& prevBlockOutputTensor, + const std::string& scale_tensor_name, + cudnnDataType_t tensorType, + bool isOutputVirtual, bool isScaleByValue, + std::vector* ops, + const std::string& output_tensor_name ="") { + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + int64_t output_dim[4]; + int64_t output_stride[4]; + + for (int i = 0; i < 4; i++) { + output_dim[i] = prevBlockOutputTensor.getDim()[i]; + output_stride[i] = prevBlockOutputTensor.getStride()[i]; + } + + auto scaleTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid[scale_tensor_name], + scale_dim, scale_stride, false, isScaleByValue); // is by value + + int64_t outputUID = isOutputVirtual ? tensor_name_to_uid["VIRTUAL"] + + tensor_name_to_uid[scale_tensor_name] + 5000 : + tensor_name_to_uid[output_tensor_name]; + auto afterScaleKTensor = tensor_create( + tensorType, outputUID, output_dim, + output_stride, isOutputVirtual, false); // is virtual + + // Define the scale descriptor + auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a Scale Node + auto scale_op = binary_pw_op_create( + prevBlockOutputTensor, scaleTensor, afterScaleKTensor, scaleDesc); + + ops->push_back(std::move(scale_op)); + return afterScaleKTensor; +} + +static cudnn_frontend::Tensor createScale( + const cudnn_frontend::Tensor& prevBlockOutputTensor, + const cudnn_frontend::Tensor& scaleTensor, + cudnnDataType_t tensorType, + bool isOutputVirtual, bool isScaleByValue, + std::vector* ops, + int UID_offset, const std::string& output_tensor_name ="") { + int64_t output_dim[4]; + int64_t output_stride[4]; + for (int i = 0; i < 4; i++) { + output_dim[i] = prevBlockOutputTensor.getDim()[i]; + output_stride[i] = prevBlockOutputTensor.getStride()[i]; + } + + int64_t outputUID = isOutputVirtual ? + tensor_name_to_uid["VIRTUAL"] + UID_offset : + tensor_name_to_uid[output_tensor_name]; + auto afterScaleTensor = tensor_create( + tensorType, outputUID, output_dim, + output_stride, isOutputVirtual, false); // is virtual + + // Define the scale descriptor + auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a Scale Node + auto scale_op = binary_pw_op_create( + prevBlockOutputTensor, scaleTensor, afterScaleTensor, scaleDesc); + + ops->push_back(std::move(scale_op)); + return afterScaleTensor; +} + +static cudnn_frontend::Tensor createScaleWithOffset( + const cudnn_frontend::Tensor& prevBlockOutputTensor, + const std::string& scale_tensor_name, + cudnnDataType_t tensorType, + bool isOutputVirtual, + bool isScaleByValue, + std::vector* ops, + std::shared_ptr offsetTensor, + const std::string& output_tensor_name ="") { + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + int64_t output_dim[4]; + int64_t output_stride[4]; + // If output tensor is dQ, dK, or dV, we need to generate QKV interleaved strides + if (output_tensor_name == "dQ" || output_tensor_name == "dK" || output_tensor_name == "dV") { + for (int i = 0; i < 4; i++) { + output_dim[i] = prevBlockOutputTensor.getDim()[i]; + } + generateMatrixStrides(output_dim[0], output_dim[1], output_dim[2], + 0 /*s_kv = 0 for placeholder*/, + output_dim[3], output_stride, + NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, NVTE_QKV_Matrix::NVTE_Q_Matrix); + } else { + // Otherwise output dim and stride should be the same as prev block dim and stride + for (int i = 0; i < 4; i++) { + output_dim[i] = prevBlockOutputTensor.getDim()[i]; + output_stride[i] = prevBlockOutputTensor.getStride()[i]; + } + } + + auto scaleTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid[scale_tensor_name], + scale_dim, scale_stride, false, isScaleByValue); // is by value + + cudnnDataType_t outputDataType = isOutputVirtual ? CUDNN_DATA_FLOAT : tensorType; + int64_t outputUID = isOutputVirtual ? + tensor_name_to_uid["VIRTUAL"] + tensor_name_to_uid[scale_tensor_name] + 7000 : + tensor_name_to_uid[output_tensor_name]; + auto afterScaleTensor = tensor_create_with_offset( + outputDataType, outputUID, output_dim, + output_stride, isOutputVirtual, false, offsetTensor); // is virtual + + // Define the scale descriptor + auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a Scale Node + auto scale_op = binary_pw_op_create( + prevBlockOutputTensor, scaleTensor, afterScaleTensor, scaleDesc); + + ops->push_back(std::move(scale_op)); + return afterScaleTensor; +} + +static cudnn_frontend::Tensor createSoftmaxForward( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, + std::vector* ops, + const cudnn_frontend::Tensor& prevBlockOutputTensor, + bool isTraining) { + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t afterReduction_dim[4] = {b, h, s_q, 1}; + int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1}; + + // max (x) (M tensor) + auto afterMaxReductionTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["M"], + afterReduction_dim, afterReduction_stride, + !isTraining, false); // not virtual if training is true, + // virtual if training is false + // x - max(x) + auto afterSubtractionTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 151, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + // e^(x - max(x)) + auto afterExponentTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 152, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual; + // sum (e^(x - max(x))) (Z tensor) + auto zTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["Z"], + afterReduction_dim, afterReduction_stride, true, false); // is virtual + // 1 / sum (e^(x - max(x))) (Z_INV tensor) + auto zInvTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["Z_INV"], + afterReduction_dim, afterReduction_stride, + !isTraining, false); // not virtual if training is true, + // virtual if training is false + // Final softmax output (After exponent * Z_INV) + auto beforeDropoutTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 153, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + + // Define the reduction descriptor + auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_MAX) + .build(); + + // Create a reduction max Node + auto reductionMax_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(prevBlockOutputTensor) + .setyDesc(afterMaxReductionTensor) + .setreductionDesc(reductionMaxDesc) + .build(); + + // Define the subtract descriptor + auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB); + + // Create a subtract Node + auto subtract_op = binary_pw_op_create( + prevBlockOutputTensor, afterMaxReductionTensor, + afterSubtractionTensor, subtractDesc); + + // Define the exponent descriptor + auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP); + + // Create a exponent Node + auto exponent_op = unary_pw_op_create( + afterSubtractionTensor, afterExponentTensor, exponentDesc); + + // Define the reduction descriptor + auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_ADD) + .build(); + + // Create a reduction add Node + auto reductionAdd_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(afterExponentTensor) + .setyDesc(zTensor) + .setreductionDesc(reductionAddDesc) + .build(); + + // Define the reciprocal descriptor + auto reciprocalDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_RECIPROCAL); + + // Create a reciprocal Node + auto reciprocal_op = unary_pw_op_create(zTensor, zInvTensor, reciprocalDesc); + + // Define the pw multiply descriptor + auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply Node + auto mutliply_op = binary_pw_op_create( + afterExponentTensor, zInvTensor, beforeDropoutTensor, multiplyDesc); + + ops->push_back(std::move(reductionMax_op)); + ops->push_back(std::move(subtract_op)); + ops->push_back(std::move(exponent_op)); + ops->push_back(std::move(reductionAdd_op)); + ops->push_back(std::move(reciprocal_op)); + ops->push_back(std::move(mutliply_op)); + + return beforeDropoutTensor; +} + +static cudnn_frontend::Tensor createDropoutForward( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, + double probability, + std::vector* ops, + const cudnn_frontend::Tensor& beforeDropoutTensor) { + cudnn_frontend::throw_if(ops->size() == 0, + "Dropout DAG constructed incorrectly as the first one", + CUDNN_STATUS_BAD_PARAM); + + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + // Mask for the dropout + auto dropoutMaskTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 250, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + auto dropoutSeedTensor = tensor_create( + CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_SEED"], + scale_dim, scale_stride, false, false); // is by value + auto dropoutOffsetTensor = tensor_create( + CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_OFFSET"], + scale_dim, scale_stride, false, false); // is by value + + // After dropout tensor befor scale + auto beforeDropoutScaleTensor = cudnn_frontend::TensorBuilder() + .setDim(4, afterBMM1_dim) + .setStride(4, afterBMM1_stride) + .setId(tensor_name_to_uid["VIRTUAL"] + 201) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(CUDNN_DATA_FLOAT) + .setVirtual(true) + .setByValue(false) + .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t:: + CUDNN_TENSOR_REORDERING_F16x16) + .build(); + // Scale after dropout + auto scaleDropoutTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["DROPOUT_SCALE"], + scale_dim, scale_stride, false, true); // is by value + // After Scale + auto afterDropout_before_quan_S = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 202, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + + // Define the reduction descriptor + auto rngDesc = cudnn_frontend::RngDescBuilder() + .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI) + .setBernoulliDistProbability(1.0 - probability) + .build(); + + // Create a rng Node + auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR) + .setyDesc(dropoutMaskTensor) + .setSeedDesc(dropoutSeedTensor) + .setOffsetDesc(dropoutOffsetTensor) + .setRngDesc(rngDesc) + .build(); + + + // Define the multiply mask descriptor + auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask Node + auto maskMul_op = binary_pw_op_create( + beforeDropoutTensor, dropoutMaskTensor, + beforeDropoutScaleTensor, maskMulDesc); + + // Define the multiply scale descriptor + auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask Node + auto scaleMul_op = binary_pw_op_create( + beforeDropoutScaleTensor, scaleDropoutTensor, + afterDropout_before_quan_S, scaleMulDesc); + + ops->push_back(std::move(rng_op)); + ops->push_back(std::move(maskMul_op)); + ops->push_back(std::move(scaleMul_op)); + + return afterDropout_before_quan_S; +} + +static cudnn_frontend::Tensor createDropoutBackward( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, + double probability, + std::vector* ops, + const cudnn_frontend::Tensor& beforeDropoutTensor, + const cudnn_frontend::Tensor& dropoutMaskTensor) { + cudnn_frontend::throw_if(ops->size() == 0, + "Dropout DAG constructed incorrectly as the first one", + CUDNN_STATUS_BAD_PARAM); + + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + auto dropoutSeedTensor = tensor_create( + CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_SEED"], + scale_dim, scale_stride, false, false); // is by value + auto dropoutOffsetTensor = tensor_create( + CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_OFFSET"], + scale_dim, scale_stride, false, false); // is by value + + // After dropout tensor befor scale + auto beforeDropoutScaleTensor = cudnn_frontend::TensorBuilder() + .setDim(4, afterBMM1_dim) + .setStride(4, afterBMM1_stride) + .setId(tensor_name_to_uid["VIRTUAL"] + 201) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(CUDNN_DATA_FLOAT) + .setVirtual(true) + .setByValue(false) + .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t:: + CUDNN_TENSOR_REORDERING_F16x16) + .build(); + // Scale after dropout (1 / (1 - p)) + auto scaleDropoutTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["DROPOUT_SCALE"], + scale_dim, scale_stride, false, true); // is by value + // After Scale + auto afterDropout_before_quan_S = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 202, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + + // Define the reduction descriptor + auto rngDesc = cudnn_frontend::RngDescBuilder() + .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI) + .setBernoulliDistProbability(1.0 - probability) + .build(); + + // Create a rng Node + auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR) + .setyDesc(dropoutMaskTensor) + .setSeedDesc(dropoutSeedTensor) + .setOffsetDesc(dropoutOffsetTensor) + .setRngDesc(rngDesc) + .build(); + + // Define the multiply mask descriptor + auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask Node + auto maskMul_op = binary_pw_op_create( + beforeDropoutTensor, dropoutMaskTensor, + beforeDropoutScaleTensor, maskMulDesc); + + // Define the multiply scale descriptor + auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask Node + auto scaleMul_op = binary_pw_op_create( + beforeDropoutScaleTensor, scaleDropoutTensor, + afterDropout_before_quan_S, scaleMulDesc); + + ops->push_back(std::move(rng_op)); + ops->push_back(std::move(maskMul_op)); + ops->push_back(std::move(scaleMul_op)); + + return afterDropout_before_quan_S; +} + +static cudnn_frontend::Tensor createSoftmaxBackward( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, + std::vector* ops, + const cudnn_frontend::Tensor& dyTensor) { + cudnn_frontend::throw_if(ops->size() == 0, + "Softmax backward constructed incorrectly as the first one", + CUDNN_STATUS_BAD_PARAM); + + int64_t dx_dim[4] = {b, h, s_q, s_kv}; + int64_t dx_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t M_Z_dim[4] = {b, h, s_q, 1}; + int64_t M_Z_stride[4] = {h * s_q, s_q, 1, 1}; + + // Creating all tensors + auto MTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["M"], + M_Z_dim, M_Z_stride, false, false); // not virtual + auto ZInvTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["Z_INV"], + M_Z_dim, M_Z_stride, false, false); // not virtual + auto dxAfterSubtractionTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 252, + dx_dim, dx_stride, true, false); // is virtual + auto dxAfterExponentiation = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 253, + dx_dim, dx_stride, true, false); // is virtual + auto dxBeforeDropout_QKt_Tensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 254, + dx_dim, dx_stride, true, false); // is virtual + + // Creating all ops + // sub (dy - M) + auto subtractionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB); + auto subtractionOp = binary_pw_op_create( + dyTensor, MTensor, dxAfterSubtractionTensor, subtractionDesc); + + // Define the exponent descriptor + auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP); + + // Create a exponent Node. (exp(dy - M)) + auto exponentOp = unary_pw_op_create( + dxAfterSubtractionTensor, dxAfterExponentiation, exponentDesc); + + // Define the pw multiply descriptor + auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply Node + auto mutliplyOp = binary_pw_op_create( + dxAfterExponentiation, ZInvTensor, dxBeforeDropout_QKt_Tensor, multiplyDesc); + + ops->push_back(std::move(subtractionOp)); + ops->push_back(std::move(exponentOp)); + ops->push_back(std::move(mutliplyOp)); + + return dxBeforeDropout_QKt_Tensor; +} + +static cudnn_frontend::Tensor createQKBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor &qTensor, + const cudnn_frontend::Tensor &kTensor, + const cudnn_frontend::Tensor &mnkOverride, + std::shared_ptr QKVRaggedOffsetTensor) { + // Creates the necessary tensor descriptors + int64_t k_transpose_dim[4] = {b, h, d, s_kv}; + int64_t k_transpose_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, + k_transpose_stride, layout, NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose); + + int64_t s_dim[4] = {b, h, s_q, s_kv}; + int64_t s_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, s_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + auto kTransposeTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["K_TRANSPOSE"], + k_transpose_dim, k_transpose_stride, + false, false, QKVRaggedOffsetTensor); // is virtual + + // First GEMM output + auto afterQKTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 1, + s_dim, s_stride, true, false); // is virtual + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setPaddingValue(-2000000) + .build(); + + // Create reshape node for K -> K.T + auto reshape_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(kTensor) + .setyDesc(kTransposeTensor) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(qTensor) + .setbMatDesc(kTransposeTensor) + .setcMatDesc(afterQKTensor) + .setmOverrideDesc(mnkOverride) + .setnOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(reshape_op)); + ops->push_back(std::move(matmulOp)); + + return afterQKTensor; +} + +static cudnn_frontend::Tensor createSVBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor &softmaxTensor, + const cudnn_frontend::Tensor &mnkOverride, + std::shared_ptr QKVRaggedOffsetTensor) { + cudnn_frontend::throw_if(ops->size() == 0, + "BMM2 op constructed incorrectly as the first one", + CUDNN_STATUS_BAD_PARAM); + + int64_t v_dim[4] = {b, h, s_kv, d}; + int64_t v_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, v_stride, layout, NVTE_QKV_Matrix::NVTE_V_Matrix); + + int64_t o_dim[4] = {b, h, s_q, d}; + int64_t o_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride, layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + + auto vTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["V"], + v_dim, v_stride, false, false, QKVRaggedOffsetTensor); + // Second fprop GEMM output + auto oTensor = tensor_create( + tensorType, tensor_name_to_uid["VIRTUAL"] + 300, + o_dim, o_stride, true, false); // is virtual + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(softmaxTensor) + .setbMatDesc(vTensor) + .setcMatDesc(oTensor) + .setmOverrideDesc(mnkOverride) + .setkOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(matmulOp)); + + return oTensor; +} + +static cudnn_frontend::Tensor createSdOBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor &softmaxTensor, + const cudnn_frontend::Tensor &dOTensor, + const cudnn_frontend::Tensor &mnkOverride) { + cudnn_frontend::throw_if(ops->size() == 0, + "BMM2 op constructed incorrectly as the first one", + CUDNN_STATUS_BAD_PARAM); + + int64_t s_dim_transpose[4] = {b, h, s_kv, s_q}; + int64_t s_stride_transpose[4] = {h * s_kv * s_q, s_kv * s_q, 1, s_kv}; + + int64_t v_dim[4] = {b, h, s_kv, d}; + int64_t v_stride[4] = {h * s_kv * d, d, h * d, 1}; + + auto sTransposeTensor = tensor_create( + tensorType, tensor_name_to_uid["VIRTUAL"] + 499, + s_dim_transpose, s_stride_transpose, + true, false); // is virtual + // S.T * dO + auto dVTensor_before_dequan_S = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 500, + v_dim, v_stride, + true, false); // is virtual + + // Create reshape node for softmax -> softmax.T + auto reshape_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(softmaxTensor) + .setyDesc(sTransposeTensor) + .build(); + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setPaddingValue(0) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(sTransposeTensor) + .setbMatDesc(dOTensor) + .setcMatDesc(dVTensor_before_dequan_S) + .setmOverrideDesc(mnkOverride) + .setkOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(reshape_op)); + ops->push_back(std::move(matmulOp)); + + return dVTensor_before_dequan_S; +} + +static cudnn_frontend::Tensor createdOVBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor &dOTensor, + const cudnn_frontend::Tensor &mnkOverride, + std::shared_ptr QKVRaggedOffsetTensor) { + // Creates the necessary tensor descriptors + int64_t v_dim[4] = {b, h, s_kv, d}; + int64_t v_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, v_stride, layout, NVTE_QKV_Matrix::NVTE_V_Matrix); + + int64_t v_transpose_dim[4] = {b, h, d, s_kv}; + int64_t v_transpose_stride[4]; + v_transpose_stride[0] = v_stride[0]; + v_transpose_stride[1] = v_stride[1]; + v_transpose_stride[2] = v_stride[3]; + v_transpose_stride[3] = v_stride[2]; + + int64_t s_dim[4] = {b, h, s_q, s_kv}; + int64_t s_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, s_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + auto vTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["V"], + v_dim, v_stride, + false, false, QKVRaggedOffsetTensor); + auto vTransposeTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["V_TRANSPOSE"], + v_transpose_dim, v_transpose_stride, + false, false, QKVRaggedOffsetTensor); // is virtual + + // dO * V.T + auto afterdOVTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 600, + s_dim, s_stride, true, false); // is virtual + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setPaddingValue(0) + .build(); + + // Create reshape node for V -> V.T + auto reshape_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(vTensor) + .setyDesc(vTransposeTensor) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dOTensor) + .setbMatDesc(vTransposeTensor) + .setcMatDesc(afterdOVTensor) + .setmOverrideDesc(mnkOverride) + .setnOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(reshape_op)); + ops->push_back(std::move(matmulOp)); + + return afterdOVTensor; +} + +static cudnn_frontend::Tensor createdOAndORowReductionChain( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + std::vector* ops, + const cudnn_frontend::Tensor &O_after_dequan, + const cudnn_frontend::Tensor &dO_after_dequan, + const cudnn_frontend::Tensor &dropoutScale_dOVt_OdO_Tensor) { + int64_t o_dim[4] = {b, h, s_q, d}; + int64_t o_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride, layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + int64_t o_dim_row_sum[4] = {b, h, s_q, 1}; + int64_t o_dim_row_sum_stride[4] = {s_q * h, s_q, 1, 1}; + + auto O_dO_after_pointwise_multiply = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 700, + o_dim, o_stride, true, false); // is virtual + auto O_dO_after_dropout_scale = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 701, + o_dim, o_stride, true, false); // is virtual + auto O_dO_after_rowsum = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 702, + o_dim_row_sum, o_dim_row_sum_stride, true, false); // is virtual + + // Define the pw multiply descriptor + auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply Node + auto mutliply_op = binary_pw_op_create( + O_after_dequan, dO_after_dequan, + O_dO_after_pointwise_multiply, multiplyDesc); + + // Create multiply node with dropout scale + auto dropout_scale_multiply_op = binary_pw_op_create( + O_dO_after_pointwise_multiply, dropoutScale_dOVt_OdO_Tensor, + O_dO_after_dropout_scale, multiplyDesc); + + // Define the reduction descriptor + auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_ADD) + .build(); + + // Create a reduction add Node + auto reductionAdd_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(O_dO_after_dropout_scale) + .setyDesc(O_dO_after_rowsum) + .setreductionDesc(reductionAddDesc) + .build(); + + ops->push_back(std::move(mutliply_op)); + ops->push_back(std::move(dropout_scale_multiply_op)); + ops->push_back(std::move(reductionAdd_op)); + + return O_dO_after_rowsum; +} + +static cudnn_frontend::Tensor createBiasSubtractionSoftmaxMulChain( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + std::vector* ops, + const cudnn_frontend::Tensor &dS_after_dropout, + const cudnn_frontend::Tensor &AfterDropout_before_quan_S, + const cudnn_frontend::Tensor &O_dO_after_rowsum, + const cudnn_frontend::Tensor &attnScale) { + int64_t o_dim[4] = {b, h, s_q, s_kv}; + int64_t o_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + auto dS_minus_O_dO = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 800, + o_dim, o_stride, true, false); // is virtual + auto AfterAttnScale_before_dS = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 801, + o_dim, o_stride, true, false); // is virtual + auto S_mul_dS_minus_O_dO = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 802, + o_dim, o_stride, true, false); // is virtual + + // Define the pw subtraction descriptor + auto subDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB); + + // Create a subtraction Node + auto sub_op = binary_pw_op_create( + dS_after_dropout, O_dO_after_rowsum, dS_minus_O_dO, subDesc); + + // Define the pw multiplication descriptor + auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // dS_minus_O_dO * attnScale + auto mutliply_attn_scale_op = binary_pw_op_create( + dS_minus_O_dO, attnScale, + AfterAttnScale_before_dS, multiplyDesc); + + // AfterDropout_before_quan_S * AfterAttnScale_before_dS + auto mutliply_op = binary_pw_op_create( + AfterDropout_before_quan_S, AfterAttnScale_before_dS, + S_mul_dS_minus_O_dO, multiplyDesc); + + ops->push_back(std::move(sub_op)); + ops->push_back(std::move(mutliply_attn_scale_op)); + ops->push_back(std::move(mutliply_op)); + + return S_mul_dS_minus_O_dO; +} + +static cudnn_frontend::Tensor createdSKBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + std::vector* ops, + const cudnn_frontend::Tensor &dSTensor, + const cudnn_frontend::Tensor &kTensor, + const cudnn_frontend::Tensor &mnkOverride) { + // Creates the necessary tensor descriptors + int64_t after_dSK_dim[4] = {b, h, s_kv, d}; + int64_t after_dSK_stride[4] = {h * s_kv * d, d, h * d, 1}; + // dS * K + auto After_dS_K = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 875, + after_dSK_dim, after_dSK_stride, true, false); // is virtual + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setPaddingValue(0) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dSTensor) + .setbMatDesc(kTensor) + .setcMatDesc(After_dS_K) + .setmOverrideDesc(mnkOverride) + .setkOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(matmulOp)); + + return After_dS_K; +} + +static cudnn_frontend::Tensor createdSQBMM( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, + std::vector* ops, + const cudnn_frontend::Tensor &dSTensor, + const cudnn_frontend::Tensor &qTensor, + const cudnn_frontend::Tensor &mnkOverride) { + // Creates the necessary tensor descriptors + int64_t dS_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, dS_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + int64_t dS_transpose_dim[4] = {b, h, s_kv, s_q}; + int64_t dS_transpose_stride[4]; + dS_transpose_stride[0] = dS_stride[0]; + dS_transpose_stride[1] = dS_stride[1]; + dS_transpose_stride[2] = dS_stride[3]; + dS_transpose_stride[3] = dS_stride[2]; + + int64_t after_dSTranspose_Q_dim[4] = {b, h, s_kv, d}; + int64_t after_dSTranspose_Q_stride[4] = {h * s_kv * d, d, h * d, 1}; + + auto dSTransposeTensor = tensor_create( + CUDNN_DATA_FP8_E5M2, tensor_name_to_uid["VIRTUAL"] + 650, + dS_transpose_dim, dS_transpose_stride, true, false); // is virtual + + // dS.T * Q + auto After_dSTranspose_Q = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 651, + after_dSTranspose_Q_dim, after_dSTranspose_Q_stride, + true, false); // is virtual + + // Create reshape node for V -> V.T + auto reshape_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(dSTensor) + .setyDesc(dSTransposeTensor) + .build(); + + // Define the matmul desc + auto matmulDesc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setPaddingValue(0) + .build(); + + // Create a matmul Node + auto matmulOp = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dSTransposeTensor) + .setbMatDesc(qTensor) + .setcMatDesc(After_dSTranspose_Q) + .setmOverrideDesc(mnkOverride) + .setkOverrideDesc(mnkOverride) + .setmatmulDesc(matmulDesc) + .build(); + + ops->push_back(std::move(reshape_op)); + ops->push_back(std::move(matmulOp)); + + return After_dSTranspose_Q; +} + +// fused attention FWD FP8 +void fa_fwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, + bool isTraining, float attnScale, + float dropoutProbability, NVTE_QKV_Layout layout, + void* devPtrQ, void* devPtrK, void* devPtrV, + void* devPtrM, void* devPtrZInv, + void* devPtrO, + void* devPtrDescaleQ, void* devPtrDescaleK, void* devPtrDescaleV, + void* devPtrDescaleS, void* devPtrScaleS, void* devPtrScaleO, + void* devPtrAmaxO, void* devPtrAmaxS, + void* devPtrcuSeqlensQ, void* devPtrcuSeqlensKV, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnnDataType_t tensorType, + void* workspace_ptr, + size_t* workspace_size, + cudaStream_t stream, + cudnnHandle_t handle_) { + try { + NVTE_CHECK_CUDNN(cudnnSetStream(handle_, stream)); + + FADescriptor descriptor{ + b, h, s_q, s_kv, d, + attnScale, isTraining, dropoutProbability, layout, tensorType}; + + using CacheType = std::map; + static CacheType fa_fprop_cache; + + // Get plan from cache if cache is available, otherwise create one + auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { + // If hit, return + auto it = cache.find(descriptor); + if (it != cache.end()) { + auto plan = it->second; + return plan; + } + + // Otherwise, build the op_graph and the plan. Then update cache + std::vector all_ops; + std::vector ops; + + cudnn_frontend::throw_if(dropoutProbability != 0.0f && !isTraining, + "Dropout probability should be 0.0f for inference mode", + CUDNN_STATUS_BAD_PARAM); + cudnn_frontend::throw_if(dropoutProbability == 1.0f, + "Dropout probability cannot be 1.0", + CUDNN_STATUS_BAD_PARAM); + + int64_t raggedDim[4] = {b + 1, 1, 1, 1}; + int64_t raggedStride[4] = {1, 1, 1, 1}; + // Create offset tensors + auto QKVOffsetTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["QKV_RAGGED"], + raggedDim, raggedStride, false, false); + auto ORaggedOffsetTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["O_RAGGED"], + raggedDim, raggedStride, false, false); + + int64_t seqlen_dim[4] = {b, 1, 1, 1}; + int64_t seqlen_stride[4] = {1, 1, 1, 1}; + // Create override tensors + auto seqlenMNKTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["MNK_OVERRIDE"], + seqlen_dim, seqlen_stride, false, false); + + // Create shared ptrs to ragged offset tensors + // for multiple tensors to use ragged offset + std::shared_ptr QKVRaggedOffsetTensorPtr = + std::make_shared(std::move(QKVOffsetTensor)); + std::shared_ptr ORaggedOffsetTensorPtr = + std::make_shared(std::move(ORaggedOffsetTensor)); + + // Create Q and K tensors that are used in different places + int64_t q_dim[4] = {b, h, s_q, d}; + int64_t q_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, q_stride, layout, + NVTE_QKV_Matrix::NVTE_Q_Matrix); + + int64_t k_dim[4] = {b, h, s_kv, d}; + int64_t k_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, k_stride, layout, + NVTE_QKV_Matrix::NVTE_K_Matrix); + + auto qTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["Q"], + q_dim, q_stride, false, false, + QKVRaggedOffsetTensorPtr); + auto kTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["K"], + k_dim, k_stride, false, false, + QKVRaggedOffsetTensorPtr); + + // Q * K.T + auto afterQKTensor = createQKBMM( + b, h, s_q, s_kv, d, layout, tensorType, + &ops, qTensor, kTensor, + seqlenMNKTensor, QKVRaggedOffsetTensorPtr); + + // QK.T * attn scale + auto AfterAttnScale_before_dequan_Q_tensor = createScale( + afterQKTensor, // input tensor + "AttnScale", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + true, // scale is by value + &ops); + + // QK.T * attn scale * dequant_Q + auto AfterAttnScale_before_dequan_K_tensor = createScale( + AfterAttnScale_before_dequan_Q_tensor, // input tensor + "descaleQ", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // QK.T * attn scale * dequant_Q * dequant_K + auto AfterAttnScale_tensor = createScale( + AfterAttnScale_before_dequan_K_tensor, // input tensor + "descaleK", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + auto BeforeDropoutTensor = createSoftmaxForward( + b, h, s_q, s_kv, &ops, + AfterAttnScale_tensor, isTraining); + + auto AfterDropout_before_quan_S = createDropoutForward( + b, h, s_q, s_kv, dropoutProbability, + &ops, BeforeDropoutTensor); + + // Amax for S + createAmax("amaxS", BeforeDropoutTensor, &ops); + + // After softmax * dropout * scale S -> fp8 input to next bmm with V + auto AfterMultiplyDropout = createScale( + AfterDropout_before_quan_S, // input tensor + "scaleS", // scale tensor + tensorType, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // After softmax * Dropout * V + auto OTensor_before_dequan_S_tensor = createSVBMM( + b, h, s_q, s_kv, d, layout, tensorType, + &ops, AfterMultiplyDropout, + seqlenMNKTensor, QKVRaggedOffsetTensorPtr); + + // O * dequant_S + auto OTensor_before_dequan_V_tensor = createScale( + OTensor_before_dequan_S_tensor, // input tensor + "descaleS", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // O * dequant_S * dequant_V + auto OTensor_before_quan_O_tensor = createScale( + OTensor_before_dequan_V_tensor, // input tensor + "descaleV", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // O * dequant_S * dequant_V * scale O + auto OTensor = createScaleWithOffset( + OTensor_before_quan_O_tensor, // input tensor + "scaleO", // scale tensor + tensorType, // output tensor type + false, // output not virtual + false, // scale is by value + &ops, + ORaggedOffsetTensorPtr, // ragged offset + "O"); + + // Amax for O + createAmax("amaxO", OTensor_before_quan_O_tensor, &ops); + + for (unsigned int i = 0; i < ops.size(); i++) { + all_ops.push_back(&ops[i]); + } + + // Create an Operation Graph + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle_) + .setOperationGraph(all_ops.size(), all_ops.data()) + .build(); + + cudnn_frontend::EngineConfigList filtered_configs; + auto statuses = cudnn_frontend::get_heuristics_list<1>( + {"heuristics_instant"}, opGraph, + allowAllConfig, filtered_configs, true); + + if (filtered_configs.size() == 0) { + cudnn_frontend::set_error_and_throw_exception( + nullptr, + CUDNN_STATUS_NOT_SUPPORTED, + "run_mha_fprop: No config returned by the heuristics"); + } + + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle_) + .setEngineConfig(filtered_configs[0], opGraph.getTag()) + .build(); + cache.insert({descriptor, plan}); + return plan; + }; // end of get_plan + + auto plan = get_plan(fa_fprop_cache, descriptor); + size_t wkspace_size = static_cast(plan.getWorkspaceSize()); + + // Exit to request upper level API to allocate memory if needed + if (workspace_ptr == nullptr) { + *workspace_size = wkspace_size + ((b + 1) * 2 + b) * sizeof(int32_t); + return; + } + + int32_t* qkv_ragged_offset = reinterpret_cast( + reinterpret_cast(workspace_ptr) + wkspace_size); + int32_t* o_ragged_offset = reinterpret_cast( + reinterpret_cast(workspace_ptr) + + wkspace_size + (b + 1) * sizeof(int32_t)); + int32_t* actual_seqlens_q = reinterpret_cast( + reinterpret_cast(workspace_ptr) + + wkspace_size + (b + 1) * 2 * sizeof(int32_t)); + // FP8 currently only supports self-attention, so doesn't use devPtrcuSeqlensKV + dim3 blockDims(128); + dim3 gridDims((b + blockDims.x)/blockDims.x); + cu_seqlens_to_offsets<<>>( + b, h, d, reinterpret_cast(devPtrcuSeqlensQ), + actual_seqlens_q, qkv_ragged_offset, o_ragged_offset); + void* devPtrQKVRaggedOffset = reinterpret_cast(qkv_ragged_offset); + void* devPtrORaggedOffset = reinterpret_cast(o_ragged_offset); + void* devPtrMNKOverride = reinterpret_cast(actual_seqlens_q); + + float dropoutScale = 1.0f/(1.0f - dropoutProbability); + + std::set> data_ptrs; + data_ptrs.emplace(std::pair(tensor_name_to_uid["Q"], devPtrQ)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["K"], devPtrK)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["K_TRANSPOSE"], devPtrK)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["V"], devPtrV)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["AttnScale"], &attnScale)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_SCALE"], &dropoutScale)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_SEED"], devPtrDropoutSeed)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_OFFSET"], devPtrDropoutOffset)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["O"], devPtrO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleQ"], devPtrDescaleQ)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleK"], devPtrDescaleK)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleV"], devPtrDescaleV)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleS"], devPtrDescaleS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaleS"], devPtrScaleS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaleO"], devPtrScaleO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxO"], devPtrAmaxO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxS"], devPtrAmaxS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["QKV_RAGGED"], devPtrQKVRaggedOffset)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["O_RAGGED"], devPtrORaggedOffset)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["MNK_OVERRIDE"], devPtrMNKOverride)); + + // If training, then we need to write out M and Z_INV + if (isTraining) { + data_ptrs.emplace(std::pair( + tensor_name_to_uid["M"], devPtrM)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["Z_INV"], devPtrZInv)); + } + + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(data_ptrs) + .build(); + cudnnStatus_t status = cudnnBackendExecute( + handle_, plan.get_raw_desc(), variantPack.get_raw_desc()); + + cudnn_frontend::throw_if( + [status]() { return (status != CUDNN_STATUS_SUCCESS); }, + "Plan execute error", status); + } catch (cudnn_frontend::cudnnException& e) { + struct cudaDeviceProp prop; + NVTE_CHECK_CUDA(cudaGetDeviceProperties(&prop, 0)); + + // This example is only for GH100 cards (cudnn Version >= 8900) + if (!((prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) + && (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH + || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) { + std::cout << "Example is only supported for GH100 (cuDNN >= 8900) GPUs" << std::endl; + } else { + std::cout << "[ERROR] Exception " << e.what() << std::endl; + } + } +} + +// fused attention BWD FP8 +void fa_bwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, + float attnScale, float dropoutProbability, NVTE_QKV_Layout layout, + void* devPtrQ, void* devPtrK, void* devPtrV, + void* devPtrM, void* devPtrZInv, + void* devPtrO, void* devPtrdO, + void* devPtrdQ, void* devPtrdK, void* devPtrdV, + void* devPtrDescaleQ, void* devPtrDescaleK, void* devPtrDescaleV, + void* devPtrDescaleO, void* devPtrDescaledO, + void* devPtrDescaleS, void* devPtrDescaledS, + void* devPtrScaleS, void* devPtrScaledS, + void* devPtrScaledQ, void* devPtrScaledK, void* devPtrScaledV, + void* devPtrAmaxdS, + void* devPtrAmaxdQ, void* devPtrAmaxdK, void* devPtrAmaxdV, + void* devPtrcuSeqlensQ, void* devPtrcuSeqlensKV, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnnDataType_t tensorType, + void* workspace_ptr, + size_t* workspace_size, + cudaStream_t stream, + cudnnHandle_t handle_) { + try { + NVTE_CHECK_CUDNN(cudnnSetStream(handle_, stream)); + + FADescriptor descriptor{ + b, h, s_q, s_kv, d, + attnScale, false, dropoutProbability, layout, tensorType}; + + using CacheType = std::map; + static CacheType fa_bprop_cache; + + // Get plan from cache if cache is available, otherwise create one + auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { + // If hit, return + auto it = cache.find(descriptor); + if (it != cache.end()) { + auto plan = it->second; + return plan; + } + + // Otherwise, build the op_graph and the plan. Then update cache + std::vector all_ops; + std::vector ops; + + cudnn_frontend::throw_if(dropoutProbability == 1.0f, + "Dropout probability cannot be 1.0", + CUDNN_STATUS_BAD_PARAM); + + int64_t raggedDim[4] = {b + 1, 1, 1, 1}; + int64_t raggedStride[4] = {1, 1, 1, 1}; + // Create offset tensors + auto QKVOffsetTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["QKV_RAGGED"], + raggedDim, raggedStride, false, false); + auto ORaggedOffsetTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["O_RAGGED"], + raggedDim, raggedStride, false, false); + + // Create shared ptrs to ragged offset tensors for multiple tensors + std::shared_ptr QKVRaggedOffsetTensorPtr = + std::make_shared(std::move(QKVOffsetTensor)); + std::shared_ptr ORaggedOffsetTensorPtr = + std::make_shared(std::move(ORaggedOffsetTensor)); + + // Create Q and K tensors that are used in different places + int64_t q_dim[4] = {b, h, s_q, d}; + int64_t q_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, q_stride, layout, + NVTE_QKV_Matrix::NVTE_Q_Matrix); + + int64_t k_dim[4] = {b, h, s_kv, d}; + int64_t k_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, k_stride, layout, + NVTE_QKV_Matrix::NVTE_K_Matrix); + + auto qTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["Q"], + q_dim, q_stride, false, false, QKVRaggedOffsetTensorPtr); + auto kTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["K"], + k_dim, k_stride, false, false, QKVRaggedOffsetTensorPtr); + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + // Create attnScale tensor for multiple ops to use + auto attnScaleTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["AttnScale"], + scale_dim, scale_stride, false, true); // is by value + + // Create descale Q K dO dS global tensors since they are used in multiple places + auto descaleQTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["descaleQ"], + scale_dim, scale_stride, false, false); + auto descaleKTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["descaleK"], + scale_dim, scale_stride, false, false); + auto descaledOTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["descaledO"], + scale_dim, scale_stride, false, false); + auto descaledSTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["descaledS"], + scale_dim, scale_stride, false, false); + + int64_t seqlen_dim[4] = {b, 1, 1, 1}; + int64_t seqlen_stride[4] = {1, 1, 1, 1}; + // Create MNK override tensor + auto seqlenMNKTensor = tensor_create( + CUDNN_DATA_INT32, tensor_name_to_uid["MNK_OVERRIDE"], + seqlen_dim, seqlen_stride, false, false); + + int64_t O_dim[4] = {b, h, s_q, d}; + int64_t O_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, O_stride, layout, + NVTE_QKV_Matrix::NVTE_O_Matrix); + // Create O and loss tensor + auto OTensor = tensor_create_with_offset( + tensorType, tensor_name_to_uid["O"], + O_dim, O_stride, false, false, ORaggedOffsetTensorPtr); + // dO is used in multiple places and E5M2 + auto dOTensor = tensor_create_with_offset( + CUDNN_DATA_FP8_E5M2, tensor_name_to_uid["dO"], + O_dim, O_stride, false, false, ORaggedOffsetTensorPtr); + + // Q * K.T + auto afterQKTensor = createQKBMM( + b, h, s_q, s_kv, d, layout, tensorType, + &ops, qTensor, kTensor, + seqlenMNKTensor, QKVRaggedOffsetTensorPtr); + + // QK.T * attn scale + auto AfterAttnScale_before_dequan_Q_tensor = createScale( + afterQKTensor, // input tensor + attnScaleTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + true, // scale is by value + &ops, + 1999 /*UID offset*/); + + // QK.T * attn scale * dequant_Q + auto AfterAttnScale_before_dequan_K_tensor = createScale( + AfterAttnScale_before_dequan_Q_tensor, // input tensor + descaleQTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2000 /*UID offset*/); + + // QK.T * attn scale * dequant_Q * dequant_K + auto AfterAttnScale_tensor = createScale( + AfterAttnScale_before_dequan_K_tensor, // input tensor + descaleKTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2001 /*UID offset*/); + + auto beforeDropout_QKt_Tensor = createSoftmaxBackward( + b, h, s_q, s_kv, &ops, AfterAttnScale_tensor); + + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + // mask for the dropout. Used in different places + auto dropoutMaskTensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 200, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + + auto AfterDropout_before_quan_S = createDropoutBackward( + b, h, s_q, s_kv, dropoutProbability, + &ops, beforeDropout_QKt_Tensor, dropoutMaskTensor); + + // After softmax * scale S -> fp8 input to next bmm with V + auto AfterMultiply = createScale( + AfterDropout_before_quan_S, // input tensor + "scaleS", // scale tensor + tensorType, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // After softmax * dO + auto dVTensor_before_dequan_S = createSdOBMM( + b, h, s_q, s_kv, d, tensorType, + &ops, AfterMultiply, dOTensor, seqlenMNKTensor); + + // O * dequant_S + auto dVTensor_before_dequan_dO = createScale( + dVTensor_before_dequan_S, // input tensor + "descaleS", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // O * dequant_S * dequant_dO + auto dVTensor_before_quan_dV = createScale( + dVTensor_before_dequan_dO, // input tensor + descaledOTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2002 /*UID offset*/); + + // O * dequant_S * dequant_dO * scale dV + auto dVTensor = createScaleWithOffset( + dVTensor_before_quan_dV, // input tensor + "scaledV", // scale tensor + CUDNN_DATA_FP8_E5M2, // output tensor type + false, // output not virtual + false, // scale is by value + &ops, + QKVRaggedOffsetTensorPtr, // ragged offset + "dV" /*Output tensor name*/); + + // Amax for dV + createAmax("amaxdV", dVTensor_before_quan_dV, &ops); + + auto dS_before_dequan_dO_Tensor = createdOVBMM( + b, h, s_q, s_kv, d, layout, tensorType, + &ops, dOTensor, seqlenMNKTensor, QKVRaggedOffsetTensorPtr); + + // dS * dequant_dO + auto dS_before_dequan_V = createScale( + dS_before_dequan_dO_Tensor, // input tensor + descaledOTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2003 /*UID offset*/); + + // O * dequant_S * dequant_dV + auto dS_after_dequan = createScale( + dS_before_dequan_V, // input tensor + "descaleV", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // RNG Multiply + auto beforeDropoutScale_dOVt_Tensor = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 350, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + // After dropout mask and scale + auto dS_after_dropout = tensor_create( + CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 351, + afterBMM1_dim, afterBMM1_stride, true, false); // is virtual + + // Define the multiply mask descriptor + auto mulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask Node + auto maskMul_op = binary_pw_op_create( + dS_after_dequan, dropoutMaskTensor, + beforeDropoutScale_dOVt_Tensor, mulDesc); + + ops.push_back(std::move(maskMul_op)); + + // scale after dropout for dO and O chain + auto dropoutScale_dOVt_OdO_Tensor = tensor_create( + tensorType, tensor_name_to_uid["DROPOUT_SCALE_dOVt_OdO"], + scale_dim, scale_stride, false, true); // is by value + + // Create a multiply dropout scale Node + auto mul_dropout_scale_op = binary_pw_op_create( + beforeDropoutScale_dOVt_Tensor, + dropoutScale_dOVt_OdO_Tensor, + dS_after_dropout, mulDesc); + + ops.push_back(std::move(mul_dropout_scale_op)); + + // O * dequant_O + auto O_after_dequan_Tensor = createScale(OTensor, // input tensor + "descaleO", // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // dO * dequant_dO + auto dO_after_dequan_Tensor = createScale(dOTensor, // input tensor + descaledOTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2004 /*UID offset*/); + + // row reduction sum[(dO * dequant_dO) * (O * dequant_O) * (1 - p)] + auto O_dO_after_rowsum = createdOAndORowReductionChain( + b, h, s_q, s_kv, d, layout, + &ops, O_after_dequan_Tensor, + dO_after_dequan_Tensor, dropoutScale_dOVt_OdO_Tensor); + + // (dS_after_dropout - O_dO_after_rowsum) * AfterDropout_before_quan_S * attnScale + auto S_mul_dS_minus_O_dO = createBiasSubtractionSoftmaxMulChain( + b, h, s_q, s_kv, d, layout, + &ops, dS_after_dropout, + AfterDropout_before_quan_S, O_dO_after_rowsum, + attnScaleTensor); + + + // S_mul_dS_minus_O_dO * scaledS + auto S_mul_dS_minus_O_dO_after_quan_dS = createScale( + S_mul_dS_minus_O_dO, // input tensor + "scaledS", // scale tensor + CUDNN_DATA_FP8_E5M2, // output tensor type + true, // output is virtual + false, // scale is by value + &ops); + + // Amax for dS + createAmax("amaxdS", S_mul_dS_minus_O_dO, &ops); + + // dS @ K + auto After_dS_K = createdSKBMM( + b, h, s_q, s_kv, d, &ops, + S_mul_dS_minus_O_dO_after_quan_dS, + kTensor, seqlenMNKTensor); + + // (dS * K) * descale dS + auto After_dS_K_before_dequan_K = createScale( + After_dS_K, // input tensor + descaledSTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2006 /*UID offset*/); + + // (dS * K) * descale dS * descale K + auto After_dS_K_before_quan_dQ = createScale( + After_dS_K_before_dequan_K, // input tensor + descaleKTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2007 /*UID offset*/); + + // (dS * K) * descale dS * descale K * scale dQ + auto dQ = createScaleWithOffset( + After_dS_K_before_quan_dQ, // input tensor + "scaledQ", // scale tensor + CUDNN_DATA_FP8_E5M2, // output tensor type + false, // output not virtual + false, // scale is by value + &ops, + QKVRaggedOffsetTensorPtr, // ragged offset + "dQ"); + + // Amax for dQ + createAmax("amaxdQ", After_dS_K_before_quan_dQ, &ops); + + // dS.T @ Q + auto After_dSTranspose_Q = createdSQBMM( + b, h, s_q, s_kv, d, layout, &ops, + S_mul_dS_minus_O_dO_after_quan_dS, + qTensor, seqlenMNKTensor); + + // (dS.T * Q) * descale dS + auto After_dSTranspose_Q_before_dequan_Q = createScale( + After_dSTranspose_Q, // input tensor + descaledSTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2009 /*UID offset*/); + + // (dS.T * Q) * descale dS * descale Q + auto After_dSTranspose_Q_before_quan_dK = createScale( + After_dSTranspose_Q_before_dequan_Q, // input tensor + descaleQTensor, // scale tensor + CUDNN_DATA_FLOAT, // output tensor type + true, // output is virtual + false, // scale is by value + &ops, + 2010 /*UID offset*/); + + // (dS.T * Q) * descale dS * descale Q * scale dK + auto dK = createScaleWithOffset( + After_dSTranspose_Q_before_quan_dK, // input tensor + "scaledK", // scale tensor + CUDNN_DATA_FP8_E5M2, // output tensor type + false, // output not virtual + false, // scale is by value + &ops, + QKVRaggedOffsetTensorPtr, // ragged offset + "dK"); + + // Amax for dK + createAmax("amaxdK", After_dSTranspose_Q_before_quan_dK, &ops); + + for (unsigned int i = 0; i < ops.size(); i++) { + all_ops.push_back(&ops[i]); + } + + // Create an Operation Graph + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle_) + .setOperationGraph(all_ops.size(), all_ops.data()) + .build(); + + cudnn_frontend::EngineConfigList filtered_configs; + auto statuses = cudnn_frontend::get_heuristics_list<1>( + {"heuristics_instant"}, opGraph, + allowAllConfig, filtered_configs, true); + + if (filtered_configs.size() == 0) { + cudnn_frontend::set_error_and_throw_exception( + nullptr, + CUDNN_STATUS_NOT_SUPPORTED, + "run_mha_bprop: No config returned by the heuristics"); + } + + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle_) + .setEngineConfig(filtered_configs[0], opGraph.getTag()) + .build(); + cache.insert({descriptor, plan}); + return plan; + }; + + auto plan = get_plan(fa_bprop_cache, descriptor); + size_t wkspace_size = static_cast(plan.getWorkspaceSize()); + + // Exit to request upper level API to allocate memory if needed + if (workspace_ptr == nullptr) { + *workspace_size = wkspace_size + ((b + 1) * 2 + b) * sizeof(int32_t); + return; + } + + int32_t* qkv_ragged_offset = reinterpret_cast( + reinterpret_cast(workspace_ptr) + wkspace_size); + int32_t* o_ragged_offset = reinterpret_cast( + reinterpret_cast(workspace_ptr) + + wkspace_size + (b + 1) * sizeof(int32_t)); + int32_t* actual_seqlens_q = reinterpret_cast( + reinterpret_cast(workspace_ptr) + + wkspace_size + (b + 1) * 2 * sizeof(int32_t)); + // FP8 currently only supports self-attention, so doesn't use devPtrcuSeqlensKV + dim3 blockDims(128); + dim3 gridDims((b + blockDims.x)/blockDims.x); + cu_seqlens_to_offsets<<>>( + b, h, d, reinterpret_cast(devPtrcuSeqlensQ), + actual_seqlens_q, qkv_ragged_offset, o_ragged_offset); + void* devPtrQKVRaggedOffset = reinterpret_cast(qkv_ragged_offset); + void* devPtrORaggedOffset = reinterpret_cast(o_ragged_offset); + void* devPtrMNKOverride = reinterpret_cast(actual_seqlens_q); + + std::set> data_ptrs; + float dropoutScale = 1.0f/(1.0f - dropoutProbability); + float dropoutScale_dOVt_OdO = 1.0f - dropoutProbability; + data_ptrs.emplace(std::pair(tensor_name_to_uid["Q"], devPtrQ)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["K"], devPtrK)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["K_TRANSPOSE"], devPtrK)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["V"], devPtrV)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["V_TRANSPOSE"], devPtrV)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["dQ"], devPtrdQ)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["dK"], devPtrdK)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["dV"], devPtrdV)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["dO"], devPtrdO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["AttnScale"], &attnScale)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_SCALE"], &dropoutScale)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_SCALE_dOVt_OdO"], + &dropoutScale_dOVt_OdO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_SEED"], devPtrDropoutSeed)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["DROPOUT_OFFSET"], devPtrDropoutOffset)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["M"], devPtrM)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["Z_INV"], devPtrZInv)); + data_ptrs.emplace(std::pair(tensor_name_to_uid["O"], devPtrO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleQ"], devPtrDescaleQ)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleK"], devPtrDescaleK)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleV"], devPtrDescaleV)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleS"], devPtrDescaleS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaledS"], devPtrDescaledS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaleO"], devPtrDescaleO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["descaledO"], devPtrDescaledO)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaleS"], devPtrScaleS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaledS"], devPtrScaledS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaledQ"], devPtrScaledQ)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaledK"], devPtrScaledK)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["scaledV"], devPtrScaledV)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxdS"], devPtrAmaxdS)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxdQ"], devPtrAmaxdQ)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxdK"], devPtrAmaxdK)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["amaxdV"], devPtrAmaxdV)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["QKV_RAGGED"], devPtrQKVRaggedOffset)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["O_RAGGED"], devPtrORaggedOffset)); + data_ptrs.emplace(std::pair( + tensor_name_to_uid["MNK_OVERRIDE"], devPtrMNKOverride)); + + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(data_ptrs) + .build(); + cudnnStatus_t status = cudnnBackendExecute( + handle_, plan.get_raw_desc(), variantPack.get_raw_desc()); + + cudnn_frontend::throw_if( + [status]() { return (status != CUDNN_STATUS_SUCCESS); }, + "Plan execute error", status); + } catch (cudnn_frontend::cudnnException& e) { + struct cudaDeviceProp prop; + NVTE_CHECK_CUDA(cudaGetDeviceProperties(&prop, 0)); + + // This example is only for GH100 cards (cudnn Version >= 8900) + if (!((prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) + && (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH + || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) { + std::cout << "Example is only supported for GH100 (cuDNN >= 8900) GPUs" << std::endl; + } else { + std::cout << "[ERROR] Exception " << e.what() << std::endl; + } + } +} + +#endif + +} // namespace fused_attn + +#if (CUDNN_VERSION >= 8900) +// fused attention FWD FP8 with packed QKV +void fused_attn_fwd_fp8_qkvpacked( + size_t b, size_t max_seqlen, + size_t h, size_t d, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_QKV, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_Output_Tensors, + const Tensor *cu_seqlens, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + // QKV shape is [total_seqs, 3, h, d] + void* devPtrQKV = input_QKV->data.dptr; + void* devPtrQ = reinterpret_cast(devPtrQKV); + void* devPtrK = reinterpret_cast(reinterpret_cast(devPtrQKV) + h * d); + void* devPtrV = reinterpret_cast(reinterpret_cast(devPtrQKV) + 2 * h * d); + void* devPtrDescaleQ = input_QKV->scale_inv.dptr; + void* devPtrDescaleK = input_QKV->scale_inv.dptr; + void* devPtrDescaleV = input_QKV->scale_inv.dptr; + + void* devPtrO = output_O->data.dptr; + void* devPtrAmaxO = output_O->amax.dptr; + void* devPtrScaleO = output_O->scale.dptr; + + void* devPtrM = nullptr; + void* devPtrZInv = nullptr; + if (Aux_Output_Tensors->size == 0) { + if (is_training) { + Aux_Output_Tensors->size = 2; + Tensor *output_M = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_Output_Tensors->tensors[1]); + output_M->data.dptr = nullptr; + output_M->data.shape = {b, h, max_seqlen, 1}; + output_M->data.dtype = DType::kFloat32; + output_ZInv->data.dptr = nullptr; + output_ZInv->data.shape = {b, h, max_seqlen, 1}; + output_ZInv->data.dtype = DType::kFloat32; + } + } else if (Aux_Output_Tensors->size == 2) { + Tensor *output_M = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_Output_Tensors->tensors[1]); + devPtrM = output_M->data.dptr; + devPtrZInv = output_ZInv->data.dptr; + } + + void* devPtrAmaxS = input_output_S->amax.dptr; + void* devPtrScaleS = input_output_S->scale.dptr; + void* devPtrDescaleS = input_output_S->scale_inv.dptr; + + void* devPtrcuSeqlens = reinterpret_cast( + reinterpret_cast(cu_seqlens->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const DType QKV_type = input_QKV->data.dtype; + size_t workspace_size = 0; + + fused_attn::fa_fwd_fp8( + b, max_seqlen, max_seqlen, h, d, + is_training, attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlens, devPtrcuSeqlens, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} +// fused attention BWD FP8 with packed QKV +void fused_attn_bwd_fp8_qkvpacked( + size_t b, size_t max_seqlen, + size_t h, size_t d, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_QKV, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQKV, + const Tensor *cu_seqlens, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + // QKV shape is [total_seqs, 3, h, d] + void* devPtrQKV = input_QKV->data.dptr; + void* devPtrQ = reinterpret_cast(devPtrQKV); + void* devPtrK = reinterpret_cast(reinterpret_cast(devPtrQKV) + h * d); + void* devPtrV = reinterpret_cast(reinterpret_cast(devPtrQKV) + 2 * h * d); + void* devPtrDescaleQ = input_QKV->scale_inv.dptr; + void* devPtrDescaleK = input_QKV->scale_inv.dptr; + void* devPtrDescaleV = input_QKV->scale_inv.dptr; + + void* devPtrO = input_O->data.dptr; + void* devPtrDescaleO = input_O->scale_inv.dptr; + void* devPtrdO = input_dO->data.dptr; + void* devPtrDescaledO = input_dO->scale_inv.dptr; + + void* devPtrM = input_M->data.dptr; + void* devPtrZInv = input_ZInv->data.dptr; + + void* devPtrScaleS = input_S->scale.dptr; + void* devPtrDescaleS = input_S->scale_inv.dptr; + void* devPtrAmaxdS = input_output_dP->amax.dptr; + void* devPtrScaledS = input_output_dP->scale.dptr; + void* devPtrDescaledS = input_output_dP->scale_inv.dptr; + + // dQKV shape is [total_seqs, 3, h, d] + void* devPtrdQKV = output_dQKV->data.dptr; + void* devPtrdQ = reinterpret_cast(devPtrdQKV); + void* devPtrdK = reinterpret_cast(reinterpret_cast(devPtrdQKV) + h * d); + void* devPtrdV = reinterpret_cast(reinterpret_cast(devPtrdQKV) + 2 * h * d); + void* devPtrAmaxdQ = output_dQKV->amax.dptr; + void* devPtrAmaxdK = output_dQKV->amax.dptr; + void* devPtrAmaxdV = output_dQKV->amax.dptr; + void* devPtrScaledQ = output_dQKV->scale.dptr; + void* devPtrScaledK = output_dQKV->scale.dptr; + void* devPtrScaledV = output_dQKV->scale.dptr; + + void* devPtrcuSeqlens = reinterpret_cast( + reinterpret_cast(cu_seqlens->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const DType QKV_type = input_QKV->data.dtype; + size_t workspace_size = 0; + + fused_attn::fa_bwd_fp8( + b, max_seqlen, max_seqlen, h, d, + attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledS, + devPtrScaleS, devPtrScaledS, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdS, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlens, devPtrcuSeqlens, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} +#endif // end of CUDNN>=8900 +} // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h new file mode 100644 index 0000000000..928e128737 --- /dev/null +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h @@ -0,0 +1,46 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "transformer_engine/transformer_engine.h" + +namespace transformer_engine { +#if (CUDNN_VERSION >= 8900) +// fused attention FWD FP8 with packed QKV +void fused_attn_fwd_fp8_qkvpacked( + size_t b, size_t max_seqlen, + size_t h, size_t d, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_QKV, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_Output_Tensors, + const Tensor *cu_seqlens, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); + +// fused attention BWD FP8 with packed QKV +void fused_attn_bwd_fp8_qkvpacked( + size_t b, size_t max_seqlen, + size_t h, size_t d, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_QKV, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQKV, + const Tensor *cu_seqlens, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); +#endif // end of CUDNN>=8900 +} // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/utils.cu b/transformer_engine/common/fused_attn/utils.cu new file mode 100644 index 0000000000..5b0b03cb3e --- /dev/null +++ b/transformer_engine/common/fused_attn/utils.cu @@ -0,0 +1,167 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "transformer_engine/fused_attn.h" +#include "../common.h" +#include "utils.h" + +namespace transformer_engine { +namespace fused_attn { + +using namespace transformer_engine; + +// get matrix strides based on matrix type +void generateMatrixStrides( + int64_t b, int64_t h, + int64_t s_q, int64_t s_kv, + int64_t d, int64_t* strideA, + NVTE_QKV_Layout layout, NVTE_QKV_Matrix matrix) { + constexpr int batch_dim_idx = 0; + constexpr int head_dim_idx = 1; + constexpr int seqlen_dim_idx = 2; + constexpr int hidden_dim_idx = 3; + + constexpr int seqlen_transpose_dim_idx = 3; + constexpr int hidden_transpose_dim_idx = 2; + + constexpr int seqlen_q_dim_idx = 2; + constexpr int seqlen_kv_dim_idx = 3; + + switch (matrix) { + case NVTE_QKV_Matrix::NVTE_Q_Matrix: + if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { + strideA[hidden_dim_idx] = 1; + strideA[seqlen_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_q * 3 * h * d; + } else { + strideA[hidden_dim_idx] = 1; + strideA[seqlen_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_q * h * d; + } + break; + case NVTE_QKV_Matrix::NVTE_K_Matrix: + if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { + strideA[seqlen_dim_idx] = 3 * h * d; + strideA[hidden_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 3 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { + strideA[seqlen_transpose_dim_idx] = 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 2 * h * d; + } else { + strideA[seqlen_transpose_dim_idx] = h * d; + strideA[hidden_transpose_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * h * d; + } + break; + case NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose: + if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { + strideA[seqlen_transpose_dim_idx] = 3 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 3 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { + strideA[seqlen_transpose_dim_idx] = 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 2 * h * d; + } else { + strideA[seqlen_transpose_dim_idx] = h * d; + strideA[hidden_transpose_dim_idx] = 1; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * h * d; + } + break; + case NVTE_QKV_Matrix::NVTE_V_Matrix: + if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { + strideA[hidden_dim_idx] = 1; + strideA[seqlen_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 3 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { + strideA[hidden_dim_idx] = 1; + strideA[seqlen_dim_idx] = 2* h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 2 * h * d; + } else { + strideA[hidden_dim_idx] = 1; + strideA[seqlen_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * h * d; + } + break; + case NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose: + if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 3 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = 2* h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 2 * h * d; + } else { + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * h * d; + } + break; + case NVTE_QKV_Matrix::NVTE_S_Matrix: + strideA[seqlen_kv_dim_idx] = 1; + strideA[seqlen_q_dim_idx] = s_kv; + strideA[head_dim_idx] = s_q * s_kv; + strideA[batch_dim_idx] = h * s_q * s_kv; + break; + case NVTE_QKV_Matrix::NVTE_O_Matrix: + strideA[seqlen_kv_dim_idx] = 1; + strideA[seqlen_q_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_q * h * d; + break; + } +} + +// convert cu_seqlens_q to qkv/o_ragged_offset and actual_seqlens_q +__global__ void cu_seqlens_to_offsets(size_t b, size_t h, size_t d, + int32_t *cu_seqlens_q, int32_t *actual_seqlens_q, + int32_t *qkv_ragged_offset, int32_t *o_ragged_offset) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < b) { + actual_seqlens_q[tid] = cu_seqlens_q[tid + 1] - cu_seqlens_q[tid]; + } + if (tid < b + 1) { + qkv_ragged_offset[tid] = cu_seqlens_q[tid] * 3 * h * d; + o_ragged_offset[tid] = cu_seqlens_q[tid] * h * d; + } +} +} // namespace fused_attn + +// get cuDNN data type +cudnnDataType_t get_cudnn_dtype(const transformer_engine::DType t) { + using namespace transformer_engine; + switch (t) { + case DType::kFloat16: + return CUDNN_DATA_HALF; + case DType::kFloat32: + return CUDNN_DATA_FLOAT; + case DType::kBFloat16: + return CUDNN_DATA_BFLOAT16; + case DType::kFloat8E4M3: + return CUDNN_DATA_FP8_E4M3; + case DType::kFloat8E5M2: + return CUDNN_DATA_FP8_E5M2; + default: + NVTE_ERROR("Invalid cuDNN data type. \n"); + } +} +} // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h new file mode 100644 index 0000000000..371a19990e --- /dev/null +++ b/transformer_engine/common/fused_attn/utils.h @@ -0,0 +1,90 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#ifndef TRANSFORMER_ENGINE_FUSED_ATTN_UTILS_H_ +#define TRANSFORMER_ENGINE_FUSED_ATTN_UTILS_H_ + +#include "transformer_engine/transformer_engine.h" +#include + +namespace transformer_engine { +namespace fused_attn { + +using namespace transformer_engine; + +enum NVTE_QKV_Matrix { + NVTE_Q_Matrix = 0, // queries + NVTE_K_Matrix = 1, // keys + NVTE_K_Matrix_Transpose = 2, // keys transposed + NVTE_V_Matrix = 3, // values + NVTE_V_Matrix_Transpose = 4, // value matrix transposed + NVTE_S_Matrix = 5, // output of GEMM1 + NVTE_O_Matrix = 6, // final output +}; + +void generateMatrixStrides( + int64_t b, int64_t h, + int64_t s_q, int64_t s_kv, + int64_t d, int64_t* strideA, + NVTE_QKV_Layout layout, NVTE_QKV_Matrix matrix); + +struct FADescriptor { + std::int64_t b; + std::int64_t h; + std::int64_t s_q; + std::int64_t s_kv; + std::int64_t d; + float attnScale; + bool isTraining; + float dropoutProbability; + NVTE_QKV_Layout layout; + cudnnDataType_t tensor_type; + + bool operator<(const FADescriptor &rhs) const { + return std::tie(b, h, s_q, s_kv, d, + attnScale, isTraining, dropoutProbability, + layout, tensor_type) < std::tie( + rhs.b, rhs.h, rhs.s_q, rhs.s_kv, rhs.d, + rhs.attnScale, rhs.isTraining, + rhs.dropoutProbability, rhs.layout, rhs.tensor_type); + } +}; + +__global__ void cu_seqlens_to_offsets(size_t b, size_t h, size_t d, + int32_t *cu_seqlens_q, int32_t *actual_seqlens_q, + int32_t *qkv_ragged_offset, int32_t *o_ragged_offset); + +} // namespace fused_attn + +cudnnDataType_t get_cudnn_dtype(const transformer_engine::DType t); + +class cudnnExecutionPlanManager { + public: + static cudnnExecutionPlanManager &Instance() { + static thread_local cudnnExecutionPlanManager instance; + return instance; + } + + cudnnHandle_t GetCudnnHandle() { + static thread_local std::once_flag flag; + std::call_once(flag, [&] { cudnnCreate(&handle_); }); + return handle_; + } + + ~cudnnExecutionPlanManager() { + static thread_local std::once_flag flag; + std::call_once(flag, [&] { + if (handle_ != nullptr) { + cudnnDestroy(handle_); + }}); + } + + private: + cudnnHandle_t handle_ = nullptr; +}; +} // namespace transformer_engine + +#endif diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h new file mode 100644 index 0000000000..bb9262de18 --- /dev/null +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -0,0 +1,262 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#ifndef TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_ +#define TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_ + +#include "transformer_engine.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum NVTE_QKV_Layout { +/*!< separate Q, K, V tensors: + Q: [total_seqs_q, num_heads, head_dim] + | Q Q Q ... Q + | \___________ _____________/ + total_seqs_q <| \/ + | num_heads * head_dim + K: [total_seqs_kv, num_heads, head_dim] + | K K K ... K + | \___________ _____________/ + total_seqs_kv <| \/ + | num_heads * head_dim + V: [total_seqs_kv, num_heads, head_dim] + | V V V ... V + | \___________ _____________/ + total_seqs_kv <| \/ + | num_heads * head_dim + */ + NVTE_NOT_INTERLEAVED = 0, + +/*!< packed QKV tensor: + QKV: [total_seqs, 3, num_heads, head_dim] + | Q Q Q ... Q K K K ... K V V V ... V + | \___________ _____________/ + total_seqs <| \/ + | num_heads * head_dim + */ + NVTE_QKV_INTERLEAVED = 1, + +/*!< Q and packed KV tensor: + Q: [total_seqs_q, num_heads, head_dim] + | Q Q Q ... Q + | \___________ _____________/ + total_seqs_q <| \/ + | num_heads * head_dim + KV: [total_seqs_kv, 2, num_heads, head_dim] + | K K K ... K V V V ... V + | \___________ _____________/ + total_seqs_kv <| \/ + | num_heads * head_dim + */ + NVTE_KV_INTERLEAVED = 2 +}; + +enum NVTE_Bias_Type { + NVTE_NO_BIAS = 0, /*!< no bias */ + NVTE_PRE_SCALE_BIAS = 1, /*!< bias before scale */ + NVTE_POST_SCALE_BIAS = 2 /*!< bias after scale */ +}; + +enum NVTE_Mask_Type { + NVTE_PADDING_MASK = 0, /*!< padding attention mask */ + NVTE_CAUSAL_MASK = 1, /*!< causal attention mask */ + NVTE_NO_MASK = 2 /*!< no masking */ +}; + +/*! \brief Compute dot product attention with packed QKV input. + * + * Computes: + * - P = Q * K.T + Bias + * - S = ScaleMaskSoftmax(P) + * - D = Dropout(S) + * - O = D * V.T + * + * Support Matrix: + * | precision | qkv layout | bias | mask | sequence length | head_dim | + * | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | <= 512 | 64 | + * + * + * \param[in] QKV The QKV tensor in packed format, + * [total_seqs, 3, num_heads, head_dim]. + * \param[in] Bias The Bias tensor. + * \param[in,out] S The S tensor. + * \param[out] O The output O tensor. + * \param[out] Aux_Output_Tensors Auxiliary output tensors when training, e.g. M, ZInv. + * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen Max sequence length used for computing, + * it may be >= max(cu_seqlens). + * \param[in] is_training Whether this is in training mode or inference. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_fwd_qkvpacked( + const NVTETensor QKV, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_Output_Tensors, + const NVTETensor cu_seqlens, + const NVTETensor rng_state, + size_t max_seqlen, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); + +/*! \brief Compute the backward of the dot product attention with packed QKV input. + * + * Support Matrix: + * | precision | qkv layout | bias | mask | sequence length | head_dim | + * | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | <= 512 | 64 | + * + * + * \param[in] QKV The QKV tensor in packed format, + * [total_seqs, 3, num_heads, head_dim]. + * \param[in] dBias The gradient of the Bias tensor. + * \param[in] O The O tensor from forward. + * \param[in] dO The gradient of the O tensor. + * \param[in] S The S tensor. + * \param[in,out] dP The gradient of the P tensor. + * \param[in] Aux_CTX_Tensors Auxiliary tensors from forward when in training mode. + * \param[out] dQKV The gradient of the QKV tensor. + * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen Max sequence length used for computing, + * it may be >= max(cu_seqlens). + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_bwd_qkvpacked( + const NVTETensor QKV, + const NVTETensor dBias, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQKV, + const NVTETensor cu_seqlens, + size_t max_seqlen, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); + +/*! \brief Compute dot product attention with packed KV input. + * + * Computes: + * - P = Q * K.T + Bias + * - S = ScaleMaskSoftmax(P) + * - D = Dropout(S) + * - O = D * V.T + * + * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. + * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. + * \param[in] Bias The Bias tensor. + * \param[in,out] S The S tensor. + * \param[out] O The output O tensor. + * \param[out] Aux_Output_Tensors Auxiliary output tensors when training, e.g. M, ZInv. + * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(cu_seqlens_q). + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(cu_seqlens_kv). + * \param[in] is_training Whether this is in training mode or inference. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_fwd_kvpacked( + const NVTETensor Q, + const NVTETensor KV, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_Output_Tensors, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + const NVTETensor rng_state, + size_t max_seqlen_q, size_t max_seqlen_kv, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); + +/*! \brief Compute the backward of the dot product attention with packed KV input. + * + * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. + * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. + * \param[in] dBias The gradient of the Bias tensor. + * \param[in] O The O tensor from forward. + * \param[in] dO The gradient of the O tensor. + * \param[in] S The S tensor. + * \param[in,out] dP The gradient of the P tensor. + * \param[in] Aux_CTX_Tensors Auxiliary tensors from forward when in training mode. + * \param[out] dQ The gradient of the Q tensor. + * \param[out] dKV The gradient of the KV tensor. + * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(cu_seqlens_q). + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(cu_seqlens_kv). + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_bwd_kvpacked( + const NVTETensor Q, + const NVTETensor KV, + const NVTETensor dBias, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQ, + NVTETensor dKV, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/transformer_engine/common/include/transformer_engine/logging.h b/transformer_engine/common/include/transformer_engine/logging.h index 36fd614f59..d488274579 100644 --- a/transformer_engine/common/include/transformer_engine/logging.h +++ b/transformer_engine/common/include/transformer_engine/logging.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -39,10 +40,18 @@ inline void check_cublas_(cublasStatus_t status) { } } +inline void check_cudnn_(cudnnStatus_t status) { + if ( status != CUDNN_STATUS_SUCCESS ) { + NVTE_ERROR("CUDNN Error: " + std::string(cudnnGetErrorString(status))); + } +} + } // namespace #define NVTE_CHECK_CUDA(ans) { check_cuda_(ans); } #define NVTE_CHECK_CUBLAS(ans) { check_cublas_(ans); } +#define NVTE_CHECK_CUDNN(ans) { check_cudnn_(ans); } + #endif // TRANSFORMER_ENGINE_LOGGING_H_ diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h index 0f17a4926a..72383c36bc 100644 --- a/transformer_engine/common/include/transformer_engine/transformer_engine.h +++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h @@ -24,11 +24,12 @@ extern "C" { enum NVTEDType { kNVTEByte = 0, /*!< Byte */ kNVTEInt32 = 1, /*!< 32-bit integer */ - kNVTEFloat32 = 2, /*!< 32-bit float */ - kNVTEFloat16 = 3, /*!< 16-bit float (E5M10) */ - kNVTEBFloat16 = 4, /*!< 16-bit bfloat (E8M7) */ - kNVTEFloat8E4M3 = 5, /*!< 8-bit float (E4M3) */ - kNVTEFloat8E5M2 = 6, /*!< 8-bit float (E5M2) */ + kNVTEInt64 = 2, /*!< 32-bit integer */ + kNVTEFloat32 = 3, /*!< 32-bit float */ + kNVTEFloat16 = 4, /*!< 16-bit float (E5M10) */ + kNVTEBFloat16 = 5, /*!< 16-bit bfloat (E8M7) */ + kNVTEFloat8E4M3 = 6, /*!< 8-bit float (E4M3) */ + kNVTEFloat8E5M2 = 7, /*!< 8-bit float (E5M2) */ kNVTENumTypes /*!< Number of supported types */ }; @@ -129,6 +130,19 @@ float *nvte_tensor_scale(const NVTETensor tensor); */ float *nvte_tensor_scale_inv(const NVTETensor tensor); +struct NVTETensorPack { + static const int MAX_SIZE = 10; /*!< we expect <10 matrices in auxiliary outputs */ + NVTETensor tensors[MAX_SIZE]; /*!< wrappers to tensors, do not hold memory */ + size_t size = 0; /*!< actual size of the tensor pack, 0 <= size <= MAX_SIZE */ +}; + +/*! \brief Create NVTETensors in NVTETensorPack. + */ +void nvte_tensor_pack_create(NVTETensorPack* pack); + +/*! \brief Destroy NVTETensors in NVTETensorPack. + */ +void nvte_tensor_pack_destroy(NVTETensorPack* pack); #ifdef __cplusplus } // extern "C" @@ -146,11 +160,12 @@ namespace transformer_engine { enum class DType { kByte = 0, kInt32 = 1, - kFloat32 = 2, - kFloat16 = 3, - kBFloat16 = 4, - kFloat8E4M3 = 5, - kFloat8E5M2 = 6, + kInt64 = 2, + kFloat32 = 3, + kFloat16 = 4, + kBFloat16 = 5, + kFloat8E4M3 = 6, + kFloat8E5M2 = 7, kNumTypes }; diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index 679d1e93c4..708712ff9a 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -133,3 +133,16 @@ float *nvte_tensor_scale_inv(const NVTETensor tensor) { "Tensor's inverse of scale must have Float32 type!"); return reinterpret_cast(t.scale_inv.dptr); } + +void nvte_tensor_pack_create(NVTETensorPack* pack) { + for (int i = 0; i < pack->MAX_SIZE; i++) { + pack->tensors[i] = reinterpret_cast(new transformer_engine::Tensor); + } +} + +void nvte_tensor_pack_destroy(NVTETensorPack* pack) { + for (int i = 0; i < pack->MAX_SIZE; i++) { + auto *t = reinterpret_cast(pack->tensors[i]); + delete t; + } +} diff --git a/transformer_engine/pytorch/constants.py b/transformer_engine/pytorch/constants.py index 271c70fcab..cc8b063245 100644 --- a/transformer_engine/pytorch/constants.py +++ b/transformer_engine/pytorch/constants.py @@ -14,7 +14,7 @@ with enum in transformer_engine.h """ TE_DType = { - torch.int8: tex.DType.kByte, + torch.uint8: tex.DType.kByte, torch.int32: tex.DType.kInt32, torch.float32: tex.DType.kFloat32, torch.half: tex.DType.kFloat16, diff --git a/transformer_engine/pytorch/cpp_extensions.py b/transformer_engine/pytorch/cpp_extensions.py index fae64445f0..1353f1513e 100644 --- a/transformer_engine/pytorch/cpp_extensions.py +++ b/transformer_engine/pytorch/cpp_extensions.py @@ -3,11 +3,735 @@ # See LICENSE for license information. """TE FP8 extensions and GEMMs""" -from typing import Optional, Tuple, Union +import math +from typing import Optional, Tuple, List, Union import torch import transformer_engine_extensions as tex from .constants import TE_DType +TORCH_DType = { + tex.DType.kFloat8E4M3: torch.uint8, + tex.DType.kFloat8E5M2: torch.uint8, + tex.DType.kFloat16: torch.half, + tex.DType.kBFloat16: torch.bfloat16, + tex.DType.kFloat32: torch.float32, + tex.DType.kInt32: torch.int32, +} + +def check_tensor(x: torch.Tensor): + """Check tensor properties.""" + assert (x.is_cuda and x.is_contiguous() + ), "Tensor should be a GPU tensor and contiguous." + +def check_qkv(qkv: torch.Tensor, dtype: torch.dtype): + """Check tensor properties.""" + check_tensor(qkv) + assert (qkv.dtype is dtype + and qkv.dim() == 4 + and qkv.shape[1] == 3 + ), """QKV should be in [total_seqs, 3, num_heads, head_dim] shape + and {dtype} dtype.""" + +def check_q(q: torch.Tensor, dtype: torch.dtype): + """Check tensor properties.""" + check_tensor(q) + assert (q.dtype is dtype + and q.dim() == 3 + ), """Q should be in [total_seqs, num_heads, head_dim] shape + and {dtype} dtype.""" + +def check_kv(kv: torch.Tensor, dtype: torch.dtype): + """Check tensor properties.""" + check_tensor(kv) + assert (kv.dtype is dtype + and kv.dim() == 4 + and kv.shape[1] == 2 + ), """KV should be in [total_seqs, 2, num_heads, head_dim] shape + and {dtype} dtype.""" + +def check_o(o: torch.Tensor, dtype: torch.dtype): + """Check tensor properties.""" + check_tensor(o) + assert (o.dtype is dtype + and o.dim() == 3 + ), """O and dO should be in [total_seqs, num_heads, head_dim] shape + and {dtype} dtype.""" + +def check_stats(stats: torch.Tensor, b: int, h: int, s: int): + """Check tensor properties.""" + check_tensor(stats) + assert (stats.dtype is torch.float32 + and stats.dim() == 4 + and stats.shape == torch.Size([b, h, s, 1]) + ), """M and ZInv should be in [batch_size, num_heads, max_seqlen_q, 1] + shape and float32 dtype.""" + +def check_cu_seqlens(cu_seqlens: torch.Tensor): + """Check tensor properties.""" + check_tensor(cu_seqlens) + assert (cu_seqlens.dtype is torch.int32 + and cu_seqlens.dim() == 1 + ), """cu_seqlens should be in [batch_size +1] shape and int32 dtype.""" + +def check_scalar(scalar: torch.Tensor): + """Check tensor properties.""" + check_tensor(scalar) + assert (scalar.dtype is torch.float32 + and scalar.dim() <= 1 + and scalar.numel() == 1 + ), "amax/scale/descale tensors should be scalars in float32 dtype." + +def check_rng_state(rng_state: torch.Tensor): + """Check tensor properties.""" + check_tensor(rng_state) + assert (rng_state.dtype is torch.int64 + and rng_state.numel() == 2 + ), "rng_state should be [seed, offset] and in int64 dtype." + +def fused_attn_fwd_qkvpacked( + is_training: bool, + max_seqlen: int, + cu_seqlens: torch.Tensor, + qkv: torch.Tensor, + qkv_dtype: tex.DType, + bias: torch.Tensor = None, + d_scale_qkv: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_o: torch.Tensor = None, + amax_s: torch.Tensor = None, + amax_o: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + set_zero: bool = True, + qkv_layout: str = "qkv_interleaved", + bias_type: str = "no_bias", + attn_mask_type: str = "padding", + rng_gen: torch.Generator = None, +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention FWD for packed QKV input. + + Parameters + ---------- + is_training: bool + if True, runs training and produces auxiliary tensors aux_ctx_tensors + for the backward; if False, runs inference and doesn't produce aux_ctx_tensors + max_seqlen: int + max sequence length for QKV, used for padding; may be larger than max(cu_seqlens) + cu_seqlens: torch.Tensor + accumulative sequence lengths for QKV; shape [batch_size + 1] + qkv: torch.Tensor + input tensor QKV; + shape [total_seqs, 3, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + qkv_dtype: tex.DType + data type of QKV; in tex.DType, not torch.dtype + bias: torch.Tensor, default = None + input tensor Bias; + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of QKV in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) + q_scale_o: torch.Tensor, default = None + input tensor for the quantization of O in FP8 computations + amax_s: torch.Tensor, default = None + output tensor, amax of S, used by the next iteration in FP8 computations + amax_o: torch.Tensor, default = None + output tensor, amax of O, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + set_zero: bool, default = True + if True, initializes the output tensor O to zero using the mha_fill method; + if False, doesn't initialize O after its allocation + qkv_layout: str, default = "qkv_interleaved" + layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} + bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + rng_gen: torch.Generator, default = None + random number generator; + if None, uses the default CUDA generator from PyTorch; otherwise, uses rng_gen + + Returns + ---------- + o: torch.Tensor + output tensor O, of the attention calculation; same data type as QKV; + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors used for the backward; + if is_training is True, aux_ctx_tensors = [M, ZInv, rng_state] + if is_training is False, aux_ctx_tensors = [rng_state] + M: torch.Tensor + max(Q*K.T) + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + ZInv: torch.Tensor + 1/sum(e^(x - max(x))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + rng_state: torch.Tensor + state of the random number generator; + [seed, offset], dtype uint64 + """ + + check_cu_seqlens(cu_seqlens) + b = cu_seqlens.numel() - 1 + qkv_type = TORCH_DType[qkv_dtype] + check_qkv(qkv, qkv_type) + + total_seqs = qkv.size(0) + h = qkv.size(2) + d = qkv.size(3) + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + # FP8 fused attention API + if (qkv_type is torch.uint8) and (max_seqlen <= 512) and (d == 64): + assert (qkv_layout == "qkv_interleaved" + and bias_type == "no_bias" + and attn_mask_type == "padding" + ), """The FP8 fused attention API currently only supports qkv_interleaved layout, + no_bias type, and padding attention mask type.""" + assert (d_scale_qkv is not None), "d_scale_qkv is required for the FP8 API." + assert (q_scale_s is not None), "q_scale_s is required for the FP8 API." + assert (q_scale_o is not None), "q_scale_o is required for the FP8 API." + assert (amax_s is not None), "amax_s is required for the FP8 API." + assert (amax_o is not None), "amax_o is required for the FP8 API." + check_scalar(d_scale_qkv) + check_scalar(q_scale_s) + check_scalar(q_scale_o) + check_scalar(amax_s) + check_scalar(amax_o) + + # BF16/FP16 fused attention API from fmha_v2 + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen > 512): + # add BF/FP16 support for >512 sequence length + assert False, "The BF16/FP16 support for >512 sequence length is coming!" + + # BF16/FP16 fused attention API from fmha_v1 apex + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen <= 512): + # add BF/FP16 support for <=512 sequence length + assert False, "The BF16/FP16 support for <=512 sequence length is coming!" + + else: + assert False, "No support for this dtype and max_seqlen combination." + + # execute kernel + output_tensors = tex.fused_attn_fwd_qkvpacked( + b, max_seqlen, total_seqs, h, d, + is_training, attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, + cu_seqlens, + qkv, + qkv_dtype, + d_scale_qkv, + q_scale_s, + q_scale_o, + amax_s, + amax_o, + bias, + rng_gen, + ) + + return output_tensors[0], output_tensors[1:] + + +def fused_attn_bwd_qkvpacked( + max_seqlen: int, + cu_seqlens: torch.Tensor, + qkv: torch.Tensor, + o: torch.Tensor, + d_o: torch.Tensor, + qkv_dtype: tex.DType, + aux_ctx_tensors: List[torch.Tensor] = None, + d_bias: torch.Tensor = None, + d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, + d_scale_o: torch.Tensor = None, + d_scale_do: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_dp: torch.Tensor = None, + q_scale_dqkv: torch.Tensor = None, + amax_dp: torch.Tensor = None, + amax_dqkv: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + set_zero: bool = True, + qkv_layout: str = "qkv_interleaved", + bias_type: str = "no_bias", + attn_mask_type: str = "padding", +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention BWD for packed QKV input. + + Parameters + ---------- + max_seqlen: int + max sequence length for QKV, used for padding; may be larger than max(cu_seqlens_q) + cu_seqlens: torch.Tensor + accumulative sequence lengths for QKV; shape [batch_size + 1] + qkv: torch.Tensor + input tensor QKV; + shape [total_seqs, 3, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + o: torch.Tensor + input tensor O (output of forward); + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + d_o: torch.Tensor + input tensor dO (gradient of O); + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + qkv_dtype: tex.DType + data type of QKV; in tex.DType, not torch.dtype + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors of the forward pass when its is_training is True, + e.g. aux_ctx_tensors = [M, ZInv, rng_state] + d_bias: torch.Tensor, default = None + input tensor Bias; + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of QKV in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) + d_scale_o: torch.Tensor, default = None + input tensor for the dequantization of O in FP8 computations + d_scale_do: torch.Tensor, default = None + input tensor for the dequantization of dO in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations + q_scale_dp: torch.Tensor, default = None + input tensor for the quantization of dP in FP8 computations, P = Q * K.T + q_scale_dqkv: torch.Tensor, default = None + input tensor for the quantization of dQKV in FP8 computations + amax_dp: torch.Tensor, default = None + output tensor, amax of dP, used by the next iteration in FP8 computations + amax_dqkv: torch.Tensor, default = None + output tensor, amax of dQKV, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + set_zero: bool, default = True + if True, initializes the output tensor O to zero using the mha_fill method; + if False, doesn't initialize O after its allocation + qkv_layout: str, default = "qkv_interleaved" + layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} + bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + + Returns + ---------- + d_qkv: torch.Tensor + gradient tensor of QKV; same data type and shape as QKV + """ + + check_cu_seqlens(cu_seqlens) + b = cu_seqlens.numel() - 1 + qkv_type = TORCH_DType[qkv_dtype] + check_qkv(qkv, qkv_type) + check_o(o, qkv_type) + check_o(d_o, qkv_type) + + total_seqs = qkv.size(0) + h = qkv.size(2) + d = qkv.size(3) + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + assert (len(aux_ctx_tensors) >= 1 + ), "aux_ctx_tensors must contain rng_state as its last element." + rng_state = aux_ctx_tensors[-1] + check_rng_state(rng_state) + + # FP8 fused attention API + if (qkv_type is torch.uint8) and (max_seqlen <= 512) and d == 64: + assert (qkv_layout == "qkv_interleaved" + and bias_type == "no_bias" + and attn_mask_type == "padding" + ), """The FP8 fused attention API currently only supports qkv_interleaved layout, + no_bias type, and padding attention mask type.""" + assert (d_scale_qkv is not None), "d_scale_qkv is required for the FP8 API." + assert (d_scale_s is not None), "d_scale_s is required for the FP8 API." + assert (d_scale_o is not None), "d_scale_o is required for the FP8 API." + assert (d_scale_do is not None), "d_scale_do is required for the FP8 API." + assert (q_scale_s is not None), "q_scale_s is required for the FP8 API." + assert (q_scale_dp is not None), "q_scale_dp is required for the FP8 API." + assert (q_scale_dqkv is not None), "q_scale_dqkv is required for the FP8 API." + assert (amax_dp is not None), "amax_dp is required for the FP8 API." + assert (amax_dqkv is not None), "amax_dqkv is required for the FP8 API." + assert (len(aux_ctx_tensors) == 3 + ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for the FP8 API." + check_scalar(d_scale_qkv) + check_scalar(d_scale_s) + check_scalar(d_scale_o) + check_scalar(d_scale_do) + check_scalar(q_scale_s) + check_scalar(q_scale_dp) + check_scalar(q_scale_dqkv) + check_scalar(amax_dp) + check_scalar(amax_dqkv) + m, z_inv = aux_ctx_tensors[:2] + check_stats(m, b, h, max_seqlen) + check_stats(z_inv, b, h, max_seqlen) + + # BF16/FP16 fused attention API from fmha_v2 + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen > 512): + # add BF/FP16 support for >512 sequence length + assert False, "The BF16/FP16 support for >512 sequence length is coming!" + + # BF16/FP16 fused attention API from fmha_v1 apex + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen <= 512): + # add BF/FP16 support for <=512 sequence length + assert False, "The BF16/FP16 support for <=512 sequence length is coming!" + + else: + assert False, "No support for this dtype and max_seqlen combination." + + # execute kernel + output_tensors = tex.fused_attn_bwd_qkvpacked( + b, max_seqlen, total_seqs, h, d, + attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, + cu_seqlens, + qkv, o, d_o, + qkv_dtype, + aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + q_scale_s, q_scale_dp, q_scale_dqkv, + amax_dp, amax_dqkv, + d_bias, + ) + + return output_tensors[0] + + +def fused_attn_fwd_kvpacked( + is_training: bool, + max_seqlen_q: int, + max_seqlen_kv: int, + cu_seqlens_q: torch.Tensor, + cu_seqlens_kv: torch.Tensor, + q: torch.Tensor, + kv: torch.Tensor, + qkv_dtype: tex.DType, + bias: torch.Tensor = None, + d_scale_qkv: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_o: torch.Tensor = None, + amax_s: torch.Tensor = None, + amax_o: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + set_zero: bool = True, + qkv_layout: str = "qkv_interleaved", + bias_type: str = "no_bias", + attn_mask_type: str = "padding", + rng_gen: torch.Generator = None, +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention FWD for packed KV input. + + Parameters + ---------- + is_training: bool + if True, runs training and produces auxiliary tensors aux_ctx_tensors + for the backward; if False, runs inference and doesn't produce aux_ctx_tensors + max_seqlen_q: int + max sequence length for Q, used for padding; may be larger than max(cu_seqlens_q) + max_seqlen_kv: int + max sequence length for KV, used for padding; may be larger than max(cu_seqlens_kv) + cu_seqlens_q: torch.Tensor + accumulative sequence lengths for Q; shape [batch_size + 1] + cu_seqlens_kv: torch.Tensor + accumulative sequence lengths for KV; shape [batch_size + 1] + q: torch.Tensor + input tensor Q; + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + kv: torch.Tensor + packed input tensor KV; + shape [total_seqs_kv, 2, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1] + qkv_dtype: tex.DType + data type of QKV; in tex.DType, not torch.dtype + bias: torch.Tensor, default = None + input tensor Bias; + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of QKV in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) + q_scale_o: torch.Tensor, default = None + input tensor for the quantization of O in FP8 computations + amax_s: torch.Tensor, default = None + output tensor, amax of S, used by the next iteration in FP8 computations + amax_o: torch.Tensor, default = None + output tensor, amax of O, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + set_zero: bool, default = True + if True, initializes the output tensor O to zero using the mha_fill method; + if False, doesn't initialize O after its allocation + qkv_layout: str, default = "qkv_interleaved" + layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} + bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + rng_gen: torch.Generator, default = None + random number generator; + if None, uses the default CUDA generator from PyTorch; otherwise, uses rng_gen + + Returns + ---------- + o: torch.Tensor + output tensor O, of the attention calculation; same data type as QKV; + shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors used for the backward; + if is_training is True, aux_ctx_tensors = [M, ZInv, rng_state] + if is_training is False, aux_ctx_tensors = [rng_state] + M: torch.Tensor + max(Q*K.T) + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + ZInv: torch.Tensor + 1/sum(e^(x - max(x))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + rng_state: torch.Tensor + state of the random number generator; + [seed, offset], dtype uint64 + """ + + check_cu_seqlens(cu_seqlens_q) + check_cu_seqlens(cu_seqlens_kv) + assert (cu_seqlens_q.numel() == cu_seqlens_kv.numel() + ), "cu_seqlens_q and cu_seqlens_kv must have the same length." + b = cu_seqlens_q.numel() - 1 + qkv_type = TORCH_DType[qkv_dtype] + check_q(q, qkv_type) + check_kv(kv, qkv_type) + + assert (q.size(1) == kv.size(2) + and q.size(2) == kv.size(3) + ), "Q and KV must have the same num_heads and head_dim." + total_seqs_q = q.size(0) + total_seqs_kv = kv.size(0) + h = q.size(1) + d = q.size(2) + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + # FP8 fused attention API + if (qkv_type is torch.uint8) and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512) \ + and (d == 64): + assert False, "The FP8 fused attention API currently only supports packed QKV input." + + # BF16/FP16 fused attention API from fmha_v2 + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ + and (max_seqlen_q > 512) and (max_seqlen_kv > 512): + # add BF/FP16 support for >512 sequence length + assert False, "The BF16/FP16 support for >512 sequence length is coming!" + + # BF16/FP16 fused attention API from fmha_v1 apex + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ + and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512): + # add BF/FP16 support for <=512 sequence length + assert False, "The BF16/FP16 support for <=512 sequence length is coming!" + + else: + assert False, "No support for this dtype and max_seqlen combination." + + # execute kernel + output_tensors = tex.fused_attn_fwd_kvpacked( + b, max_seqlen_q, max_seqlen_kv, total_seqs_q, total_seqs_kv, h, d, + is_training, attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, + cu_seqlens_q, cu_seqlens_kv, + q, kv, + qkv_dtype, + d_scale_qkv, + q_scale_s, + q_scale_o, + amax_s, + amax_o, + bias, + rng_gen, + ) + + return output_tensors[0], output_tensors[1:] + + +def fused_attn_bwd_kvpacked( + max_seqlen_q: int, + max_seqlen_kv: int, + cu_seqlens_q: torch.Tensor, + cu_seqlens_kv: torch.Tensor, + q: torch.Tensor, + kv: torch.Tensor, + o: torch.Tensor, + d_o: torch.Tensor, + qkv_dtype: tex.DType, + aux_ctx_tensors: List[torch.Tensor] = None, + d_bias: torch.Tensor = None, + d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, + d_scale_o: torch.Tensor = None, + d_scale_do: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_dp: torch.Tensor = None, + q_scale_dqkv: torch.Tensor = None, + amax_dp: torch.Tensor = None, + amax_dqkv: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + set_zero: bool = True, + qkv_layout: str = "qkv_interleaved", + bias_type: str = "no_bias", + attn_mask_type: str = "padding", +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention BWD for packed KV input. + + Parameters + ---------- + max_seqlen_q: int + max sequence length for Q, used for padding; may be larger than max(cu_seqlens_q) + max_seqlen_kv: int + max sequence length for KV, used for padding; may be larger than max(cu_seqlens_kv) + cu_seqlens_q: torch.Tensor + accumulative sequence lengths for Q; shape [batch_size + 1] + cu_seqlens_kv: torch.Tensor + accumulative sequence lengths for KV; shape [batch_size + 1] + q: torch.Tensor + input tensor Q; + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + kv: torch.Tensor + packed input tensor KV; + shape [total_seqs_kv, 2, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1] + o: torch.Tensor + input tensor O (output of forward); + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + d_o: torch.Tensor + input tensor dO (gradient of O); + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + qkv_dtype: tex.DType + data type of QKV; in tex.DType, not torch.dtype + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors of the forward pass when its is_training is True, + e.g. aux_ctx_tensors = [M, ZInv, rng_state] + bias: torch.Tensor, default = None + input tensor Bias; + shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of QKV in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) + d_scale_o: torch.Tensor, default = None + input tensor for the dequantization of O in FP8 computations + d_scale_do: torch.Tensor, default = None + input tensor for the dequantization of dO in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations + q_scale_dp: torch.Tensor, default = None + input tensor for the quantization of dP in FP8 computations, P = Q * K.T + q_scale_dqkv: torch.Tensor, default = None + input tensor for the quantization of dQKV in FP8 computations + amax_dp: torch.Tensor, default = None + output tensor, amax of dP, used by the next iteration in FP8 computations, + P = Q * K.T + amax_dqkv: torch.Tensor, default = None + output tensor, amax of dQKV, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + set_zero: bool, default = True + if True, initializes the output tensor O to zero using the mha_fill method; + if False, doesn't initialize O after its allocation + qkv_layout: str, default = "qkv_interleaved" + layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} + bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + + Returns + ---------- + d_q: torch.Tensor + gradient tensor of Q; same data type and shape as Q + d_kv: torch.Tensor + gradient tensor of KV; same data type and shape as KV + """ + + check_cu_seqlens(cu_seqlens_q) + check_cu_seqlens(cu_seqlens_kv) + assert (cu_seqlens_q.numel() == cu_seqlens_kv.numel() + ), "cu_seqlens_q and cu_seqlens_kv must have the same length." + b = cu_seqlens_q.numel() - 1 + qkv_type = TORCH_DType[qkv_dtype] + check_q(q, qkv_type) + check_kv(kv, qkv_type) + check_o(o, qkv_type) + check_o(d_o, qkv_type) + + assert (q.size(1) == kv.size(2) + and q.size(2) == kv.size(3) + ), "Q and KV must have the same num_heads and head_dim." + total_seqs_q = q.size(0) + total_seqs_kv = q.size(0) + h = q.size(1) + d = q.size(2) + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + assert (len(aux_ctx_tensors) >= 1 + ), "aux_ctx_tensors must contain rng_state as its last element." + rng_state = aux_ctx_tensors[-1] + check_rng_state(rng_state) + + # FP8 fused attention API + if (qkv_type is torch.uint8) and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512) \ + and d == 64: + assert False, "The FP8 fused attention API currently only supports packed QKV input." + + ############### BF16/FP16 fused attention API from fmha_v2 ################ + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ + and (max_seqlen_q > 512) and (max_seqlen_kv > 512): + # add BF/FP16 support for >512 sequence length + assert False, "The BF16/FP16 support for >512 sequence length is coming!" + + ############### BF16/FP16 fused attention API from fmha_v1 apex ################ + elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ + and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512): + # add BF/FP16 support for <=512 sequence length + assert False, "The BF16/FP16 support for <=512 sequence length is coming!" + + else: + assert False, "No support for this dtype and max_seqlen combination." + + # execute kernel + output_tensors = tex.fused_attn_bwd_kvpacked( + b, max_seqlen_q, max_seqlen_kv, total_seqs_q, total_seqs_kv, h, d, + attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, + cu_seqlens_q, cu_seqlens_kv, + q, kv, o, d_o, + qkv_dtype, + aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + q_scale_s, q_scale_dp, q_scale_dqkv, + amax_dp, amax_dqkv, + d_bias, + ) + + return output_tensors def fp8_gemm( A: torch.Tensor, @@ -233,9 +957,9 @@ def fp8_cast_transpose_fused( return_outputs = False if cast_out is None or transpose_out is None: - cast_out = torch.empty_like(inp, dtype=torch.int8) + cast_out = torch.empty_like(inp, dtype=torch.uint8) transpose_out = torch.empty( - inp.shape[1], inp.shape[0], device="cuda", dtype=torch.int8 + inp.shape[1], inp.shape[0], device="cuda", dtype=torch.uint8 ) return_outputs = True diff --git a/transformer_engine/pytorch/csrc/common.cu b/transformer_engine/pytorch/csrc/common.cu index 2146118382..1d20607940 100644 --- a/transformer_engine/pytorch/csrc/common.cu +++ b/transformer_engine/pytorch/csrc/common.cu @@ -88,6 +88,19 @@ size_t product(const std::vector &shape) { } +at::Tensor allocateSpace(const std::vector& shape, + const transformer_engine::DType type, + bool init_to_zeros) { + std::vector shape_int64(shape.begin(), shape.end()); + c10::IntArrayRef ar_shape(shape_int64); + if (init_to_zeros) { + return at::zeros(ar_shape, at::CUDA(GetATenDType(type))); + } else { + return at::empty(ar_shape, at::CUDA(GetATenDType(type))); + } +} + + at::Tensor allocateSpace(const NVTEShape &shape, const transformer_engine::DType type, bool init_to_zeros) { diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index f6c9898601..1d59fc7c43 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -15,9 +15,15 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -101,6 +107,12 @@ inline transformer_engine::DType GetTransformerEngineDType(at::ScalarType t) { return transformer_engine::DType::kBFloat16; case at::kBool: return transformer_engine::DType::kByte; + case torch::kByte: + return transformer_engine::DType::kByte; + case torch::kInt32: + return transformer_engine::DType::kInt32; + case torch::kInt64: + return transformer_engine::DType::kInt64; default: NVTE_ERROR("Invalid type"); } @@ -141,6 +153,9 @@ transformer_engine::TensorWrapper makeTransformerEngineTensor(at::Tensor tensor, size_t product(const std::vector &shape); +at::Tensor allocateSpace(const std::vector& shape, + const transformer_engine::DType type, + bool init_to_zeros); at::Tensor allocateSpace(const NVTEShape &shape, const transformer_engine::DType type, diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index 23330efbf0..75d4abd031 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -9,6 +9,742 @@ #include "comm_gemm_overlap.h" #endif // NVTE_WITH_USERBUFFERS +constexpr int block_size = 512; +constexpr int ctas_per_sm = 4; + +// convert QKV layout to enum +NVTE_QKV_Layout get_nvte_qkv_layout(const std::string qkv_layout) { + if (qkv_layout == "not_interleaved") { + return NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED; + } else if (qkv_layout == "qkv_interleaved") { + return NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED; + } else if (qkv_layout == "kv_interleaved") { + return NVTE_QKV_Layout::NVTE_KV_INTERLEAVED; + } else { + NVTE_ERROR("Invalid QKV layout. \n"); + } +} + +// convert bias type to enum +NVTE_Bias_Type get_nvte_bias_type(const std::string bias_type) { + if (bias_type == "no_bias") { + return NVTE_Bias_Type::NVTE_NO_BIAS; + } else if (bias_type == "pre_scale_bias") { + return NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS; + } else if (bias_type == "post_scale_bias") { + return NVTE_Bias_Type::NVTE_POST_SCALE_BIAS; + } else { + NVTE_ERROR("Invalid bias type. \n"); + } +} + +// convert attn mask type to enum +NVTE_Mask_Type get_nvte_mask_type(const std::string mask_type) { + if (mask_type == "padding") { + return NVTE_Mask_Type::NVTE_PADDING_MASK; + } else if (mask_type == "causal") { + return NVTE_Mask_Type::NVTE_CAUSAL_MASK; + } else if (mask_type == "no_mask") { + return NVTE_Mask_Type::NVTE_NO_MASK; + } else { + NVTE_ERROR("Invalid attention mask type. \n"); + } +} + +// fast zero-fills of tensors +template +__global__ void __launch_bounds__(block_size) mha_fill_kernel(scalar_t* out_tensor, + const int32_t* const start_row, + const size_t num_rows) { + size_t row_stride = gridDim.y * blockDim.x; + size_t row_index = blockIdx.x + static_cast(start_row[0]); + size_t col_index = blockIdx.y * blockDim.x + threadIdx.x; + while (row_index < num_rows) { + out_tensor[row_index*row_stride + col_index] = 0; + row_index += gridDim.x; + } +} + +// fast zero-fills of tensors +void mha_fill(const at::Tensor &self, const at::Tensor &start_index) { + auto max_tokens = self.size(0); + auto self_2d = self.view({max_tokens, -1}); + auto fcd_size = self_2d.size(1); + TORCH_CHECK(self.is_contiguous(), "input not contiguous"); + TORCH_CHECK(fcd_size % block_size == 0, "input size not aligned to block size"); + const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + uint64_t num_blk_y = (uint64_t)(fcd_size / block_size); + uint64_t num_blk_x = (uint64_t)((num_mp * ctas_per_sm + num_blk_y - 1) / num_blk_y); + dim3 dim_grid(num_blk_x, num_blk_y); + dim3 dim_block(block_size); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, + self_2d.scalar_type(), "mha_fill", [&]() { + mha_fill_kernel<<>>( + self_2d.data_ptr(), + static_cast(start_index.data_ptr()), + max_tokens); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +} + +// extract seed and offset from PhiloxCudaState +__global__ void unpack(at::PhiloxCudaState arg, int64_t* rng_state_ptr) { + if (arg.captured_) { + rng_state_ptr[0] = static_cast(*arg.seed_.ptr); + rng_state_ptr[1] = static_cast( + *(arg.offset_.ptr) + static_cast(arg.offset_intragraph_)); + } else { + rng_state_ptr[0] = static_cast(arg.seed_.val); + rng_state_ptr[1] = static_cast(arg.offset_.val); + } +} + +// extract PhiloxCudaState from CUDA random number generator +at::PhiloxCudaState init_philox_state( + at::CUDAGeneratorImpl* gen, + size_t max_seq_len, + size_t threads_per_cta) { + at::PhiloxCudaState philox_args; + size_t elts_per_thread = (max_seq_len * max_seq_len + threads_per_cta - 1)/threads_per_cta; + std::lock_guard lock(gen->mutex_); + philox_args = gen->philox_cuda_state(elts_per_thread); + return philox_args; +} + +// fused attention FWD with packed QKV +std::vector fused_attn_fwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen) { + using namespace transformer_engine; + + // create output tensor O + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto O = torch::empty({static_cast(total_seqs), + static_cast(h), static_cast(d)}, options); + if (set_zero) { + mha_fill(O, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } + + // construct NVTE tensors + TensorWrapper te_QKV, te_S, te_O, te_Bias, te_cu_seqlens; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + at::Tensor descale_S = torch::empty_like(scale_S.value()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_S.value().data_ptr(), + scale_S.value().data_ptr(), descale_S.data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if (Bias.has_value()) { + auto bias_shape = Bias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, + DType::kFloat32, nullptr, nullptr, nullptr); + } + te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // convert strings to enums + NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); + NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); + NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); + + // extract random number generator seed and offset + auto gen = at::get_generator_or_default( + rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); + size_t threads_per_cta = 128; + at::PhiloxCudaState philox_args = init_philox_state(gen, max_seqlen, threads_per_cta); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( + philox_args, static_cast(rng_state.data_ptr())); + auto te_rng_state = makeTransformerEngineTensor(rng_state); + + // create auxiliary output tensors + // if training, tensors are [M, ZInv] + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_fwd_qkvpacked( + te_QKV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens.data(), + te_rng_state.data(), + max_seqlen, + is_training, attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace and auxiliary output tensors + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // output_tensors = [O, nvte_aux_tensor_pack.tensors, rng_state] + std::vector output_tensors; + output_tensors.push_back(O); + // nvte_aux_tensor_pack.size is 0 if inference + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + // allocate memory for nvte_aux_tensor_pack.tensors + auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + output_tensors.push_back(output_tensor); + tensor->data.dptr = output_tensor.data_ptr(); + } + if (is_training) { + output_tensors.push_back(rng_state); + } + + // execute the kernel + nvte_fused_attn_fwd_qkvpacked( + te_QKV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens.data(), + te_rng_state.data(), + max_seqlen, + is_training, attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers, but not allocated memory + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + // if training, [O, M, ZInv, rng_state]; if inference, [O] + return output_tensors; +} + +// fused attention BWD with packed QKV +std::vector fused_attn_bwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV, + const c10::optional dBias) { + using namespace transformer_engine; + + // create output tensor dQKV + at::Tensor dQKV = torch::empty_like(QKV); + if (set_zero) { + mha_fill(dQKV, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } + + // construct NVTE tensors + TensorWrapper te_QKV, te_O, te_dO, te_S, te_dP, te_dQKV, te_dBias; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!scale_S.has_value()) || (!scale_dP.has_value()) + || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; + err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, + nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); + at::Tensor descale_dP = torch::empty_like(scale_dP.value()); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), + descale_dP.data_ptr()); + te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if (dBias.has_value()) { + auto bias_shape = dBias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_dBias = makeTransformerEngineTensor( + dBias.value().data_ptr(), shape, DType::kFloat32, + nullptr, nullptr, nullptr); + } + + // convert strings to enums + NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); + NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); + NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); + + // convert auxiliary tensors from forward into NVTETensors + // aux_ctx_tensors are [M, ZInv, rng_state] + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); + std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); + tensor->data.shape = std::vector(tmp.begin(), tmp.end()); + tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); + } + + // create cu_seqlens tensorwrappers + TensorWrapper te_cu_seqlens; + te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_bwd_qkvpacked( + te_QKV.data(), + te_dBias.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQKV.data(), + te_cu_seqlens.data(), + max_seqlen, + attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // execute kernel + nvte_fused_attn_bwd_qkvpacked( + te_QKV.data(), + te_dBias.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQKV.data(), + te_cu_seqlens.data(), + max_seqlen, + attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + return {dQKV}; +} + +// fused attention FWD with packed KV +std::vector fused_attn_fwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen) { + using namespace transformer_engine; + + // create output tensor O + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto O = torch::empty({static_cast(total_seqs_q), + static_cast(h), static_cast(d)}, options); + if (set_zero) { + mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } + + // construct NVTE tensors + TensorWrapper te_Q, te_KV, te_S, te_O, te_Bias, te_cu_seqlens_q, te_cu_seqlens_kv; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + at::Tensor descale_S = torch::empty_like(scale_S.value()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_S.value().data_ptr(), + scale_S.value().data_ptr(), descale_S.data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if (Bias.has_value()) { + auto bias_shape = Bias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, + DType::kFloat32, nullptr, nullptr, nullptr); + } + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // convert strings to enums + NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); + NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); + NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); + + // extract rng seed and offset + auto gen = at::get_generator_or_default( + rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); + size_t threads_per_cta = 128; + at::PhiloxCudaState philox_args = init_philox_state( + gen, max(max_seqlen_q, max_seqlen_kv), threads_per_cta); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( + philox_args, static_cast(rng_state.data_ptr())); + auto te_rng_state = makeTransformerEngineTensor(rng_state); + + // create auxiliary output tensors + // if training, tensors are [M, ZInv] + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_fwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace and auxiliary output tensors + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // output_tensors = [O, nvte_aux_tensor_pack.tensors, rng_state] + std::vector output_tensors; + output_tensors.push_back(O); + // nvte_aux_tensor_pack.size is 0 if inference + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + // allocate memory for nvte_aux_tensor_pack.tensors + auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + output_tensors.push_back(output_tensor); + tensor->data.dptr = output_tensor.data_ptr(); + } + if (is_training) { + output_tensors.push_back(rng_state); + } + + // execute the kernel + nvte_fused_attn_fwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers, but not allocated memory + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + // if training, [O, M, ZInv, rng_state]; if inference, [O] + return output_tensors; +} + +// fused attention BWD with packed KV +std::vector fused_attn_bwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV, + const c10::optional dBias) { + using namespace transformer_engine; + + // create output tensors dQ and dKV + at::Tensor dQ = torch::empty_like(Q); + at::Tensor dKV = torch::empty_like(KV); + if (set_zero) { + mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + mha_fill(dKV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } + + // construct NVTE tensors + TensorWrapper te_Q, te_KV, te_O, te_dO, te_S, te_dP, te_dQ, te_dKV, te_dBias; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!scale_S.has_value()) || (!scale_dP.has_value()) + || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; + err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, + scale_S.value().data_ptr(), descale_S.value().data_ptr()); + at::Tensor descale_dP = torch::empty_like(scale_dP.value()); + te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, + amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), + descale_dP.data_ptr()); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if (dBias.has_value()) { + auto bias_shape = dBias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_dBias = makeTransformerEngineTensor( + dBias.value().data_ptr(), shape, DType::kFloat32, + nullptr, nullptr, nullptr); + } + + // create cu_seqlens tensorwrappers + TensorWrapper te_cu_seqlens_q, te_cu_seqlens_kv; + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // convert strings to enums + NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); + NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); + NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); + + // convert auxiliary tensors from forward to NVTETensors + // aux_ctx_tensors are [M, ZInv, rng_state] + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); + std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); + tensor->data.shape = std::vector(tmp.begin(), tmp.end()); + tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); + } + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_bwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_dBias.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dKV.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // execute kernel + nvte_fused_attn_bwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_dBias.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dKV.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + return {dQ, dKV}; +} + void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, transformer_engine::DType A_type, @@ -749,13 +1485,13 @@ at::Tensor cast_to_fp8(const at::Tensor &input, transformer_engine::DType otype ) { using namespace transformer_engine; - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); + auto input_shape = input.sizes().vec(); + std::vector shape{input_shape.begin(), input_shape.end()}; auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); auto input_cu = makeTransformerEngineTensor(input); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, H}, otype, + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), shape, otype, amax.data_ptr(), scale.data_ptr(), scale_inv.data_ptr()); @@ -795,12 +1531,12 @@ at::Tensor cast_from_fp8(const at::Tensor &input, transformer_engine::DType otype ) { using namespace transformer_engine; - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); + auto input_shape = input.sizes().vec(); + std::vector shape{input_shape.begin(), input_shape.end()}; auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {N, H}, itype, + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), shape, itype, nullptr, nullptr, scale_inv.data_ptr()); auto output_cu = makeTransformerEngineTensor(output); @@ -1066,6 +1802,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8"); m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8"); m.def("te_gemm", &te_gemm, "CublasLt GEMM"); + m.def("fused_attn_fwd_qkvpacked", &fused_attn_fwd_qkvpacked, + "Fused Attention FP8/BF16/FP16 FWD with packed QKV"); + m.def("fused_attn_bwd_qkvpacked", &fused_attn_bwd_qkvpacked, + "Fused Attention FP8/BF16/FP16 BWD with packed QKV"); + m.def("fused_attn_fwd_kvpacked", &fused_attn_fwd_kvpacked, + "Fused Attention FP8/BF16/FP16 FWD with packed KV"); + m.def("fused_attn_bwd_kvpacked", &fused_attn_bwd_kvpacked, + "Fused Attention FP8/BF16/FP16 BWD with packed KV"); m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); m.def("fp8_gelu", &fp8_gelu, "GeLU with FP8 output"); diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index 6be404226e..561ba417e6 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -5,7 +5,95 @@ ************************************************************************/ #include "common.h" - +#include "../common.h" + +NVTE_QKV_Layout get_nvte_qkv_layout(const std::string qkv_layout); + +NVTE_Bias_Type get_nvte_bias_type(const std::string bias_type); + +NVTE_Mask_Type get_nvte_mask_type(const std::string mask_type); + +std::vector fused_attn_fwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen); + +std::vector fused_attn_bwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV, + const c10::optional dBias); + +std::vector fused_attn_fwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen); + +std::vector fused_attn_bwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV, + const c10::optional dBias); void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, diff --git a/transformer_engine/pytorch/module.py b/transformer_engine/pytorch/module.py index 3e0a868047..07805088b2 100644 --- a/transformer_engine/pytorch/module.py +++ b/transformer_engine/pytorch/module.py @@ -102,7 +102,7 @@ def get_workspace() -> torch.Tensor: global _cublas_workspace if _cublas_workspace is None: _cublas_workspace = torch.empty( - get_cublas_workspace_size_bytes(), dtype=torch.int8, device="cuda" + get_cublas_workspace_size_bytes(), dtype=torch.uint8, device="cuda" ) return _cublas_workspace @@ -520,7 +520,7 @@ def set_fp8_weights(self) -> None: torch.empty( shape, device=torch.cuda.current_device(), - dtype=torch.int8, + dtype=torch.uint8, ), ) setattr( @@ -530,7 +530,7 @@ def set_fp8_weights(self) -> None: shape[1], shape[0], device=torch.cuda.current_device(), - dtype=torch.int8, + dtype=torch.uint8, ), ) From e1ef756590fb3e73043c1f17b1f6783d9b40b016 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Fri, 21 Apr 2023 16:22:56 -0700 Subject: [PATCH 021/427] zero inter-node communication buffer (#163) Signed-off-by: Sangkug Lym Co-authored-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/csrc/userbuffers/userbuffers.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h index 1d4c1d4024..d6ec23c40d 100644 --- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h @@ -34,8 +34,6 @@ #define NVTE_REG0_FLAGS (NVTE_REG0_RECV + NVTE_MAX_PEERS * NVTE_MAX_REGIONS) #define NVTE_REG0_IBRS 32 #define NVTE_REG0_IBAG 512 -#undef NVTE_REG0_COMMBUFFER -#define NVTE_REG0_COMMBUFFER (1024 * 1024 * 16) // gpuflags map offsets #define NVTE_GF_STATE 16000 From 9d90eb477974182c196b556a1fea79b81c368603 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 21 Apr 2023 16:40:15 -0700 Subject: [PATCH 022/427] Remove userbuf docs (#164) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/transformer.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 52d303e8f4..dfa28846af 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -924,12 +924,6 @@ class TransformerLayer(torch.nn.Module): `set_tensor_parallel_group(tp_group)` method on the initialized module before the forward pass to supply the tensor parallel group needed for tensor and sequence parallel collectives. - ub_bulk_wgrad: bool, default = False - Bulk overlap UserBuffer ReduceScatter | WGRAD GEMM - ub_bulk_dgrad: bool, default = False - Bulk overlap UserBuffer AllGather | DGRAD GEMM - ub_split_ag: bool, default = False - Split pipelined overlap UserBuffer AllGather -> GEMM Optimization parameters ----------------------- From 71488dbec80899d2ce5e1730b08a6feb9451f0ec Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 27 Apr 2023 17:09:24 -0700 Subject: [PATCH 023/427] Faster split of QKV for FlashAttention (#166) * Faster split of QKV for FlashAttention Signed-off-by: Przemek Tredak * CI fixes Signed-off-by: Przemek Tredak * Fix Signed-off-by: Przemek Tredak * review comments Signed-off-by: Kirthi Shankar Sivamani * Message with assert Co-authored-by: Przemyslaw Tredak Signed-off-by: Kirthi Shankar Sivamani * Review comments Signed-off-by: Kirthi Shankar Sivamani * review Signed-off-by: Kirthi Shankar Sivamani * fix misalignment error Signed-off-by: Kirthi Shankar Sivamani * make clarifying comment and check strides Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Przemek Tredak Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/csrc/extensions.cu | 171 ++++++++++++++++++ transformer_engine/pytorch/transformer.py | 119 +++++++++++- 2 files changed, 284 insertions(+), 6 deletions(-) diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index 75d4abd031..4cb6c50c34 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -1767,6 +1767,175 @@ bool userbuf_comm_available() { // TODO(ksivamani) check on python side void placeholder() {} // TODO(ksivamani) clean this up +namespace flash_attention { + +constexpr int warp_size = 32; +constexpr int type_size = 2; // FP16 or BF16 +constexpr int nvec = sizeof(uint64_t) / type_size; +constexpr int load_size = warp_size * nvec; +constexpr int block_size = 512; + +template +__launch_bounds__(block_size) +__global__ void prepare_kernel_fwd(const T *qkvi, + T *qkv, + const size_t B, + const size_t S, + const size_t Z, + const size_t W) { + const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; + const int id_in_warp = threadIdx.x % warp_size; + const size_t offset_input = blockIdx.y * W + warpid * 3 * W * Z + id_in_warp * nvec; + const T *my_input = qkvi + offset_input; + + const size_t s = warpid / B; + if (s >= S) return; + + const size_t b = warpid % B; + + const size_t offset_output = blockIdx.y * B * S * Z * W + + (s + b * S) * W * Z + + id_in_warp * nvec; + + T *my_output = qkv + offset_output; + + for (int i = 0; i < Z; ++i) { + uint64_t *out = reinterpret_cast(my_output + i * load_size); + *out = *reinterpret_cast(my_input + i * load_size * 3); + } +} + +template +__launch_bounds__(block_size) +__global__ void prepare_kernel_bwd(const T *q, const T *k, const T *v, + T *qkv, const size_t B, const size_t S, + const size_t Z, const size_t W) { + const T *input = blockIdx.y == 0 ? q : (blockIdx.y == 1 ? k : v); + + const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; + const int id_in_warp = threadIdx.x % warp_size; + const size_t offset_input = warpid * W * Z + id_in_warp * nvec; + const T *my_input = input + offset_input; + + const size_t b = warpid / S; + if (b >= B) return; + + const size_t s = warpid % S; + + const size_t offset_output = (b + s * B) * 3 * W * Z + + id_in_warp * nvec + blockIdx.y * W; + + T *my_output = qkv + offset_output; + + for (int i = 0; i < Z; ++i) { + uint64_t *out = reinterpret_cast(my_output + i * load_size * 3); + *out = *reinterpret_cast(my_input + i * load_size); + } +} + +} // namespace flash_attention + +at::Tensor fa_prepare_fwd(at::Tensor qkvi) { + NVTE_CHECK(qkvi.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(qkvi.scalar_type() == at::ScalarType::Half || + qkvi.scalar_type() == at::ScalarType::BFloat16); + NVTE_CHECK(qkvi.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(qkvi.size(3) == flash_attention::load_size); + NVTE_CHECK(qkvi.stride(3) == 1, "Wrong stride."); + NVTE_CHECK(qkvi.stride(2) == 3 * qkvi.size(3), "Wrong stride."); + NVTE_CHECK(qkvi.stride(1) == 3 * qkvi.size(3) * qkvi.size(2), "Wrong stride."); + NVTE_CHECK(qkvi.stride(0) == 3 * qkvi.size(3) * qkvi.size(2) * qkvi.size(1), "Wrong stride."); + + // [s, b, n, h * 3] -> [3, b, s, n, h] + std::vector shape = {3, qkvi.size(1), qkvi.size(0), qkvi.size(2), qkvi.size(3)}; + at::Tensor qkv = at::empty(shape, at::CUDA(qkvi.scalar_type())); + + size_t warps = qkvi.size(0) * qkvi.size(1); + size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; + size_t blocks = (warps + warps_per_block - 1) / warps_per_block; + dim3 grid(blocks, 3); + int threads = flash_attention::block_size; + if (qkvi.scalar_type() == at::ScalarType::Half) { + using dtype = at::Half; + flash_attention::prepare_kernel_fwd<<>>( + qkvi.data_ptr(), + qkv.data_ptr(), + shape[1], + shape[2], + shape[3], + shape[4]); + } else { + using dtype = at::BFloat16; + flash_attention::prepare_kernel_fwd<<>>( + qkvi.data_ptr(), + qkv.data_ptr(), + shape[1], + shape[2], + shape[3], + shape[4]); + } + + return qkv; +} + +at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v) { + NVTE_CHECK(q.is_contiguous()); + NVTE_CHECK(k.is_contiguous()); + NVTE_CHECK(v.is_contiguous()); + NVTE_CHECK(q.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(k.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(v.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(q.scalar_type() == at::ScalarType::Half || + q.scalar_type() == at::ScalarType::BFloat16); + NVTE_CHECK(k.scalar_type() == q.scalar_type()); + NVTE_CHECK(v.scalar_type() == q.scalar_type()); + NVTE_CHECK(q.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(q.size(3) == flash_attention::load_size); + NVTE_CHECK(k.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(k.size(3) == flash_attention::load_size); + NVTE_CHECK(v.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(v.size(3) == flash_attention::load_size); + + // 3 x [s, b, n, h] -> [b, s, n, 3 * h] + + std::vector shape = {q.size(1), q.size(0), q.size(2), 3 * q.size(3)}; + at::Tensor qkv = at::empty(shape, at::CUDA(q.scalar_type())); + + size_t warps = q.size(0) * q.size(1); + size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; + size_t blocks = (warps + warps_per_block - 1) / warps_per_block; + dim3 grid(blocks, 3); + int threads = flash_attention::block_size; + if (q.scalar_type() == at::ScalarType::Half) { + using dtype = at::Half; + flash_attention::prepare_kernel_bwd<<>>( + q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + qkv.data_ptr(), + q.size(0), + q.size(1), + q.size(2), + q.size(3)); + } else { + using dtype = at::BFloat16; + flash_attention::prepare_kernel_bwd<<>>( + q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + qkv.data_ptr(), + q.size(0), + q.size(1), + q.size(2), + q.size(3)); + } + + return qkv; +} PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Softmax functions @@ -1812,6 +1981,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Fused Attention FP8/BF16/FP16 BWD with packed KV"); m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); m.def("fp8_gelu", &fp8_gelu, "GeLU with FP8 output"); + m.def("fa_prepare_fwd", &fa_prepare_fwd, "Prepare QKV for Flash Attention"); + m.def("fa_prepare_bwd", &fa_prepare_bwd, "Backward of QKV preparation for Flash Attention"); // Misc m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index dfa28846af..7071378b61 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -77,6 +77,48 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: output = hidden_state.div(keep_prob) * random_tensor return output +class _SplitLastDim(torch.autograd.Function): + """""" + + @staticmethod + def forward(ctx, + mixed_x_layer: torch.Tensor, + num_parts: int + ) -> Tuple[torch.Tensor, ...]: + return split_tensor_along_dim(mixed_x_layer, -1, num_parts) + + @staticmethod + def backward(ctx, + *grad_outputs): + assert len(grad_outputs) > 0, "No gradients received for backprop!" + + noop_ok = True + strides = grad_outputs[0].stride() + data_ptr = grad_outputs[0].untyped_storage().data_ptr() + shape = grad_outputs[0].shape + last_dim_size = grad_outputs[0].shape[-1] + for i, tensor in enumerate(grad_outputs): + if (tensor.stride() != strides or + tensor.shape != shape or + tensor.untyped_storage().data_ptr() != data_ptr or + tensor.storage_offset() != i * last_dim_size): + noop_ok = False + break + + if noop_ok: + ret = torch.Tensor().to(grad_outputs[0].dtype) + ret = torch.Tensor().to(device=grad_outputs[0].device, + dtype=grad_outputs[0].dtype) + new_shape = list(shape) + new_shape[-1] = new_shape[-1] * len(grad_outputs) + ret.set_(grad_outputs[0].untyped_storage(), + grad_outputs[0].storage_offset(), + new_shape, + grad_outputs[0].stride() + ) + return ret, None + + return torch.cat(grad_outputs, dim = -1), None class UnfusedDotProductAttention(torch.nn.Module): """Parallel attention w/o QKV and Proj Gemms @@ -204,6 +246,56 @@ def forward( return context_layer +class _PrepareQKVForFA(torch.autograd.Function): + """This class converts QKV from interleaved (s, b, ...) layout + to separate contiguous q, k, v tensors in (b, s, ...) layout.""" + + @staticmethod + def forward(ctx, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor + ) -> torch.Tensor: + # All inputs received are non-contiguous tensors. + # The `query_layer` tensor is used to access the + # full memory region of the QKV tensor. + qkv = tex.fa_prepare_fwd(query_layer) + q, k, v = split_tensor_along_dim(qkv, 0, 3) + query_layer = torch.squeeze(q, 0) + key_layer = torch.squeeze(k, 0) + value_layer = torch.squeeze(v, 0) + return query_layer, key_layer, value_layer + + @staticmethod + def backward(ctx, + dq: torch.Tensor, + dk: torch.Tensor, + dv: torch.Tensor + ) -> Tuple[Union[torch.Tensor, None], ...]: + dqkv = tex.fa_prepare_bwd(dq, dk, dv) + dq, dk, dv = split_tensor_along_dim(dqkv, -1, 3) + return dq, dk, dv + +def _check_if_interleaved(q, k, v): + data_ptr = q.untyped_storage().data_ptr() + check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v]) + if not check_ptrs: + return False + + stride = q.stride() + check_strides = all(stride == x.stride() for x in [q, k, v]) + if not check_strides: + return False + + shape = q.shape + check_shapes = all(shape == x.shape for x in [q, k, v]) + if not check_shapes: + return False + + last_dim_size = shape[-1] + check_offsets = all(i * last_dim_size == x.storage_offset() + for i, x in enumerate([q, k, v])) + return check_offsets class FlashAttention(torch.nn.Module): """Dot product attention implementation by using the flash-attn package. @@ -252,8 +344,17 @@ def forward( attention_mask is None ), 'FlashAttention currently does not support external attention mask.' - query_layer, key_layer, value_layer = [x.transpose(0,1).contiguous() - for x in (query_layer, key_layer, value_layer)] + # For now just 128, will make it more general in the future + + if (query_layer.shape[-1] == 128 and + query_layer.shape[0] * query_layer.shape[1] >= 512 and + _check_if_interleaved(query_layer, key_layer, value_layer)): + query_layer, key_layer, value_layer = _PrepareQKVForFA.apply(query_layer, + key_layer, + value_layer) + else: + query_layer, key_layer, value_layer = [x.transpose(0,1).contiguous() + for x in (query_layer, key_layer, value_layer)] batch_size, seqlen = query_layer.shape[0], query_layer.shape[1] @@ -731,9 +832,12 @@ def forward( mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # mixed_x_layer --> 3 [sq, b, np, hn] - query_layer, key_layer, value_layer = split_tensor_along_dim( - mixed_x_layer, split_dim, 3 - ) + if split_dim == -1 and not is_in_onnx_export_mode(): + query_layer, key_layer, value_layer = _SplitLastDim.apply(mixed_x_layer, 3) + else: + query_layer, key_layer, value_layer = split_tensor_along_dim( + mixed_x_layer, split_dim, 3 + ) else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer = self.key_value( @@ -761,7 +865,10 @@ def forward( mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) # mixed_kv_layer --> 2 [sk, b, np, hn] - key_layer, value_layer = split_tensor_along_dim(mixed_kv_layer, split_dim, 2) + if split_dim == -1 and not is_in_onnx_export_mode(): + key_layer, value_layer = _SplitLastDim.apply(mixed_kv_layer, 2) + else: + key_layer, value_layer = split_tensor_along_dim(mixed_kv_layer, split_dim, 2) # Attention head [sq, b, h] --> [sq, b, hp] if self.input_layernorm: From 87706dc6a65e7d5e44acf801527ceb898e990ecd Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 27 Apr 2023 17:09:59 -0700 Subject: [PATCH 024/427] Remove the nonexistent parameter from fused attention documentation (#181) * Remove the nonexistent parameter from fused attention documentation Signed-off-by: Przemek Tredak * Remove the second instance Signed-off-by: Przemek Tredak --------- Signed-off-by: Przemek Tredak Signed-off-by: Kirthi Shankar Sivamani --- .../common/include/transformer_engine/fused_attn.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index bb9262de18..967fc62724 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -133,7 +133,6 @@ void nvte_fused_attn_fwd_qkvpacked( * \param[in] Aux_CTX_Tensors Auxiliary tensors from forward when in training mode. * \param[out] dQKV The gradient of the QKV tensor. * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. - * \param[in] rng_state Seed and offset of CUDA random number generator. * \param[in] max_seqlen Max sequence length used for computing, * it may be >= max(cu_seqlens). * \param[in] attn_scale Scaling factor for Q * K.T. @@ -222,7 +221,6 @@ void nvte_fused_attn_fwd_kvpacked( * \param[out] dKV The gradient of the KV tensor. * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] rng_state Seed and offset of CUDA random number generator. * \param[in] max_seqlen_q Max sequence length used for computing for Q. * it may be >= max(cu_seqlens_q). * \param[in] max_seqlen_kv Max sequence length used for computing for KV. From 2ce7f0c8b06498a41eb90192bef28021b46ffb26 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 27 Apr 2023 17:12:07 -0700 Subject: [PATCH 025/427] Re-add support for PyTorch version 1.x (#180) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/transformer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 7071378b61..fae4ff595d 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -94,13 +94,13 @@ def backward(ctx, noop_ok = True strides = grad_outputs[0].stride() - data_ptr = grad_outputs[0].untyped_storage().data_ptr() + data_ptr = grad_outputs[0].storage().data_ptr() shape = grad_outputs[0].shape last_dim_size = grad_outputs[0].shape[-1] for i, tensor in enumerate(grad_outputs): if (tensor.stride() != strides or tensor.shape != shape or - tensor.untyped_storage().data_ptr() != data_ptr or + tensor.storage().data_ptr() != data_ptr or tensor.storage_offset() != i * last_dim_size): noop_ok = False break @@ -111,7 +111,7 @@ def backward(ctx, dtype=grad_outputs[0].dtype) new_shape = list(shape) new_shape[-1] = new_shape[-1] * len(grad_outputs) - ret.set_(grad_outputs[0].untyped_storage(), + ret.set_(grad_outputs[0].storage(), grad_outputs[0].storage_offset(), new_shape, grad_outputs[0].stride() @@ -277,8 +277,8 @@ def backward(ctx, return dq, dk, dv def _check_if_interleaved(q, k, v): - data_ptr = q.untyped_storage().data_ptr() - check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v]) + data_ptr = q.storage().data_ptr() + check_ptrs = all(x.storage().data_ptr() == data_ptr for x in [q, k, v]) if not check_ptrs: return False From 00707bbd13429d40ee1eec0f11b09c9cff743b83 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Fri, 28 Apr 2023 10:04:35 +0800 Subject: [PATCH 026/427] [JAX] Adjust Module Structure. (#169) * Adjust Module Structure. 1. Collect Flax related modules to a sub-folder, flax. 2. Add a function to unify scale_init for zero-centered-gamma LN. Signed-off-by: Ming Huang * Make changes be compatible to previous versions. Signed-off-by: Ming Huang * Adapt jax/examples to the new module structure. Signed-off-by: Ming Huang * Update jax/docs and Add deprecated warning. Signed-off-by: Ming Huang * Update README Signed-off-by: Ming Huang * Adding deprecated_wrapper Signed-off-by: Ming Huang * Adding deprecated warning to flax modules which imported via transformer_engine.jax Signed-off-by: Ming Huang * Fix CI errors and update docs. Signed-off-by: Ming Huang * Removing unnecessary deprecated warning in docs. Signed-off-by: Ming Huang * Implementing __iter__ to DeprecatedEnum. Signed-off-by: Ming Huang --------- Signed-off-by: Ming Huang Co-authored-by: Kirthi Shankar Sivamani --- README.rst | 5 +- docs/api/jax.rst | 25 +++++---- .../encoder/test_model_parallel_encoder.py | 24 ++++----- examples/jax/encoder/test_multigpu_encoder.py | 8 +-- .../jax/encoder/test_single_gpu_encoder.py | 6 +-- examples/jax/mnist/test_single_gpu_mnist.py | 2 +- tests/jax/test_layer.py | 2 +- tests/jax/test_sharding.py | 2 +- transformer_engine/common/utils.py | 53 +++++++++++++++++++ transformer_engine/jax/__init__.py | 41 ++++++++++++-- transformer_engine/jax/flax/__init__.py | 9 ++++ transformer_engine/jax/{ => flax}/module.py | 46 ++++++++-------- .../jax/{ => flax}/transformer.py | 6 +-- 13 files changed, 162 insertions(+), 67 deletions(-) create mode 100644 transformer_engine/common/utils.py create mode 100644 transformer_engine/jax/flax/__init__.py rename transformer_engine/jax/{ => flax}/module.py (97%) rename transformer_engine/jax/{ => flax}/transformer.py (99%) diff --git a/README.rst b/README.rst index fe576f3498..6964f219d0 100644 --- a/README.rst +++ b/README.rst @@ -69,6 +69,9 @@ pyTorch JAX ^^^ +Flax +~~~~ + .. code-block:: python import jax @@ -90,7 +93,7 @@ JAX # Enable autocasting for the forward pass with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): - model = te.DenseGeneral(features=HIDDEN) + model = te.flax.DenseGeneral(features=HIDDEN) def loss_fn(params, other_vars, inp): out = model.apply({'params':params, **other_vars}, inp) diff --git a/docs/api/jax.rst b/docs/api/jax.rst index e049c70e50..13b276c3a1 100644 --- a/docs/api/jax.rst +++ b/docs/api/jax.rst @@ -9,34 +9,33 @@ Jax .. autoapiclass:: transformer_engine.jax.MajorShardingType .. autoapiclass:: transformer_engine.jax.ShardingType .. autoapiclass:: transformer_engine.jax.TransformerLayerType +.. autoapiclass:: transformer_engine.jax.ShardingResource(dp_resource=None, tp_resource=None) -.. autoapiclass:: transformer_engine.jax.ShardingResource(dp_resource=None, tp_resource=None) +.. autoapifunction:: transformer_engine.jax.fp8_autocast +.. autoapifunction:: transformer_engine.jax.update_collections +.. autoapifunction:: transformer_engine.jax.update_fp8_metas -.. autoapiclass:: transformer_engine.jax.LayerNorm(epsilon=1e-6, layernorm_type='layernorm', **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.LayerNorm(epsilon=1e-6, layernorm_type='layernorm', **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.DenseGeneral(features, layernorm_type='layernorm', use_bias=False, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.DenseGeneral(features, layernorm_type='layernorm', use_bias=False, **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.LayerNormDenseGeneral(features, layernorm_type='layernorm', epsilon=1e-6, use_bias=False, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.LayerNormDenseGeneral(features, layernorm_type='layernorm', epsilon=1e-6, use_bias=False, **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.LayerNormMLP(intermediate_dim=2048, layernorm_type='layernorm', epsilon=1e-6, use_bias=False, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.LayerNormMLP(intermediate_dim=2048, layernorm_type='layernorm', epsilon=1e-6, use_bias=False, **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.RelativePositionBiases(num_buckets, max_distance, num_heads, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.RelativePositionBiases(num_buckets, max_distance, num_heads, **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.MultiHeadAttention(head_dim, num_heads, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.MultiHeadAttention(head_dim, num_heads, **kwargs) :members: __call__ -.. autoapiclass:: transformer_engine.jax.TransformerLayer(hidden_size=512, mlp_hidden_size=2048, num_attention_heads=8, **kwargs) +.. autoapiclass:: transformer_engine.jax.flax.TransformerLayer(hidden_size=512, mlp_hidden_size=2048, num_attention_heads=8, **kwargs) :members: __call__ - -.. autoapifunction:: transformer_engine.jax.extend_logical_axis_rules -.. autoapifunction:: transformer_engine.jax.fp8_autocast -.. autoapifunction:: transformer_engine.jax.update_collections -.. autoapifunction:: transformer_engine.jax.update_fp8_metas \ No newline at end of file +.. autoapifunction:: transformer_engine.jax.flax.extend_logical_axis_rules diff --git a/examples/jax/encoder/test_model_parallel_encoder.py b/examples/jax/encoder/test_model_parallel_encoder.py index 10c880710e..ff09f1b84e 100644 --- a/examples/jax/encoder/test_model_parallel_encoder.py +++ b/examples/jax/encoder/test_model_parallel_encoder.py @@ -59,7 +59,7 @@ class Net(nn.Module): def __call__(self, x, mask, disable_dropout=False): x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) - te_Encoder = partial(te.TransformerLayer, + te_Encoder = partial(te.flax.TransformerLayer, hidden_size=256, mlp_hidden_size=1024, num_attention_heads=8, @@ -73,17 +73,17 @@ def __call__(self, x, mask, disable_dropout=False): x = x.reshape(x.shape[0], -1) - x = te.DenseGeneral(features=256, - kernel_axes=(NAMED_BROADCAST_AXIS, NAMED_TP_AXIS), - bias_axes=(NAMED_TP_AXIS,), - sharding_type=te.ShardingType.DP_TP_COL, - dtype=jnp.bfloat16)(x) - - x = te.DenseGeneral(features=256, - kernel_axes=(NAMED_TP_AXIS, NAMED_BROADCAST_AXIS), - bias_axes=(NAMED_BROADCAST_AXIS,), - sharding_type=te.ShardingType.DP_TP_ROW, - dtype=jnp.bfloat16)(x) + x = te.flax.DenseGeneral(features=256, + kernel_axes=(NAMED_BROADCAST_AXIS, NAMED_TP_AXIS), + bias_axes=(NAMED_TP_AXIS,), + sharding_type=te.ShardingType.DP_TP_COL, + dtype=jnp.bfloat16)(x) + + x = te.flax.DenseGeneral(features=256, + kernel_axes=(NAMED_TP_AXIS, NAMED_BROADCAST_AXIS), + bias_axes=(NAMED_BROADCAST_AXIS,), + sharding_type=te.ShardingType.DP_TP_ROW, + dtype=jnp.bfloat16)(x) x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) return x diff --git a/examples/jax/encoder/test_multigpu_encoder.py b/examples/jax/encoder/test_multigpu_encoder.py index 9cb420b0c8..5f06ddf879 100644 --- a/examples/jax/encoder/test_multigpu_encoder.py +++ b/examples/jax/encoder/test_multigpu_encoder.py @@ -56,7 +56,7 @@ class Net(nn.Module): def __call__(self, x, mask, disable_dropout=False): x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) - te_Encoder = partial(te.TransformerLayer, + te_Encoder = partial(te.flax.TransformerLayer, hidden_size=256, mlp_hidden_size=1024, num_attention_heads=8, @@ -70,9 +70,11 @@ def __call__(self, x, mask, disable_dropout=False): x = x.reshape(x.shape[0], -1) - x = te.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, dtype=jnp.bfloat16)(x) + x = te.flax.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, + dtype=jnp.bfloat16)(x) - x = te.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, dtype=jnp.bfloat16)(x) + x = te.flax.DenseGeneral(features=256, sharding_type=te.ShardingType.DP, + dtype=jnp.bfloat16)(x) x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) return x diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py index bac1469b5b..ea6c0abd51 100644 --- a/examples/jax/encoder/test_single_gpu_encoder.py +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -46,7 +46,7 @@ class Net(nn.Module): def __call__(self, x, mask, disable_dropout=False): x = nn.Embed(num_embeddings=self.num_embed, features=256, dtype=jnp.bfloat16)(x) - te_Encoder = partial(te.TransformerLayer, + te_Encoder = partial(te.flax.TransformerLayer, hidden_size=256, mlp_hidden_size=1024, num_attention_heads=8, @@ -60,9 +60,9 @@ def __call__(self, x, mask, disable_dropout=False): x = x.reshape(x.shape[0], -1) - x = te.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) + x = te.flax.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) - x = te.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) + x = te.flax.DenseGeneral(features=256, dtype=jnp.bfloat16)(x) x = nn.Dense(features=2, dtype=jnp.bfloat16)(x) return x diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py index 0b16dd8b98..3b8e2d0bd9 100644 --- a/examples/jax/mnist/test_single_gpu_mnist.py +++ b/examples/jax/mnist/test_single_gpu_mnist.py @@ -47,7 +47,7 @@ class Net(nn.Module): @nn.compact def __call__(self, x, disable_dropout=False): if self.use_te: - nn_Dense = te.DenseGeneral + nn_Dense = te.flax.DenseGeneral else: nn_Dense = nn.Dense diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py index 1522fa198b..c959f7abcf 100644 --- a/tests/jax/test_layer.py +++ b/tests/jax/test_layer.py @@ -10,7 +10,7 @@ import pytest from transformer_engine.common.recipe import Format -from transformer_engine.jax import TransformerLayer, TransformerLayerType +from transformer_engine.jax.flax import TransformerLayer, TransformerLayerType from transformer_engine.jax.fp8 import FP8Helper from utils import assert_allclose, is_fp8_supported from utils import DecoderLayer as RefDecoderLayer diff --git a/tests/jax/test_sharding.py b/tests/jax/test_sharding.py index 458e10ffac..cd135752c0 100644 --- a/tests/jax/test_sharding.py +++ b/tests/jax/test_sharding.py @@ -7,7 +7,7 @@ import pytest from jax.experimental import maps -from transformer_engine.jax import extend_logical_axis_rules +from transformer_engine.jax.flax import extend_logical_axis_rules from transformer_engine.jax.sharding import get_dot_sharding_meta from transformer_engine.jax.sharding import get_elementwise_sharding_meta from transformer_engine.jax.sharding import get_fp8_meta_sharding_meta diff --git a/transformer_engine/common/utils.py b/transformer_engine/common/utils.py new file mode 100644 index 0000000000..cf35108673 --- /dev/null +++ b/transformer_engine/common/utils.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""The utilities for Transformer Engine""" +import inspect +import warnings +from enum import Enum + +warnings.simplefilter('default') + + +class DeprecatedEnum: # pylint: disable=too-few-public-methods + """DeprecatedEnum""" + + def __init__(self, enum_cls, msg): + self.enum_cls = enum_cls + self.msg = msg + + def __iter__(self): + return iter(list(self.enum_cls.__members__.values())) + + def __getattr__(self, name): + if name in self.enum_cls.__members__: + warnings.warn(self.msg, DeprecationWarning) + return self.enum_cls.__members__[name] + raise AttributeError(f"{self.enum_cls} does not contain {name}") + + +def deprecate_wrapper(obj, msg): + """Deprecate wrapper""" + if inspect.isclass(obj): + if issubclass(obj, Enum): + return DeprecatedEnum(obj, msg) + + class DeprecatedCls(obj): # pylint: disable=too-few-public-methods + """DeprecatedCls""" + + def __init__(self, *args, **kwargs): + warnings.warn(msg, DeprecationWarning) + super().__init__(*args, **kwargs) + + return DeprecatedCls + + if inspect.isfunction(obj): + + def deprecated(*args, **kwargs): + warnings.warn(msg, DeprecationWarning) + return obj(*args, **kwargs) + + return deprecated + + raise NotImplementedError( + f"deprecate_cls_wrapper only support Class and Function, but got {type(obj)}.") diff --git a/transformer_engine/jax/__init__.py b/transformer_engine/jax/__init__.py index 750a34fb5b..9b7c2f224f 100644 --- a/transformer_engine/jax/__init__.py +++ b/transformer_engine/jax/__init__.py @@ -2,10 +2,41 @@ # # See LICENSE for license information. """Transformer Engine bindings for JAX""" + +from . import flax from .fp8 import fp8_autocast, update_collections, update_fp8_metas, get_delayed_scaling -from .module import DenseGeneral, LayerNorm -from .module import LayerNormDenseGeneral, LayerNormMLP, TransformerEngineBase -from .transformer import extend_logical_axis_rules -from .transformer import MultiHeadAttention, RelativePositionBiases -from .transformer import TransformerLayer, TransformerLayerType from .sharding import MajorShardingType, ShardingResource, ShardingType +from ..common.utils import deprecate_wrapper + +extend_logical_axis_rules = deprecate_wrapper( + flax.extend_logical_axis_rules, + "extend_logical_axis_rules is moving to transformer_engine.jax.flax module") +DenseGeneral = deprecate_wrapper(flax.DenseGeneral, + "DenseGeneral is moving to transformer_engine.jax.flax module") +LayerNorm = deprecate_wrapper(flax.LayerNorm, + "LayerNorm is moving to transformer_engine.jax.flax module") +LayerNormDenseGeneral = deprecate_wrapper( + flax.LayerNormDenseGeneral, + "LayerNormDenseGeneral is moving to transformer_engine.jax.flax module") +LayerNormMLP = deprecate_wrapper(flax.LayerNormMLP, + "LayerNormMLP is moving to transformer_engine.jax.flax module") +TransformerEngineBase = deprecate_wrapper( + flax.TransformerEngineBase, + "TransformerEngineBase is moving to transformer_engine.jax.flax module") +MultiHeadAttention = deprecate_wrapper( + flax.MultiHeadAttention, "MultiHeadAttention is moving to transformer_engine.jax.flax module") +RelativePositionBiases = deprecate_wrapper( + flax.RelativePositionBiases, + "RelativePositionBiases is moving to transformer_engine.jax.flax module") +TransformerLayer = deprecate_wrapper( + flax.TransformerLayer, "TransformerLayer is moving to transformer_engine.jax.flax module") +TransformerLayerType = deprecate_wrapper( + flax.TransformerLayerType, + "TransformerLayerType is moving to transformer_engine.jax.flax module") + +__all__ = [ + 'fp8_autocast', 'update_collections', 'update_fp8_metas', 'get_delayed_scaling', + 'MajorShardingType', 'ShardingResource', 'ShardingType', 'flax', 'DenseGeneral', 'LayerNorm', + 'LayerNormDenseGeneral', 'LayerNormMLP', 'TransformerEngineBase', 'MultiHeadAttention', + 'RelativePositionBiases', 'TransformerLayer', 'TransformerLayerType' +] diff --git a/transformer_engine/jax/flax/__init__.py b/transformer_engine/jax/flax/__init__.py new file mode 100644 index 0000000000..5dd8f9bdf1 --- /dev/null +++ b/transformer_engine/jax/flax/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Transformer Engine bindings for JAX""" +from .module import DenseGeneral, LayerNorm +from .module import LayerNormDenseGeneral, LayerNormMLP, TransformerEngineBase +from .transformer import extend_logical_axis_rules +from .transformer import MultiHeadAttention, RelativePositionBiases +from .transformer import TransformerLayer, TransformerLayerType diff --git a/transformer_engine/jax/module.py b/transformer_engine/jax/flax/module.py similarity index 97% rename from transformer_engine/jax/module.py rename to transformer_engine/jax/flax/module.py index af96b95ada..f9924c600f 100644 --- a/transformer_engine/jax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -16,15 +16,15 @@ from jax import nn as jax_nn from jax import random as jax_random -from .dot import fp8_dot -from .fp8 import FP8GemmPackage, FP8Helper -from .layernorm import canonicalize_layernorm_type -from .layernorm import layernorm, layernorm_fp8_dot -from .mlp import fp8_ln_mlp, geglu -from .sharding import infer_sharding_type -from .softmax import is_softmax_kernel_available -from .sharding import MajorShardingType, ShardingType -from .softmax import softmax, SoftmaxType +from ..dot import fp8_dot +from ..fp8 import FP8GemmPackage, FP8Helper +from ..layernorm import canonicalize_layernorm_type +from ..layernorm import layernorm, layernorm_fp8_dot +from ..mlp import fp8_ln_mlp, geglu +from ..sharding import infer_sharding_type +from ..softmax import is_softmax_kernel_available +from ..sharding import MajorShardingType, ShardingType +from ..softmax import softmax, SoftmaxType PRNGKey = Any Shape = Tuple[int, ...] @@ -46,6 +46,13 @@ def _canonicalize_tuple(x): return (x,) +def _obtain_default_layernorm_scale_init_if_need(original_init, zero_centered_gamma): + if original_init is None: + if not zero_centered_gamma: + return nn.initializers.ones + return nn.initializers.zeros + + def _create_layernorm_parameters(layernorm_type, shape, scale_init, scale_axes, bias_init, bias_axes, dtype): scale = nn_partitioning.param_with_axes('scale', @@ -250,11 +257,8 @@ class LayerNorm(nn.Module): sharding_type: ShardingType = ShardingType.SINGLE def __post_init__(self): - if self.scale_init is None: - if not self.zero_centered_gamma: - self.scale_init = nn.initializers.ones - else: - self.scale_init = nn.initializers.zeros + self.scale_init = _obtain_default_layernorm_scale_init_if_need( + self.scale_init, self.zero_centered_gamma) super().__post_init__() @nn.compact @@ -549,11 +553,8 @@ class LayerNormDenseGeneral(TransformerEngineBase): def __post_init__(self): if self.kernel_init is None: self.kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal') - if self.scale_init is None: - if not self.zero_centered_gamma: - self.scale_init = nn.initializers.ones - else: - self.scale_init = nn.initializers.zeros + self.scale_init = _obtain_default_layernorm_scale_init_if_need( + self.scale_init, self.zero_centered_gamma) super().__post_init__() @nn.compact @@ -781,11 +782,8 @@ class LayerNormMLP(TransformerEngineBase): def __post_init__(self): if self.kernel_init is None: self.kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal') - if self.scale_init is None: - if not self.zero_centered_gamma: - self.scale_init = nn.initializers.ones - else: - self.scale_init = nn.initializers.zeros + self.scale_init = _obtain_default_layernorm_scale_init_if_need( + self.scale_init, self.zero_centered_gamma) super().__post_init__() @nn.compact diff --git a/transformer_engine/jax/transformer.py b/transformer_engine/jax/flax/transformer.py similarity index 99% rename from transformer_engine/jax/transformer.py rename to transformer_engine/jax/flax/transformer.py index 2ec33cf5b6..aaecab7b51 100644 --- a/transformer_engine/jax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -18,9 +18,9 @@ from .module import DenseGeneral, LayerNormDenseGeneral, LayerNormMLP from .module import LayerNorm, Softmax -from .softmax import SoftmaxType -from .sharding import infer_major_sharding_type, infer_sharding_type -from .sharding import global_shard_resource, ShardingType +from ..softmax import SoftmaxType +from ..sharding import infer_major_sharding_type, infer_sharding_type +from ..sharding import global_shard_resource, ShardingType PRNGKey = Any Shape = Tuple[int, ...] From 550da28957304219a25deffcef007e88ec86ba10 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Sat, 29 Apr 2023 10:41:14 -0700 Subject: [PATCH 027/427] Correct cuDNN version requirement (#184) correct cuDNN version requirement Signed-off-by: Charlene Yang --- docs/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation.rst b/docs/installation.rst index 9aded82d0f..89f9fd549d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -14,7 +14,7 @@ Prerequisites 1. Linux x86_64 2. `CUDA 11.8 `__ 3. |driver link|_ supporting CUDA 11.8 or later. -4. `cuDNN 8 `__ or later. +4. `cuDNN 8.1 `__ or later. 5. For FP8 fused attention, `CUDA 12.1 `__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9 `__ or later. From d3d419117f28af637968c7c1f175656eb72ec94d Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 2 May 2023 07:20:52 -0700 Subject: [PATCH 028/427] Use separate streams for pushsend/recv kernels in UB p2p exchanges (#188) * using different strems for pushsend and pushrecv Signed-off-by: Sangkug Lym * fix stream dependency Signed-off-by: Sangkug Lym * add wait from main_stream to memcpy stream Signed-off-by: Sangkug Lym --------- Signed-off-by: Sangkug Lym Co-authored-by: Kirthi Shankar Sivamani --- .../pytorch/csrc/comm_gemm_overlap.h | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h index 1e8b96f46b..5dd71e4758 100644 --- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h +++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h @@ -332,9 +332,10 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { void *_ubuf_ptr; torch::Tensor _ubuf; std::vector _ubufs; - at::cuda::CUDAStream _stream_comm = at::cuda::getStreamFromPool(true); + at::cuda::CUDAStream _stream_send = at::cuda::getStreamFromPool(true); + at::cuda::CUDAStream _stream_recv = at::cuda::getStreamFromPool(true); std::vector _stream_compute; - cudaEvent_t _start_compute, _stop_compute, _start_comm, _stop_comm, _start_accum, _stop_accum; + cudaEvent_t _start_compute, _stop_compute, _stop_send, _stop_recv; UbufP2PCommOverlap(torch::Tensor sample, int rank, int tp_size, bool aggregate2, int num_max_streams) { @@ -385,10 +386,8 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { // CUDA event creation cudaEventCreateWithFlags(&_start_compute, 0); cudaEventCreateWithFlags(&_stop_compute, 0); - cudaEventCreateWithFlags(&_start_comm, 0); - cudaEventCreateWithFlags(&_stop_comm, 0); - cudaEventCreateWithFlags(&_start_accum, 0); - cudaEventCreateWithFlags(&_stop_accum, 0); + cudaEventCreateWithFlags(&_stop_send, 0); + cudaEventCreateWithFlags(&_stop_recv, 0); } /* @@ -430,7 +429,8 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { assert(pre_gelu_out.numel() == 0); if (_aggregate2) { // Catch up the default torch stream - CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_recv, _start_compute, 0)); const int num_steps = _tp_size / 2; char *input_b_ptr = reinterpret_cast(_ubuf.data_ptr()); @@ -442,11 +442,12 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { int recv_offset = comm_bytes * recv_chunk_id; int peer_rank = (_tp_id % 2 == 0) ? _next_rank : _prev_rank; userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes, _ub_comm, peer_rank, - (cudaStream_t)_stream_comm); + (cudaStream_t)_stream_send); userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes, _ub_comm, peer_rank, - (cudaStream_t)_stream_comm); - CHECK_CUDA(cudaEventRecord(_start_compute, (cudaStream_t)_stream_comm)); - CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); + (cudaStream_t)_stream_recv); + CHECK_CUDA(cudaEventRecord(_stop_recv, (cudaStream_t)_stream_recv)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _stop_recv, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _stop_recv, 0)); int local_rank_round2 = (_tp_id % 2 == 0) ? _tp_id : _tp_id - 1; const int next_rank = (_tp_size + _tp_id + 2) % _tp_size + _rank_round_tp; @@ -476,18 +477,21 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { if (i < num_steps - 1) { // P2P communication userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes * 2, _ub_comm, - next_rank, (cudaStream_t)_stream_comm); + next_rank, (cudaStream_t)_stream_send); userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes * 2, _ub_comm, - prev_rank, (cudaStream_t)_stream_comm); - CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + prev_rank, (cudaStream_t)_stream_recv); + CHECK_CUDA(cudaEventRecord(_stop_recv, (cudaStream_t)_stream_recv)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _stop_recv, 0)); CHECK_CUDA(cudaStreamWaitEvent( - (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_comm, 0)); + (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_recv, 0)); } else if (B_copy.numel() > 0) { assert(B_copy.numel() == _ubufs[_tp_id].numel()); assert(B_copy.element_size() == _ubufs[_tp_id].element_size()); CHECK_CUDA(cudaMemcpyAsync(B_copy.data_ptr(), _ubufs[_tp_id].data_ptr(), _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), - cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_comm)); + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaEventRecord(_stop_send, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_send, 0)); } } at::cuda::setCurrentCUDAStream(stream_main); @@ -497,7 +501,8 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[last_compute_stream_id])); } else { // Catch up the default torch stream - CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_recv, _start_compute, 0)); CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); for (int i = 0; i < _tp_size; i++) { @@ -524,18 +529,21 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { if (i < _tp_size - 1) { // P2P communication userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, comm_bytes, _ub_comm, - _next_rank, (cudaStream_t)_stream_comm); + _next_rank, (cudaStream_t)_stream_send); userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, comm_bytes, _ub_comm, - _prev_rank, (cudaStream_t)_stream_comm); - CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + _prev_rank, (cudaStream_t)_stream_recv); + CHECK_CUDA(cudaEventRecord(_stop_recv, (cudaStream_t)_stream_recv)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _stop_recv, 0)); CHECK_CUDA(cudaStreamWaitEvent( - (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_comm, 0)); + (cudaStream_t)_stream_compute[(i + 1) % _stream_compute.size()], _stop_recv, 0)); } else if (B_copy.numel() > 0) { assert(B_copy.numel() == _ubufs[_tp_id].numel()); assert(B_copy.element_size() == _ubufs[_tp_id].element_size()); CHECK_CUDA(cudaMemcpyAsync(B_copy.data_ptr(), _ubufs[_tp_id].data_ptr(), _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), - cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_comm)); + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaEventRecord(_stop_send, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_send, 0)); } } at::cuda::setCurrentCUDAStream(stream_main); @@ -544,7 +552,6 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[last_compute_stream_id])); } CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_compute, 0)); - CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _stop_compute, 0)); return D; } // split_overlap_ag From 8e5f00f203ee518961bfb8febb017a2ffcc1d6b3 Mon Sep 17 00:00:00 2001 From: Shriya Palsamudram <69161273+ShriyaPalsamudram@users.noreply.github.com> Date: Wed, 10 May 2023 13:22:10 -0400 Subject: [PATCH 029/427] Shriya/tp overlap patch (#205) userbuffer pushsend/recv fix with atomicAdd_system Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym --- transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu index 9144e9e739..2c8e9dc61d 100644 --- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu @@ -1551,7 +1551,7 @@ __global__ void __launch_bounds__(MAX_THREADS) __threadfence_system(); atomicAdd(flagptr, 1); // otherwise need local SM sync before sending flag } else { // 0 bytes and 1 SM only - atomicAdd(flagptr, 1); + atomicAdd_system(flagptr, 1); } } @@ -1561,7 +1561,7 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *f volatile int *flag = (volatile int *)flagptr; if (*flag >= signal_id) return; clock_t s = clock64(); - while (*flag < signal_id) { + while (atomicAdd_system(flagptr, 0) < signal_id) { if (clock64() - s > TIMEOUT) { printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, *flag); return; From f92c430e56c7f74de389a2a55f79d186b06ceeb5 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Mon, 22 May 2023 13:55:33 -0700 Subject: [PATCH 030/427] Relax checks for attn_mask_type in FlashAttention (#226) * relax attn mask type checks for FlashAttention Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * disable flash attn if mask tensor is not None Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix the logic for flash attn Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * minor fix for lint Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- transformer_engine/pytorch/attention.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 41b4d5fcd4..29e6412b02 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -281,9 +281,6 @@ def __init__( assert ( _flash_attn_version >= _flash_attn_version_required ), f"FlashAttention minimum version {_flash_attn_version_required} is required." - assert ( - attn_mask_type == "causal" - ), 'FlashAttention currently only supports causal attention mask.' self.attn_causal_mask = attn_mask_type == "causal" self.norm_factor = norm_factor @@ -296,7 +293,6 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """flash-attn fprop""" @@ -308,9 +304,6 @@ def forward( assert ( query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda ), 'FlashAttention currently only supports CUDA tensors.' - assert ( - attention_mask is None - ), 'FlashAttention currently does not support external attention mask.' # For now just 128, will make it more general in the future @@ -428,7 +421,6 @@ def __init__( self.device_compute_capability = get_device_compute_capability() self.use_flash_attention = ( int(os.getenv("NVTE_FLASH_ATTN", "1")) - and attn_mask_type == "causal" and self.device_compute_capability >= 8.0 ) @@ -437,6 +429,7 @@ def __init__( "attention_dropout_ctx": attention_dropout_ctx, "attn_mask_type": attn_mask_type, } + self.attn_mask_type = attn_mask_type if self.use_flash_attention: self.flash_attention = FlashAttention(norm_factor, **attn_kwargs) @@ -514,6 +507,9 @@ def forward( ): use_flash_attention = False + if self.attn_mask_type == "padding" and attention_mask is not None: + use_flash_attention = False + if is_in_onnx_export_mode(): use_flash_attention = False From 06cacd205e317d9ce804a87b686ada89e967912d Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 23 May 2023 13:14:32 +0800 Subject: [PATCH 031/427] Jax bug fixes for the dot product attention (#236) * Unfused scale+softmax if bias is present Signed-off-by: Reese Wang * WAR a causal masking + no_bias bug and add the unittests Signed-off-by: Reese Wang * Fix the optional args (bias) sharding Signed-off-by: Reese Wang * Disable fused attn in JAX by default, enable it with NVTE_USE_FUSED_ATTN Signed-off-by: Reese Wang * Add thread local for the plan cache Signed-off-by: Reese Wang * Rename dbeta to dbias for the readability Signed-off-by: Reese Wang * Add scaled softmax with dropout test cases Signed-off-by: Reese Wang * Updated NVTE_FUSED_ATTN variable name Signed-off-by: Reese Wang --------- Signed-off-by: Reese Wang --- tests/jax/test_fused_attn.py | 67 ++++++++++++------- tests/jax/test_layer.py | 6 ++ .../fused_attn_fp16_bf16_max_seqlen_512.cu | 8 +-- .../common/fused_attn/fused_attn_fp8.cu | 4 +- transformer_engine/jax/flax/transformer.py | 17 ++++- transformer_engine/jax/sharding.py | 4 +- 6 files changed, 71 insertions(+), 35 deletions(-) diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py index fb333275bb..2504960705 100644 --- a/tests/jax/test_fused_attn.py +++ b/tests/jax/test_fused_attn.py @@ -113,7 +113,7 @@ def customcall_cross_fused_attn(q, kv, q_token, kv_token, dropout_rng, **kwargs) reason="Fused attention kernel is not supported.") class TestSelfFusedAttnMax512(): - def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): + def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): key = jax.random.PRNGKey(0) subkeys = jax.random.split(key, 2) @@ -125,7 +125,8 @@ def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): min_val, max_val = -1, 1 self.qkv = jax.random.uniform(subkeys[0], qkv_shape, dtype, min_val, max_val) - self.bias = jax.random.uniform(subkeys[1], bias_shape, dtype, min_val, max_val) + self.bias = jax.random.uniform(subkeys[1], bias_shape, dtype, min_val, + max_val) if with_bias else None self.q_token = jnp.concatenate((jnp.ones((b, self.valid_len)), jnp.zeros((b, pad_len))), axis=-1) @@ -133,8 +134,8 @@ def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): self.scaling_factor = 1. / math.sqrt(d) self.dropout_probability = 0. - self.dropout_rng = jax.random.PRNGKey(0) - self.attn_bias_type = AttnBiasType.POST_SCALE_BIAS + self.dropout_rng = jax.random.PRNGKey(0) if self.dropout_probability > 0 else None + self.attn_bias_type = AttnBiasType.NO_BIAS if self.bias is None else AttnBiasType.POST_SCALE_BIAS # deterministic = not is_training self.deterministic = False @@ -143,9 +144,17 @@ def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): @pytest.mark.parametrize('attn_mask_type', [AttnMaskType.PADDING_MASK, AttnMaskType.CAUSAL_MASK]) @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): + @pytest.mark.parametrize('with_bias', [True, False]) + def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): - self.set_input(b, s, h, d, dtype=dtype, attn_mask_type=attn_mask_type, pad_ratio=pad_ratio) + self.set_input(b, + s, + h, + d, + dtype=dtype, + attn_mask_type=attn_mask_type, + pad_ratio=pad_ratio, + with_bias=with_bias) primitive_out = customcall_self_fused_attn(self.qkv, self.bias, @@ -183,8 +192,16 @@ def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): [AttnMaskType.PADDING_MASK, AttnMaskType.CAUSAL_MASK]) @pytest.mark.parametrize('dtype', DTYPES) @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - def test_forward_backward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio): - self.set_input(b, s, h, d, dtype=dtype, attn_mask_type=attn_mask_type, pad_ratio=pad_ratio) + @pytest.mark.parametrize('with_bias', [True, False]) + def test_forward_backward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): + self.set_input(b, + s, + h, + d, + dtype=dtype, + attn_mask_type=attn_mask_type, + pad_ratio=pad_ratio, + with_bias=with_bias) def grad_func(fused_attn_max_512_func, *args, **kwargs): # Gradient is small, use a gradient multiplier to amplify the graident @@ -221,11 +238,11 @@ def grad_func(fused_attn_max_512_func, *args, **kwargs): (0, 1))) primitive_out, (primitive_dqkv, - primitive_dbeta) = jitted_primitive(self.qkv, self.bias, self.q_token, + primitive_dbias) = jitted_primitive(self.qkv, self.bias, self.q_token, self.kv_token, self.dropout_rng) reference_out, (reference_dqkv, - reference_dbeta) = jitted_reference(self.qkv, self.bias, self.q_token, + reference_dbias) = jitted_reference(self.qkv, self.bias, self.q_token, self.kv_token, self.dropout_rng) np.testing.assert_allclose(jnp.asarray(primitive_out, np.float32), @@ -261,20 +278,22 @@ def grad_func(fused_attn_max_512_func, *args, **kwargs): # Padded part should be 0s assert jnp.allclose(invalid_primitive_dqkv, jnp.zeros_like(invalid_primitive_dqkv)) - # dbeta valid part - np.testing.assert_allclose( - jnp.asarray(primitive_dbeta[:, :, :self.valid_len, :self.valid_len], np.float32), - jnp.asarray(reference_dbeta[:, :, :self.valid_len, :self.valid_len], np.float32), - rtol=1e-4, - atol=3e-5) - - # dbeta padded part - np.testing.assert_allclose( - jnp.asarray(primitive_dbeta[:, :, self.valid_len:, self.valid_len:], np.float32), - jnp.asarray(reference_dbeta[:, :, self.valid_len:, self.valid_len:], np.float32)) - - assert jnp.allclose(primitive_dbeta[:, :, self.valid_len:, self.valid_len:], - jnp.zeros_like(primitive_dbeta[:, :, self.valid_len:, self.valid_len:])) + if self.attn_bias_type != AttnBiasType.NO_BIAS: + # dbias valid part + np.testing.assert_allclose( + jnp.asarray(primitive_dbias[:, :, :self.valid_len, :self.valid_len], np.float32), + jnp.asarray(reference_dbias[:, :, :self.valid_len, :self.valid_len], np.float32), + rtol=1e-4, + atol=3e-5) + + # dbias padded part + np.testing.assert_allclose( + jnp.asarray(primitive_dbias[:, :, self.valid_len:, self.valid_len:], np.float32), + jnp.asarray(reference_dbias[:, :, self.valid_len:, self.valid_len:], np.float32)) + + assert jnp.allclose( + primitive_dbias[:, :, self.valid_len:, self.valid_len:], + jnp.zeros_like(primitive_dbias[:, :, self.valid_len:, self.valid_len:])) @pytest.mark.skipif(not is_fused_attn_kernel_available(), diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py index 9cce15aa70..30143e5f75 100644 --- a/tests/jax/test_layer.py +++ b/tests/jax/test_layer.py @@ -102,6 +102,12 @@ def compare_frozen_dict(ref_fd, test_fd, rtol=1e-05, atol=1e-08): _KEY_OF_DROPOUT_RATE: 0.0, _KEY_OF_MLP_ACTIVATIONS: (('gelu', 'linear')), _KEY_OF_FUSE_MLP_WI: True +}, { + _KEY_OF_SCALE_ATTN_LOGITS: True, + _KEY_OF_LAYERNORM_TYPE: 'rmsnorm', + _KEY_OF_DROPOUT_RATE: 0.8, + _KEY_OF_MLP_ACTIVATIONS: (('gelu', 'linear')), + _KEY_OF_FUSE_MLP_WI: True }, { _KEY_OF_TRANSPOSE_BS: False, _KEY_OF_SCALE_ATTN_LOGITS: True, diff --git a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu b/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu index c01018137b..53f4f72636 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu @@ -327,7 +327,6 @@ static cudnn_frontend::Tensor createSoftmaxForward( // NOLINTNEXTLINE(runtime/references) std::vector &ops, cudnn_frontend::Tensor const &prevBlockOutputTensor) { - int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; @@ -645,7 +644,7 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv mask_type, tensorType}; using CacheType = std::map; - static CacheType fmha_fprop_cache; + static thread_local CacheType fmha_fprop_cache; bool enable_dropout = (dropout_probability != 0.0f); @@ -668,7 +667,8 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv createScale(b, h, s_q, s_kv, d, layout, tensorType, ops); // if bias, we need to memset the S buffer to correctly computate dbias - auto zero_s = (bias_type != NVTE_Bias_Type::NVTE_NO_BIAS); + auto zero_s = (bias_type != NVTE_Bias_Type::NVTE_NO_BIAS) || + (mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK); auto bmm1_output = createBMM1(b, h, s_q, s_kv, d, layout, tensorType, zero_s, ops); NVTE_CHECK(bias_type != NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS, @@ -814,7 +814,7 @@ void fused_attn_max_512_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv layout, bias_type, mask_type, tensorType}; using CacheType = std::map; - static CacheType fmha_bprop_cache; + static thread_local CacheType fmha_bprop_cache; auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { auto it = cache.find(descriptor); diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu index be483b8af5..768ac8eb20 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu @@ -1016,7 +1016,7 @@ void fa_fwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, NVTE_Bias_Type::NVTE_NO_BIAS, NVTE_Mask_Type::NVTE_PADDING_MASK, tensorType}; using CacheType = std::map; - static CacheType fa_fprop_cache; + static thread_local CacheType fa_fprop_cache; // Get plan from cache if cache is available, otherwise create one auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { @@ -1332,7 +1332,7 @@ void fa_bwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, NVTE_Bias_Type::NVTE_NO_BIAS, NVTE_Mask_Type::NVTE_PADDING_MASK, tensorType}; using CacheType = std::map; - static CacheType fa_bprop_cache; + static thread_local CacheType fa_bprop_cache; // Get plan from cache if cache is available, otherwise create one auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py index a6b9f92b6f..3b4a61f3aa 100644 --- a/transformer_engine/jax/flax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -7,6 +7,7 @@ import functools from enum import Enum from math import sqrt +import os from typing import Any, Callable, Optional, Sequence, Tuple, Union import warnings @@ -165,8 +166,17 @@ def core_attention(query: Array, else: attn_weights = jnp.einsum('bqhd,bkhd->bhqk', query, key) + # When a bias is present, the computation is performed as Softmax(attn_weights * scale + bias). + # In this case, the scale can not fused into the Softmax module. + if bias is not None: + attn_weights = attn_weights * scale_factor + fused_scale_factor = 1. + else: + # If no bias, the scale can be fused into Softmax module + fused_scale_factor = scale_factor + attn_weights = Softmax(softmax_type=softmax_type, - scale_factor=scale_factor, + scale_factor=fused_scale_factor, sharding_type=softmax_sharding_type)(attn_weights, mask, bias) if not deterministic and dropout_rate > 0.: @@ -360,12 +370,13 @@ def kv_init(key, shape, dtype): q_seqlen = inputs_q.shape[0] if self.transpose_batch_sequence else inputs_q.shape[1] kv_seqlen = inputs_kv.shape[0] if self.transpose_batch_sequence else inputs_kv.shape[1] fused_attn_supported_seqlen = [128, 256, 384, 512] + enable_fused_attn = int(os.getenv("NVTE_FUSED_ATTN", "0")) use_fused_attn = not decode and not self.transpose_batch_sequence and self.fuse_qkv and \ self.dropout_rate == 0 and canonicalize_dtype in [jnp.bfloat16, jnp.float16] and \ q_seqlen in fused_attn_supported_seqlen and kv_seqlen in fused_attn_supported_seqlen \ - and is_fused_attn_kernel_available() + and is_fused_attn_kernel_available() and enable_fused_attn - if not use_fused_attn: + if enable_fused_attn and not use_fused_attn: reason = "" if decode: reason += f"decode=False is required but got {decode}, " diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py index 939072cfd4..f93a3c0983 100644 --- a/transformer_engine/jax/sharding.py +++ b/transformer_engine/jax/sharding.py @@ -386,7 +386,7 @@ def _get_dptp_sharding_meta(input_shapes: Tuple[Tuple[int, ...]], for input_shape, dp_dim, tp_dim in zip(input_shapes, input_dp_dims, input_tp_dims): in_axis = {} - if dp_dim is not None: + if dp_dim is not None and input_shape is not None: in_axis[dp_dim] = dp_axis_name assert input_shape[dp_dim] % dp_size == 0, \ f"The dimension of batch in input_shape should be a multiple of " \ @@ -398,7 +398,7 @@ def _get_dptp_sharding_meta(input_shapes: Tuple[Tuple[int, ...]], if tp_dim is not None and tp_dim >= dp_dim: tp_dim = tp_dim + 1 - if tp_dim is not None: + if tp_dim is not None and input_shape is not None: in_axis[tp_dim] = tp_axis_name assert input_shape[tp_dim] % tp_size == 0, \ f"The dimension of tensor parallel in input_shape should be a multiple of " \ From 84a4a7504221e671efdf9d582d994250c3cdf465 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 20 Jun 2023 23:44:44 +0800 Subject: [PATCH 032/427] [JAX] Add self_attn_mask_type and replace attn_type (#273) * Add self_attn_mask_type and replace attn_type Signed-off-by: Reese Wang * Refine the keyword style for the better readability Signed-off-by: Reese Wang * Replace attn_type with attn_mask_type in praxis transformer Signed-off-by: Reese Wang * Fix typos Signed-off-by: Reese Wang --------- Signed-off-by: Reese Wang Co-authored-by: Kirthi Shankar Sivamani --- tests/jax/test_praxis_layers.py | 13 ++- transformer_engine/jax/flax/transformer.py | 105 +++++++++++++++---- transformer_engine/jax/praxis/transformer.py | 12 ++- 3 files changed, 96 insertions(+), 34 deletions(-) diff --git a/tests/jax/test_praxis_layers.py b/tests/jax/test_praxis_layers.py index 3adec948bd..de44b3a163 100644 --- a/tests/jax/test_praxis_layers.py +++ b/tests/jax/test_praxis_layers.py @@ -20,7 +20,6 @@ from transformer_engine.jax.flax import RelativePositionBiases as flax_RelativePositionBiases from transformer_engine.jax.flax import TransformerLayer as flax_TransformerLayer from transformer_engine.jax.flax.module import Softmax -from transformer_engine.jax.flax.transformer import AttentionType from transformer_engine.jax.fp8 import FP8Helper, is_fp8_available from transformer_engine.jax.praxis import LayerNorm from transformer_engine.jax.praxis import FusedSoftmax, LayerNorm @@ -666,32 +665,32 @@ class MultiHeadAttnAttr: USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: False, - ATTN_TYPE: AttentionType.PADDING + ATTN_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: True, - ATTN_TYPE: AttentionType.PADDING + ATTN_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'rmsnorm', ZERO_CEN: False, - ATTN_TYPE: AttentionType.PADDING + ATTN_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: False, - ATTN_TYPE: AttentionType.CAUSAL + ATTN_TYPE: 'causal' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: True, - ATTN_TYPE: AttentionType.CAUSAL + ATTN_TYPE: 'causal' }, { USE_BIAS: True, LN_TYPE: 'rmsnorm', ZERO_CEN: False, - ATTN_TYPE: AttentionType.CAUSAL + ATTN_TYPE: 'causal' }] diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py index c8a949c90e..563b15d526 100644 --- a/transformer_engine/jax/flax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -197,17 +197,16 @@ def core_attention(query: Array, dynamic_vector_slice_in_dim = vmap(lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None)) -class AttentionType(Enum): - """TransformerLayerType.""" - PADDING = AttnMaskType.PADDING_MASK - CAUSAL = AttnMaskType.CAUSAL_MASK - - class MultiHeadAttention(nn.Module): r""" Multi-head Attention (MHA), including Query, Key, Value and Output projection. + .. warning:: + + Argument :attr:`attn_type` is deprecated and superseded by :attr:`attn_mask_type`. + :attr:`attn_type` is ignored in version 0.10 and will be fully removed in version 0.11. + Parameters ---------- head_dim : int @@ -245,8 +244,11 @@ class MultiHeadAttention(nn.Module): Indicate if apply residual connection with the output of layer normalization. output_layernorm : bool, default = False Indicate if apply a layer normalization at the end of MHA. - attn_type: AttentionType, defult = AttentionType.PADDING - Indicate the format of the attention mask in the core attention. + attn_type: Any, defult = None + *Deprecated*, will be ignored in v0.10 and be fully removed in v0.11. + Please use `attn_mask_type` to config the attention mask. + attn_mask_type: {'causal', 'padding'}, default = 'causal' + Type of attention mask passed into softmax operation. Optimization parameters ----------------------- @@ -282,7 +284,9 @@ class MultiHeadAttention(nn.Module): bias_init: Initializer = nn.initializers.zeros apply_residual_connection_post_layernorm: bool = False output_layernorm: bool = False - attn_type: AttentionType = AttentionType.PADDING + # TODO(rewang): remove attn_type and the related doc after v0.11 + attn_type: Any = None + attn_mask_type: str = 'causal' dtype: DType = jnp.float32 fuse_qkv: bool = True transpose_batch_sequence: bool = True @@ -293,6 +297,14 @@ class MultiHeadAttention(nn.Module): def __post_init__(self): if self.kernel_init is None: self.kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'normal') + # TODO(rewang): remove attn_type after v0.11 + if self.attn_type is not None: + warnings.warn( + "The 'attn_type' argument in the 'MultiHeadAttention' is" + " deprecated in version 0.10 and will be removed in version 0.11." + " Passing value in attn_type will be ignored, please use `attn_mask_type`" + " to config the attention mask type.", + category=DeprecationWarning) super().__post_init__() @nn.compact @@ -570,9 +582,23 @@ def kv_init(key, shape, dtype): if use_fused_attn: assert mask is not None and mask.ndim == 4 # (b, 1, s_q, s_kv) assert not self.transpose_batch_sequence + # TODO(rewang): make it configurable for pre_scale_bias attn_bias_type = AttnBiasType.NO_BIAS if bias is None else AttnBiasType.POST_SCALE_BIAS + def canonicalize_attn_mask_type(attn_mask_type): + """ + Convert the string to AttnMaskType + """ + if attn_mask_type == 'causal': + return AttnMaskType.CAUSAL_MASK + if attn_mask_type == 'padding': + return AttnMaskType.PADDING_MASK + raise ValueError(f"Unsupported {attn_mask_type=}, " + "supported attn_mask_type = {'causal', 'padding'}") + + attn_mask_type = canonicalize_attn_mask_type(self.attn_mask_type) + if inputs_q is inputs_kv: qkv_proj = qkv_proj.reshape((*qkv_proj.shape[:-1], self.num_heads, self.head_dim)) qkv_sharding_constraint = ('batch', 'length', 'qkv_dim', 'heads', 'kv') @@ -583,7 +609,7 @@ def kv_init(key, shape, dtype): mask, dropout_rng, attn_bias_type=attn_bias_type, - attn_mask_type=self.attn_type.value, + attn_mask_type=attn_mask_type, scaling_factor=scale_factor, dropout_probability=self.dropout_rate, is_training=not deterministic, @@ -602,18 +628,27 @@ def kv_init(key, shape, dtype): mask, dropout_rng, attn_bias_type=attn_bias_type, - attn_mask_type=self.attn_type.value, + attn_mask_type=attn_mask_type, scaling_factor=scale_factor, dropout_probability=self.dropout_rate, is_training=not deterministic, sharding_type=first_sharding_type) else: - softmax_type = SoftmaxType.SCALED - if self.attn_type is AttentionType.PADDING: - if mask is not None: - softmax_type = SoftmaxType.SCALED_MASKED - else: - softmax_type = SoftmaxType.SCALED_UPPER_TRIANG_MASKED + + def convert_to_softmax_type(attn_mask_type, mask): + """ + Convert the string to SoftmaxType + """ + if attn_mask_type == 'causal': + return SoftmaxType.SCALED_UPPER_TRIANG_MASKED + if attn_mask_type == 'padding': + if mask is not None: + return SoftmaxType.SCALED_MASKED + return SoftmaxType.SCALED + raise ValueError(f"Unsupported {attn_mask_type=}, " + "supported attn_mask_type = {'causal', 'padding'}") + + softmax_type = convert_to_softmax_type(self.attn_mask_type, mask) x = core_attention(query, key, @@ -765,6 +800,18 @@ class TransformerLayer(nn.Module): an attention block and a feedforward network (MLP). This standard layer is based on the paper “Attention Is All You Need”. + .. warning:: + + Argument :attr:`self_attn_mask_type` is introduced in version 0.10. + Starting from version 0.11, the default value will be `"causal"`. + However, to ensure compatibility with earlier versions, before 0.11, + the default value will be `"padding"` for the encoder and `"causal"` for the decoder. + + .. note:: + + Argument :attr:`attention_mask` will be ignored when + :attr:`self_attn_mask_type` is set to `"causal"`. + Parameters ---------- hidden_size: int, default = 512 @@ -825,6 +872,8 @@ class TransformerLayer(nn.Module): If set to TransformerLayerType.DECODER, an additional cross-attention block is added after self-attention.this can be used for structures like `T5` Transformer in conjunction with the TransformerLayerType.ENCODER option. + self_attn_mask_type: {'causal', 'padding'}, default = 'causal' + Type of attention mask passed into softmax operation. enable_relative_embedding: bool, default = True Whether to enable relative embedding as shifting of attention logits. relative_embedding: flax.linen.Module, default = None @@ -878,6 +927,7 @@ class TransformerLayer(nn.Module): output_layernorm: bool = False float32_attention_logits: bool = False layer_type: TransformerLayerType = TransformerLayerType.ENCODER + self_attn_mask_type: str = None # TODO(rewang): default to 'causal' after 0.11 enable_relative_embedding: bool = True relative_embedding: nn.Module = None dtype: DType = jnp.float32 @@ -893,6 +943,19 @@ def __post_init__(self): if self.mlp_kernel_init is None: self.mlp_kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal') + # TODO(rewang): default to 'causal' in 0.11 (also updated the doc after 0.11) + if self.self_attn_mask_type is None: + warnings.warn( + "The 'self_attn_mask_type' argument in the 'TransformerLayer' is" + " introduced in version 0.10. Starting from version 0.11, the default" + " value will be 'causal'. However, to ensure compatibility with earlier" + " versions, before 0.11, the default value will be 'padding' for the" + " encoder and 'causal' for the decoder.", + category=FutureWarning) + if self.layer_type == TransformerLayerType.ENCODER: + self.self_attn_mask_type = 'padding' + else: + self.self_attn_mask_type = 'causal' super().__post_init__() @nn.compact @@ -975,16 +1038,12 @@ def __call__(self, assert inputs.ndim == 3 - self_attn_type = None # Make name be the exactly same as T5X, since names would affect # RNGKey during init and apply. Myabe no need in the feature. if self.layer_type == TransformerLayerType.ENCODER: mha_name = 'attention' - self_attn_type = AttentionType.PADDING else: mha_name = 'self_attention' - self_attn_type = AttentionType.CAUSAL - assert self_attn_type is not None # [batch, length, emb_dim] -> [batch, length, emb_dim] x, residual = MultiHeadAttention( @@ -1002,7 +1061,7 @@ def __call__(self, zero_centered_gamma=self.zero_centered_gamma, apply_residual_connection_post_layernorm=self.apply_residual_connection_post_layernorm, output_layernorm=self.output_layernorm, - attn_type=self_attn_type, + attn_mask_type=self.self_attn_mask_type, fuse_qkv=self.fuse_qkv_params, kernel_init=self.mha_kernel_init, use_bias=self.use_bias, @@ -1049,7 +1108,7 @@ def hidden_dropout(x, deterministic): apply_residual_connection_post_layernorm=self. apply_residual_connection_post_layernorm, output_layernorm=False, # Must do LayerNorm before MHA. - attn_type=AttentionType.PADDING, + attn_mask_type='padding', float32_logits=self.float32_attention_logits, scale_attn_logits=self.scale_attn_logits, scaled_query_init=self.scaled_query_init, diff --git a/transformer_engine/jax/praxis/transformer.py b/transformer_engine/jax/praxis/transformer.py index 32facd04aa..1260c266b5 100644 --- a/transformer_engine/jax/praxis/transformer.py +++ b/transformer_engine/jax/praxis/transformer.py @@ -5,14 +5,14 @@ Praxis Modules related Transformer """ from functools import partial -from typing import Optional, Sequence, Tuple +from typing import Any, Optional, Sequence, Tuple from praxis import pax_fiddle from praxis.base_layer import WeightInit from praxis.pytypes import JTensor from .module import TransformerEngineBaseLayer -from ..flax.transformer import AttentionType, TransformerLayerType +from ..flax.transformer import TransformerLayerType from ..flax.transformer import MultiHeadAttention as flax_MultiHeadAttention from ..flax.transformer import RelativePositionBiases as flax_RelativePositionBiases from ..flax.transformer import TransformerLayer as flax_TransformerLayer @@ -73,7 +73,9 @@ class MultiHeadAttention(TransformerEngineBaseLayer): bias_init: WeightInit = WeightInit.Constant(0.0) apply_residual_connection_post_layernorm: bool = False output_layernorm: bool = False - attn_type: AttentionType = AttentionType.PADDING + # TODO(rewang): remove attn_type and the related doc after v0.11 + attn_type: Any = None + attn_mask_type: str = 'causal' fuse_qkv: bool = True transpose_batch_sequence: bool = True scale_attn_logits: bool = False @@ -99,7 +101,7 @@ def setup(self) -> None: bias_init=TransformerEngineBaseLayer.generate_params_init("bias", self.bias_init), apply_residual_connection_post_layernorm=self.apply_residual_connection_post_layernorm, output_layernorm=self.output_layernorm, - attn_type=self.attn_type, + attn_mask_type=self.attn_mask_type, fuse_qkv=self.fuse_qkv, transpose_batch_sequence=self.transpose_batch_sequence, scale_attn_logits=self.scale_attn_logits, @@ -145,6 +147,7 @@ class TransformerLayer(TransformerEngineBaseLayer): output_layernorm: bool = False float32_attention_logits: bool = False layer_type: TransformerLayerType = TransformerLayerType.ENCODER + self_attn_mask_type: str = None # TODO(rewang): default to 'causal' after 0.11 enable_relative_embedding: bool = True relative_embedding: pax_fiddle.Config[RelativePositionBiases] = pax_fiddle.template_field(None) drop_path: float = 0.0 @@ -201,6 +204,7 @@ def setup(self) -> None: output_layernorm=self.output_layernorm, float32_attention_logits=self.float32_attention_logits, layer_type=self.layer_type, + self_attn_mask_type=self.self_attn_mask_type, enable_relative_embedding=self.enable_relative_embedding, relative_embedding=relative_embedding_flax_module, drop_path=self.drop_path, From 4244ba91390a41a849a4188cc2c9a434609045dc Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Wed, 21 Jun 2023 02:45:27 +0800 Subject: [PATCH 033/427] Support dropout for the fused attention when max seqlen <= 512 (#227) * Enable fused attention dropout Signed-off-by: Reese Wang * Cast the uint32 key/counter to int64 Signed-off-by: Reese Wang * Update dropout support in fused attention docs Signed-off-by: Reese Wang * Revise devPtrCuSeqlen* to align the naming Signed-off-by: Reese Wang * Support different Jax PRNG impls Signed-off-by: Reese Wang * Revert CastAsync since it is not used Signed-off-by: Reese Wang * Implement is_training for 16-bit fused attn Signed-off-by: Reese Wang * Add fused attn with dropout sanity unit tests Signed-off-by: Reese Wang * Enhance the comments readability and rng_state checker Signed-off-by: Reese Wang * Change the attention dropout shape to align other frameworks Signed-off-by: Reese Wang * Make encoder tests deterministic Signed-off-by: Reese Wang * Change the default seed for the jax encoder tests Signed-off-by: Reese Wang * Maintain offset in TE Signed-off-by: Reese Wang * Enhance the resource safety Signed-off-by: Reese Wang * Revert rng_state type to allow only i64 Signed-off-by: Reese Wang * Handle the corner case for elts_per_threads calculation Signed-off-by: Reese Wang * Populate rng state by kernels Signed-off-by: Reese Wang * Rename rng_state as seed in cpp_extensions Signed-off-by: Reese Wang * Update the attention dropout comment Signed-off-by: Reese Wang --------- Signed-off-by: Reese Wang Co-authored-by: Kirthi Shankar Sivamani --- .../encoder/test_model_parallel_encoder.py | 2 +- examples/jax/encoder/test_multigpu_encoder.py | 2 +- .../encoder/test_multiprocessing_encoder.py | 2 +- .../jax/encoder/test_single_gpu_encoder.py | 2 +- qa/L0_jax_unittest/test.sh | 7 +- tests/jax/test_fused_attn.py | 138 +++++++++++---- tests/jax/utils.py | 2 - .../fused_attn_fp16_bf16_max_seqlen_512.cu | 161 ++++++++++-------- transformer_engine/common/fused_attn/utils.cu | 4 + .../include/transformer_engine/fused_attn.h | 8 +- transformer_engine/jax/CMakeLists.txt | 2 +- transformer_engine/jax/cpp_extensions.py | 81 ++++++--- transformer_engine/jax/csrc/modules.cpp | 48 ++++-- .../jax/csrc/{utils.cpp => utils.cu} | 18 ++ transformer_engine/jax/csrc/utils.h | 24 +++ transformer_engine/jax/flax/transformer.py | 19 ++- transformer_engine/jax/fused_attn.py | 39 +++-- 17 files changed, 375 insertions(+), 184 deletions(-) rename transformer_engine/jax/csrc/{utils.cpp => utils.cu} (52%) diff --git a/examples/jax/encoder/test_model_parallel_encoder.py b/examples/jax/encoder/test_model_parallel_encoder.py index 0a2af0623e..4a26244fff 100644 --- a/examples/jax/encoder/test_model_parallel_encoder.py +++ b/examples/jax/encoder/test_model_parallel_encoder.py @@ -377,7 +377,7 @@ def encoder_parser(args): default=False, help="quickly check a single pass", ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--seed", type=int, default=0, metavar="S", help="random seed (default: 0)") parser.add_argument("--use-fp8", action="store_true", default=False, diff --git a/examples/jax/encoder/test_multigpu_encoder.py b/examples/jax/encoder/test_multigpu_encoder.py index 48f858af58..ef3837c8d4 100644 --- a/examples/jax/encoder/test_multigpu_encoder.py +++ b/examples/jax/encoder/test_multigpu_encoder.py @@ -359,7 +359,7 @@ def encoder_parser(args): default=False, help="quickly check a single pass", ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--seed", type=int, default=0, metavar="S", help="random seed (default: 0)") parser.add_argument("--use-fp8", action="store_true", default=False, diff --git a/examples/jax/encoder/test_multiprocessing_encoder.py b/examples/jax/encoder/test_multiprocessing_encoder.py index 61e5bda9df..a21346458c 100644 --- a/examples/jax/encoder/test_multiprocessing_encoder.py +++ b/examples/jax/encoder/test_multiprocessing_encoder.py @@ -459,7 +459,7 @@ def encoder_parser(args): default=False, help="quickly check a single pass", ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--seed", type=int, default=0, metavar="S", help="random seed (default: 0)") parser.add_argument("--use-fp8", action="store_true", default=False, diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py index 3db264daf7..62798eed82 100644 --- a/examples/jax/encoder/test_single_gpu_encoder.py +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -294,7 +294,7 @@ def encoder_parser(args): default=False, help="quickly check a single pass", ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument("--seed", type=int, default=0, metavar="S", help="random seed (default: 0)") parser.add_argument("--use-fp8", action="store_true", default=False, diff --git a/qa/L0_jax_unittest/test.sh b/qa/L0_jax_unittest/test.sh index 62242ba075..72d2817456 100644 --- a/qa/L0_jax_unittest/test.sh +++ b/qa/L0_jax_unittest/test.sh @@ -9,5 +9,10 @@ pytest -Wignore -v $TE_PATH/tests/jax pip install -r $TE_PATH/examples/jax/mnist/requirements.txt pip install -r $TE_PATH/examples/jax/encoder/requirements.txt -pytest -Wignore -v $TE_PATH/examples/jax --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py + +pytest -Wignore -v $TE_PATH/examples/jax/mnist + +# Make encoder tests to have run-to-run deterministic to have the stable CI results +export XLA_FLAGS="--xla_gpu_deterministic_ops" +pytest -Wignore -v $TE_PATH/examples/jax/encoder --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py pytest -Wignore -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py index 2504960705..8e4d59a9e2 100644 --- a/tests/jax/test_fused_attn.py +++ b/tests/jax/test_fused_attn.py @@ -54,6 +54,7 @@ def jax_self_fused_attn(qkv, bias, q_token, kv_token, dropout_rng, **kwargs): value, bias=bias, mask=mask, + deterministic=not kwargs['is_training'], dropout_rate=kwargs['dropout_probability'], dropout_rng=dropout_rng, dtype=qkv.dtype) @@ -78,6 +79,7 @@ def jax_cross_fused_attn(q, kv, q_token, kv_token, dropout_rng, **kwargs): value, bias=None, mask=mask, + deterministic=not kwargs['is_training'], dropout_rate=kwargs['dropout_probability'], dropout_rng=dropout_rng, dtype=q.dtype) @@ -113,7 +115,8 @@ def customcall_cross_fused_attn(q, kv, q_token, kv_token, dropout_rng, **kwargs) reason="Fused attention kernel is not supported.") class TestSelfFusedAttnMax512(): - def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): + def set_input(self, b, s, h, d, *, attn_bias_type, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): key = jax.random.PRNGKey(0) subkeys = jax.random.split(key, 2) @@ -125,6 +128,8 @@ def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): min_val, max_val = -1, 1 self.qkv = jax.random.uniform(subkeys[0], qkv_shape, dtype, min_val, max_val) + + with_bias = attn_bias_type != AttnBiasType.NO_BIAS self.bias = jax.random.uniform(subkeys[1], bias_shape, dtype, min_val, max_val) if with_bias else None @@ -133,28 +138,81 @@ def set_input(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): self.kv_token = self.q_token self.scaling_factor = 1. / math.sqrt(d) - self.dropout_probability = 0. + self.dropout_probability = dropout_probability self.dropout_rng = jax.random.PRNGKey(0) if self.dropout_probability > 0 else None - self.attn_bias_type = AttnBiasType.NO_BIAS if self.bias is None else AttnBiasType.POST_SCALE_BIAS - # deterministic = not is_training - self.deterministic = False + self.attn_bias_type = attn_bias_type + self.is_training = is_training @pytest.mark.parametrize('b, s, h, d', SELF_CASES) - @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('attn_bias_type', [AttnBiasType.NO_BIAS, AttnBiasType.POST_SCALE_BIAS]) @pytest.mark.parametrize('attn_mask_type', [AttnMaskType.PADDING_MASK, AttnMaskType.CAUSAL_MASK]) + @pytest.mark.parametrize('dropout_probability', [0., 0.1]) + @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('is_training', [True, False]) @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - @pytest.mark.parametrize('with_bias', [True, False]) - def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): + def test_sanity(self, b, s, h, d, attn_bias_type, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): + + def grad_func(func, *args, **kwargs): + # Keep only valid result for the gradient + # fused_attn_max_512 output has shape (b, s, h, d) + valid_ret, _ = jnp.split(func(*args, **kwargs), (self.valid_len,), axis=1) + return jnp.mean(valid_ret, dtype=jnp.float32).astype(dtype) self.set_input(b, s, h, d, + attn_bias_type=attn_bias_type, + attn_mask_type=attn_mask_type, + dropout_probability=dropout_probability, dtype=dtype, + is_training=is_training, + pad_ratio=pad_ratio) + + kwargs = { + 'attn_bias_type': self.attn_bias_type, + 'attn_mask_type': attn_mask_type, + 'scaling_factor': self.scaling_factor, + 'dropout_probability': self.dropout_probability, + 'is_training': self.is_training + } + + jitted_primitive = jit( + value_and_grad( + lambda qkv, bias, q_token, kv_token, dropout_rng: grad_func( + customcall_self_fused_attn, qkv, bias, q_token, kv_token, dropout_rng, **kwargs + ), (0, 1))) + + primitive_out, (primitive_dqkv, + primitive_dbias) = jitted_primitive(self.qkv, self.bias, self.q_token, + self.kv_token, self.dropout_rng) + + @pytest.mark.parametrize('b, s, h, d', SELF_CASES) + @pytest.mark.parametrize('attn_bias_type', [AttnBiasType.NO_BIAS, AttnBiasType.POST_SCALE_BIAS]) + @pytest.mark.parametrize('attn_mask_type', + [AttnMaskType.PADDING_MASK, AttnMaskType.CAUSAL_MASK]) + @pytest.mark.parametrize('dropout_probability', [0., 0.1]) + @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('is_training', [True, False]) + @pytest.mark.parametrize('pad_ratio', PAD_RATIO) + def test_forward(self, b, s, h, d, attn_bias_type, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): + # dropout can't get the bitmatch result + if is_training and dropout_probability > 0.: + return + + self.set_input(b, + s, + h, + d, + attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, - pad_ratio=pad_ratio, - with_bias=with_bias) + dropout_probability=dropout_probability, + dtype=dtype, + is_training=is_training, + pad_ratio=pad_ratio) primitive_out = customcall_self_fused_attn(self.qkv, self.bias, @@ -165,7 +223,7 @@ def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): attn_mask_type=attn_mask_type, scaling_factor=self.scaling_factor, dropout_probability=self.dropout_probability, - is_training=not self.deterministic) + is_training=self.is_training) reference_out = jax_self_fused_attn(self.qkv, self.bias, @@ -174,7 +232,8 @@ def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): self.dropout_rng, attn_mask_type=attn_mask_type, scaling_factor=self.scaling_factor, - dropout_probability=self.dropout_probability) + dropout_probability=self.dropout_probability, + is_training=self.is_training) ref_valid, _ = jnp.split(reference_out, (self.valid_len,), axis=1) pri_valid, pri_invalid = jnp.split(primitive_out, (self.valid_len,), axis=1) @@ -188,20 +247,25 @@ def test_forward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): jnp.zeros_like(pri_invalid, jnp.float32)) @pytest.mark.parametrize('b, s, h, d', SELF_CASES) + @pytest.mark.parametrize('attn_bias_type', [AttnBiasType.NO_BIAS, AttnBiasType.POST_SCALE_BIAS]) @pytest.mark.parametrize('attn_mask_type', [AttnMaskType.PADDING_MASK, AttnMaskType.CAUSAL_MASK]) + @pytest.mark.parametrize('dropout_probability', [0.]) # dropout can't get the bitmatch result @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('is_training', [True]) # backward is only used when is_training @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - @pytest.mark.parametrize('with_bias', [True, False]) - def test_forward_backward(self, b, s, h, d, dtype, attn_mask_type, pad_ratio, with_bias): + def test_forward_backward(self, b, s, h, d, attn_bias_type, attn_mask_type, dropout_probability, + dtype, is_training, pad_ratio): self.set_input(b, s, h, d, - dtype=dtype, + attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, - pad_ratio=pad_ratio, - with_bias=with_bias) + dropout_probability=dropout_probability, + dtype=dtype, + is_training=is_training, + pad_ratio=pad_ratio) def grad_func(fused_attn_max_512_func, *args, **kwargs): # Gradient is small, use a gradient multiplier to amplify the graident @@ -221,7 +285,7 @@ def grad_func(fused_attn_max_512_func, *args, **kwargs): 'attn_mask_type': attn_mask_type, 'scaling_factor': self.scaling_factor, 'dropout_probability': self.dropout_probability, - 'is_training': not self.deterministic + 'is_training': self.is_training } # Use FP16/BF16 to sum the results may cause overflow, use FP32 for the summation @@ -300,7 +364,8 @@ def grad_func(fused_attn_max_512_func, *args, **kwargs): reason="Fused attention kernel is not supported.") class TestCrossFusedAttnMax512(): - def set_input(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): + def set_input(self, b, s_q, s_kv, h, d, *, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): key = jax.random.PRNGKey(0) subkeys = jax.random.split(key, 2) @@ -321,25 +386,32 @@ def set_input(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): (b, kv_pad_len))), axis=-1) self.scaling_factor = 1. / math.sqrt(d) - self.dropout_probability = 0. - self.dropout_rng = jax.random.PRNGKey(0) + self.dropout_probability = dropout_probability + self.dropout_rng = jax.random.PRNGKey(0) if self.dropout_probability > 0 else None self.attn_bias_type = AttnBiasType.NO_BIAS - # deterministic = not is_training - self.deterministic = False + self.is_training = is_training @pytest.mark.parametrize('b, s_q, s_kv, h, d', CROSS_CASES) @pytest.mark.parametrize('attn_mask_type', [AttnMaskType.PADDING_MASK]) + @pytest.mark.parametrize('dropout_probability', [0., 0.1]) @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('is_training', [True, False]) @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - def test_forward(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): + def test_forward(self, b, s_q, s_kv, h, d, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): + # dropout can't get the bitmatch result + if is_training and dropout_probability > 0.: + return self.set_input(b, s_q, s_kv, h, d, - dtype=dtype, attn_mask_type=attn_mask_type, + dropout_probability=dropout_probability, + dtype=dtype, + is_training=is_training, pad_ratio=pad_ratio) primitive_out = customcall_cross_fused_attn(self.q, @@ -351,7 +423,7 @@ def test_forward(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): attn_mask_type=attn_mask_type, scaling_factor=self.scaling_factor, dropout_probability=self.dropout_probability, - is_training=not self.deterministic) + is_training=self.is_training) reference_out = jax_cross_fused_attn(self.q, self.kv, @@ -360,7 +432,8 @@ def test_forward(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): self.dropout_rng, attn_mask_type=attn_mask_type, scaling_factor=self.scaling_factor, - dropout_probability=self.dropout_probability) + dropout_probability=self.dropout_probability, + is_training=self.is_training) ref_valid, _ = jnp.split(reference_out, (self.q_valid_len,), axis=1) pri_valid, pri_invalid = jnp.split(primitive_out, (self.q_valid_len,), axis=1) @@ -375,16 +448,21 @@ def test_forward(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): @pytest.mark.parametrize('b, s_q, s_kv, h, d', CROSS_CASES) @pytest.mark.parametrize('attn_mask_type', [AttnMaskType.PADDING_MASK]) + @pytest.mark.parametrize('dropout_probability', [0.]) # dropout can't get the bitmatch result @pytest.mark.parametrize('dtype', DTYPES) + @pytest.mark.parametrize('is_training', [True]) # backward is only used when is_training @pytest.mark.parametrize('pad_ratio', PAD_RATIO) - def test_forward_backward(self, b, s_q, s_kv, h, d, dtype, attn_mask_type, pad_ratio): + def test_forward_backward(self, b, s_q, s_kv, h, d, attn_mask_type, dropout_probability, dtype, + is_training, pad_ratio): self.set_input(b, s_q, s_kv, h, d, - dtype=dtype, attn_mask_type=attn_mask_type, + dropout_probability=dropout_probability, + dtype=dtype, + is_training=is_training, pad_ratio=pad_ratio) def grad_func(fused_attn_max_512_func, *args, **kwargs): @@ -405,7 +483,7 @@ def grad_func(fused_attn_max_512_func, *args, **kwargs): 'attn_mask_type': attn_mask_type, 'scaling_factor': self.scaling_factor, 'dropout_probability': self.dropout_probability, - 'is_training': not self.deterministic + 'is_training': self.is_training } # Use FP16/BF16 to sum the results may cause overflow, use FP32 for the summation diff --git a/tests/jax/utils.py b/tests/jax/utils.py index dc5ef2bb13..893a5afcbe 100644 --- a/tests/jax/utils.py +++ b/tests/jax/utils.py @@ -167,9 +167,7 @@ def dot_product_attention(query: Array, # T5 broadcasts along the "length" dim, but unclear which one that # corresponds to in positional dimensions here, assuming query dim. dropout_shape = list(attn_weights.shape) - dropout_shape[-2] = 1 keep = jax_random.bernoulli(dropout_rng, keep_prob, dropout_shape) - keep = jnp.broadcast_to(keep, attn_weights.shape) multiplier = (keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)) attn_weights = attn_weights * multiplier diff --git a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu b/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu index 53f4f72636..e8906b31c4 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu @@ -22,7 +22,7 @@ #define O_ID 4 #define S_ID 5 #define B_ID 6 -#define D_CONST_ID 7 +#define DROPOUT_CONST_ID 7 #define S_CONST_ID 8 #define Q_SEQLEN_ID 9 #define K_SEQLEN_ID 10 @@ -33,6 +33,8 @@ #define MASK_VAL_ID 15 #define dS_ID 16 #define dBias_ID 17 +#define DROPOUT_SEED_ID 18 +#define DROPOUT_OFFSET_ID 19 #define VIRTUAL_ID 20 @@ -333,8 +335,7 @@ static cudnn_frontend::Tensor createSoftmaxForward( int64_t afterReduction_dim[4] = {b, h, s_q, 1}; int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1}; - cudnnDataType_t softmaxOutputType = - (enable_dropout || softmax_output_virtual) ? CUDNN_DATA_FLOAT : tensorType; + cudnnDataType_t softmaxOutputType = enable_dropout ? CUDNN_DATA_FLOAT : tensorType; uint64_t softmaxOutputName = softmax_output_virtual ? VIRTUAL_ID + 154 : S_ID; // max (x) @@ -427,7 +428,7 @@ static cudnn_frontend::Tensor createSoftmaxForward( } static cudnn_frontend::Tensor createDropout(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, - int64_t d, int64_t seed, double probability, + int64_t d, double probability, cudnnDataType_t tensorType, // NOLINTNEXTLINE(runtime/references) std::vector &ops, @@ -460,8 +461,9 @@ static cudnn_frontend::Tensor createDropout(int64_t b, int64_t h, int64_t s_q, i .setReorderType(reorder_type) .build(); // scale after dropout - auto scaleDropoutTensor = tensor_create(tensorType, D_CONST_ID, scale_dim, scale_stride, false, - true); // is by value + auto scaleDropoutTensor = + tensor_create(tensorType, DROPOUT_CONST_ID, scale_dim, scale_stride, false, + true); // is by value // after Scale auto afterScaleTensor = tensor_create(tensorType, VIRTUAL_ID + 201, afterBMM1_dim, afterBMM1_stride, true, false); // is virtual @@ -472,10 +474,16 @@ static cudnn_frontend::Tensor createDropout(int64_t b, int64_t h, int64_t s_q, i .setBernoulliDistProbability(1.0 - probability) .build(); + auto dropoutSeed = + tensor_create(CUDNN_DATA_INT64, DROPOUT_SEED_ID, scale_dim, scale_stride, false, false); + auto dropoutOffset = + tensor_create(CUDNN_DATA_INT64, DROPOUT_OFFSET_ID, scale_dim, scale_stride, false, false); + // Create a rng Node. auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR) .setyDesc(dropoutMaskTensor) - .setSeed(seed) + .setSeedDesc(dropoutSeed) + .setOffsetDesc(dropoutOffset) .setRngDesc(rngDesc) .build(); @@ -624,16 +632,14 @@ static cudnn_frontend::Tensor createSoftmaxBackward(int64_t b, int64_t h, int64_ return dxTensor; } -void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, - bool is_training, float scaling_factor, float dropout_probability, - NVTE_QKV_Layout layout, NVTE_Bias_Type bias_type, - NVTE_Mask_Type mask_type, void *devPtrQ, void *devPtrK, - void *devPtrV, void *devPtrS, void *devPtrO, void *devPtrBias, - void *devCuSeqlenQ, void *devCuSeqlenK, void *workspace, - size_t *workspace_size, cudnnDataType_t tensorType, - cudaStream_t stream, cudnnHandle_t handle) { +void fused_attn_max_512_fwd_impl( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, bool is_training, + float scaling_factor, float dropout_probability, NVTE_QKV_Layout layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, void *devPtrQ, void *devPtrK, void *devPtrV, + void *devPtrS, void *devPtrO, void *devPtrBias, void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV, + void *devPtrDropoutSeed, void *devPtrDropoutOffset, void *workspace, size_t *workspace_size, + cudnnDataType_t tensorType, cudaStream_t stream, cudnnHandle_t handle) { try { - constexpr int64_t seed = 0; // TODO(rewang): replace this with device seed/offset NVTE_CHECK_CUDNN(cudnnSetStream(handle, stream)); FADescriptor descriptor{b, h, @@ -646,10 +652,13 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv using CacheType = std::map; static thread_local CacheType fmha_fprop_cache; - bool enable_dropout = (dropout_probability != 0.0f); + // softmax auxiliary is only used in the training mode + bool enable_dropout = is_training && (dropout_probability != 0.0f); - NVTE_CHECK(!enable_dropout, - "dropout probability > 0 in fused_attn_max_512 has not been implemented."); + // two conditions that make softmax auxiliary in virtual + // 1. inference mode (not is_training) + // 2. dropout enabled: the auxiliary becomes the dropout output + bool softmax_output_virtual = !is_training || enable_dropout; // Get plan from cache if cache is available, otherwise create one auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { @@ -667,8 +676,10 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv createScale(b, h, s_q, s_kv, d, layout, tensorType, ops); // if bias, we need to memset the S buffer to correctly computate dbias + // WAR: causal_mask without bias needs memset the S buffer + // inference mode doesn't need the S auxiliary auto zero_s = (bias_type != NVTE_Bias_Type::NVTE_NO_BIAS) || - (mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK); + (mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) && is_training; auto bmm1_output = createBMM1(b, h, s_q, s_kv, d, layout, tensorType, zero_s, ops); NVTE_CHECK(bias_type != NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS, @@ -683,14 +694,12 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv NVTE_CHECK(dropout_probability != 1.0f, "Dropout probability cannot be 1.0."); - // TODO(rewang): check whether devPtrS can be removed - bool softmax_output_virtual = enable_dropout; // || devPtrS == nullptr; auto softmax_output = createSoftmaxForward(b, h, s_q, s_kv, d, layout, enable_dropout, softmax_output_virtual, tensorType, ops, mask_output); - if (dropout_probability != 0.0f) { - auto dropout_output = createDropout(b, h, s_q, s_kv, d, seed, dropout_probability, + if (enable_dropout) { + auto dropout_output = createDropout(b, h, s_q, s_kv, d, dropout_probability, tensorType, ops, softmax_output); createBMM2(b, h, s_q, s_kv, d, layout, tensorType, ops, dropout_output); } else { @@ -741,9 +750,10 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv void *devActualSeqlenQ = static_cast(workspace) + plan_workspace_size; void *devActualSeqlenK = static_cast(devActualSeqlenQ) + b * sizeof(int32_t); cu_seqlens_to_actual_seqlens<<>>( - b, static_cast(devCuSeqlenQ), - static_cast(devCuSeqlenK), static_cast(devActualSeqlenQ), - static_cast(devActualSeqlenK)); + b, static_cast(devPtrCuSeqlenQ), + static_cast(devPtrCuSeqlenKV), + static_cast(devActualSeqlenQ), static_cast(devActualSeqlenK)); + NVTE_CHECK_CUDA(cudaGetLastError()); // change this if you have access to float_min float negInfinity = -1.0E+10; @@ -758,16 +768,17 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv data_ptrs.insert(std::pair(K_SEQLEN_ID, devActualSeqlenK)); data_ptrs.insert(std::pair(MASK_VAL_ID, &negInfinity)); + __half half_cast_scaling_factor{scaling_factor}; + __nv_bfloat16 bfloat_cast_scaling_factor{scaling_factor}; + if (tensorType == CUDNN_DATA_FLOAT) { data_ptrs.insert(std::pair(S_CONST_ID, &scaling_factor)); } else if (tensorType == CUDNN_DATA_HALF) { - __half cast_scaling_factor{scaling_factor}; - data_ptrs.insert(std::pair(S_CONST_ID, &cast_scaling_factor)); + data_ptrs.insert(std::pair(S_CONST_ID, &half_cast_scaling_factor)); } else if (tensorType == CUDNN_DATA_BFLOAT16) { - __nv_bfloat16 cast_scaling_factor{scaling_factor}; - data_ptrs.insert(std::pair(S_CONST_ID, &cast_scaling_factor)); + data_ptrs.insert(std::pair(S_CONST_ID, &bfloat_cast_scaling_factor)); } else { - std::cerr << "Not supported tensorType." << std::endl; + NVTE_ERROR("Unsupported tensor type."); } data_ptrs.insert(std::pair(O_ID, devPtrO)); @@ -776,12 +787,30 @@ void fused_attn_max_512_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv data_ptrs.insert(std::pair(B_ID, devPtrBias)); } - if (devPtrS != nullptr) { + // if enable_dropout, S is the result after dropout + // if not enable dropout, S is the result after softmax + if (enable_dropout || !softmax_output_virtual) { data_ptrs.insert(std::pair(S_ID, devPtrS)); } + __half half_cast_scale_dropout{scale_dropout}; + __nv_bfloat16 bfloat16_cast_scale_dropout{scale_dropout}; + if (enable_dropout) { - data_ptrs.insert(std::pair(D_CONST_ID, &scale_dropout)); + // TODO(rewang): make a util func + if (tensorType == CUDNN_DATA_FLOAT) { + data_ptrs.insert(std::pair(DROPOUT_CONST_ID, &scale_dropout)); + } else if (tensorType == CUDNN_DATA_HALF) { + data_ptrs.insert( + std::pair(DROPOUT_CONST_ID, &half_cast_scale_dropout)); + } else if (tensorType == CUDNN_DATA_BFLOAT16) { + data_ptrs.insert( + std::pair(DROPOUT_CONST_ID, &bfloat16_cast_scale_dropout)); + } else { + NVTE_ERROR("Unsupported tensor type."); + } + data_ptrs.insert(std::pair(DROPOUT_SEED_ID, devPtrDropoutSeed)); + data_ptrs.insert(std::pair(DROPOUT_OFFSET_ID, devPtrDropoutOffset)); } auto variantPack = cudnn_frontend::VariantPackBuilder() @@ -802,7 +831,7 @@ void fused_attn_max_512_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv NVTE_Bias_Type bias_type, void *devPtrQ, void *devPtrK, void *devPtrV, void *devPtrS, void *devPtrdQ, void *devPtrdK, void *devPtrdV, void *devPtrdO, void *devPtrdS, void *devPtrdBias, - void *devCuSeqlenQ, void *devCuSeqlenK, void *workspace, + void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV, void *workspace, size_t *workspace_size, cudnnDataType_t tensorType, cudaStream_t stream, cudnnHandle_t handle) { try { @@ -915,7 +944,7 @@ void fused_attn_max_512_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv ops.push_back(std::move(reshape_op)); // scale dropout - auto dropoutScaleTensor = tensor_create(CUDNN_DATA_FLOAT, D_CONST_ID, scale_dim, + auto dropoutScaleTensor = tensor_create(CUDNN_DATA_FLOAT, DROPOUT_CONST_ID, scale_dim, scale_stride, false, true); // is by value auto pAfterScaleTensor = tensor_create(tensorType, VIRTUAL_ID + 301, p_transpose_dim, p_transpose_stride, true, false); @@ -1160,9 +1189,10 @@ void fused_attn_max_512_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv void *devActualSeqlenQ = static_cast(workspace) + plan_workspace_size; void *devActualSeqlenK = static_cast(devActualSeqlenQ) + b * sizeof(int32_t); cu_seqlens_to_actual_seqlens<<>>( - b, static_cast(devCuSeqlenQ), - static_cast(devCuSeqlenK), static_cast(devActualSeqlenQ), - static_cast(devActualSeqlenK)); + b, static_cast(devPtrCuSeqlenQ), + static_cast(devPtrCuSeqlenKV), + static_cast(devActualSeqlenQ), static_cast(devActualSeqlenK)); + NVTE_CHECK_CUDA(cudaGetLastError()); std::set> data_ptrs; // add all the data pointers to be used in the variant pack @@ -1183,13 +1213,10 @@ void fused_attn_max_512_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv data_ptrs.insert(std::pair(dBias_ID, devPtrdBias)); } - NVTE_CHECK(dropout_probability == 0.f, - "dropout probability > 0 in fused_attn_max_512 has not been implemented."); - float zeroVal = 0.0f; float dropoutScale = 1.0f / (1.0f - dropout_probability); - data_ptrs.insert(std::pair(D_CONST_ID, &dropoutScale)); + data_ptrs.insert(std::pair(DROPOUT_CONST_ID, &dropoutScale)); data_ptrs.insert(std::pair(S_CONST_ID, &scaling_factor)); data_ptrs.insert(std::pair(MASK_VAL_ID, &zeroVal)); @@ -1216,8 +1243,6 @@ void fused_attn_max_512_fwd_qkvpacked( Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - // Only is_training is verified - NVTE_CHECK(is_training, "is_training=False is not implemented in fused_attn_max_512."); NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, "qkv_layout must be NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED."); @@ -1246,23 +1271,22 @@ void fused_attn_max_512_fwd_qkvpacked( devPtrS = output_S->data.dptr; } - void *devCuSeqlen = cu_seqlens->data.dptr; + void *devPtrCuSeqlen = cu_seqlens->data.dptr; - // TODO(rewang): dropout seed - // void* devPtrDropoutSeed = reinterpret_cast( - // reinterpret_cast(rng_state->data.dptr)); - // void* devPtrDropoutOffset = reinterpret_cast( - // reinterpret_cast(rng_state->data.dptr) + 1); + const DType rng_state_type = rng_state->data.dtype; + NVTE_CHECK(rng_state_type == DType::kInt64); + void *devPtrDropoutSeed = rng_state->data.dptr; + void *devPtrDropoutOffset = + static_cast(static_cast(rng_state->data.dptr) + 1); const DType QKV_type = input_QKV->data.dtype; size_t workspace_size = 0; - // TODO(rewang): replace CPU seed - fused_attn_max_512_fwd_impl(batch, num_head, max_seqlen, max_seqlen, head_dim, is_training, - attn_scale, p_dropout, qkv_layout, bias_type, mask_type, devPtrQ, - devPtrK, devPtrV, devPtrS, devPtrO, devPtrBias, devCuSeqlen, - devCuSeqlen, workspace->data.dptr, &workspace_size, - get_cudnn_dtype(QKV_type), stream, handle); + fused_attn_max_512_fwd_impl( + batch, num_head, max_seqlen, max_seqlen, head_dim, is_training, attn_scale, p_dropout, + qkv_layout, bias_type, mask_type, devPtrQ, devPtrK, devPtrV, devPtrS, devPtrO, devPtrBias, + devPtrCuSeqlen, devPtrCuSeqlen, devPtrDropoutSeed, devPtrDropoutOffset, + workspace->data.dptr, &workspace_size, get_cudnn_dtype(QKV_type), stream, handle); if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { @@ -1288,8 +1312,6 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - // Only is_training is verified - NVTE_CHECK(is_training, "is_training=False is not implemented in fused_attn_max_512."); NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED, "qkv_layout must be NVTE_QKV_Layout::NVTE_KV_INTERLEAVED."); NVTE_CHECK(bias_type == NVTE_Bias_Type::NVTE_NO_BIAS || @@ -1328,20 +1350,19 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k void *devQCuSeqlen = q_cu_seqlens->data.dptr; void *devKVCuSeqlen = kv_cu_seqlens->data.dptr; - // TODO(rewang): dropout seed - // void* devPtrDropoutSeed = reinterpret_cast( - // reinterpret_cast(rng_state->data.dptr)); - // void* devPtrDropoutOffset = reinterpret_cast( - // reinterpret_cast(rng_state->data.dptr) + 1); + const DType rng_state_type = rng_state->data.dtype; + NVTE_CHECK(rng_state_type == DType::kInt64); + void *devPtrDropoutSeed = rng_state->data.dptr; + void *devPtrDropoutOffset = + static_cast(static_cast(rng_state->data.dptr) + 1); size_t workspace_size = 0; - // TODO(rewang): replace CPU seed - fused_attn_max_512_fwd_impl(batch, num_head, q_max_seqlen, kv_max_seqlen, head_dim, is_training, - attn_scale, p_dropout, qkv_layout, bias_type, mask_type, devPtrQ, - devPtrK, devPtrV, devPtrS, devPtrO, devPtrBias, devQCuSeqlen, - devKVCuSeqlen, workspace->data.dptr, &workspace_size, - get_cudnn_dtype(q_type), stream, handle); + fused_attn_max_512_fwd_impl( + batch, num_head, q_max_seqlen, kv_max_seqlen, head_dim, is_training, attn_scale, p_dropout, + qkv_layout, bias_type, mask_type, devPtrQ, devPtrK, devPtrV, devPtrS, devPtrO, devPtrBias, + devQCuSeqlen, devKVCuSeqlen, devPtrDropoutSeed, devPtrDropoutOffset, workspace->data.dptr, + &workspace_size, get_cudnn_dtype(q_type), stream, handle); if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { diff --git a/transformer_engine/common/fused_attn/utils.cu b/transformer_engine/common/fused_attn/utils.cu index 5ae4b42c16..cae42bafa0 100644 --- a/transformer_engine/common/fused_attn/utils.cu +++ b/transformer_engine/common/fused_attn/utils.cu @@ -256,6 +256,10 @@ __global__ void cu_seqlens_to_actual_seqlens(size_t b, cudnnDataType_t get_cudnn_dtype(const transformer_engine::DType t) { using namespace transformer_engine; switch (t) { + case DType::kInt32: + return CUDNN_DATA_INT32; + case DType::kInt64: + return CUDNN_DATA_INT64; case DType::kFloat16: return CUDNN_DATA_HALF; case DType::kFloat32: diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index 6311da2465..ed6dd4c041 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -106,7 +106,7 @@ enum NVTE_Mask_Type { \verbatim | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | No | <= 512 | 64 | + | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * * \param[in] QKV The QKV tensor in packed format, @@ -149,7 +149,7 @@ void nvte_fused_attn_fwd_qkvpacked( \verbatim | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | No | <= 512 | 64 | + | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * * \param[in] QKV The QKV tensor in packed format, @@ -200,7 +200,7 @@ void nvte_fused_attn_bwd_qkvpacked( * Support Matrix: \verbatim | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | No | <= 512 | 64 | + | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. @@ -247,7 +247,7 @@ void nvte_fused_attn_fwd_kvpacked( * Support Matrix: \verbatim | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | No | <= 512 | 64 | + | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. diff --git a/transformer_engine/jax/CMakeLists.txt b/transformer_engine/jax/CMakeLists.txt index 9e8efa2c60..cf9a48244d 100644 --- a/transformer_engine/jax/CMakeLists.txt +++ b/transformer_engine/jax/CMakeLists.txt @@ -6,7 +6,7 @@ pybind11_add_module( transformer_engine_jax ${CMAKE_CURRENT_SOURCE_DIR}/csrc/extensions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/modules.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/csrc/utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/utils.cu ) target_link_libraries(transformer_engine_jax PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt transformer_engine) diff --git a/transformer_engine/jax/cpp_extensions.py b/transformer_engine/jax/cpp_extensions.py index 566b95ff63..b8dc0447c7 100644 --- a/transformer_engine/jax/cpp_extensions.py +++ b/transformer_engine/jax/cpp_extensions.py @@ -8,6 +8,8 @@ from typing import Tuple from functools import partial, reduce import operator +import warnings + import numpy as np from jaxlib.hlo_helpers import custom_call import jax.numpy as jnp @@ -1679,7 +1681,7 @@ def lowering(ctx, grad_outputs, softmax_outputs, *, scale_factor): grad_outputs, softmax_outputs, scale_factor) - return out # out is iterable already + return out # out is iterable already _scaled_softmax_bwd_p = register_primitive(ScaledSoftmaxBwdPrimitive) @@ -1828,7 +1830,7 @@ def lowering(ctx, grad_outputs, softmax_outputs, *, scale_factor): grad_outputs, softmax_outputs, scale_factor) - return out # out is iterable already + return out # out is iterable already _scaled_masked_softmax_bwd_p = register_primitive(ScaledMaskedSoftmaxBwdPrimitive) @@ -1962,7 +1964,7 @@ def lowering(ctx, grad_outputs, softmax_outputs, *, scale_factor): ScaledUpperTriangMaskedSoftmaxBwdPrimitive.name, ctx, grad_outputs, softmax_outputs, scale_factor) - return out # out is iterable already + return out # out is iterable already _scaled_upper_triang_masked_softmax_bwd_p = \ register_primitive(ScaledUpperTriangMaskedSoftmaxBwdPrimitive) @@ -1979,6 +1981,27 @@ def scaled_upper_triang_masked_softmax_bwd(grad_outputs: jnp.ndarray, softmax_ou scale_factor=scale_factor) +def _check_seed(seed, dropout_probability, is_training): + # Jax can't bind None, create a dummy tensor for None + if seed is None: + dropout_enabled = dropout_probability > 0 and is_training + assert not dropout_enabled, "seed is not allowed to be None when dropout is enabled." + seed = jnp.zeros(2, dtype=jnp.uint32) + + if seed.dtype != jnp.uint32: + warnings.warn( + f"Requested {seed.dtype=} is not available, and will be " + f"casted to dtype uint32. " + f"Please use threefry/rbg/unsafe_rbg PRNG implementations to remove this warning.") + seed = seed.astype(jnp.uint32) + + assert seed.dtype == jnp.uint32 + # Only the first 2 u32 elements are taken + assert seed.size >= 2 + + return seed + + class SelfFusedAttnMax512FwdPrimitive(BasePrimitive): """ Self Fused Attention Max Seqlen 512 Forward Primitive @@ -1991,7 +2014,7 @@ def abstract( qkv, bias, cu_seqlen, # pylint: disable=unused-argument - rng_state, # pylint: disable=unused-argument + seed, # pylint: disable=unused-argument *, attn_bias_type, # pylint: disable=unused-argument attn_mask_type, # pylint: disable=unused-argument @@ -2020,8 +2043,8 @@ def abstract( ) @staticmethod - def lowering(ctx, qkv, bias, cu_seqlen, rng_state, *, attn_bias_type, attn_mask_type, - scaling_factor, dropout_probability, is_training): + def lowering(ctx, qkv, bias, cu_seqlen, seed, *, attn_bias_type, attn_mask_type, scaling_factor, + dropout_probability, is_training): """ Self fused attention max seqlen 512 fwd lowering rules """ @@ -2036,8 +2059,8 @@ def lowering(ctx, qkv, bias, cu_seqlen, rng_state, *, attn_bias_type, attn_mask_ ir_cu_seqlen_type = ir.RankedTensorType(cu_seqlen.type) ir_cu_seqlen_shape = ir_cu_seqlen_type.shape - ir_rng_state_type = ir.RankedTensorType(rng_state.type) - ir_rng_state_shape = ir_rng_state_type.shape + ir_seed_type = ir.RankedTensorType(seed.type) + ir_seed_shape = ir_seed_type.shape batch, max_seqlen, nqkv, num_head, head_dim = ir_qkv_shape assert nqkv == 3 @@ -2049,8 +2072,8 @@ def lowering(ctx, qkv, bias, cu_seqlen, rng_state, *, attn_bias_type, attn_mask_ ir.RankedTensorType.get(output_shape, ir_qkv_type.element_type), ir.RankedTensorType.get(softmax_aux_shape, ir_qkv_type.element_type) ] - operands = [qkv, bias, cu_seqlen, rng_state] - operand_shapes = [ir_qkv_shape, ir_bias_shape, ir_cu_seqlen_shape, ir_rng_state_shape] + operands = [qkv, bias, cu_seqlen, seed] + operand_shapes = [ir_qkv_shape, ir_bias_shape, ir_cu_seqlen_shape, ir_seed_shape] args = CustomCallArgsWrapper(out_types, operands, operand_shapes) opaque = transformer_engine_jax.pack_fused_attn_descriptor( @@ -2069,23 +2092,22 @@ def lowering(ctx, qkv, bias, cu_seqlen, rng_state, *, attn_bias_type, attn_mask_ def self_fused_attn_max_512_fwd(qkv: jnp.ndarray, bias: jnp.ndarray, cu_seqlen: jnp.ndarray, - rng_state: jnp.ndarray, attn_bias_type: NVTE_Bias_Type, + seed: jnp.ndarray, attn_bias_type: NVTE_Bias_Type, attn_mask_type: NVTE_Mask_Type, scaling_factor: float, dropout_probability: float, is_training: bool): """ Wrapper for TE self fused attention max seqlen 512 fwd Return BMM1 -> (PreBias) -> ScaleMaskSoftmax -> (PostBias) -> (Dropout) -> BMM2 """ - # Jax can't bind None, create a dummy tensor for None - if rng_state is None: - rng_state = jnp.zeros(2, dtype=jnp.int32) + seed = _check_seed(seed, dropout_probability, is_training) + if bias is None: assert attn_bias_type == NVTE_Bias_Type.NVTE_NO_BIAS bias = jnp.zeros(0, dtype=qkv.dtype) return _self_fused_attn_max_512_fwd_p.bind(qkv, bias, cu_seqlen, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -2161,6 +2183,9 @@ def lowering(ctx, qkv, softmax_aux, doutput, cu_seqlen, *, attn_bias_type, attn_ operand_shapes = [ir_qkv_shape, ir_softmax_aux_shape, ir_doutput_shape, ir_cu_seqlen_shape] args = CustomCallArgsWrapper(out_types, operands, operand_shapes) + + # the dropout elements are encoded in the forward auxiliary tensor + # so seed is not needed in backward opaque = transformer_engine_jax.pack_fused_attn_descriptor( batch, num_head, max_seqlen, max_seqlen, head_dim, scaling_factor, dropout_probability, attn_bias_type, attn_mask_type, jax_dtype_to_te_dtype(qkv_aval.dtype), is_training) @@ -2208,7 +2233,7 @@ def abstract( kv, q_cu_seqlen, kv_cu_seqlen, - rng_state, # pylint: disable=unused-argument + seed, # pylint: disable=unused-argument *, attn_bias_type, # pylint: disable=unused-argument attn_mask_type, # pylint: disable=unused-argument @@ -2243,8 +2268,8 @@ def abstract( ) @staticmethod - def lowering(ctx, q, kv, q_cu_seqlen, kv_cu_seqlen, rng_state, *, attn_bias_type, - attn_mask_type, scaling_factor, dropout_probability, is_training): + def lowering(ctx, q, kv, q_cu_seqlen, kv_cu_seqlen, seed, *, attn_bias_type, attn_mask_type, + scaling_factor, dropout_probability, is_training): """ Cross fused attention max seqlen 512 fwd lowering rules """ @@ -2260,8 +2285,8 @@ def lowering(ctx, q, kv, q_cu_seqlen, kv_cu_seqlen, rng_state, *, attn_bias_type ir_q_cu_seqlen_shape = ir.RankedTensorType(q_cu_seqlen.type).shape ir_kv_cu_seqlen_shape = ir.RankedTensorType(kv_cu_seqlen.type).shape - ir_rng_state_type = ir.RankedTensorType(rng_state.type) - ir_rng_state_shape = ir_rng_state_type.shape + ir_seed_type = ir.RankedTensorType(seed.type) + ir_seed_shape = ir_seed_type.shape batch, q_max_seqlen, num_head, head_dim = ir_q_shape kv_max_seqlen = ir_kv_shape[1] @@ -2273,9 +2298,9 @@ def lowering(ctx, q, kv, q_cu_seqlen, kv_cu_seqlen, rng_state, *, attn_bias_type ir.RankedTensorType.get(output_shape, ir_q_type.element_type), ir.RankedTensorType.get(softmax_aux_shape, ir_q_type.element_type) ] - operands = [q, kv, q_cu_seqlen, kv_cu_seqlen, rng_state] + operands = [q, kv, q_cu_seqlen, kv_cu_seqlen, seed] operand_shapes = [ - ir_q_shape, ir_kv_shape, ir_q_cu_seqlen_shape, ir_kv_cu_seqlen_shape, ir_rng_state_shape + ir_q_shape, ir_kv_shape, ir_q_cu_seqlen_shape, ir_kv_cu_seqlen_shape, ir_seed_shape ] args = CustomCallArgsWrapper(out_types, operands, operand_shapes) @@ -2296,7 +2321,7 @@ def lowering(ctx, q, kv, q_cu_seqlen, kv_cu_seqlen, rng_state, *, attn_bias_type def cross_fused_attn_max_512_fwd(q: jnp.ndarray, kv: jnp.ndarray, q_cu_seqlen: jnp.ndarray, - kv_cu_seqlen: jnp.ndarray, rng_state: jnp.ndarray, + kv_cu_seqlen: jnp.ndarray, seed: jnp.ndarray, attn_bias_type: NVTE_Bias_Type, attn_mask_type: NVTE_Mask_Type, scaling_factor: float, dropout_probability: float, is_training: bool): @@ -2304,14 +2329,13 @@ def cross_fused_attn_max_512_fwd(q: jnp.ndarray, kv: jnp.ndarray, q_cu_seqlen: j Wrapper for TE cross fused attention max seqlen 512 fwd Return BMM1 -> (PreBias) -> ScaleMaskSoftmax -> (PostBias) -> (Dropout) -> BMM2 """ - # Jax can't bind None, create a dummy tensor for None - if rng_state is None: - rng_state = jnp.zeros(2, dtype=jnp.int32) + seed = _check_seed(seed, dropout_probability, is_training) + return _cross_fused_attn_max_512_fwd_p.bind(q, kv, q_cu_seqlen, kv_cu_seqlen, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -2391,6 +2415,9 @@ def lowering(ctx, q, kv, softmax_aux, doutput, q_cu_seqlen, kv_cu_seqlen, *, att ] args = CustomCallArgsWrapper(out_types, operands, operand_shapes) + + # the dropout elements are encoded in the forward auxiliary tensor + # so seed is not needed in backward opaque = transformer_engine_jax.pack_fused_attn_descriptor( batch, num_head, q_max_seqlen, kv_max_seqlen, head_dim, scaling_factor, dropout_probability, attn_bias_type, attn_mask_type, diff --git a/transformer_engine/jax/csrc/modules.cpp b/transformer_engine/jax/csrc/modules.cpp index b1c9d5d21a..d6d3caf4ba 100644 --- a/transformer_engine/jax/csrc/modules.cpp +++ b/transformer_engine/jax/csrc/modules.cpp @@ -749,7 +749,7 @@ void SelfFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char void *qkv = buffers[0]; void *bias = buffers[1]; void *cu_seqlens = buffers[2]; - void *rng_state = buffers[3]; + void *seed = buffers[3]; // output void *output = buffers[4]; @@ -778,30 +778,37 @@ void SelfFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char auto cu_seqlens_tensor = TensorWrapper(cu_seqlens, std::vector{batch + 1}, DType::kInt32); - auto rng_state_tensor = TensorWrapper(rng_state, std::vector{1}, DType::kInt64); + + auto dummy_rng_state_tensor = TensorWrapper(nullptr, std::vector{2}, DType::kInt64); NVTETensorPack aux_output_tensors; nvte_tensor_pack_create(&aux_output_tensors); TensorWrapper query_workspace_tensor; - nvte_fused_attn_fwd_qkvpacked(qkv_tensor.data(), bias_tensor.data(), s_tensor.data(), - o_tensor.data(), &aux_output_tensors, cu_seqlens_tensor.data(), - rng_state_tensor.data(), q_max_seqlen, descriptor.is_training, - descriptor.scaling_factor, descriptor.dropout_probability, - NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, descriptor.bias_type, - descriptor.mask_type, query_workspace_tensor.data(), stream); + nvte_fused_attn_fwd_qkvpacked( + qkv_tensor.data(), bias_tensor.data(), s_tensor.data(), o_tensor.data(), + &aux_output_tensors, cu_seqlens_tensor.data(), dummy_rng_state_tensor.data(), q_max_seqlen, + descriptor.is_training, descriptor.scaling_factor, descriptor.dropout_probability, + NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, descriptor.bias_type, descriptor.mask_type, + query_workspace_tensor.data(), stream); auto *output_s = reinterpret_cast(aux_output_tensors.tensors[0]); output_s->data.dptr = softmax_aux; - size_t workspace_size = + // fused attn workspace + workspace for rng_state + auto plan_workspace_size = query_workspace_tensor.shape().data[0] * typeToSize(query_workspace_tensor.dtype()); - auto *workspace = cublasLtMetaManager::Instance().GetWorkspace(workspace_size); - + auto rng_workspace_size = 2 * sizeof(int64_t); + auto total_workspace_size = plan_workspace_size + rng_workspace_size; + auto *workspace = cublasLtMetaManager::Instance().GetWorkspace(total_workspace_size); auto workspace_tensor = TensorWrapper(workspace, query_workspace_tensor.shape(), query_workspace_tensor.dtype()); + auto rng_state = static_cast(workspace) + plan_workspace_size; + auto rng_state_tensor = TensorWrapper(rng_state, std::vector{2}, DType::kInt64); + PopulateRngStateAsync(rng_state, seed, q_max_seqlen, kv_max_seqlen, stream); + nvte_fused_attn_fwd_qkvpacked(qkv_tensor.data(), bias_tensor.data(), s_tensor.data(), o_tensor.data(), &aux_output_tensors, cu_seqlens_tensor.data(), rng_state_tensor.data(), q_max_seqlen, descriptor.is_training, @@ -907,7 +914,7 @@ void CrossFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char void *kv = buffers[1]; void *q_cu_seqlens = buffers[2]; void *kv_cu_seqlens = buffers[3]; - void *rng_state = buffers[4]; + void *seed = buffers[4]; // output void *output = buffers[5]; @@ -939,7 +946,8 @@ void CrossFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char TensorWrapper(q_cu_seqlens, std::vector{batch + 1}, DType::kInt32); auto kv_cu_seqlens_tensor = TensorWrapper(kv_cu_seqlens, std::vector{batch + 1}, DType::kInt32); - auto rng_state_tensor = TensorWrapper(rng_state, std::vector{1}, DType::kInt64); + + auto dummy_rng_state_tensor = TensorWrapper(nullptr, std::vector{2}, DType::kInt64); NVTETensorPack aux_output_tensors; nvte_tensor_pack_create(&aux_output_tensors); @@ -949,7 +957,7 @@ void CrossFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char nvte_fused_attn_fwd_kvpacked( q_tensor.data(), kv_tensor.data(), bias_tensor.data(), s_tensor.data(), o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(), kv_cu_seqlens_tensor.data(), - rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, descriptor.is_training, + dummy_rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, descriptor.is_training, descriptor.scaling_factor, descriptor.dropout_probability, NVTE_QKV_Layout::NVTE_KV_INTERLEAVED, descriptor.bias_type, descriptor.mask_type, query_workspace_tensor.data(), stream); @@ -957,13 +965,19 @@ void CrossFusedAttnMax512Forward(cudaStream_t stream, void **buffers, const char auto *output_s = reinterpret_cast(aux_output_tensors.tensors[0]); output_s->data.dptr = softmax_aux; - size_t workspace_size = + // fused attn workspace + workspace for rng_state + auto plan_workspace_size = query_workspace_tensor.shape().data[0] * typeToSize(query_workspace_tensor.dtype()); - auto *workspace = cublasLtMetaManager::Instance().GetWorkspace(workspace_size); - + auto rng_workspace_size = 2 * sizeof(int64_t); + auto total_workspace_size = plan_workspace_size + rng_workspace_size; + auto *workspace = cublasLtMetaManager::Instance().GetWorkspace(total_workspace_size); auto workspace_tensor = TensorWrapper(workspace, query_workspace_tensor.shape(), query_workspace_tensor.dtype()); + auto rng_state = static_cast(workspace) + plan_workspace_size; + auto rng_state_tensor = TensorWrapper(rng_state, std::vector{2}, DType::kInt64); + PopulateRngStateAsync(rng_state, seed, q_max_seqlen, kv_max_seqlen, stream); + nvte_fused_attn_fwd_kvpacked( q_tensor.data(), kv_tensor.data(), bias_tensor.data(), s_tensor.data(), o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(), kv_cu_seqlens_tensor.data(), diff --git a/transformer_engine/jax/csrc/utils.cpp b/transformer_engine/jax/csrc/utils.cu similarity index 52% rename from transformer_engine/jax/csrc/utils.cpp rename to transformer_engine/jax/csrc/utils.cu index f8440e2625..0970076838 100644 --- a/transformer_engine/jax/csrc/utils.cpp +++ b/transformer_engine/jax/csrc/utils.cu @@ -32,5 +32,23 @@ int GetDeviceComputeCapability(int gpu_id) { return gpu_arch; } +__global__ void populate_rng_state_kernel(int64_t *rng_state_dst, const int64_t *const seed, + int64_t offset) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid > 0) return; + rng_state_dst[0] = seed[0]; + rng_state_dst[1] = offset; +} + +void PopulateRngStateAsync(void *rng_state_dst, const void *const seed, size_t q_max_seqlen, + size_t kv_max_seqlen, cudaStream_t stream) { + constexpr int threads_per_cta = 128; + const size_t increment = (q_max_seqlen * kv_max_seqlen + threads_per_cta - 1) / threads_per_cta; + auto offset = FusedAttnOffsetManager::Instance().GetAndUpdateOffset(increment); + populate_rng_state_kernel<<<1, 1, 0, stream>>>(reinterpret_cast(rng_state_dst), + reinterpret_cast(seed), offset); + NVTE_CHECK_CUDA(cudaGetLastError()); +} + } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/csrc/utils.h b/transformer_engine/jax/csrc/utils.h index 448c6706c7..baa014d6cb 100644 --- a/transformer_engine/jax/csrc/utils.h +++ b/transformer_engine/jax/csrc/utils.h @@ -21,6 +21,9 @@ namespace jax { int GetCudaRuntimeVersion(); int GetDeviceComputeCapability(int gpu_id); +void PopulateRngStateAsync(void *rng_state_dst, const void *const seed, size_t q_max_seqlen, + size_t kv_max_seqlen, cudaStream_t stream); + class cublasLtMetaManager { public: static cublasLtMetaManager &Instance() { @@ -93,6 +96,27 @@ class cudaDevicePropertiesManager { cudaDeviceProp prop_; }; +class FusedAttnOffsetManager { + public: + static FusedAttnOffsetManager &Instance() { + static thread_local FusedAttnOffsetManager instance; + return instance; + } + + size_t GetAndUpdateOffset(size_t increment) { + size_t ret = offset_; + offset_ += increment; + return ret; + } + + FusedAttnOffsetManager(FusedAttnOffsetManager const &) = delete; + void operator=(FusedAttnOffsetManager const &) = delete; + + private: + FusedAttnOffsetManager() {} + size_t offset_ = 0; +}; + } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py index 563b15d526..14ad7f02e8 100644 --- a/transformer_engine/jax/flax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -11,6 +11,7 @@ from typing import Any, Callable, Optional, Sequence, Tuple, Union import warnings +import jax import jax.numpy as jnp import numpy as np from flax import linen as nn @@ -182,9 +183,8 @@ def core_attention(query: Array, if not deterministic and dropout_rate > 0.: keep_prob = 1.0 - dropout_rate dropout_shape = list(attn_weights.shape) - dropout_shape[-2] = 1 + # TODO(rewang): add attention dropout broadcast dimension arguments for users keep = jax_random.bernoulli(dropout_rng, keep_prob, dropout_shape) - keep = jnp.broadcast_to(keep, attn_weights.shape) multiplier = (keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)) attn_weights = attn_weights * multiplier @@ -384,7 +384,7 @@ def kv_init(key, shape, dtype): fused_attn_supported_seqlen = [128, 256, 384, 512] enable_fused_attn = int(os.getenv("NVTE_FUSED_ATTN", "0")) use_fused_attn = not decode and not self.transpose_batch_sequence and self.fuse_qkv and \ - self.dropout_rate == 0 and canonicalize_dtype in [jnp.bfloat16, jnp.float16] and \ + canonicalize_dtype in [jnp.bfloat16, jnp.float16] and \ q_seqlen in fused_attn_supported_seqlen and kv_seqlen in fused_attn_supported_seqlen \ and is_fused_attn_kernel_available() and (self.head_dim == 64) and enable_fused_attn @@ -397,9 +397,6 @@ def kv_init(key, shape, dtype): f"but got {self.transpose_batch_sequence}, " if not self.fuse_qkv: reason += f"fuse_qkv=True is required but got {self.fuse_qkv}, " - if self.dropout_rate != 0: - # TODO(rewang): add dropout support - reason += f"no dropout is required but got dropout_rate={self.dropout_rate}, " if canonicalize_dtype not in [jnp.bfloat16, jnp.float16]: reason += f"dtype in [BF16, FP16] is required " \ f"but got dtype={canonicalize_dtype}, " @@ -583,6 +580,12 @@ def kv_init(key, shape, dtype): assert mask is not None and mask.ndim == 4 # (b, 1, s_q, s_kv) assert not self.transpose_batch_sequence + seed = None + if dropout_rng is not None: + seed = jax.random.split(dropout_rng, len(jax.devices())) + # ensure the old key never used + del dropout_rng + # TODO(rewang): make it configurable for pre_scale_bias attn_bias_type = AttnBiasType.NO_BIAS if bias is None else AttnBiasType.POST_SCALE_BIAS @@ -607,7 +610,7 @@ def canonicalize_attn_mask_type(attn_mask_type): x = self_fused_attn(qkv_proj, bias, mask, - dropout_rng, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scale_factor, @@ -626,7 +629,7 @@ def canonicalize_attn_mask_type(attn_mask_type): x = cross_fused_attn(query, kv_proj, mask, - dropout_rng, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scale_factor, diff --git a/transformer_engine/jax/fused_attn.py b/transformer_engine/jax/fused_attn.py index 3eb516e3bb..ce34ca2670 100644 --- a/transformer_engine/jax/fused_attn.py +++ b/transformer_engine/jax/fused_attn.py @@ -46,7 +46,7 @@ class AttnMaskType(Enum): def self_fused_attn(qkv: jnp.ndarray, bias: jnp.ndarray, mask: jnp.ndarray, - rng_state: jnp.ndarray, + seed: jnp.ndarray, attn_bias_type: AttnBiasType, attn_mask_type: AttnMaskType, scaling_factor: float, @@ -63,7 +63,7 @@ def self_fused_attn(qkv: jnp.ndarray, output = _self_fused_attn_max_512(qkv, bias, mask, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -73,13 +73,13 @@ def self_fused_attn(qkv: jnp.ndarray, dp_axis_name = "batch" tp_axis_name = "model" - inputs = [qkv, bias, mask, rng_state] + inputs = [qkv, bias, mask, seed] batch, seqlen, _, num_head, head_dim = qkv.shape output_shape = [batch, seqlen, num_head, head_dim] sharding_meta = get_fused_attn_sharding_meta( sharding_type, [x.shape if x is not None else None for x in inputs], [output_shape], - dp_dims=([0, None, 0, None], [0]), - tp_dims=([3, 1, None, None], [2]), + dp_dims=([0, None, 0, 0], [0]), + tp_dims=([3, 1, None, 0], [2]), dp_axis_name=dp_axis_name, tp_axis_name=tp_axis_name) @@ -104,13 +104,13 @@ def self_fused_attn(qkv: jnp.ndarray, @partial(jax.custom_vjp, nondiff_argnums=(4, 5, 6, 7, 8)) def _self_fused_attn_max_512(qkv: jnp.ndarray, bias: jnp.ndarray, mask: jnp.ndarray, - rng_state: jnp.ndarray, attn_bias_type: AttnBiasType, + seed: jnp.ndarray, attn_bias_type: AttnBiasType, attn_mask_type: AttnMaskType, scaling_factor: float, dropout_probability: float, is_training: bool): output, _ = _self_fused_attn_max_512_fwd(qkv, bias, mask, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -119,7 +119,7 @@ def _self_fused_attn_max_512(qkv: jnp.ndarray, bias: jnp.ndarray, mask: jnp.ndar return output -def _self_fused_attn_max_512_fwd(qkv, bias, mask, rng_state, attn_bias_type, attn_mask_type, +def _self_fused_attn_max_512_fwd(qkv, bias, mask, seed, attn_bias_type, attn_mask_type, scaling_factor, dropout_probability, is_training): seqlen = jnp.sum(mask[:, :, :, 0] == 0, axis=(-1, -2), dtype=jnp.int32) @@ -129,7 +129,7 @@ def _self_fused_attn_max_512_fwd(qkv, bias, mask, rng_state, attn_bias_type, att output, softmax_aux = self_fused_attn_max_512_fwd(qkv, bias, cu_seqlen, - rng_state, + seed, attn_bias_type=attn_bias_type.value, attn_mask_type=attn_mask_type.value, scaling_factor=scaling_factor, @@ -163,7 +163,7 @@ def _self_fused_attn_max_512_bwd(attn_bias_type, attn_mask_type, scaling_factor, def cross_fused_attn(q: jnp.ndarray, kv: jnp.ndarray, mask: jnp.ndarray, - rng_state: jnp.ndarray, + seed: jnp.ndarray, attn_bias_type: AttnBiasType, attn_mask_type: AttnMaskType, scaling_factor: float, @@ -180,7 +180,7 @@ def cross_fused_attn(q: jnp.ndarray, output = _cross_fused_attn_max_512(q, kv, mask, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -190,7 +190,7 @@ def cross_fused_attn(q: jnp.ndarray, dp_axis_name = "batch" tp_axis_name = "model" - inputs = [q, kv, mask, rng_state] + inputs = [q, kv, mask, seed] output_shape = q.shape sharding_meta = get_fused_attn_sharding_meta( sharding_type, [x.shape if x is not None else None for x in inputs], [output_shape], @@ -219,15 +219,14 @@ def cross_fused_attn(q: jnp.ndarray, @partial(jax.custom_vjp, nondiff_argnums=(4, 5, 6, 7, 8)) -def _cross_fused_attn_max_512(q: jnp.ndarray, kv: jnp.ndarray, mask: jnp.ndarray, - rng_state: jnp.ndarray, attn_bias_type: AttnBiasType, - attn_mask_type: AttnMaskType, scaling_factor: float, - dropout_probability: float, is_training: bool): +def _cross_fused_attn_max_512(q: jnp.ndarray, kv: jnp.ndarray, mask: jnp.ndarray, seed: jnp.ndarray, + attn_bias_type: AttnBiasType, attn_mask_type: AttnMaskType, + scaling_factor: float, dropout_probability: float, is_training: bool): output, _ = _cross_fused_attn_max_512_fwd(q, kv, mask, - rng_state, + seed, attn_bias_type=attn_bias_type, attn_mask_type=attn_mask_type, scaling_factor=scaling_factor, @@ -236,8 +235,8 @@ def _cross_fused_attn_max_512(q: jnp.ndarray, kv: jnp.ndarray, mask: jnp.ndarray return output -def _cross_fused_attn_max_512_fwd(q, kv, mask, rng_state, attn_bias_type, attn_mask_type, - scaling_factor, dropout_probability, is_training): +def _cross_fused_attn_max_512_fwd(q, kv, mask, seed, attn_bias_type, attn_mask_type, scaling_factor, + dropout_probability, is_training): q_seqlen = jnp.sum(mask[:, :, :, 0] == 0, axis=(-1, -2), dtype=jnp.int32) q_cu_seqlen = jnp.cumsum(q_seqlen) @@ -251,7 +250,7 @@ def _cross_fused_attn_max_512_fwd(q, kv, mask, rng_state, attn_bias_type, attn_m kv, q_cu_seqlen, kv_cu_seqlen, - rng_state, + seed, attn_bias_type=attn_bias_type.value, attn_mask_type=attn_mask_type.value, scaling_factor=scaling_factor, From 92eabc339e159c50cda00fdd2de356ed43aba115 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Thu, 22 Jun 2023 11:41:36 -0700 Subject: [PATCH 034/427] Add long sequence support for fused attention (#237) * add long sequence support and unify three backends for fused attention Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * update cudnn-frontend to v0.9.1 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * replace cpu_float2half_rn with __float2half_rn Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix backend selection and NVTEDType Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * minor fixes Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix ci Signed-off-by: Kirthi Shankar Sivamani * make cudnn plan caches thread_local Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix CI Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * replace cuDNN throw with NVTE_CHECK Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix replacement of cuDNN throw with NVTE_CHECK Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * force dropout probablity to 0 in inference mode Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * change negInfinity to be consistent with m512 fused attn Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove float2half conversion for scale_dropout Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add back runtime api for sm detection Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add gemm3 to enums FP8Fwd/BwdTensors Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * change dropout from no to yes for fmha_v1 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove output_rng_state in m512 kernels Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix elts_per_thread calculation in kvpacked fwd Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove dropout=0.0 restriction for m512 fused attn Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove output_rng_state completely from m512 kernels Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- 3rdparty/cudnn-frontend | 2 +- tests/pytorch/test_fused_attn.py | 626 ++++++++ transformer_engine/common/CMakeLists.txt | 3 +- .../common/fused_attn/fused_attn.cpp | 355 +++-- .../fused_attn_f16_arbitrary_seqlen.cu | 1304 +++++++++++++++++ .../fused_attn_f16_arbitrary_seqlen.h | 44 + ...512.cu => fused_attn_f16_max512_seqlen.cu} | 46 +- ...n_512.h => fused_attn_f16_max512_seqlen.h} | 8 +- .../common/fused_attn/fused_attn_fp8.cu | 34 +- .../common/fused_attn/fused_attn_fp8.h | 6 +- transformer_engine/common/fused_attn/utils.cu | 1 - .../include/transformer_engine/fused_attn.h | 224 +-- transformer_engine/pytorch/attention.py | 410 +++++- transformer_engine/pytorch/constants.py | 2 +- .../pytorch/cpp_extensions/fused_attn.py | 470 +++--- transformer_engine/pytorch/csrc/common.h | 12 +- transformer_engine/pytorch/csrc/extensions.cu | 198 ++- transformer_engine/pytorch/csrc/extensions.h | 51 +- transformer_engine/pytorch/transformer.py | 15 + 19 files changed, 3172 insertions(+), 639 deletions(-) create mode 100644 tests/pytorch/test_fused_attn.py create mode 100644 transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu create mode 100644 transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h rename transformer_engine/common/fused_attn/{fused_attn_fp16_bf16_max_seqlen_512.cu => fused_attn_f16_max512_seqlen.cu} (98%) rename transformer_engine/common/fused_attn/{fused_attn_fp16_bf16_max_seqlen_512.h => fused_attn_f16_max512_seqlen.h} (91%) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index e7f64390e9..a4f05c1edc 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit e7f64390e9bb4a3db622ffe11c973834f572b609 +Subproject commit a4f05c1edcef453f5fd52f96218c29c7d420e511 diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py new file mode 100644 index 0000000000..831c2d7c79 --- /dev/null +++ b/tests/pytorch/test_fused_attn.py @@ -0,0 +1,626 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import torch +import pytest + +from transformer_engine.pytorch.utils import ( + init_method_normal, + scaled_init_method_normal, +) +from transformer_engine.pytorch import TransformerLayer +from transformer_engine.pytorch.attention import DotProductAttention +import os + +class ModelConfig: + def __init__( + self, num_layers, hidden_size, num_attention_heads, head_dim, seq_len, + dropout_p, attn_mask_type, + ): + self.num_layers = num_layers + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + assert (hidden_size == num_attention_heads * head_dim + ), """hidden_size must be = num_heads x head_dim.""" + self.seq_len = seq_len + self.dropout_p = dropout_p + self.attn_mask_type = attn_mask_type + +model_configs = { + "test1": ModelConfig(1, 1024, 16, 64, 128, 0.0, "causal"), + "test2": ModelConfig(1, 1024, 16, 64, 2048, 0.0, "causal"), + "test3": ModelConfig(1, 2048, 16, 128, 128, 0.0, "causal"), + "test4": ModelConfig(1, 2048, 16, 128, 2048, 0.0, "causal"), + "test5": ModelConfig(1, 1024, 16, 64, 128, 0.0, "no_mask"), +} + +param_types = [torch.float16] +if torch.cuda.is_bf16_supported(): + param_types.append(torch.bfloat16) + +batch_sizes = [1, 2] + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_dot_product_attention(dtype, bs, model): + """Test DotProductAttention module with three backends, + FlashAttention, FusedAttention and UnfusedDotProductAttention""" + + config = model_configs[model] + + flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention( + dtype, bs, config, "FlashAttention") + fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention( + dtype, bs, config, "FusedAttention") + unfused_attn_fwd, unfused_attn_bwd = _run_dot_product_attention( + dtype, bs, config, "UnfusedDotProductAttention") + + atol, rtol = (2.5e-2, 2.5e-2) if dtype == torch.bfloat16 else (2.5e-3, 2.5e-3) + assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + +def _run_dot_product_attention(dtype, bs, config, backend): + + torch.manual_seed(1234) + torch.cuda.manual_seed(1234) + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + if backend == "FlashAttention": + os.environ["NVTE_FLASH_ATTN"] = "1" + if backend == "FusedAttention": + os.environ["NVTE_FUSED_ATTN"] = "1" + + inp = 0.1 * torch.randn( + config.seq_len, bs, 3, config.num_attention_heads, config.head_dim, + dtype = dtype).cuda() + inp.requires_grad=True + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + op_grad = 0.001 * torch.randint(0, 200, ( + config.seq_len, bs, config.num_attention_heads * config.head_dim + ), dtype = dtype).cuda() + + block = ( + DotProductAttention( + config.num_attention_heads, + config.head_dim, + attention_dropout = config.dropout_p, + attn_mask_type = config.attn_mask_type, + sequence_parallel = False, + tp_size = 1, + get_rng_state_tracker = None, + tp_group = None, + layer_number = 1, + attention_type = "self" + ).to(dtype = dtype).cuda() + ) + + q = inp[:, :,0,:,:] + k = inp[:, :,1,:,:] + v = inp[:, :,2,:,:] + op = block(q, k, v) + op.backward(op_grad) + + return op, inp.grad + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_transformer_layer(dtype, bs, model): + """Test TransformerLayer module when its DotProductAttention is enabled with + FlashAttention, FusedAttention, or UnfusedDotProductAttention backend""" + + config = model_configs[model] + + flash_attn_fwd, flash_attn_bwd = _run_transformer_layer( + dtype, bs, config, "FlashAttention") + fused_attn_fwd, fused_attn_bwd = _run_transformer_layer( + dtype, bs, config, "FusedAttention") + unfused_attn_fwd, unfused_attn_bwd = _run_transformer_layer( + dtype, bs, config, "UnfusedDotProductAttention") + + atol, rtol = (5e-1, 5e-1) if dtype == torch.bfloat16 else (5e-1, 5e-1) + assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + +def _run_transformer_layer(dtype, bs, config, backend): + + torch.manual_seed(1234) + torch.cuda.manual_seed(1234) + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + if backend == "FlashAttention": + os.environ["NVTE_FLASH_ATTN"] = "1" + if backend == "FusedAttention": + os.environ["NVTE_FUSED_ATTN"] = "1" + + inp = 0.1 * torch.randn( + config.seq_len, bs, config.num_attention_heads * config.head_dim, + dtype = dtype).cuda() + inp.requires_grad=True + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + op_grad = 0.001 * torch.randint(0, 200, ( + config.seq_len, bs, config.num_attention_heads * config.head_dim + ), dtype = dtype).cuda() + + sigma = 0.02 + init_method = init_method_normal(sigma) + output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + + layer_number = 1 + drop_path_rate = 0.0 + drop_path_rates = [ + rate.item() for rate in torch.linspace(0, drop_path_rate, config.num_layers)] + + block = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + layernorm_epsilon = 1e-5, + hidden_dropout = 0.0, + attention_dropout = config.dropout_p, + init_method = init_method, + output_layer_init_method = output_layer_init_method, + layer_number = layer_number, + kv_channels = config.head_dim, + self_attn_mask_type = config.attn_mask_type, + tp_group = None, + tp_size = 1, + params_dtype = dtype, + get_rng_state_tracker = None, + fuse_wgrad_accumulation = False, + seq_length = config.seq_len, + micro_batch_size = bs, + sequence_parallel = False, + apply_residual_connection_post_layernorm = False, + output_layernorm = False, + layer_type = "encoder", + drop_path_rate = drop_path_rates[layer_number - 1], + set_parallel_mode = True, + fuse_qkv_params = True, + zero_centered_gamma = False, + qkv_weight_interleaved = False, + ub_tp_comm_overlap = False, + bias = True, + ) + .to(dtype = dtype) + .cuda() + ) + + op = block(inp) + op.backward(op_grad) + + return op, inp.grad + +model_configs_fp8 = { + "test1": ModelConfig(1, 1024, 16, 64, 512, 0.0, "no_mask"), +} +batch_sizes_fp8 = [1, 4] +param_types_fp8 = [torch.float16] + +@pytest.mark.parametrize("dtype", param_types_fp8) +@pytest.mark.parametrize("bs", batch_sizes_fp8) +@pytest.mark.parametrize("model", model_configs_fp8.keys()) +def test_dpa_fp8(dtype, bs, model): + """Test DotProductAttention module with FP8, + using cpp_extensions import fused_attn_fwd/bwd_qkvpacked and UnfusedDotProductAttention""" + + config = model_configs_fp8[model] + + fused_attn_fwd, fused_attn_bwd = _run_dpa_fp8( + dtype, bs, config, "FusedAttention") + unfused_attn_fwd, unfused_attn_bwd = _run_dpa_fp8_ref( + dtype, bs, config, "UnfusedDotProductAttention") + + atol, rtol = (5e-2, 1e-1) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + +def _run_dpa_fp8(dtype, bs, config, backend): + + torch.manual_seed(1234) + torch.cuda.manual_seed(1234) + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + + inp = 0.01 * torch.randn( + bs * config.seq_len, config.num_attention_heads * config.head_dim, + dtype = dtype).cuda() + inp.requires_grad=True + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + op_grad = 0.001 * torch.randint(0, 200, ( + bs * config.seq_len, config.num_attention_heads * config.head_dim + ), dtype = dtype).cuda() + torch.save(op_grad, 'op_grad.pt') + + fp8_recipe = recipe.DelayedScaling( + margin=0, + interval=1, + fp8_format=recipe.Format.HYBRID, + amax_history_len=1, + amax_compute_algo="most_recent", + ) + + dpa = DPA_FP8(config).to(dtype = torch.float16).cuda() + with fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): + op = dpa(inp, cu_seqlens, config.seq_len) + op.backward(op_grad) + + context = torch.load("ctx.pt") + dqkv = torch.load('dqkv.pt') + return (context.view(bs, config.seq_len, -1).transpose(0,1), + dqkv.view(bs, config.seq_len, 3, config.num_attention_heads, config.head_dim).transpose(0,1).contiguous()) + +def _run_dpa_fp8_ref(dtype, bs, config, backend): + + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + if backend == "FlashAttention": + os.environ["NVTE_FLASH_ATTN"] = "1" + if backend == "FusedAttention": + os.environ["NVTE_FUSED_ATTN"] = "1" + + inp = torch.load('qkv.pt').cuda() + inp.requires_grad=True + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + op_grad = torch.load('op_grad.pt').cuda().view(bs, config.seq_len, -1).transpose(0,1) + + block = ( + DotProductAttention( + config.num_attention_heads, + config.head_dim, + attention_dropout = config.dropout_p, + attn_mask_type = config.attn_mask_type, + sequence_parallel = False, + tp_size = 1, + get_rng_state_tracker = None, + tp_group = None, + layer_number = 1, + attention_type = "self" + ).to(dtype = dtype).cuda() + ) + + q = inp[:, :,0,:,:] + k = inp[:, :,1,:,:] + v = inp[:, :,2,:,:] + op = block(q, k, v) + op.backward(op_grad) + torch.save(op,'ctx_ref.pt') + torch.save(inp.grad,'dqkv_ref.pt') + + return op, inp.grad + +from torch.nn.parameter import Parameter +import transformer_engine.pytorch.cpp_extensions as ext +import transformer_engine_extensions as tex +import transformer_engine.pytorch.fp8 as fp8 +from transformer_engine.pytorch import fp8_autocast +from transformer_engine.pytorch.module.base import TransformerEngineBaseModule, _prepare_backward +from transformer_engine.common import recipe +from typing import Union, Dict, Any, Tuple, List +from transformer_engine.pytorch.cpp_extensions.fused_attn import ( + fused_attn_fwd_qkvpacked, + fused_attn_bwd_qkvpacked, + FusedAttnBackend) + +_CUBLASLT_WORKSPACE_SIZE_BYTES = 33_554_432 # 32MiB +_2X_ACC_FPROP = False +_2X_ACC_DGRAD = False +_2X_ACC_WGRAD = False + +META_QKV = tex.FP8FwdTensors.GEMM1_OUTPUT +META_O = tex.FP8FwdTensors.GEMM2_INPUT +META_DO = tex.FP8BwdTensors.GRAD_INPUT2 +META_DQKV = tex.FP8BwdTensors.GRAD_OUTPUT1 + +META_S = tex.FP8FwdTensors.GEMM3_WEIGHT +META_DS = tex.FP8BwdTensors.GRAD_INPUT3 + +class _dpa_fp8(torch.autograd.Function): + @staticmethod + def forward( + ctx, + inp: torch.Tensor, + qkv_weight: torch.Tensor, + qkv_bias: torch.Tensor, + cu_seqlens: torch.Tensor, + num_attention_heads: int, + p_dropout: float, + max_s: int, + fast_zero_fill: bool, + fp8_meta: Dict[str, Any], + workspace: torch.Tensor, + is_training: bool, + ) -> torch.Tensor: + + assert inp.dim() == 2 + in_features = qkv_weight.shape[-1] + h = num_attention_heads + d = in_features // h + b = cu_seqlens.numel() - 1 + is_nl = False + if b < 4 and b > 1: + max_s = 512 + is_nl = True + + fp8_dtype_forward = fp8.get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + + inputmat, inputmat_t = ext.fp8_cast_transpose_fused( + inp, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + ) + + qkv_weight_fp8, qkv_weight_t_fp8 = ext.fp8_cast_transpose_fused( + qkv_weight, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_WEIGHT, + fp8_dtype_forward, + ) + + M = None + ZInv = None + philox_unpacked = None + + qkv_out = ext.fp8_gemm( + qkv_weight_fp8, + fp8_meta["scaling_fwd"].scale_inv, + tex.FP8FwdTensors.GEMM1_WEIGHT, + fp8_dtype_forward, + inputmat, + fp8_meta["scaling_fwd"].scale_inv, + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + torch.uint8, + workspace, + bias=qkv_bias, + use_bias=True, + out_index = META_QKV, + fp8_meta_tensor = fp8_meta["scaling_fwd"], + use_split_accumulator=_2X_ACC_FPROP, + D_dtype=fp8_dtype_forward, + ) + qkv_out = qkv_out.view(-1, 3, h, d) + qkv_out_fp16 = ext.cast_from_fp8(qkv_out, fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, + tex.DType.kFloat16).view(b, max_s, 3, h, d).transpose(0,1).contiguous() + torch.save(qkv_out_fp16, 'qkv.pt') + + # FMHA + context_, aux_ctx_tensors, *rest = fused_attn_fwd_qkvpacked( + is_training, + max_s, + cu_seqlens, + qkv_out, + fp8_dtype_forward, + FusedAttnBackend["FP8"], + None, + fp8_meta["scaling_fwd"].scale_inv[META_QKV], + fp8_meta["scaling_fwd"].scale[META_S], + fp8_meta["scaling_fwd"].scale[META_O], + fp8_meta["scaling_fwd"].amax_history[0][META_S], + fp8_meta["scaling_fwd"].amax_history[0][META_O], + attn_scale = None, + dropout = p_dropout, + fast_zero_fill = fast_zero_fill, + qkv_layout = "qkv_interleaved", + attn_bias_type = "no_bias", + attn_mask_type = "padding", + rng_gen = None, + ) + M, ZInv, philox_unpacked = aux_ctx_tensors + + context = context_.view(-1, in_features) + context_t = tex.fp8_transpose(context, fp8_dtype_forward) + + ctx.save_for_backward( + inputmat_t, qkv_weight_t_fp8, workspace, + qkv_out, + context_, context_t, + fp8_meta["scaling_fwd"].scale, + fp8_meta["scaling_fwd"].scale_inv, + ) + ctx.aux_ctx_tensors = aux_ctx_tensors + ctx.fp8_meta = fp8_meta + ctx.cu_seqlens = cu_seqlens + ctx.p_dropout = p_dropout + ctx.max_s = max_s + ctx.fast_zero_fill = fast_zero_fill + ctx.is_nl = is_nl + ctx.hidden_size = in_features + ctx.num_attention_heads = num_attention_heads + + context_fp16 = ext.cast_from_fp8(context, fp8_meta["scaling_fwd"], + META_O, fp8_dtype_forward, tex.DType.kFloat16) + torch.save(context_fp16, 'ctx.pt') + return context_fp16 + + + @staticmethod + def backward( + ctx, grad_output: torch.Tensor + ) -> Tuple[Union[torch.Tensor, None], ...]: + + with _prepare_backward(True, ctx.fp8_meta, None, 1, name="_DPA"): + ( + inputmat_t, + qkv_weight_t_fp8, + workspace, + qkv_out, + context, context_t, + fwd_scales, + fwd_scale_inverses, + ) = ctx.saved_tensors + fp8_dtype_forward = fp8.get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=True + ) + fp8_dtype_backward = fp8.get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=False + ) + + proj_dgrad = ext.cast_to_fp8( + grad_output, ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward + ) + + dqkv, *rest = fused_attn_bwd_qkvpacked( + ctx.max_s, + ctx.cu_seqlens, + qkv_out, + context, + proj_dgrad.view_as(context), + fp8_dtype_forward, + ctx.aux_ctx_tensors, + FusedAttnBackend["FP8"], + fwd_scale_inverses[META_QKV], # d_scale_qkv, + fwd_scale_inverses[META_S], # d_scale_s, + fwd_scale_inverses[META_O], # d_scale_o, + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO], # d_scale_do + fwd_scales[META_S], # q_scale_s + ctx.fp8_meta['scaling_bwd'].scale[META_DS], # q_scale_ds + ctx.fp8_meta['scaling_bwd'].scale[META_DQKV], # q_scale_dqkv + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DS], # amax_ds + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DQKV], # amax_dqkv + None, + ctx.p_dropout, + ctx.fast_zero_fill, + "qkv_interleaved", + "no_bias", + "padding", + ) + + dqkv_grad_output_c = dqkv.view(-1, 3*ctx.hidden_size) + dqkv_grad_output_c_fp16 = ext.cast_from_fp8(dqkv_grad_output_c, + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, tex.DType.kFloat16) + torch.save(dqkv_grad_output_c_fp16, 'dqkv.pt') + + qkv_bgrad, dqkv_grad_output_t = ext.fp8_transpose_bgrad_fused( + dqkv_grad_output_c, + ctx.fp8_meta["scaling_bwd"], + META_DQKV, + fp8_dtype_backward, + torch.float16, + ) + + # QKV DGRAD + qkv_dgrad = ext.fp8_gemm( + qkv_weight_t_fp8, + fwd_scale_inverses, + tex.FP8FwdTensors.GEMM1_WEIGHT, + fp8_dtype_forward, + dqkv_grad_output_c, + ctx.fp8_meta["scaling_bwd"].scale_inv, + META_DQKV, + fp8_dtype_backward, + torch.float16, + workspace, + use_split_accumulator=_2X_ACC_DGRAD, + ) + # QKV WGRAD + qkv_wgrad = ext.fp8_gemm( + inputmat_t, + fwd_scale_inverses, + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + dqkv_grad_output_t, + ctx.fp8_meta["scaling_bwd"].scale_inv, + META_DQKV, + fp8_dtype_backward, + torch.float16, + workspace, + use_split_accumulator=_2X_ACC_WGRAD, + ) + + return (qkv_dgrad, + qkv_wgrad, + qkv_bgrad, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None) + +class DPA_FP8(TransformerEngineBaseModule): + def __init__( + self, + config, + params_dtype: torch.dtype = torch.float32): + super().__init__() + self.p_dropout = config.dropout_p + self.h = config.num_attention_heads + self.hidden_size = config.hidden_size + self.head_dim = config.head_dim + self.fast_zero_fill = True + + self.qkv_weight = Parameter( + torch.empty( + self.hidden_size * 3, + self.hidden_size, + device=torch.cuda.current_device(), + dtype=params_dtype, + ) + ) + self.fp8_weight_shapes.append(self.qkv_weight.shape) + self.qkv_bias = Parameter( + torch.empty( + self.hidden_size * 3, + device=torch.cuda.current_device(), + dtype=params_dtype, + ) + ) + with torch.no_grad(): + self.qkv_bias.zero_() + self.qkv_weight.fill_(1.0) + self.workspace = torch.empty( + _CUBLASLT_WORKSPACE_SIZE_BYTES, dtype=torch.int8, device="cuda" + ) + + def forward( + self, inp: torch.Tensor, + cu_seqlens, max_s, + ) -> torch.Tensor: + with self.prepare_forward(inp, None, num_gemms=3) as inp: + out = _dpa_fp8.apply( + inp, + self.qkv_weight, + self.qkv_bias, + cu_seqlens, + self.h, + self.p_dropout, + max_s, + self.fast_zero_fill, + self.fp8_meta, + self.workspace, + self.training) + return out + + def get_fp8_weights_scratchpad( + self, + is_first_microbatch: Union[bool, None], + ) -> List[torch.Tensor]: + """Needs override.""" diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index a7653355db..481e1677ee 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -12,9 +12,10 @@ list(APPEND transformer_engine_SOURCES transpose/transpose_fusion.cu transpose/multi_cast_transpose.cu activation/gelu.cu + fused_attn/fused_attn_f16_max512_seqlen.cu + fused_attn/fused_attn_f16_arbitrary_seqlen.cu activation/relu.cu activation/swiglu.cu - fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu fused_attn/fused_attn_fp8.cu fused_attn/fused_attn.cpp fused_attn/utils.cu diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp index f1846c49d5..25f62cad09 100644 --- a/transformer_engine/common/fused_attn/fused_attn.cpp +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -7,8 +7,80 @@ #include "transformer_engine/fused_attn.h" #include "../common.h" #include "utils.h" -#include "fused_attn_fp16_bf16_max_seqlen_512.h" +#include "fused_attn_f16_max512_seqlen.h" +#include "fused_attn_f16_arbitrary_seqlen.h" #include "fused_attn_fp8.h" +#include "../util/cuda_runtime.h" + +// select a backend for fused attention +NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( + NVTEDType q_dtype, + NVTEDType kv_dtype, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + float dropout, size_t max_seqlen_q, + size_t max_seqlen_kv, size_t head_dim) { + using namespace transformer_engine; + NVTE_Fused_Attn_Backend backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend; + const int device_id = cuda::current_device(); + const int sm_arch_ = cuda::sm_arch(device_id); + NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type."); + if ((q_dtype == NVTEDType::kNVTEFloat8E4M3) || (q_dtype == NVTEDType::kNVTEFloat8E5M2) + && (sm_arch_ >= 90) + && (max_seqlen_q == max_seqlen_kv) + && (max_seqlen_q <= 512) + && (head_dim == 64) + && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) + && (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK) + && (qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED)) { + backend = NVTE_Fused_Attn_Backend::NVTE_FP8; + } else if ((q_dtype == NVTEDType::kNVTEFloat16) || (q_dtype == NVTEDType::kNVTEBFloat16)) { + bool flag_m512 = false; + bool flag_arb = false; + if ((sm_arch_ >= 80) + && (head_dim == 64) + && ((bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) + || (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)) + && ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) + || (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) + || (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) + && ((qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) + || (qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED))) { + flag_m512 = true; + } + if ((sm_arch_ >= 80) + && (max_seqlen_q == max_seqlen_kv) + && ((head_dim == 64) || (head_dim == 128)) + && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) + && (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) + && (qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED)) { + flag_arb = true; + } + if (((max_seqlen_q > 512) || (max_seqlen_kv > 512)) + && (flag_arb == true)) { + backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen; + } + if ((max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + if (flag_m512 == true) { + backend = NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen; + } else if ((flag_m512 == false) && (flag_arb == true)) { + backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen; + } + } + const char* env_backend = std::getenv("NVTE_FUSED_ATTN_BACKEND"); + if ((max_seqlen_q <= 512) && (max_seqlen_kv <= 512) + && (flag_arb == true) + && (env_backend != nullptr) + && (std::string(env_backend) == std::to_string( + NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen))) { + backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen; + } + } else { + backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend; + } + return backend; +} // NVTE fused attention FWD FP8 with packed QKV void nvte_fused_attn_fwd_qkvpacked( @@ -16,7 +88,7 @@ void nvte_fused_attn_fwd_qkvpacked( const NVTETensor Bias, NVTETensor S, NVTETensor O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const NVTETensor cu_seqlens, const NVTETensor rng_state, size_t max_seqlen, @@ -43,54 +115,56 @@ void nvte_fused_attn_fwd_qkvpacked( size_t d = input_QKV->data.shape[ndim - 1]; auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); - const DType QKV_type = input_QKV->data.dtype; + const NVTEDType QKV_type = static_cast(input_QKV->data.dtype); - if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) - && (max_seqlen <= 512)) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + QKV_type, QKV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen, max_seqlen, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { +#if (CUDNN_VERSION >= 8901) + fused_attn_max_512_fwd_qkvpacked( + b, max_seqlen, h, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_QKV, input_Bias, output_O, + Aux_CTX_Tensors, + input_cu_seqlens, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { #if (CUDNN_VERSION >= 8900) - // FP8 API doesn't use input_Bias, bias_type or attn_mask_type - fused_attn_fwd_fp8_qkvpacked( + fused_attn_arbitrary_seqlen_fwd_qkvpacked( + b, max_seqlen, h, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_QKV, input_Bias, output_O, + Aux_CTX_Tensors, + input_cu_seqlens, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR( + "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { +#if (CUDNN_VERSION >= 8900) + fused_attn_fp8_fwd_qkvpacked( b, max_seqlen, h, d, is_training, attn_scale, dropout, qkv_layout, input_QKV, input_output_S, output_O, - Aux_Output_Tensors, + Aux_CTX_Tensors, input_cu_seqlens, input_rng_state, wkspace, stream, handle); #else - NVTE_ERROR("cuDNN 8.9 is required to run FP8 fused attention. \n"); + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); #endif - } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) - && (max_seqlen <= 512)) { -#if (CUDNN_VERSION >= 8901) - fused_attn_max_512_fwd_qkvpacked( - b, - max_seqlen, - h, - d, - is_training, - attn_scale, - dropout, - qkv_layout, - bias_type, - attn_mask_type, - input_QKV, - input_Bias, - output_O, - Aux_Output_Tensors, - input_cu_seqlens, - input_rng_state, - wkspace, - stream, - handle); -#else - NVTE_ERROR( - "cuDNN 8.9.1 is required to run BF16/FP16 fused attention with max_seqlen<=512. \n"); -#endif - } else if (max_seqlen > 512) { - NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); } else { - NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } } // NVTE fused attention BWD FP8 with packed QKV @@ -130,18 +204,52 @@ void nvte_fused_attn_bwd_qkvpacked( size_t d = input_QKV->data.shape[ndim - 1]; auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); - const DType QKV_type = input_QKV->data.dtype; + const NVTEDType QKV_type = static_cast(input_QKV->data.dtype); - if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) - && (max_seqlen <= 512)) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + QKV_type, QKV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen, max_seqlen, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { +#if (CUDNN_VERSION >= 8901) + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + fused_attn_max_512_bwd_qkvpacked( + b, max_seqlen, h, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_QKV, input_dO, + output_S, + output_dQKV, output_dBias, + input_cu_seqlens, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { +#if (CUDNN_VERSION >= 8900) + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + fused_attn_arbitrary_seqlen_bwd_qkvpacked( + b, max_seqlen, h, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_QKV, input_O, input_dO, + output_S, + output_dQKV, output_dBias, + input_cu_seqlens, input_rng_state, + wkspace, stream, handle); +#else + const char *err_msg = + "cuDNN 8.9.0 is required for BF16/FP16 fused attention " + "with arbitrary sequence length. \n"; + NVTE_ERROR(err_msg); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { #if (CUDNN_VERSION >= 8900) - // Aux_CTX_Tensors contain [M, ZInv, rng_state] generated by the forward pass const Tensor *input_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); const Tensor *input_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); - - // FP8 API doesn't use input_dBias, bias_type or attn_mask_type - fused_attn_bwd_fp8_qkvpacked( + fused_attn_fp8_bwd_qkvpacked( b, max_seqlen, h, d, attn_scale, dropout, qkv_layout, input_QKV, input_O, input_dO, @@ -152,38 +260,10 @@ void nvte_fused_attn_bwd_qkvpacked( input_rng_state, wkspace, stream, handle); #else - NVTE_ERROR("cuDNN 8.9 is required to run FP8 fused attention. \n"); -#endif - } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) - && (max_seqlen <= 512)) { -#if (CUDNN_VERSION >= 8901) - fused_attn_max_512_bwd_qkvpacked( - b, - max_seqlen, - h, - d, - attn_scale, - dropout, - qkv_layout, - bias_type, - attn_mask_type, - input_QKV, - input_dO, - Aux_CTX_Tensors, - output_dQKV, - output_dBias, - input_cu_seqlens, - wkspace, - stream, - handle); -#else - NVTE_ERROR( - "cuDNN 8.9.1 is required to run BF16/FP16 fused attention with max_seqlen<=512. \n"); + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); #endif - } else if (max_seqlen > 512) { - NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); } else { - NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } } // NVTE fused attention FWD FP8 with packed KV @@ -193,7 +273,7 @@ void nvte_fused_attn_fwd_kvpacked( const NVTETensor Bias, NVTETensor S, NVTETensor O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv, const NVTETensor rng_state, @@ -223,45 +303,37 @@ void nvte_fused_attn_fwd_kvpacked( size_t d = input_Q->data.shape[ndim - 1]; auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); - const DType QKV_type = input_Q->data.dtype; + const NVTEDType Q_type = static_cast(input_Q->data.dtype); + const NVTEDType KV_type = static_cast(input_KV->data.dtype); - if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) - && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { - NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); - } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) - && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + Q_type, KV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen_q, max_seqlen_kv, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { #if (CUDNN_VERSION >= 8901) - fused_attn_max_512_fwd_kvpacked( - b, - max_seqlen_q, - max_seqlen_kv, - h, - d, - is_training, - attn_scale, - dropout, - qkv_layout, - bias_type, - attn_mask_type, - input_Q, - input_KV, - input_Bias, - output_O, - Aux_Output_Tensors, - input_cu_seqlens_q, - input_cu_seqlens_kv, - input_rng_state, - wkspace, - stream, - handle); + fused_attn_max_512_fwd_kvpacked( + b, max_seqlen_q, max_seqlen_kv, h, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_KV, input_Bias, output_O, + Aux_CTX_Tensors, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); #else - NVTE_ERROR( - "cuDNN 8.9.1 is required to run BF16/FP16 fused attention with max_seqlen<=512. \n"); + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); #endif - } else if ((max_seqlen_q > 512) || (max_seqlen_kv > 512)) { - NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { + const char* err_msg = + "The FP16/BF16 fused attention (arbitrary seqlen) currently " + "only supports packed QKV input.\n"; + NVTE_ERROR(err_msg); + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { + NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); } else { - NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } } // NVTE fused attention BWD FP8 with packed KV @@ -307,44 +379,37 @@ void nvte_fused_attn_bwd_kvpacked( size_t d = input_Q->data.shape[ndim - 1]; auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); - const DType QKV_type = input_Q->data.dtype; + const NVTEDType Q_type = static_cast(input_Q->data.dtype); + const NVTEDType KV_type = static_cast(input_KV->data.dtype); - if (((QKV_type == DType::kFloat8E4M3) || (QKV_type == DType::kFloat8E5M2)) - && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { - NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); - } else if (((QKV_type == DType::kFloat16) || (QKV_type == DType::kBFloat16)) - && (max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + Q_type, KV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen_q, max_seqlen_kv, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { #if (CUDNN_VERSION >= 8901) - fused_attn_max_512_bwd_kvpacked( - b, - max_seqlen_q, - max_seqlen_kv, - h, - d, - attn_scale, - dropout, - qkv_layout, - bias_type, - attn_mask_type, - input_Q, - input_KV, - input_dO, - Aux_CTX_Tensors, - output_dQ, - output_dKV, - output_dBias, - input_cu_seqlens_q, - input_cu_seqlens_kv, - wkspace, - stream, - handle); + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + fused_attn_max_512_bwd_kvpacked( + b, max_seqlen_q, max_seqlen_kv, h, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_KV, input_dO, + output_S, + output_dQ, output_dKV, output_dBias, + input_cu_seqlens_q, input_cu_seqlens_kv, + wkspace, stream, handle); #else - NVTE_ERROR( - "cuDNN 8.9.1 is required to run BF16/FP16 fused attention with max_seqlen<=512. \n"); + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); #endif - } else if ((max_seqlen_q > 512) || (max_seqlen_kv > 512)) { - NVTE_ERROR("TBD: No support for fused attention with >512 seqlence length currently. \n"); + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { + const char* err_msg = + "The FP16/BF16 fused attention (arbitrary seqlen) currently " + "only supports packed QKV input.\n"; + NVTE_ERROR(err_msg); + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { + NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); } else { - NVTE_ERROR("Invalid combination of data type and sequence length! \n"); + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } } diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu new file mode 100644 index 0000000000..88e006fb4e --- /dev/null +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu @@ -0,0 +1,1304 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "fused_attn_f16_arbitrary_seqlen.h" + +#include +#include +#include +#include +#include + +#include "../common.h" +#include "utils.h" + +#if (CUDNN_VERSION >= 8900) +#define Q_ID 1 +#define K_ID 2 +#define V_ID 3 +#define O_ID 4 +#define S_ID 5 +#define B_ID 6 +#define D_CONST_ID 7 +#define S_CONST_ID 8 +#define Q_SEQLEN_ID 9 +#define K_SEQLEN_ID 10 +#define dQ_ID 11 +#define dK_ID 12 +#define dV_ID 13 +#define dO_ID 14 +#define MASK_VAL_ID 15 +#define dS_ID 16 +#define D_SEED_ID 17 +#define D_OFFSET_ID 18 +#define S_STATS_ID 19 +#define S_SUM_ID 20 +#define SCALE_PROB 21 +#define K_TRANSPOSE_ID 22 +#define dQ_ACCUM_ID 23 + +#define VIRTUAL_ID 30 + +namespace transformer_engine { +namespace fused_attn { + +static cudnn_frontend::Tensor +createScale(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, cudnnDataType_t tensorType, + const cudnn_frontend::Tensor& sTensor, + std::vector* ops) { + // scale + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + int64_t s_dim[4] = {b, h, s_q, s_kv}; + int64_t s_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, s_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + auto scaleTensor = tensor_create( + tensorType, S_CONST_ID, scale_dim, + scale_stride, false, true); // is by value + auto sScaleTensor = tensor_create( + tensorType, VIRTUAL_ID + 2000, s_dim, + s_stride, true, false); // is virtual + + // Define the scale descriptor + auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a scale node + auto scale_op = binary_pw_op_create(sTensor, scaleTensor, sScaleTensor, scaleDesc); + + ops->push_back(std::move(scale_op)); + return sScaleTensor; +} + +static cudnn_frontend::Tensor +createQKBMM(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, cudnnDataType_t tensorType, + std::vector* ops) { + // Creates the necessary tensor descriptors + int64_t q_dim[4] = {b, h, s_q, d}; + int64_t q_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, q_stride, layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); + + int64_t k_dim[4] = {b, h, d, s_kv}; + int64_t k_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, k_stride, layout, NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose); + + int64_t s_dim[4] = {b, h, s_q, s_kv}; + int64_t s_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, s_stride, layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + auto qTensor = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false); + auto kTransposeTensor = tensor_create( + tensorType, K_ID, k_dim, k_stride, false, false); // is virtual + // first GEMM output + auto sTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 1, s_dim, s_stride, true, false); // is virtual + + // Define the matmul 1 desc + auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + // Create a matmul 1 node + auto matmul_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(qTensor) + .setbMatDesc(kTransposeTensor) + .setcMatDesc(sTensor) + .setmatmulDesc(matmul_1_Desc) + .build(); + + ops->push_back(std::move(matmul_op1)); + + return sTensor; +} + +static cudnn_frontend::Tensor +createCausalMask(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor& prevBlockOutputTensor) { + CUDNN_FRONTEND_UNUSED(d); + CUDNN_FRONTEND_UNUSED(layout); + CUDNN_FRONTEND_UNUSED(tensorType); + + NVTE_CHECK(ops->size() != 0, "Padding Mask constructed incorrectly as the first one"); + + // subtraction output + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t maskVal_dim[4] = {1, 1, 1, 1}; + int64_t maskVal_stride[4] = {1, 1, 1, 1}; + + // mask value to put in the masked pixels + auto maskValTensor = tensor_create( + CUDNN_DATA_FLOAT, MASK_VAL_ID, maskVal_dim, + maskVal_stride, false, true); // is by value + // gen index row output + auto rowIndexTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 100, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + // gen index column output + auto columnIndexTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 101, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + // create causal mask (row >= col) + auto causalMaskTensor = tensor_create( + CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 106, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + + // output after masking + auto maskOutputTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 107, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + + // Define the gen index for row descriptor + auto genIndexRowDesc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_GEN_INDEX) + .setAxis(2) + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + // Create a gen index node + auto genIndexRow_op = unary_pw_op_create( + prevBlockOutputTensor, rowIndexTensor, genIndexRowDesc); + + // Define the gen index for row descriptor + auto genIndexColumnDesc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_GEN_INDEX) + .setAxis(3) + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + // Create a gen index node + auto genIndexColumn_op = unary_pw_op_create( + prevBlockOutputTensor, columnIndexTensor, genIndexColumnDesc); + + // Define the greater than equal to comparison descriptor + auto rowGreaterColDesc = pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_CMP_GE); + + // Create a greater than equal to node + auto rowGreaterCol_op = binary_pw_op_create( + rowIndexTensor, columnIndexTensor, causalMaskTensor, rowGreaterColDesc); + + // Define the binary select to perform masking descriptor + auto maskDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT); + + // Create a binary select node + auto mask_op = ternary_pw_op_create( + prevBlockOutputTensor, maskValTensor, + causalMaskTensor, maskOutputTensor, maskDesc); + + ops->push_back(std::move(genIndexRow_op)); + ops->push_back(std::move(genIndexColumn_op)); + ops->push_back(std::move(rowGreaterCol_op)); + ops->push_back(std::move(mask_op)); + + return maskOutputTensor; +} + +static cudnn_frontend::Tensor +createSoftmaxForward(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, bool isTraining, + std::vector* ops, + const cudnn_frontend::Tensor& sAfterMaskTensor) { + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t afterReduction_dim[4] = {b, h, s_q, 1}; + int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1}; + + // max (x) + auto afterMaxReductionTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 150, afterReduction_dim, + afterReduction_stride, true, false); // is virtual + + // x - max(x) + auto afterSubtractionTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 151, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + + // e^(x - max(x)) + auto afterExponentTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 152, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual; + + // sum (e^(x - max(x))) + auto afterAddReductionTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 153, afterReduction_dim, + afterReduction_stride, true, false); // is virtual + + // log (sum (e^(x - max(x)))) + auto afterLogLTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 154, afterReduction_dim, + afterReduction_stride, true, false); + + // M + log (sum (e^(x - max(x)))) + auto softmaxStatsTensor = tensor_create( + CUDNN_DATA_FLOAT, S_STATS_ID, afterReduction_dim, + afterReduction_stride, !isTraining, false); + // not virtual if training is true, virtual if training is false + + // divide (e/ sum(e)) + auto afterSoftmaxTensor = cudnn_frontend::TensorBuilder() + .setDim(4, afterBMM1_dim) + .setStride(4, afterBMM1_stride) + .setId(VIRTUAL_ID + 156) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(CUDNN_DATA_FLOAT) + .setVirtual(true) + .setByValue(false) + .setReorderType( + cudnn_frontend::cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16) + .build(); + + // Define the reduction descriptor + auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_MAX) + .build(); + + // Create a reduction max node + auto reductionMax_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(sAfterMaskTensor) + .setyDesc(afterMaxReductionTensor) + .setreductionDesc(reductionMaxDesc) + .build(); + + // Define the subtract descriptor + auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB); + + // Create a subtract node + auto subtract_op = binary_pw_op_create( + sAfterMaskTensor, afterMaxReductionTensor, + afterSubtractionTensor, subtractDesc); + + // Define the exponent descriptor + auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP); + + // Create a exponent node + auto exponent_op = unary_pw_op_create( + afterSubtractionTensor, afterExponentTensor, exponentDesc); + + // Define the reduction descriptor + auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_ADD) + .build(); + + // Create a reduction add node + auto reductionAdd_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(afterExponentTensor) + .setyDesc(afterAddReductionTensor) + .setreductionDesc(reductionAddDesc) + .build(); + + // Create log descriptor + auto logDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_LOG); + + // Create log node + auto log_op = unary_pw_op_create(afterAddReductionTensor, afterLogLTensor, logDesc); + + // Create add descriptor + auto addDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ADD); + + // Create add node + auto add_op = binary_pw_op_create( + afterMaxReductionTensor, afterLogLTensor, + softmaxStatsTensor, addDesc); + + // Define the division descriptor + auto divisionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_DIV); + + // Create a subtract node + auto division_op = binary_pw_op_create( + afterExponentTensor, afterAddReductionTensor, + afterSoftmaxTensor, divisionDesc); + + ops->push_back(std::move(reductionMax_op)); + ops->push_back(std::move(subtract_op)); + ops->push_back(std::move(exponent_op)); + ops->push_back(std::move(reductionAdd_op)); + ops->push_back(std::move(log_op)); + ops->push_back(std::move(add_op)); + ops->push_back(std::move(division_op)); + + return afterSoftmaxTensor; +} + +static cudnn_frontend::Tensor +createDropoutForward(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + double probability, cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor& afterSoftmaxTensor) { + CUDNN_FRONTEND_UNUSED(d); + + NVTE_CHECK(ops->size() != 0, "Dropout DAG constructed incorrectly as the first one"); + + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + auto dropoutSeed = tensor_create( + CUDNN_DATA_INT64, D_SEED_ID, scale_dim, + scale_stride, false, false); // not virtual + auto dropoutOffset = tensor_create( + CUDNN_DATA_INT64, D_OFFSET_ID, scale_dim, + scale_stride, false, false); // not virtual + + // mask for the dropout + auto dropoutMaskTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 200, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + // after dropout tensor + auto afterDropoutTensor = cudnn_frontend::TensorBuilder() + .setDim(4, afterBMM1_dim) + .setStride(4, afterBMM1_stride) + .setId(VIRTUAL_ID + 201) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(tensorType) + .setVirtual(true) + .setByValue(false) + .setReorderType( + cudnn_frontend::cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16) + .build(); + // scale after dropout + auto scaleDropoutTensor = tensor_create( + tensorType, D_CONST_ID, scale_dim, + scale_stride, false, true); // is by value + // after Scale + auto afterScaleTensor = tensor_create( + tensorType, VIRTUAL_ID + 202, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + + // Define the reduction descriptor + auto rngDesc = cudnn_frontend::RngDescBuilder() + .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI) + .setBernoulliDistProbability(1.0 - probability) + .build(); + + // Create a rng node + auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR) + .setyDesc(dropoutMaskTensor) + .setSeedDesc(dropoutSeed) + .setOffsetDesc(dropoutOffset) + .setRngDesc(rngDesc) + .build(); + + // Define the multiply mask descriptor + auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask node + auto maskMul_op = binary_pw_op_create( + afterSoftmaxTensor, dropoutMaskTensor, + afterDropoutTensor, maskMulDesc); + + // Define the multiply scale descriptor + auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply scale node + auto scaleMul_op = binary_pw_op_create( + afterDropoutTensor, scaleDropoutTensor, + afterScaleTensor, scaleMulDesc); + + ops->push_back(std::move(rng_op)); + ops->push_back(std::move(maskMul_op)); + ops->push_back(std::move(scaleMul_op)); + + return afterScaleTensor; +} + +static cudnn_frontend::Tensor +createDropoutBackward(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + double probability, cudnnDataType_t tensorType, + std::vector* ops, + const cudnn_frontend::Tensor& afterSoftmaxTensor, + const cudnn_frontend::Tensor& dropoutMaskTensor) { + CUDNN_FRONTEND_UNUSED(d); + + NVTE_CHECK(ops->size() != 0, "Dropout DAG constructed incorrectly as the first one"); + + int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv}; + int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1}; + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + auto dropoutSeed = tensor_create( + CUDNN_DATA_INT64, D_SEED_ID, scale_dim, + scale_stride, false, false); // not virtual + auto dropoutOffset = tensor_create( + CUDNN_DATA_INT64, D_OFFSET_ID, scale_dim, + scale_stride, false, false); // not virtual + + // after dropout tensor + auto afterDropoutTensor = cudnn_frontend::TensorBuilder() + .setDim(4, afterBMM1_dim) + .setStride(4, afterBMM1_stride) + .setId(VIRTUAL_ID + 201) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(tensorType) + .setVirtual(true) + .setByValue(false) + .setReorderType( + cudnn_frontend::cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16) + .build(); + // scale after dropout + auto scaleDropoutTensor = tensor_create( + tensorType, D_CONST_ID, scale_dim, + scale_stride, false, true); // is by value + // after Scale + auto afterScaleTensor = tensor_create( + tensorType, VIRTUAL_ID + 202, afterBMM1_dim, + afterBMM1_stride, true, false); // is virtual + + // Define the reduction descriptor + auto rngDesc = cudnn_frontend::RngDescBuilder() + .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI) + .setBernoulliDistProbability(1.0 - probability) + .build(); + + // Create a rng node + auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR) + .setyDesc(dropoutMaskTensor) + .setSeedDesc(dropoutSeed) + .setOffsetDesc(dropoutOffset) + .setRngDesc(rngDesc) + .build(); + + // Define the multiply mask descriptor + auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply mask node + auto maskMul_op = binary_pw_op_create( + afterSoftmaxTensor, dropoutMaskTensor, + afterDropoutTensor, maskMulDesc); + + // Define the multiply scale descriptor + auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // Create a multiply scale node + auto scaleMul_op = binary_pw_op_create( + afterDropoutTensor, scaleDropoutTensor, + afterScaleTensor, scaleMulDesc); + + ops->push_back(std::move(rng_op)); + ops->push_back(std::move(maskMul_op)); + ops->push_back(std::move(scaleMul_op)); + + return afterScaleTensor; +} + +static void +createSVBMM(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + NVTE_QKV_Layout layout, cudnnDataType_t tensorType, + std::vector* ops, + cudnn_frontend::Tensor const &afterScaleDropoutTensor) { + NVTE_CHECK(ops->size() != 0, "BMM2 op constructed incorrectly as the first one"); + + int64_t v_dim[4] = {b, h, s_kv, d}; + int64_t v_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, v_stride, layout, NVTE_QKV_Matrix::NVTE_V_Matrix); + + int64_t o_dim[4] = {b, h, s_q, d}; + int64_t o_stride[4]; + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride, layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + + auto vTensor = tensor_create(tensorType, V_ID, v_dim, v_stride, false, false); + // second GEMM output + auto oTensor = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false); + + // Define the matmul 2 desc + auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + // Create a matmul 2 node + auto matmul_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(afterScaleDropoutTensor) + .setbMatDesc(vTensor) + .setcMatDesc(oTensor) + .setmatmulDesc(matmul_2_Desc) + .build(); + + ops->push_back(std::move(matmul_op2)); +} + +void fused_attn_arbitrary_seqlen_fwd_impl( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + bool is_training, float scaling_factor, float dropout_probability, + NVTE_QKV_Layout layout, + void *devPtrQ, void *devPtrK, void *devPtrV, + void *devPtrSoftmaxStats, void *devPtrO, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnnDataType_t tensorType, + void *workspace, size_t *workspace_size, + cudaStream_t stream, cudnnHandle_t handle) { + try { + NVTE_CHECK_CUDNN(cudnnSetStream(handle, stream)); + + if (!is_training) { + dropout_probability == 0.0f; + } + + FADescriptor descriptor{b, h, + s_q, s_kv, + d, scaling_factor, + is_training, dropout_probability, + layout, NVTE_Bias_Type::NVTE_NO_BIAS, + NVTE_Mask_Type::NVTE_CAUSAL_MASK, tensorType}; + + using CacheType = std::map; + static thread_local CacheType fmha_fprop_cache; + + // Get plan from cache if cache is available, otherwise create one + auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { + // if hit, return + auto it = cache.find(descriptor); + if (it != cache.end()) { + auto plan = it->second; + return plan; + } + + // otherwise, build the op_graph and the plan. Then update cache + std::vector all_ops; + std::vector ops; + + // Q * K^T + auto sTensor = createQKBMM(b, h, s_q, s_kv, d, layout, tensorType, &ops); + + // Q * K^T * bmmScale + auto sScaleTensor = createScale( + b, h, s_q, s_kv, d, layout, CUDNN_DATA_FLOAT, sTensor, &ops); + + // Causual mask + auto sAfterMaskTensor = createCausalMask( + b, h, s_q, s_kv, d, layout, tensorType, &ops, sScaleTensor); + + NVTE_CHECK(dropout_probability != 1.0f, + "Dropout probability cannot be 1.0"); + + auto softmax_output = createSoftmaxForward( + b, h, s_q, s_kv, is_training, &ops, sAfterMaskTensor); + + // Dropout(softmax) + auto dropout_output = createDropoutForward( + b, h, s_q, s_kv, d, + dropout_probability, tensorType, &ops, softmax_output); + createSVBMM(b, h, s_q, s_kv, d, layout, tensorType, &ops, dropout_output); + + for (unsigned int i = 0; i < ops.size(); i++) { + all_ops.push_back(&ops[i]); + } + + // Create an Operation Graph + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(all_ops.size(), all_ops.data()) + .build(); + + cudnn_frontend::EngineConfigList filtered_configs; + auto statuses = cudnn_frontend::get_heuristics_list<1>( + {"heuristics_instant"}, opGraph, allowAllConfig, + filtered_configs, true); + + if (filtered_configs.size() == 0) { + cudnn_frontend::set_error_and_throw_exception( + nullptr, + CUDNN_STATUS_NOT_SUPPORTED, + "run_mha_fprop: No config returned by the heuristics"); + } + + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(filtered_configs[0], opGraph.getTag()) + .build(); + + cache.insert({descriptor, plan}); + return plan; + }; + + auto plan = get_plan(fmha_fprop_cache, descriptor); + + auto plan_workspace_size = plan.getWorkspaceSize(); + + // Exit to request upper level API to allocate memory if needed + if (workspace == nullptr) { + *workspace_size = plan_workspace_size; + return; + } + + std::set> data_ptrs; + // Add all the data pointers to be used in the variant pack + float negInfinity = -1.0E+10f; + float scale_dropout = 1.0f/(1.0f - dropout_probability); + + data_ptrs.insert(std::pair(Q_ID, devPtrQ)); + data_ptrs.insert(std::pair(K_ID, devPtrK)); + data_ptrs.insert(std::pair(V_ID, devPtrV)); + data_ptrs.insert(std::pair(MASK_VAL_ID, &negInfinity)); + data_ptrs.insert(std::pair(S_CONST_ID, &scaling_factor)); + data_ptrs.insert(std::pair(O_ID, devPtrO)); + data_ptrs.insert(std::pair(D_SEED_ID, devPtrDropoutSeed)); + data_ptrs.insert(std::pair(D_OFFSET_ID, devPtrDropoutOffset)); + data_ptrs.insert(std::pair(D_CONST_ID, &scale_dropout)); + + // If training mode, we write out softmax stats + if (is_training) { + data_ptrs.insert(std::pair(S_STATS_ID, devPtrSoftmaxStats)); + } + + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace) + .setDataPointers(data_ptrs) + .build(); + + NVTE_CHECK_CUDNN( + cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); + } catch (cudnn_frontend::cudnnException &e) { + NVTE_ERROR(e.what()); + } +} + +void fused_attn_arbitrary_seqlen_bwd_impl( + int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, + float scaling_factor, float dropout_probability, NVTE_QKV_Layout layout, + void* devPtrQ, void* devPtrKTranspose, void* devPtrVTranspose, + void* devPtrO, void* devPtrSoftmaxStats, + void* devPtrdQ, void* devPtrdK, void* devPtrdV, void* devPtrdO, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnnDataType_t tensorType, void *workspace, size_t *workspace_size, + cudaStream_t stream, cudnnHandle_t handle) { + try { + NVTE_CHECK_CUDNN(cudnnSetStream(handle, stream)); + + FADescriptor descriptor{b, h, + s_q, s_kv, + d, scaling_factor, + true, dropout_probability, + layout, NVTE_Bias_Type::NVTE_NO_BIAS, + NVTE_Mask_Type::NVTE_CAUSAL_MASK, tensorType}; + + using CacheType = std::map; + static thread_local CacheType fmha_bprop_cache; + + auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) { + auto it = cache.find(descriptor); + if (it != cache.end()) { + return it->second; + } + + std::vector all_ops; + std::vector ops; + + // Creates the necessary tensor descriptors + int64_t q_dim[4] = {b, h, s_q, d}; + int64_t q_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, q_stride, + layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); + + int64_t k_transpose_dim[4] = {b, h, d, s_kv}; + int64_t k_transpose_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, k_transpose_stride, + layout, NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose); + + int64_t v_transpose_dim[4] = {b, h, d, s_kv}; + int64_t v_transpose_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, v_transpose_stride, + layout, NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose); + + int64_t p_dim[4] = {b, h, s_q, s_kv}; + int64_t p_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, p_stride, + layout, NVTE_QKV_Matrix::NVTE_S_Matrix); + + int64_t p_transpose_dim[4] = {b, h, s_kv, s_q}; + int64_t p_transpose_stride[4]; + p_transpose_stride[0] = p_stride[0]; + p_transpose_stride[1] = p_stride[1]; + p_transpose_stride[2] = p_stride[3]; + p_transpose_stride[3] = p_stride[2]; + + int64_t o_dim[4] = {b, h, s_q, d}; + int64_t o_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, o_stride, + layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + + int64_t scale_dim[4] = {1, 1, 1, 1}; + int64_t scale_stride[4] = {1, 1, 1, 1}; + + /******************************************************************************* + * Dot product dO * O */ + + // output and gradient of the output + auto oTensor = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false); + auto dOTensor = tensor_create(tensorType, dO_ID, o_dim, o_stride, false, false); + + auto dotProductTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID, o_dim, + o_stride, true, false); // is virtual + + // Create pointwise mul + auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL); + + // do * O + auto dotProductOp = binary_pw_op_create( + dOTensor, oTensor, dotProductTensor, multiplyDesc); + ops.push_back(std::move(dotProductOp)); + + /******************************************************************************* + * Reduction(dO * O) */ + + int64_t reduction_dim[4] = {b, h, s_q, 1}; + int64_t reduction_stride[4] = {h * s_q, s_q, 1, 1}; + + // reduction(dO * O) + auto afterReductionTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 1, reduction_dim, + reduction_stride, true, false); // is virtual + auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .setReductionOp(CUDNN_REDUCE_TENSOR_MAX) + .build(); + + // Create a reduction max node + auto reductionMax_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR) + .setxDesc(dotProductTensor) + .setyDesc(afterReductionTensor) + .setreductionDesc(reductionMaxDesc) + .build(); + ops.push_back(std::move(reductionMax_op)); + + + /******************************************************************************* + * reduction(dO * O) * scale prob -> softmaxSum */ + + auto softmaxSumTensor = tensor_create( + CUDNN_DATA_FLOAT, S_SUM_ID, reduction_dim, + reduction_stride, false, false); // not virtual + auto scaleProbTensor = tensor_create( + CUDNN_DATA_FLOAT, SCALE_PROB, scale_dim, + scale_stride, false, true); // not virtual + auto softmaxSumOp = binary_pw_op_create( + afterReductionTensor, scaleProbTensor, + softmaxSumTensor, multiplyDesc); + ops.push_back(std::move(softmaxSumOp)); + + /******************************************************************************* + * Q @ K.T -> P */ + + // Inputs from fprop + auto qTensor = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false); + auto kTransposeTensor = tensor_create( + tensorType, K_ID, k_transpose_dim, + k_transpose_stride, false, false); + auto pTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 2, p_dim, + p_stride, true, false); // is virtual + + // matmul to calculate dvTensor + auto matmul_0_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + auto matmul_op0 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(qTensor) + .setbMatDesc(kTransposeTensor) + .setcMatDesc(pTensor) + .setmatmulDesc(matmul_0_Desc) + .build(); + + ops.push_back(std::move(matmul_op0)); + + /******************************************************************************* + * P * bmmScale -> pAfterScale */ + + auto bmmScaleTensor = tensor_create( + CUDNN_DATA_FLOAT, S_CONST_ID, scale_dim, + scale_stride, false, true); // not virtual and by value + auto pAfterScaleTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 2000, p_dim, + p_stride, true, false); // virtual + auto scaleOp = binary_pw_op_create( + pTensor, bmmScaleTensor, pAfterScaleTensor, multiplyDesc); + ops.push_back(std::move(scaleOp)); + + /******************************************************************************* + * Causal masking -> pAfterMaskTensor */ + + auto pAfterMaskTensor = createCausalMask( + b, h, s_q, s_kv, d, layout, tensorType, &ops, pAfterScaleTensor); + + /******************************************************************************* + * pAfterMaskTensor - softmaxStats -> pAfterSubtract */ + + auto pAfterSubtractTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 3, p_dim, + p_stride, true, false); // is virtual + auto softmaxStatsTensor = tensor_create( + CUDNN_DATA_FLOAT, S_STATS_ID, reduction_dim, + reduction_stride, false, false); // not virtual + auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB); + auto subtract_op = binary_pw_op_create( + pAfterMaskTensor, softmaxStatsTensor, + pAfterSubtractTensor, subtractDesc); + ops.push_back(std::move(subtract_op)); + + /******************************************************************************* + * e^(pAfterSubtract) -> pAfterSoftmax */ + + auto pAfterSoftmaxTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 4, p_dim, + p_stride, true, false); // is virtual + auto expDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP); + auto exp_op = unary_pw_op_create( + pAfterSubtractTensor, pAfterSoftmaxTensor, expDesc); + ops.push_back(std::move(exp_op)); + + /******************************************************************************* + * Dropout -> afterScaleDropout */ + + auto dropoutMaskTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 5, p_dim, + p_stride, true, false); // is virtual + auto afterScaleDropoutTensor = createDropoutBackward( + b, h, s_q, s_kv, d, dropout_probability, tensorType, + &ops, pAfterSoftmaxTensor, dropoutMaskTensor); + + /******************************************************************************* + * afterScaleDropout -> sTransposeTensor */ + + auto sTransposeTensor = tensor_create( + tensorType, VIRTUAL_ID + 6, p_transpose_dim, + p_transpose_stride, true, false); // is virtual + auto reshape_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(afterScaleDropoutTensor) + .setyDesc(sTransposeTensor) + .build(); + ops.push_back(std::move(reshape_op)); + + // Outputs of bprop + int64_t dqkv_dim[4] = {b, h, s_kv, d}; + int64_t dqkv_stride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, dqkv_stride, + layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); + + // Outputs of backprop + auto dQTensor = tensor_create(tensorType, dQ_ID, dqkv_dim, dqkv_stride, false, false); + auto dKTensor = tensor_create(tensorType, dK_ID, dqkv_dim, dqkv_stride, false, false); + auto dVTensor = tensor_create(tensorType, dV_ID, dqkv_dim, dqkv_stride, false, false); + // not virtual + + /******************************************************************************* + * sTransposeTensor @ dO -> dV */ + + auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + auto matmul_op1 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(sTransposeTensor) + .setbMatDesc(dOTensor) + .setcMatDesc(dVTensor) + .setmatmulDesc(matmul_1_Desc) + .build(); + + ops.push_back(std::move(matmul_op1)); + + /******************************************************************************* + * dO @ V.T -> dS */ + + auto vTransposeTensor = tensor_create( + tensorType, V_ID, v_transpose_dim, + v_transpose_stride, false, false); + auto dSTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 7, p_dim, + p_stride, true, false); // is virtual + + auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + + auto matmul_op2 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dOTensor) + .setbMatDesc(vTransposeTensor) + .setcMatDesc(dSTensor) + .setmatmulDesc(matmul_2_Desc) + .build(); + + ops.push_back(std::move(matmul_op2)); + + /******************************************************************************* + * dS * dropoutMask -> dSAfterDropout */ + + auto dSAfterDropoutTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 8, p_dim, + p_stride, true, false); // is virtual + auto multiply_op = binary_pw_op_create( + dSTensor, dropoutMaskTensor, + dSAfterDropoutTensor, multiplyDesc); + ops.push_back(std::move(multiply_op)); + + /******************************************************************************* + * dSAfterDropout - softmaxSum -> dsAfterSubtract */ + + auto dsAfterSubtractTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 9, p_dim, + p_stride, true, false); // is virtual + auto subtract_op2 = binary_pw_op_create( + dSAfterDropoutTensor, softmaxSumTensor, + dsAfterSubtractTensor, subtractDesc); + ops.push_back(std::move(subtract_op2)); + + /******************************************************************************* + * dsAfterSubtract * afterSoftmax -> dP */ + + auto dPTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 10, p_dim, + p_stride, true, false); // is virtual + auto multiply_op2 = binary_pw_op_create( + dsAfterSubtractTensor, pAfterSoftmaxTensor, + dPTensor, multiplyDesc); + ops.push_back(std::move(multiply_op2)); + + /******************************************************************************* + * dP * scaleDropout -> dPAfterDropoutScale */ + + auto dPAfterDropoutScaleTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 11, p_dim, + p_stride, true, false); // is virtual + auto scaleDropoutTensor = tensor_create( + CUDNN_DATA_FLOAT, D_CONST_ID, scale_dim, + scale_stride, false, true); // is by value + auto multiply_op3 = binary_pw_op_create( + dPTensor, scaleDropoutTensor, + dPAfterDropoutScaleTensor, multiplyDesc); + ops.push_back(std::move(multiply_op3)); + + /******************************************************************************* + * dPAfterDropoutScale * bmmScale -> dPScaledTensor */ + + auto dPScaledTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 12, p_dim, + p_stride, true, false); // is virtual + auto multiply_op4 = binary_pw_op_create( + dPAfterDropoutScaleTensor, bmmScaleTensor, + dPScaledTensor, multiplyDesc); + ops.push_back(std::move(multiply_op4)); + + /******************************************************************************* + * K.T -> K */ + + int64_t kDim[4] = {b, h, s_kv, d}; + int64_t kStride[4]; + generateMatrixStrides( + b, h, s_q, s_kv, d, kStride, + layout, NVTE_QKV_Matrix::NVTE_K_Matrix); + auto kTensor = tensor_create( + tensorType, VIRTUAL_ID + 13, kDim, + kStride, true, false); // is virtual + auto reshape_op2 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(kTransposeTensor) + .setyDesc(kTensor) + .build(); + ops.push_back(std::move(reshape_op2)); + + /******************************************************************************* + * dP @ K -> dqAccumTensor */ + + auto dqAccumTensor = cudnn_frontend::TensorBuilder() + .setDim(4, dqkv_dim) + .setStride(4, dqkv_stride) + .setId(dQ_ACCUM_ID) + .setAlignment(16) // 16B alignment is needed to run a tensor core engine + .setDataType(CUDNN_DATA_FLOAT) + .setVirtual(false) + .setByValue(false) + .setReorderType( + cudnn_frontend::cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16) + .build(); + + auto matmul_3_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + auto matmul_op3 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dPTensor) + .setbMatDesc(kTensor) + .setcMatDesc(dqAccumTensor) + .setmatmulDesc(matmul_3_Desc) + .build(); + + ops.push_back(std::move(matmul_op3)); + + /******************************************************************************* + * dP.T @ Q -> dK */ + + auto dPTransposeTensor = tensor_create( + CUDNN_DATA_FLOAT, VIRTUAL_ID + 14, p_transpose_dim, + p_transpose_stride, true, false); // is virtual + auto reshape_op3 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR) + .setxDesc(dPTensor) + .setyDesc(dPTransposeTensor) + .build(); + ops.push_back(std::move(reshape_op3)); + + auto matmul_4_Desc = cudnn_frontend::MatMulDescBuilder() + .setComputeType(CUDNN_DATA_FLOAT) + .build(); + auto matmul_op4 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(dPTransposeTensor) + .setbMatDesc(qTensor) + .setcMatDesc(dKTensor) + .setmatmulDesc(matmul_4_Desc) + .build(); + + ops.push_back(std::move(matmul_op4)); + + /******************************************************************************* + * dqAccumTensor @ identity -> dqTensor */ + + auto identityDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_IDENTITY); + auto identity_op = unary_pw_op_create(dqAccumTensor, dQTensor, identityDesc); + ops.push_back(std::move(identity_op)); + + for (unsigned int i = 0; i < ops.size(); i++) { + all_ops.push_back(&ops[i]); + } + + // Create an Operation Graph + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(all_ops.size(), all_ops.data()) + .build(); + + cudnn_frontend::EngineConfigList filtered_configs; + auto statuses = cudnn_frontend::get_heuristics_list<1>( + {"heuristics_instant"}, opGraph, allowAllConfig, filtered_configs, true); + + if (filtered_configs.size() == 0) { + cudnn_frontend::set_error_and_throw_exception( + nullptr, CUDNN_STATUS_NOT_SUPPORTED, + "run_mha_bprop: No config returned by the heuristics"); + } + + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(filtered_configs[0], opGraph.getTag()) + .build(); + + cache.insert({descriptor, plan}); + return plan; + }; + + auto plan = get_plan(fmha_bprop_cache, descriptor); + + auto plan_workspace_size = plan.getWorkspaceSize(); + + // Exit to request upper level API to allocate memory if needed + size_t softmaxSum_workspace_size = b * h * s_q * sizeof(float); + size_t dqAccum_workspace_size = b * s_q * h * d * sizeof(float); + if (workspace == nullptr) { + *workspace_size = plan_workspace_size + softmaxSum_workspace_size + + dqAccum_workspace_size; + return; + } + + void *devPtrSoftmaxSum = static_cast(workspace) + plan_workspace_size; + void *devPtrdQAccumulator = static_cast(devPtrSoftmaxSum) + + softmaxSum_workspace_size; + NVTE_CHECK_CUDA(cudaMemset(devPtrdQAccumulator, 0, dqAccum_workspace_size)); + + std::set> data_ptrs; + // add all the data pointers to be used in the variant pack + float negInfinity = -1.0E+10f; + float scale_dropout = 1.0f/(1.0f - dropout_probability); + data_ptrs.insert(std::pair(dQ_ID, devPtrdQ)); + data_ptrs.insert(std::pair(dQ_ACCUM_ID, devPtrdQAccumulator)); + data_ptrs.insert(std::pair(dK_ID, devPtrdK)); + data_ptrs.insert(std::pair(dV_ID, devPtrdV)); + + data_ptrs.insert(std::pair(Q_ID, devPtrQ)); + data_ptrs.insert(std::pair(K_ID, devPtrKTranspose)); + data_ptrs.insert(std::pair(V_ID, devPtrVTranspose)); + data_ptrs.insert(std::pair(O_ID, devPtrO)); + data_ptrs.insert(std::pair(dO_ID, devPtrdO)); + data_ptrs.insert(std::pair(S_STATS_ID, devPtrSoftmaxStats)); + data_ptrs.insert(std::pair(S_SUM_ID, devPtrSoftmaxSum)); + data_ptrs.insert(std::pair(D_SEED_ID, devPtrDropoutSeed)); + data_ptrs.insert(std::pair(D_OFFSET_ID, devPtrDropoutOffset)); + data_ptrs.insert(std::pair(MASK_VAL_ID, &negInfinity)); + + float scaleProb = 1.0f - dropout_probability; + data_ptrs.insert(std::pair(D_CONST_ID, &scale_dropout)); + data_ptrs.insert(std::pair(S_CONST_ID, &scaling_factor)); + data_ptrs.insert(std::pair(SCALE_PROB, &scaleProb)); + + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace) + .setDataPointers(data_ptrs) + .build(); + + NVTE_CHECK_CUDNN( + cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); + } catch (cudnn_frontend::cudnnException &e) { + NVTE_ERROR(e.what()); + } +} + +} // namespace fused_attn + +using namespace transformer_engine::fused_attn; +void fused_attn_arbitrary_seqlen_fwd_qkvpacked( + size_t batch, size_t max_seqlen, size_t num_head, size_t head_dim, bool is_training, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_Bias, Tensor *output_O, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, + "qkv_layout must be NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED."); + + // QKV shape is [b, s, 3, h, d] + void *devPtrQKV = input_QKV->data.dptr; + const auto stride = num_head * head_dim; + + void *devPtrQ = static_cast(devPtrQKV); + void *devPtrK = static_cast(static_cast(devPtrQKV) + stride); + void *devPtrV = static_cast(static_cast(devPtrQKV) + 2 * stride); + + void *devPtrO = output_O->data.dptr; + + void *devPtrS = nullptr; + + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 2; + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + output_S->data.dptr = nullptr; + output_S->data.shape = {batch, num_head, max_seqlen, 1}; + output_S->data.dtype = DType::kFloat32; + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; + } else if (Aux_CTX_Tensors->size == 2) { + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + devPtrS = output_S->data.dptr; + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + output_rng_state->data.dptr = rng_state->data.dptr; + } + + void* devPtrDropoutSeed = rng_state->data.dptr; + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const DType QKV_type = input_QKV->data.dtype; + size_t workspace_size = 0; + + fused_attn_arbitrary_seqlen_fwd_impl(batch, num_head, max_seqlen, max_seqlen, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, devPtrS, devPtrO, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } +} + +void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t num_head, + size_t head_dim, float attn_scale, float p_dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type mask_type, + const Tensor *input_QKV, const Tensor *input_O, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQKV, Tensor *output_dBias, + const Tensor *cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, + "qkv_layout must be NVTE_QKV_INTERLEAVED."); + + // QKV shape is [b, s, 3, h, d] + void *devPtrQKV = input_QKV->data.dptr; + + auto stride = num_head * head_dim; + void *devPtrQ = devPtrQKV; + void *devPtrK = static_cast(static_cast(devPtrQKV) + stride); + void *devPtrV = static_cast(static_cast(devPtrQKV) + 2 * stride); + + void* devPtrO = input_O->data.dptr; + void *devPtrdO = input_dO->data.dptr; + + // dQKV shape is [b, s, 3, h, d] + void *devPtrdQKV = output_dQKV->data.dptr; + void *devPtrdQ = devPtrdQKV; + void *devPtrdK = static_cast(static_cast(devPtrdQKV) + stride); + void *devPtrdV = static_cast(static_cast(devPtrdQKV) + 2 * stride); + + void *devPtrSoftmaxStats = nullptr; + devPtrSoftmaxStats = output_S->data.dptr; + + void* devPtrDropoutSeed = rng_state->data.dptr; + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const auto qkv_type = input_QKV->data.dtype; + size_t workspace_size = 0; + + fused_attn_arbitrary_seqlen_bwd_impl(batch, num_head, max_seqlen, max_seqlen, head_dim, + attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, devPtrO, devPtrSoftmaxStats, + devPtrdQ, devPtrdK, devPtrdV, devPtrdO, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(qkv_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } +} +} // namespace transformer_engine +#endif // CUDNN_VERSION >= 8900 diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h new file mode 100644 index 0000000000..68ebe0c7c0 --- /dev/null +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file fused_attn_arbitrary_seqlen.h + * \brief Functions for fused attention with seqlen > 512 + */ + +#ifndef TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_ +#define TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_ + +#include "transformer_engine/fused_attn.h" + +#include + +#include "common/common.h" + +namespace transformer_engine { +#if (CUDNN_VERSION >= 8900) +void fused_attn_arbitrary_seqlen_fwd_qkvpacked(size_t batch, size_t max_seqlen, size_t num_head, + size_t head_size, bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_QKV, const Tensor *input_Bias, + Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors, + const Tensor *cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + +void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t num_head, + size_t head_dim, float attn_scale, float p_dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type mask_type, const Tensor *input_QKV, + const Tensor *input_O, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQKV, Tensor *output_dBias, + const Tensor *cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + +#endif // CUDNN_VERSION >= 8900 +} // namespace transformer_engine + +#endif // TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_ diff --git a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu similarity index 98% rename from transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu rename to transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu index e8906b31c4..932414ffc0 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.cu +++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu @@ -4,7 +4,7 @@ * See LICENSE for license information. ************************************************************************/ -#include "fused_attn_fp16_bf16_max_seqlen_512.h" +#include "fused_attn_f16_max512_seqlen.h" #include #include @@ -1239,7 +1239,7 @@ void fused_attn_max_512_fwd_qkvpacked( size_t batch, size_t max_seqlen, size_t num_head, size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_Bias, Tensor *output_O, - NVTETensorPack *Aux_Output_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; @@ -1260,14 +1260,14 @@ void fused_attn_max_512_fwd_qkvpacked( void *devPtrS = nullptr; - if (Aux_Output_Tensors->size == 0) { - Aux_Output_Tensors->size = 1; - Tensor *output_S = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 1; + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); output_S->data.dptr = nullptr; output_S->data.shape = {batch, num_head, max_seqlen, max_seqlen}; output_S->data.dtype = input_QKV->data.dtype; - } else if (Aux_Output_Tensors->size == 1) { - Tensor *output_S = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + } else if (Aux_CTX_Tensors->size == 1) { + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); devPtrS = output_S->data.dptr; } @@ -1307,7 +1307,7 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_KV, const Tensor *input_Bias, Tensor *output_O, - NVTETensorPack *Aux_Output_Tensors, const Tensor *q_cu_seqlens, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; @@ -1336,14 +1336,14 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k const DType kv_type = input_KV->data.dtype; NVTE_CHECK(q_type == kv_type, "data type of Q must be equal to data type of KV."); - if (Aux_Output_Tensors->size == 0) { - Aux_Output_Tensors->size = 1; - Tensor *output_S = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 1; + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); output_S->data.dptr = nullptr; output_S->data.shape = {batch, num_head, q_max_seqlen, kv_max_seqlen}; output_S->data.dtype = q_type; - } else if (Aux_Output_Tensors->size == 1) { - Tensor *output_S = reinterpret_cast(Aux_Output_Tensors->tensors[0]); + } else if (Aux_CTX_Tensors->size == 1) { + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); devPtrS = output_S->data.dptr; } @@ -1381,7 +1381,7 @@ void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t nu size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, - const Tensor *input_dO, const NVTETensorPack *Aux_CTX_Tensors, + const Tensor *input_dO, Tensor *output_S, Tensor *output_dQKV, Tensor *output_dBias, const Tensor *cu_seqlens, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { @@ -1408,12 +1408,8 @@ void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t nu void *devPtrdBias = output_dBias->data.dptr; - NVTE_CHECK(Aux_CTX_Tensors->size == 1); - void *devPtrS = nullptr; - if (Aux_CTX_Tensors->size == 1) { - Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); - devPtrS = output_S->data.dptr; - } + void *devPtrS = output_S->data.dptr; + // devPtrdS reuses the memory of devPtrS void *devPtrdS = devPtrS; @@ -1446,7 +1442,7 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_KV, - const Tensor *input_dO, const NVTETensorPack *Aux_CTX_Tensors, + const Tensor *input_dO, Tensor *output_S, Tensor *output_dQ, Tensor *output_dKV, Tensor *output_dBias, const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { @@ -1472,12 +1468,8 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k void *devPtrdBias = output_dBias->data.dptr; - NVTE_CHECK(Aux_CTX_Tensors->size == 1); - void *devPtrS = nullptr; - if (Aux_CTX_Tensors->size == 1) { - Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); - devPtrS = output_S->data.dptr; - } + void *devPtrS = output_S->data.dptr; + // devPtrdS reuses the memory of devPtrS void *devPtrdS = devPtrS; diff --git a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.h b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h similarity index 91% rename from transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.h rename to transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h index 3e11a1f02a..75545d0b40 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp16_bf16_max_seqlen_512.h +++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h @@ -24,7 +24,7 @@ void fused_attn_max_512_fwd_qkvpacked(size_t batch, size_t max_seqlen, size_t nu float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_Bias, - Tensor *output_O, NVTETensorPack *Aux_Output_Tensors, + Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); @@ -34,7 +34,7 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_KV, const Tensor *input_Bias, Tensor *output_O, - NVTETensorPack *Aux_Output_Tensors, const Tensor *q_cu_seqlens, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); @@ -42,7 +42,7 @@ void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t nu size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, - const Tensor *input_dO, const NVTETensorPack *Aux_CTX_Tensors, + const Tensor *input_dO, Tensor *output_S, Tensor *output_dQKV, Tensor *output_dBias, const Tensor *cu_seqlens, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); @@ -52,7 +52,7 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_KV, - const Tensor *input_dO, const NVTETensorPack *Aux_CTX_Tensors, + const Tensor *input_dO, Tensor *output_S, Tensor *output_dQ, Tensor *output_dKV, Tensor *output_dBias, const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu index 768ac8eb20..8fc208bfcd 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu @@ -991,7 +991,7 @@ static cudnn_frontend::Tensor createdSQBMM( } // fused attention FWD FP8 -void fa_fwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, +void fused_attn_fp8_fwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, bool isTraining, float attnScale, float dropoutProbability, NVTE_QKV_Layout layout, void* devPtrQ, void* devPtrK, void* devPtrV, @@ -1303,7 +1303,7 @@ void fa_fwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, } // fused attention BWD FP8 -void fa_bwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, +void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, float attnScale, float dropoutProbability, NVTE_QKV_Layout layout, void* devPtrQ, void* devPtrK, void* devPtrV, void* devPtrM, void* devPtrZInv, @@ -1858,7 +1858,7 @@ void fa_bwd_fp8(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, int64_t d, #if (CUDNN_VERSION >= 8900) // fused attention FWD FP8 with packed QKV -void fused_attn_fwd_fp8_qkvpacked( +void fused_attn_fp8_fwd_qkvpacked( size_t b, size_t max_seqlen, size_t h, size_t d, bool is_training, float attn_scale, @@ -1866,7 +1866,7 @@ void fused_attn_fwd_fp8_qkvpacked( const Tensor *input_QKV, Tensor *input_output_S, Tensor *output_O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, Tensor *workspace, @@ -1888,23 +1888,29 @@ void fused_attn_fwd_fp8_qkvpacked( void* devPtrM = nullptr; void* devPtrZInv = nullptr; - if (Aux_Output_Tensors->size == 0) { + if (Aux_CTX_Tensors->size == 0) { if (is_training) { - Aux_Output_Tensors->size = 2; - Tensor *output_M = reinterpret_cast(Aux_Output_Tensors->tensors[0]); - Tensor *output_ZInv = reinterpret_cast(Aux_Output_Tensors->tensors[1]); + Aux_CTX_Tensors->size = 3; + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); output_M->data.dptr = nullptr; output_M->data.shape = {b, h, max_seqlen, 1}; output_M->data.dtype = DType::kFloat32; output_ZInv->data.dptr = nullptr; output_ZInv->data.shape = {b, h, max_seqlen, 1}; output_ZInv->data.dtype = DType::kFloat32; + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; } - } else if (Aux_Output_Tensors->size == 2) { - Tensor *output_M = reinterpret_cast(Aux_Output_Tensors->tensors[0]); - Tensor *output_ZInv = reinterpret_cast(Aux_Output_Tensors->tensors[1]); + } else if (Aux_CTX_Tensors->size == 3) { + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); devPtrM = output_M->data.dptr; devPtrZInv = output_ZInv->data.dptr; + output_rng_state->data.dptr = rng_state->data.dptr; } void* devPtrAmaxS = input_output_S->amax.dptr; @@ -1921,7 +1927,7 @@ void fused_attn_fwd_fp8_qkvpacked( const DType QKV_type = input_QKV->data.dtype; size_t workspace_size = 0; - fused_attn::fa_fwd_fp8( + fused_attn::fused_attn_fp8_fwd_impl( b, max_seqlen, max_seqlen, h, d, is_training, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, @@ -1948,7 +1954,7 @@ void fused_attn_fwd_fp8_qkvpacked( } } // fused attention BWD FP8 with packed QKV -void fused_attn_bwd_fp8_qkvpacked( +void fused_attn_fp8_bwd_qkvpacked( size_t b, size_t max_seqlen, size_t h, size_t d, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, @@ -2011,7 +2017,7 @@ void fused_attn_bwd_fp8_qkvpacked( const DType QKV_type = input_QKV->data.dtype; size_t workspace_size = 0; - fused_attn::fa_bwd_fp8( + fused_attn::fused_attn_fp8_bwd_impl( b, max_seqlen, max_seqlen, h, d, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h index e43683d338..111dfddd10 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.h +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h @@ -13,7 +13,7 @@ namespace transformer_engine { #if (CUDNN_VERSION >= 8900) // fused attention FWD FP8 with packed QKV -void fused_attn_fwd_fp8_qkvpacked( +void fused_attn_fp8_fwd_qkvpacked( size_t b, size_t max_seqlen, size_t h, size_t d, bool is_training, float attn_scale, @@ -21,7 +21,7 @@ void fused_attn_fwd_fp8_qkvpacked( const Tensor *input_QKV, Tensor *input_output_S, Tensor *output_O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *rng_state, Tensor *workspace, @@ -29,7 +29,7 @@ void fused_attn_fwd_fp8_qkvpacked( cudnnHandle_t handle); // fused attention BWD FP8 with packed QKV -void fused_attn_bwd_fp8_qkvpacked( +void fused_attn_fp8_bwd_qkvpacked( size_t b, size_t max_seqlen, size_t h, size_t d, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, diff --git a/transformer_engine/common/fused_attn/utils.cu b/transformer_engine/common/fused_attn/utils.cu index cae42bafa0..ebba6efa21 100644 --- a/transformer_engine/common/fused_attn/utils.cu +++ b/transformer_engine/common/fused_attn/utils.cu @@ -249,7 +249,6 @@ __global__ void cu_seqlens_to_actual_seqlens(size_t b, kv_seqlens[tid] = kv_cu_seqlens[tid + 1] - kv_cu_seqlens[tid]; } } - } // namespace fused_attn // get cuDNN data type diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index ed6dd4c041..447b1f9d6a 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -94,6 +94,38 @@ enum NVTE_Mask_Type { NVTE_CAUSAL_MASK = 2, }; +enum NVTE_Fused_Attn_Backend { + /*! No supported backend */ + NVTE_No_Backend = -1, + /*! cuDNN-based FP16/BF16 fused attention for <= 512 sequence length */ + NVTE_F16_max512_seqlen = 0, + /*! cuDNN-based FP16/BF16 fused attention for any sequence length */ + NVTE_F16_arbitrary_seqlen = 1, + /*! cuDNN-based FP8 fused attention for <= 512 sequence length */ + NVTE_FP8 = 2, +}; + +/*! \brief Get fused attention backend based on input parameters. + * + * \param[in] q_dtype The data type of Tensor Q. + * \param[in] kv_dtype The data type of Tensors K, V. + * \param[in] qkv_layout The layout of Tensors Q, K, V. + * \param[in] bias_type The attention bias type. + * \param[in] attn_mask_type The attention mask type. + * \param[in] dropout The dropout probability. + * \param[in] max_seqlen_q The sequence length of Q. + * \param[in] max_seqlen_kv The sequence length of K, V. + * \param[in] head_dim The head dimension of Q, K, V. + */ +NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( + NVTEDType q_dtype, + NVTEDType kv_dtype, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + float dropout, size_t max_seqlen_q, + size_t max_seqlen_kv, size_t head_dim); + /*! \brief Compute dot product attention with packed QKV input. * * Computes: @@ -104,36 +136,38 @@ enum NVTE_Mask_Type { * * Support Matrix: \verbatim - | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | 1 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS | CAUSAL | Yes | > 512 | 64, 128 | + | 2 | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | \endverbatim * - * \param[in] QKV The QKV tensor in packed format, - * [total_seqs, 3, num_heads, head_dim]. - * \param[in] Bias The Bias tensor. - * \param[in,out] S The S tensor. - * \param[out] O The output O tensor. - * \param[out] Aux_Output_Tensors Auxiliary output tensors when training, e.g. M, ZInv. - * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. - * \param[in] rng_state Seed and offset of CUDA random number generator. - * \param[in] max_seqlen Max sequence length used for computing. - * It may be >= max(cu_seqlens). - * \param[in] is_training Whether this is in training mode or inference. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. + * \param[in] QKV The QKV tensor in packed format, + * [total_seqs, 3, num_heads, head_dim]. + * \param[in] Bias The Bias tensor. + * \param[in,out] S The S tensor. + * \param[out] O The output O tensor. + * \param[out] Aux_CTX_Tensors Auxiliary output tensors when training, + * e.g. M, ZInv, rng_state. + * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen Max sequence length used for computing, + * it may be >= max(cu_seqlens). + * \param[in] is_training Whether this is in training mode or inference. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. */ void nvte_fused_attn_fwd_qkvpacked( const NVTETensor QKV, const NVTETensor Bias, NVTETensor S, NVTETensor O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const NVTETensor cu_seqlens, const NVTETensor rng_state, size_t max_seqlen, @@ -147,30 +181,32 @@ void nvte_fused_attn_fwd_qkvpacked( * * Support Matrix: \verbatim - | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | 1 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS | CAUSAL | Yes | > 512 | 64, 128 | + | 2 | FP8 | QKV_INTERLEAVED | NO_BIAS | PADDING | Yes | <= 512 | 64 | \endverbatim * - * \param[in] QKV The QKV tensor in packed format, - * [total_seqs, 3, num_heads, head_dim]. - * \param[in] O The O tensor from forward. - * \param[in] dO The gradient of the O tensor. - * \param[in] S The S tensor. - * \param[in,out] dP The gradient of the P tensor. - * \param[in] Aux_CTX_Tensors Auxiliary tensors from forward when in training mode. - * \param[out] dQKV The gradient of the QKV tensor. - * \param[out] dBias The gradient of the Bias tensor. - * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. - * \param[in] max_seqlen Max sequence length used for computing. - * It may be >= max(cu_seqlens). - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. + * \param[in] QKV The QKV tensor in packed format, + * [total_seqs, 3, num_heads, head_dim]. + * \param[in] O The O tensor from forward. + * \param[in] dO The gradient of the O tensor. + * \param[in] S The S tensor. + * \param[in,out] dP The gradient of the P tensor. + * \param[in] Aux_CTX_Tensors Auxiliary tensors from context when in training mode, + * e.g. M, ZInv, rng_state. + * \param[out] dQKV The gradient of the QKV tensor. + * \param[out] dBias The gradient of the Bias tensor. + * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. + * \param[in] max_seqlen Max sequence length used for computing, + * it may be >= max(cu_seqlens). + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. */ void nvte_fused_attn_bwd_qkvpacked( const NVTETensor QKV, @@ -199,31 +235,32 @@ void nvte_fused_attn_bwd_qkvpacked( * * Support Matrix: \verbatim - | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * - * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. - * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. - * \param[in] Bias The Bias tensor. - * \param[in,out] S The S tensor. - * \param[out] O The output O tensor. - * \param[out] Aux_Output_Tensors Auxiliary output tensors when training, e.g. M, ZInv. - * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] rng_state Seed and offset of CUDA random number generator. - * \param[in] max_seqlen_q Max sequence length used for computing - * for Q. It may be >= max(cu_seqlens_q). - * \param[in] max_seqlen_kv Max sequence length used for computing - * for KV. It may be >= max(cu_seqlens_kv). - * \param[in] is_training Whether this is in training mode or inference. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. + * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. + * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. + * \param[in] Bias The Bias tensor. + * \param[in,out] S The S tensor. + * \param[out] O The output O tensor. + * \param[out] Aux_CTX_Tensors Auxiliary output tensors when training, + * e.g. M, ZInv, rng_state. + * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(cu_seqlens_q). + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(cu_seqlens_kv). + * \param[in] is_training Whether this is in training mode or inference. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. */ void nvte_fused_attn_fwd_kvpacked( const NVTETensor Q, @@ -231,7 +268,7 @@ void nvte_fused_attn_fwd_kvpacked( const NVTETensor Bias, NVTETensor S, NVTETensor O, - NVTETensorPack* Aux_Output_Tensors, + NVTETensorPack* Aux_CTX_Tensors, const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv, const NVTETensor rng_state, @@ -246,33 +283,34 @@ void nvte_fused_attn_fwd_kvpacked( * * Support Matrix: \verbatim - | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | + | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | QKV_INTERLEAVED | NO_BIAS/POST_SCALE_BIAS | PADDING/CAUSAL | Yes | <= 512 | 64 | \endverbatim * - * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. - * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. - * \param[in] O The O tensor from forward. - * \param[in] dO The gradient of the O tensor. - * \param[in] S The S tensor. - * \param[in,out] dP The gradient of the P tensor. - * \param[in] Aux_CTX_Tensors Auxiliary tensors from forward when in training mode. - * \param[out] dQ The gradient of the Q tensor. - * \param[out] dKV The gradient of the KV tensor. - * \param[out] dBias The gradient of the Bias tensor. - * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] max_seqlen_q Max sequence length used for computing - * for Q. It may be >= max(cu_seqlens_q). - * \param[in] max_seqlen_kv Max sequence length used for computing - * for KV. It may be >= max(cu_seqlens_kv). - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. + * \param[in] Q The Q tensor, [total_seqs_q, num_heads, head_dim]. + * \param[in] KV The KV tensor, [total_seqs_kv, 2, num_heads, head_dim]. + * \param[in] O The O tensor from forward. + * \param[in] dO The gradient of the O tensor. + * \param[in] S The S tensor. + * \param[in,out] dP The gradient of the P tensor. + * \param[in] Aux_CTX_Tensors Auxiliary tensors from context when in training mode, + * e.g. M, ZInv, rng_state. + * \param[out] dQ The gradient of the Q tensor. + * \param[out] dKV The gradient of the KV tensor. + * \param[out] dBias The gradient of the Bias tensor. + * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(cu_seqlens_q). + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(cu_seqlens_kv). + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensor's layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. */ void nvte_fused_attn_bwd_kvpacked( const NVTETensor Q, diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index f81b37cbc7..492ebe5cb6 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -15,6 +15,16 @@ from flash_attn.flash_attn_interface import flash_attn_unpadded_func import transformer_engine_extensions as tex +from transformer_engine.pytorch.cpp_extensions.fused_attn import ( + fused_attn_fwd_qkvpacked, + fused_attn_bwd_qkvpacked, + fused_attn_fwd_kvpacked, + fused_attn_bwd_kvpacked, + QKVLayout, + AttnBiasType, + AttnMaskType, + FusedAttnBackend, +) from transformer_engine.pytorch.module import LayerNormLinear, Linear from transformer_engine.pytorch.utils import ( divide, @@ -26,6 +36,7 @@ AttnMaskTypes, AttnTypes, dist_group_type, + TE_DType, ) from transformer_engine.pytorch.softmax import FusedScaleMaskSoftmax from transformer_engine.pytorch.distributed import ( @@ -267,9 +278,9 @@ def backward(ctx, return dq, dk, dv -def _check_if_interleaved(q, k, v): - data_ptr = q.storage().data_ptr() - check_ptrs = all(x.storage().data_ptr() == data_ptr for x in [q, k, v]) +def _check_if_interleaved_qkv(q, k, v): + data_ptr = q.untyped_storage().data_ptr() + check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v]) if not check_ptrs: return False @@ -288,9 +299,32 @@ def _check_if_interleaved(q, k, v): for i, x in enumerate([q, k, v])) return check_offsets +def _check_if_interleaved_kv(k, v): + data_ptr = k.untyped_storage().data_ptr() + check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [k, v]) + if not check_ptrs: + return False + + stride = k.stride() + check_strides = all(stride == x.stride() for x in [k, v]) + if not check_strides: + return False + + shape = k.shape + check_shapes = all(shape == x.shape for x in [k, v]) + if not check_shapes: + return False + + last_dim_size = shape[-1] + check_offsets = all(i * last_dim_size == x.storage_offset() + for i, x in enumerate([k, v])) + return check_offsets + + class FlashAttention(torch.nn.Module): - """Dot product attention implementation by using the flash-attn package. + """Dot product attention, using HazyResearch flash-attn package: + https://github.com/HazyResearch/flash-attention """ def __init__( @@ -321,9 +355,9 @@ def forward( """flash-attn fprop""" assert ( - (query_layer.dtype in [torch.float16, torch.bfloat16]) - and (key_layer.dtype in [torch.float16, torch.bfloat16]) - and (value_layer.dtype in [torch.float16, torch.bfloat16]) + query_layer.dtype in [torch.float16, torch.bfloat16] + and key_layer.dtype in [torch.float16, torch.bfloat16] + and value_layer.dtype in [torch.float16, torch.bfloat16] ), 'FlashAttention currently only supports FP16 and BF16.' assert ( query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda @@ -333,7 +367,7 @@ def forward( if (query_layer.shape[-1] == 128 and query_layer.shape[0] * query_layer.shape[1] >= 512 and - _check_if_interleaved(query_layer, key_layer, value_layer)): + _check_if_interleaved_qkv(query_layer, key_layer, value_layer)): query_layer, key_layer, value_layer = _PrepareQKVForFA.apply(query_layer, key_layer, value_layer) @@ -369,6 +403,286 @@ def forward( return output.view(batch_size, seqlen, -1).transpose(0, 1).contiguous() +class FusedAttnFunc_qkvpacked(torch.autograd.Function): + """Function for FusedAttention with packed QKV input""" + + @staticmethod + def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, attn_scale, + dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen, fused_attention_backend): + out, aux_ctx_tensors = fused_attn_fwd_qkvpacked( + is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, + fused_attention_backend, attn_bias, + None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + + ctx.save_for_backward(qkv, out, cu_seqlens) + ctx.aux_ctx_tensors = aux_ctx_tensors + ctx.max_seqlen = max_seqlen + ctx.qkv_dtype = qkv_dtype + ctx.attn_scale = attn_scale + ctx.dropout_p = dropout_p + ctx.fast_zero_fill = fast_zero_fill + ctx.qkv_layout = qkv_layout + ctx.attn_bias_type = attn_bias_type + ctx.attn_mask_type = attn_mask_type + ctx.fused_attention_backend = fused_attention_backend + + return out + + @staticmethod + def backward(ctx, d_out): + qkv, out, cu_seqlens = ctx.saved_tensors + dqkv, *rest = fused_attn_bwd_qkvpacked( + ctx.max_seqlen, cu_seqlens, qkv, out, d_out, + ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + + # if no_bias, return dqkv + if ctx.attn_bias_type == "no_bias": + return (None, None, None, dqkv, None, None, None, + None, None, None, None, None, None, + None, None, None, None, None, None) + # else, return (dqkv, dbias) + return (None, None, None, dqkv, None, rest[0], None, + None, None, None, None, None, None, + None, None, None, None, None, None) + +class FusedAttnFunc_kvpacked(torch.autograd.Function): + """Function for FusedAttention with packed KV input""" + + @staticmethod + def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, kv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, + qkv_layout, attn_bias_type, attn_mask_type, + rng_gen, fused_attention_backend): + out, aux_ctx_tensors = fused_attn_fwd_kvpacked( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, kv, qkv_dtype, fused_attention_backend, attn_bias, + None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + + ctx.save_for_backward(q, kv, out, cu_seqlens_q, cu_seqlens_kv) + ctx.aux_ctx_tensors = aux_ctx_tensors + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_kv = max_seqlen_kv + ctx.qkv_dtype = qkv_dtype + ctx.attn_scale = attn_scale + ctx.dropout_p = dropout_p + ctx.fast_zero_fill = fast_zero_fill + ctx.qkv_layout = qkv_layout + ctx.attn_bias_type = attn_bias_type + ctx.attn_mask_type = attn_mask_type + ctx.fused_attention_backend = fused_attention_backend + + return out + + @staticmethod + def backward(ctx, d_out): + q, kv, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors + dq, dkv, *rest = fused_attn_bwd_kvpacked( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, kv, out, d_out, + ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + + # if no_bias, return dqkv + if ctx.attn_bias_type == "no_bias": + return (None, None, None, None, None, dq, dkv, None, None, None, + None, None, None, None, None, None, + None, None, None, None, None, None) + # else, return (dqkv, dbias) + return (None, None, None, None, None, dq, dkv, None, rest[0], None, + None, None, None, None, None, None, + None, None, None, None, None, None) + +class FusedAttention(torch.nn.Module): + """Dot product attention, with multiple backends: + + 1. FusedAttnBackend["F16_max512_seqlen"] + cuDNN based fused attention for FP16/BF16 and <=512 sequence length. + 2. FusedAttnBackend["F16_arbitrary_seqlen"] + cuDNN based fused attention for FP16/BF16 and any sequence length. + + Support matrix: + + | backend | 1 | 2 | + | flash based | no | yes | + | cuDNN based | yes | yes | + | qkv dtype | fp16/bf16 | fp16/bf16 | + | attn_type | self/cross | self | + | qkv_layout | | | + | - qkv | qkv_interleaved | qkv_interleaved | + | - (q,kv) | kv_interleaved | | + | mask_type | causal/no_mask | causal | + | bias_type | no_bias/post_scale_bias | no_bias | + | dropout | yes | yes | + | max_seqlen | <=512 | any | + | head_dim | 64 | 64,128 | + | output dtype | fp16/bf16 | fp16/bf16 | + """ + + def __init__( + self, + norm_factor: float, + attention_dropout: float = 0.0, + attention_dropout_ctx: Optional[Callable] = nullcontext, + attn_mask_type: str = "causal", + attention_type: str = "self", + ) -> None: + super().__init__() + + self.norm_factor = norm_factor + self.attention_dropout = attention_dropout + self.attention_dropout_ctx = attention_dropout_ctx + self.attn_mask_type = attn_mask_type + self.attention_type = attention_type + + def forward( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[torch.Tensor] = None, + fast_zero_fill: bool = True, + ) -> torch.Tensor: + """fused attention fprop""" + + assert ( + (query_layer.dtype in [torch.float16, torch.bfloat16]) + and (key_layer.dtype in [torch.float16, torch.bfloat16]) + and (value_layer.dtype in [torch.float16, torch.bfloat16]) + ), 'FusedAttention only supports FP16 and BF16 data types.' + assert ( + query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda + ), 'FusedAttention only supports CUDA tensors.' + + qkv_dtype = TE_DType[query_layer.dtype] + seqlen_q, batch_size = query_layer.shape[0], query_layer.shape[1] + seqlen_kv = key_layer.shape[0] + max_seqlen_q = seqlen_q + max_seqlen_kv = seqlen_kv + + if self.attention_type == "self": + if _check_if_interleaved_qkv(query_layer, key_layer, value_layer): + query_layer = query_layer.unsqueeze(3) + key_layer = key_layer.unsqueeze(3) + value_layer = value_layer.unsqueeze(3) + # [s, b, h, 3, d] + mixed_layer = torch.cat([query_layer, key_layer, value_layer], dim = 3) + # [b, s, 3, h, d] + mixed_layer = mixed_layer.transpose(2, 3).transpose(0, 1).contiguous() + else: + query_layer = query_layer.unsqueeze(2) + key_layer = key_layer.unsqueeze(2) + value_layer = value_layer.unsqueeze(2) + # [s, b, 3, h, d] + mixed_layer = torch.cat([query_layer, key_layer, value_layer], dim = 2) + # [b, s, 3, h, d] + mixed_layer = mixed_layer.transpose(0, 1).contiguous() + + # [total_seqs, 3, h, d] + mixed_layer = mixed_layer.view( + mixed_layer.shape[0] * mixed_layer.shape[1], *mixed_layer.shape[2:]).contiguous() + + qkv_layout = "qkv_interleaved" + max_seqlen = seqlen_q + cu_seqlens = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=torch.int32, + device=query_layer.device) + + with self.attention_dropout_ctx(): + output = FusedAttnFunc_qkvpacked.apply( + self.training, + max_seqlen, + cu_seqlens, + mixed_layer, + qkv_dtype, + core_attention_bias, + 1.0/self.norm_factor, + self.attention_dropout if self.training else 0.0, + fast_zero_fill, + qkv_layout, + core_attention_bias_type, + self.attn_mask_type, + None, # rng_gen + fused_attention_backend, + ) + output = output.view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous() + + if self.attention_type == "cross": + if _check_if_interleaved_kv(key_layer, value_layer): + # [s, b, h, 2, d] + key_layer = key_layer.unsqueeze(3) + value_layer = value_layer.unsqueeze(3) + key_value = torch.cat([key_layer, value_layer], dim = 3) + # [b, s, 2, h, d] + key_value = key_value.transpose(2, 3).transpose(0, 1).contiguous() + else: + # [s, b, 2, h, d] + key_layer = key_layer.unsqueeze(2) + value_layer = value_layer.unsqueeze(2) + key_value = torch.cat([key_layer, value_layer], dim = 2) + # [b, s, 2, h, d] + key_value = key_value.transpose(0, 1).contiguous() + + # [total_seqs, 2, h, d] + query_layer = query_layer.transpose(0, 1).contiguous() + query_layer = query_layer.view( + query_layer.shape[0] * query_layer.shape[1], *query_layer.shape[2:]) + key_value = key_value.view([key_value.shape[0] * key_value.shape[1]] + + key_value.shape[2:]).contiguous() + + qkv_layout = "kv_interleaved" + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=torch.int32, + device=query_layer.device) + cu_seqlens_kv = torch.arange( + 0, + (batch_size + 1) * seqlen_kv, + step=seqlen_kv, + dtype=torch.int32, + device=key_layer.device) + + with self.attention_dropout_ctx(): + outputs = FusedAttnFunc_kvpacked.apply( + self.training, + max_seqlen_q, max_seqlen_kv, + cu_seqlens_q, cu_seqlens_kv, + query_layer, key_value, + qkv_dtype, + core_attention_bias, + 1.0/self.norm_factor, + self.attention_dropout if self.training else 0.0, + fast_zero_fill, + qkv_layout, + core_attention_bias_type, + self.attn_mask_type, + None, # rng_gen + fused_attention_backend, + ) + + output = (outputs[0].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous(), + outputs[1].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous()) + return output + + class DotProductAttention(torch.nn.Module): """Allows the model to jointly attend to information from different representation subspaces as described in the paper: @@ -422,15 +736,16 @@ def __init__( get_rng_state_tracker: Optional[Callable] = None, tp_group: Optional[dist_group_type] = None, layer_number: Optional[int] = None, + attention_type: str = "self", ) -> None: super().__init__() - tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) + self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) self.tp_group = tp_group self.get_rng_state_tracker = get_rng_state_tracker projection_size = kv_channels * num_attention_heads - self.hidden_size_per_partition = divide(projection_size, tp_size) + self.hidden_size_per_partition = divide(projection_size, self.tp_size) self.hidden_size_per_attention_head = divide( projection_size, num_attention_heads ) @@ -447,18 +762,28 @@ def __init__( int(os.getenv("NVTE_FLASH_ATTN", "1")) and self.device_compute_capability >= 8.0 ) + self.use_fused_attention = ( + int(os.getenv("NVTE_FUSED_ATTN", "1")) + and self.device_compute_capability >= 8.0 + ) attn_kwargs = { "attention_dropout": attention_dropout, "attention_dropout_ctx": attention_dropout_ctx, "attn_mask_type": attn_mask_type, } + self.attention_type = attention_type self.attn_mask_type = attn_mask_type + self.attention_dropout = attention_dropout if self.use_flash_attention: self.flash_attention = FlashAttention(norm_factor, **attn_kwargs) - # Instantiating both types since use of flash-attn + # Instantiating three types since use of flash-attn and FusedAttention # might be ruled out due to forward inputs. + if self.use_fused_attention: + self.fused_attention = FusedAttention( + norm_factor, **attn_kwargs, + attention_type = attention_type) self.unfused_attention = UnfusedDotProductAttention( norm_factor, **attn_kwargs, layer_number=layer_number) @@ -489,6 +814,9 @@ def forward( value_layer: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, checkpoint_core_attention: bool = False, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[torch.Tensor] = None, + fast_zero_fill: bool = True, ) -> torch.Tensor: """ Dot Product Attention Layer. @@ -506,6 +834,17 @@ def forward( (:attr:`sequence_length`, :attr:`batch_size`, :attr:`num_attention_heads` * :attr:`kv_channels`) is returned. + .. note:: + + `DotProductAttention` supports three backends: 1) `FlashAttention` which calls + HazyResearch's FlashAttention PyTorch API, 2) `FusedAttention` which has multiple + fused attention implementations as its backends (see `FusedAttention` for + more details), and 3) `UnfusedDotProductAttention` which is the native PyTorch + implementation with fused scaled masked softmax. Users can use environment variables + `NVTE_FLASH_ATTN`, `NVTE_FUSED_ATTN`, and `NVTE_FUSED_ATTN_BACKEND` to control + which DotProductAttention backend, and FusedAttention backend if applicable, to use. + The default DotProductAttention backend is 1. + Parameters ---------- query_layer : torch.Tensor @@ -521,9 +860,17 @@ def forward( during the backward pass in order to save memory that would otherwise be occupied to store the forward activations until backprop. + core_attention_bias_type: str, default = `no_bias` + Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`} + core_attention_bias: Optional[torch.Tensor], default = `None` + Bias tensor for Q * K.T + fast_zero_fill: bool, defautl = `True` + Whether to use the fast path to set output tensors to 0 or not. """ use_flash_attention = self.use_flash_attention + use_fused_attention = self.use_fused_attention + if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] @@ -533,9 +880,26 @@ def forward( if self.attn_mask_type == "padding" and attention_mask is not None: use_flash_attention = False + use_fused_attention = False if is_in_onnx_export_mode(): use_flash_attention = False + use_fused_attention = False + + qkv_layout = "qkv_interleaved" if self.attention_type == "self" else "kv_interleaved" + fused_attention_backend = tex.get_fused_attn_backend( + TE_DType[query_layer.dtype], + TE_DType[key_layer.dtype], + QKVLayout[qkv_layout], + AttnBiasType[core_attention_bias_type], + AttnMaskType[self.attn_mask_type], + self.attention_dropout, + query_layer.shape[0], key_layer.shape[0], + query_layer.shape[-1]) + # DPA does not support FP8; for FP8, use cpp_extensions modules directly + is_backend_avail = (fused_attention_backend in + [FusedAttnBackend["F16_max512_seqlen"], FusedAttnBackend["F16_arbitrary_seqlen"]]) + use_fused_attention = use_fused_attention and is_backend_avail if use_flash_attention: if checkpoint_core_attention: @@ -545,6 +909,22 @@ def forward( value_layer) return self.flash_attention(query_layer, key_layer, value_layer) + if use_fused_attention: + if checkpoint_core_attention: + return self._checkpointed_attention_forward(self.fused_attention, + query_layer, + key_layer, + value_layer, + fused_attention_backend, + core_attention_bias_type, + core_attention_bias, + fast_zero_fill) + return self.fused_attention(query_layer, key_layer, value_layer, + fused_attention_backend, + core_attention_bias_type, + core_attention_bias, + fast_zero_fill) + if checkpoint_core_attention: return self._checkpointed_attention_forward( self.unfused_attention, @@ -747,6 +1127,9 @@ def forward( checkpoint_core_attention: bool = False, inference_params: Optional[Any] = None, rotary_pos_emb: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[torch.Tensor] = None, + fast_zero_fill: bool = True, ) -> Tuple[Union[torch.Tensor, None], ...]: """MultiHeadAttention FWD""" # hidden_states: [sq, b, h] @@ -947,7 +1330,10 @@ def forward( key_layer, value_layer, attention_mask, - checkpoint_core_attention=checkpoint_core_attention, + checkpoint_core_attention = checkpoint_core_attention, + core_attention_bias_type = core_attention_bias_type, + core_attention_bias = core_attention_bias, + fast_zero_fill = fast_zero_fill, ) # ================= diff --git a/transformer_engine/pytorch/constants.py b/transformer_engine/pytorch/constants.py index b8495b58f3..8d109026fb 100644 --- a/transformer_engine/pytorch/constants.py +++ b/transformer_engine/pytorch/constants.py @@ -22,7 +22,7 @@ torch.bfloat16: tex.DType.kBFloat16, } -AttnMaskTypes = ("causal", "padding") +AttnMaskTypes = ("causal", "padding", "no_mask") AttnTypes = ("self", "cross") diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py index 51eb6b6774..35a1fa72f3 100644 --- a/transformer_engine/pytorch/cpp_extensions/fused_attn.py +++ b/transformer_engine/pytorch/cpp_extensions/fused_attn.py @@ -7,6 +7,12 @@ from typing import Tuple, List, Union import torch import transformer_engine_extensions as tex +from transformer_engine_extensions import ( + NVTE_QKV_Layout, + NVTE_Bias_Type, + NVTE_Mask_Type, + NVTE_Fused_Attn_Backend +) __all__ = ['fused_attn_fwd_qkvpacked', @@ -24,6 +30,34 @@ tex.DType.kInt32: torch.int32, } +QKVLayout = { + "not_interleaved": NVTE_QKV_Layout.NVTE_NOT_INTERLEAVED, + "qkv_interleaved": NVTE_QKV_Layout.NVTE_QKV_INTERLEAVED, + "kv_interleaved": NVTE_QKV_Layout.NVTE_KV_INTERLEAVED, + } + +AttnBiasType = { + "no_bias": NVTE_Bias_Type.NVTE_NO_BIAS, + "pre_scale_bias": NVTE_Bias_Type.NVTE_PRE_SCALE_BIAS, + "post_scale_bias": NVTE_Bias_Type.NVTE_POST_SCALE_BIAS, + } + +AttnMaskType = { + "no_mask": NVTE_Mask_Type.NVTE_NO_MASK, + "padding": NVTE_Mask_Type.NVTE_PADDING_MASK, + "causal": NVTE_Mask_Type.NVTE_CAUSAL_MASK, + } + +FusedAttnBackend = { + "F16_max512_seqlen": NVTE_Fused_Attn_Backend.NVTE_F16_max512_seqlen, + "F16_arbitrary_seqlen": NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen, + "FP8": NVTE_Fused_Attn_Backend.NVTE_FP8, + "No_Backend": NVTE_Fused_Attn_Backend.NVTE_No_Backend, + } + +BACKEND_F16m512_FP8_THREADS_PER_CTA = 128 +BACKEND_F16arb_ELTS_PER_THREADS = 16 + def check_tensor(x: torch.Tensor): """Check tensor properties.""" @@ -109,7 +143,8 @@ def fused_attn_fwd_qkvpacked( cu_seqlens: torch.Tensor, qkv: torch.Tensor, qkv_dtype: tex.DType, - bias: torch.Tensor = None, + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, + attn_bias: torch.Tensor = None, d_scale_qkv: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_o: torch.Tensor = None, @@ -117,9 +152,9 @@ def fused_attn_fwd_qkvpacked( amax_o: torch.Tensor = None, attn_scale: float = None, dropout: float = 0.0, - set_zero: bool = True, + fast_zero_fill: bool = True, qkv_layout: str = "qkv_interleaved", - bias_type: str = "no_bias", + attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", rng_gen: torch.Generator = None, ) -> Tuple[Union[torch.Tensor, None], ...]: @@ -139,8 +174,10 @@ def fused_attn_fwd_qkvpacked( shape [total_seqs, 3, num_heads, head_dim], where total_seqs = cu_seqlens[-1] qkv_dtype: tex.DType data type of QKV; in tex.DType, not torch.dtype - bias: torch.Tensor, default = None - input tensor Bias when bias_type is "pre_scale_bias" or "post_scale_bias"; + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. + attn_bias: torch.Tensor, default = None + input tensor Bias when attn_bias_type is "pre_scale_bias" or "post_scale_bias"; shape [1, num_heads, max_seqlen, max_seqlen], same data type as qkv d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations @@ -158,12 +195,12 @@ def fused_attn_fwd_qkvpacked( dropout: float, default = 0.0 dropout probability, 0.0 means no dropout, 1.0 means no output; dropout must be 0.0 if is_training is False - set_zero: bool, default = True - if True, initializes the output tensor O to zero using the mha_fill method; - if False, doesn't initialize O after its allocation + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method qkv_layout: str, default = "qkv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} - bias_type: str, default = "no_bias" + attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} attn_mask_type: str, default = "padding" type of the attention mask; {"padding", "causal", "no_mask"} @@ -178,15 +215,26 @@ def fused_attn_fwd_qkvpacked( shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors used for the backward; - if is_training is True, aux_ctx_tensors = [M, ZInv, rng_state] - if is_training is False, aux_ctx_tensors = [rng_state] - M: torch.Tensor - max(Q*K.T) - shape [batch_size, num_heads, max_seqlen, 1], dtype float32 - ZInv: torch.Tensor - 1/sum(e^(x - max(x))), where x=Q*K.T - shape [batch_size, num_heads, max_seqlen, 1], dtype float32 - rng_state: torch.Tensor + if is_training is True, aux_ctx_tensors = [softmax-related tensors, rng_state] + if is_training is False, aux_ctx_tensors = None + + softmax-related tensors: + 1. if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"] + softmax: torch.Tensor + Softmax(Q*K.T) + shape [batch_size, num_heads, max_seqlen, max_seqlen], dtype float32 + 2. if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"] + softmaxStats: torch.Tensor + log(sum(e^(x - max(x)))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + 3. if fused_attention_backend == FusedAttnBackend["FP8"] + M: torch.Tensor + max(Q*K.T) + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + ZInv: torch.Tensor + 1/sum(e^(x - max(x))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen, 1], dtype float32 + rng_state: torch.Tensor, optional, if backend is not F16_max512_seqlen state of the random number generator; [seed, offset], dtype uint64 """ @@ -203,60 +251,58 @@ def fused_attn_fwd_qkvpacked( if attn_scale is None: attn_scale = 1.0 / math.sqrt(d) - if bias_type != "no_bias": - assert bias is not None, "bias tensor cannot be None when bias_type is not no_bias." - assert (bias.shape == [1, h, max_seqlen, max_seqlen] - ), "bias tensor must be in [1, h, max_seqlen, max_seqlen] shape." - assert (bias.dtype == qkv.dtype - ), "bias tensor must be in the same dtype as qkv." - - # FP8 fused attention API - if (qkv_type is torch.uint8) and (max_seqlen <= 512) and (d == 64): - assert (qkv_layout == "qkv_interleaved" - and bias_type == "no_bias" - and attn_mask_type == "padding" - ), """The FP8 fused attention API currently only supports qkv_interleaved layout, - no_bias type, and padding attention mask type.""" - assert (d_scale_qkv is not None), "d_scale_qkv is required for the FP8 API." - assert (q_scale_s is not None), "q_scale_s is required for the FP8 API." - assert (q_scale_o is not None), "q_scale_o is required for the FP8 API." - assert (amax_s is not None), "amax_s is required for the FP8 API." - assert (amax_o is not None), "amax_o is required for the FP8 API." + if attn_bias_type != "no_bias": + assert (attn_bias is not None + ), "attn_bias tensor cannot be None when attn_bias_type is not no_bias." + assert (attn_bias.shape == [1, h, max_seqlen, max_seqlen] + ), "attn_bias tensor must be in [1, h, max_seqlen, max_seqlen] shape." + assert (attn_bias.dtype == qkv.dtype + ), "attn_bias tensor must be in the same dtype as qkv." + + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." + + # BF16/FP16 fused attention API from fmha_v1 apex + if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"]: + rng_elts_per_thread = (max_seqlen * max_seqlen + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + + # BF16/FP16 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"]: + rng_elts_per_thread = BACKEND_F16arb_ELTS_PER_THREADS + + # FP8 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["FP8"]: + rng_elts_per_thread = (max_seqlen * max_seqlen + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + + assert (d_scale_qkv is not None + ), "d_scale_qkv is required as an input for FP8 fused attention." + assert (q_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." + assert (q_scale_o is not None + ), "q_scale_o is required as an input for FP8 fused attention." + assert (amax_s is not None + ), "amax_s is required as an input for FP8 fused attention." + assert (amax_o is not None + ), "amax_o is required as an input for FP8 fused attention." check_scalar(d_scale_qkv) check_scalar(q_scale_s) check_scalar(q_scale_o) check_scalar(amax_s) check_scalar(amax_o) - # BF16/FP16 fused attention API from fmha_v2 - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen > 512): - # add BF/FP16 support for >512 sequence length - assert False, "The BF16/FP16 support for >512 sequence length is coming!" - - # BF16/FP16 fused attention API from fmha_v1 apex - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen <= 512): - # add BF/FP16 support for <=512 sequence length - assert False, "The BF16/FP16 support for <=512 sequence length is coming!" - - else: - assert False, "No support for this dtype and max_seqlen combination." - # execute kernel output_tensors = tex.fused_attn_fwd_qkvpacked( b, max_seqlen, total_seqs, h, d, - is_training, attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, - cu_seqlens, - qkv, - qkv_dtype, - d_scale_qkv, - q_scale_s, - q_scale_o, - amax_s, - amax_o, - bias, - rng_gen, + is_training, attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens, qkv, qkv_dtype, + d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, attn_bias, + rng_gen, rng_elts_per_thread, ) + # out, aux_ctx_tensors return output_tensors[0], output_tensors[1:] @@ -267,7 +313,8 @@ def fused_attn_bwd_qkvpacked( o: torch.Tensor, d_o: torch.Tensor, qkv_dtype: tex.DType, - aux_ctx_tensors: List[torch.Tensor] = None, + aux_ctx_tensors: List[torch.Tensor], + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, d_scale_qkv: torch.Tensor = None, d_scale_s: torch.Tensor = None, d_scale_o: torch.Tensor = None, @@ -279,9 +326,9 @@ def fused_attn_bwd_qkvpacked( amax_dqkv: torch.Tensor = None, attn_scale: float = None, dropout: float = 0.0, - set_zero: bool = True, + fast_zero_fill: bool = True, qkv_layout: str = "qkv_interleaved", - bias_type: str = "no_bias", + attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", ) -> Tuple[Union[torch.Tensor, None], ...]: """Fused Attention BWD for packed QKV input. @@ -306,6 +353,8 @@ def fused_attn_bwd_qkvpacked( aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors of the forward pass when its is_training is True, e.g. aux_ctx_tensors = [M, ZInv, rng_state] + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations d_scale_s: torch.Tensor, default = None @@ -330,12 +379,12 @@ def fused_attn_bwd_qkvpacked( dropout: float, default = 0.0 dropout probability, 0.0 means no dropout, 1.0 means no output; dropout must be 0.0 if is_training is False - set_zero: bool, default = True - if True, initializes the output tensor O to zero using the mha_fill method; - if False, doesn't initialize O after its allocation + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method qkv_layout: str, default = "qkv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} - bias_type: str, default = "no_bias" + attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} attn_mask_type: str, default = "padding" type of the attention mask; {"padding", "causal", "no_mask"} @@ -345,8 +394,8 @@ def fused_attn_bwd_qkvpacked( d_qkv: torch.Tensor gradient tensor of QKV; same data type and shape as QKV d_bias: torch.Tensor, optional - gradient tensor of Bias when bias_type is "pre_scale_bias" or "post_scale_bias"; - same data type and shape as Bias + gradient tensor of Bias when attn_bias_type is "pre_scale_bias" + or "post_scale_bias"; same data type and shape as Bias """ check_cu_seqlens(cu_seqlens) @@ -363,29 +412,27 @@ def fused_attn_bwd_qkvpacked( if attn_scale is None: attn_scale = 1.0 / math.sqrt(d) - assert (len(aux_ctx_tensors) >= 1 - ), "aux_ctx_tensors must contain rng_state as its last element." - rng_state = aux_ctx_tensors[-1] - check_rng_state(rng_state) - - # FP8 fused attention API - if (qkv_type is torch.uint8) and (max_seqlen <= 512) and d == 64: - assert (qkv_layout == "qkv_interleaved" - and bias_type == "no_bias" - and attn_mask_type == "padding" - ), """The FP8 fused attention API currently only supports qkv_interleaved layout, - no_bias type, and padding attention mask type.""" - assert (d_scale_qkv is not None), "d_scale_qkv is required for the FP8 API." - assert (d_scale_s is not None), "d_scale_s is required for the FP8 API." - assert (d_scale_o is not None), "d_scale_o is required for the FP8 API." - assert (d_scale_do is not None), "d_scale_do is required for the FP8 API." - assert (q_scale_s is not None), "q_scale_s is required for the FP8 API." - assert (q_scale_dp is not None), "q_scale_dp is required for the FP8 API." - assert (q_scale_dqkv is not None), "q_scale_dqkv is required for the FP8 API." - assert (amax_dp is not None), "amax_dp is required for the FP8 API." - assert (amax_dqkv is not None), "amax_dqkv is required for the FP8 API." + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." + + if fused_attention_backend != FusedAttnBackend["F16_max512_seqlen"]: + assert (len(aux_ctx_tensors) >= 1 + ), "aux_ctx_tensors must contain rng_state as its last element." + rng_state = aux_ctx_tensors[-1] + check_rng_state(rng_state) + + if fused_attention_backend == FusedAttnBackend["FP8"]: + assert (d_scale_qkv is not None), "d_scale_qkv is required for FP8 fused attention." + assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." + assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." + assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." + assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." + assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." + assert (amax_dp is not None), "amax_dp is required for FP8 fused attention." + assert (amax_dqkv is not None), "amax_dqkv is required for FP8 fused attention." assert (len(aux_ctx_tensors) == 3 - ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for the FP8 API." + ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for FP8 fused attention." check_scalar(d_scale_qkv) check_scalar(d_scale_s) check_scalar(d_scale_o) @@ -399,37 +446,21 @@ def fused_attn_bwd_qkvpacked( check_stats(m, b, h, max_seqlen) check_stats(z_inv, b, h, max_seqlen) - # BF16/FP16 fused attention API from fmha_v2 - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen > 512): - # add BF/FP16 support for >512 sequence length - assert False, "The BF16/FP16 support for >512 sequence length is coming!" - - # BF16/FP16 fused attention API from fmha_v1 apex - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) and (max_seqlen <= 512): - # add BF/FP16 support for <=512 sequence length - assert False, "The BF16/FP16 support for <=512 sequence length is coming!" - - else: - assert False, "No support for this dtype and max_seqlen combination." - # execute kernel output_tensors = tex.fused_attn_bwd_qkvpacked( b, max_seqlen, total_seqs, h, d, - attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, - cu_seqlens, - qkv, o, d_o, - qkv_dtype, - aux_ctx_tensors, + attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens, qkv, o, d_o, qkv_dtype, aux_ctx_tensors, d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, - q_scale_s, q_scale_dp, q_scale_dqkv, - amax_dp, amax_dqkv, + q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, ) - if bias_type == "no_bias": - # return d_qkv when bias_type is no_bias - return output_tensors[0] + if attn_bias_type == "no_bias": + # return d_qkv when attn_bias_type is no_bias + return output_tensors # otherwise return (d_qkv, d_bias) - return output_tensors + return output_tensors[0], output_tensors[1] def fused_attn_fwd_kvpacked( @@ -441,7 +472,8 @@ def fused_attn_fwd_kvpacked( q: torch.Tensor, kv: torch.Tensor, qkv_dtype: tex.DType, - bias: torch.Tensor = None, + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, + attn_bias: torch.Tensor = None, d_scale_qkv: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_o: torch.Tensor = None, @@ -449,9 +481,9 @@ def fused_attn_fwd_kvpacked( amax_o: torch.Tensor = None, attn_scale: float = None, dropout: float = 0.0, - set_zero: bool = True, + fast_zero_fill: bool = True, qkv_layout: str = "qkv_interleaved", - bias_type: str = "no_bias", + attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", rng_gen: torch.Generator = None, ) -> Tuple[Union[torch.Tensor, None], ...]: @@ -479,8 +511,10 @@ def fused_attn_fwd_kvpacked( where total_seqs_kv = cu_seqlens_kv[-1] qkv_dtype: tex.DType data type of Q and KV; in tex.DType, not torch.dtype - bias: torch.Tensor, default = None - input tensor Bias when bias_type is "pre_scale_bias" or "post_scale_bias"; + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. + attn_bias: torch.Tensor, default = None + input tensor Bias when attn_bias_type is "pre_scale_bias" or "post_scale_bias"; shape [1, num_heads, max_seqlen_q, max_seqlen_kv], same data type as q and kv d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations @@ -498,12 +532,12 @@ def fused_attn_fwd_kvpacked( dropout: float, default = 0.0 dropout probability, 0.0 means no dropout, 1.0 means no output; dropout must be 0.0 if is_training is False - set_zero: bool, default = True - if True, initializes the output tensor O to zero using the mha_fill method; - if False, doesn't initialize O after its allocation + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method qkv_layout: str, default = "qkv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} - bias_type: str, default = "no_bias" + attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} attn_mask_type: str, default = "padding" type of the attention mask; {"padding", "causal", "no_mask"} @@ -518,15 +552,26 @@ def fused_attn_fwd_kvpacked( shape [total_seqs, num_heads, head_dim], where total_seqs = cu_seqlens[-1] aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors used for the backward; - if is_training is True, aux_ctx_tensors = [M, ZInv, rng_state] - if is_training is False, aux_ctx_tensors = [rng_state] - M: torch.Tensor - max(Q*K.T) - shape [batch_size, num_heads, max_seqlen, 1], dtype float32 - ZInv: torch.Tensor - 1/sum(e^(x - max(x))), where x=Q*K.T - shape [batch_size, num_heads, max_seqlen, 1], dtype float32 - rng_state: torch.Tensor + if is_training is True, aux_ctx_tensors = [softmax-related tensors, rng_state] + if is_training is False, aux_ctx_tensors = None + + softmax-related tensors: + 1. if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"] + softmax: torch.Tensor + Softmax(Q*K.T) + shape [batch_size, num_heads, max_seqlen_q, max_seqlen_kv], dtype float32 + 2. if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"] + softmaxStats: torch.Tensor + log(sum(e^(x - max(x)))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + 3. if fused_attention_backend == FusedAttnBackend["FP8"] + M: torch.Tensor + max(Q*K.T) + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + ZInv: torch.Tensor + 1/sum(e^(x - max(x))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + rng_state: torch.Tensor, optional, if backend is not F16_max512_seqlen state of the random number generator; [seed, offset], dtype uint64 """ @@ -551,49 +596,42 @@ def fused_attn_fwd_kvpacked( if attn_scale is None: attn_scale = 1.0 / math.sqrt(d) - if bias_type != "no_bias": - assert bias is not None, "bias tensor cannot be None when bias_type is not no_bias." - assert (bias.shape == [1, h, max_seqlen_q, max_seqlen_kv] - ), "bias tensor must be in [1, h, max_seqlen_q, max_seqlen_kv] shape." - assert (bias.dtype == q.dtype - ), "bias tensor must be in the same dtype as q and kv." + if attn_bias_type != "no_bias": + assert (attn_bias is not None + ), "attn_bias tensor cannot be None when attn_bias_type is not no_bias." + assert (attn_bias.shape == [1, h, max_seqlen_q, max_seqlen_kv] + ), "attn_bias tensor must be in [1, h, max_seqlen_q, max_seqlen_kv] shape." + assert (attn_bias.dtype == q.dtype + ), "attn_bias tensor must be in the same dtype as q and kv." - # FP8 fused attention API - if (qkv_type is torch.uint8) and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512) \ - and (d == 64): - assert False, "The FP8 fused attention API currently only supports packed QKV input." - - # BF16/FP16 fused attention API from fmha_v2 - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ - and (max_seqlen_q > 512) and (max_seqlen_kv > 512): - # add BF/FP16 support for >512 sequence length - assert False, "The BF16/FP16 support for >512 sequence length is coming!" + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." # BF16/FP16 fused attention API from fmha_v1 apex - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ - and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512): - # add BF/FP16 support for <=512 sequence length - assert False, "The BF16/FP16 support for <=512 sequence length is coming!" + if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"]: + rng_elts_per_thread = (max_seqlen_q * max_seqlen_kv + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA - else: - assert False, "No support for this dtype and max_seqlen combination." + # BF16/FP16 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"]: + rng_elts_per_thread = BACKEND_F16arb_ELTS_PER_THREADS + + # FP8 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["FP8"]: + rng_elts_per_thread = (max_seqlen_q * max_seqlen_q + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA # execute kernel output_tensors = tex.fused_attn_fwd_kvpacked( b, max_seqlen_q, max_seqlen_kv, total_seqs_q, total_seqs_kv, h, d, - is_training, attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, - cu_seqlens_q, cu_seqlens_kv, - q, kv, - qkv_dtype, - d_scale_qkv, - q_scale_s, - q_scale_o, - amax_s, - amax_o, - bias, - rng_gen, + is_training, attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens_q, cu_seqlens_kv, q, kv, qkv_dtype, + d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, + attn_bias, rng_gen, rng_elts_per_thread, ) + # out, aux_ctx_tensors return output_tensors[0], output_tensors[1:] @@ -607,7 +645,8 @@ def fused_attn_bwd_kvpacked( o: torch.Tensor, d_o: torch.Tensor, qkv_dtype: tex.DType, - aux_ctx_tensors: List[torch.Tensor] = None, + aux_ctx_tensors: List[torch.Tensor], + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, d_scale_qkv: torch.Tensor = None, d_scale_s: torch.Tensor = None, d_scale_o: torch.Tensor = None, @@ -619,9 +658,9 @@ def fused_attn_bwd_kvpacked( amax_dqkv: torch.Tensor = None, attn_scale: float = None, dropout: float = 0.0, - set_zero: bool = True, + fast_zero_fill: bool = True, qkv_layout: str = "qkv_interleaved", - bias_type: str = "no_bias", + attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", ) -> Tuple[Union[torch.Tensor, None], ...]: """Fused Attention BWD for packed KV input. @@ -654,6 +693,8 @@ def fused_attn_bwd_kvpacked( aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors of the forward pass when its is_training is True, e.g. aux_ctx_tensors = [M, ZInv, rng_state] + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations d_scale_s: torch.Tensor, default = None @@ -679,12 +720,12 @@ def fused_attn_bwd_kvpacked( dropout: float, default = 0.0 dropout probability, 0.0 means no dropout, 1.0 means no output; dropout must be 0.0 if is_training is False - set_zero: bool, default = True - if True, initializes the output tensor O to zero using the mha_fill method; - if False, doesn't initialize O after its allocation + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method qkv_layout: str, default = "qkv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} - bias_type: str, default = "no_bias" + attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} attn_mask_type: str, default = "padding" type of the attention mask; {"padding", "causal", "no_mask"} @@ -696,8 +737,8 @@ def fused_attn_bwd_kvpacked( d_kv: torch.Tensor gradient tensor of KV; same data type and shape as KV d_bias: torch.Tensor, optional - gradient tensor of Bias when bias_type is "pre_scale_bias" or "post_scale_bias"; - same data type and shape as Bias + gradient tensor of Bias when attn_bias_type is "pre_scale_bias" + or "post_scale_bias"; same data type and shape as Bias """ check_cu_seqlens(cu_seqlens_q) @@ -722,45 +763,52 @@ def fused_attn_bwd_kvpacked( if attn_scale is None: attn_scale = 1.0 / math.sqrt(d) - assert (len(aux_ctx_tensors) >= 1 - ), "aux_ctx_tensors must contain rng_state as its last element." - rng_state = aux_ctx_tensors[-1] - check_rng_state(rng_state) - - # FP8 fused attention API - if (qkv_type is torch.uint8) and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512) \ - and d == 64: - assert False, "The FP8 fused attention API currently only supports packed QKV input." - - ############### BF16/FP16 fused attention API from fmha_v2 ################ - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ - and (max_seqlen_q > 512) and (max_seqlen_kv > 512): - # add BF/FP16 support for >512 sequence length - assert False, "The BF16/FP16 support for >512 sequence length is coming!" - - ############### BF16/FP16 fused attention API from fmha_v1 apex ################ - elif (qkv_type is torch.bfloat16 or qkv_type is torch.float16) \ - and (max_seqlen_q <= 512) and (max_seqlen_kv <= 512): - # add BF/FP16 support for <=512 sequence length - assert False, "The BF16/FP16 support for <=512 sequence length is coming!" - - else: - assert False, "No support for this dtype and max_seqlen combination." + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." + + if fused_attention_backend != FusedAttnBackend["F16_max512_seqlen"]: + assert (len(aux_ctx_tensors) >= 1 + ), "aux_ctx_tensors must contain rng_state as its last element." + rng_state = aux_ctx_tensors[-1] + check_rng_state(rng_state) + + if fused_attention_backend == FusedAttnBackend["FP8"]: + assert (d_scale_qkv is not None), "d_scale_qkv is required for FP8 fused attention." + assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." + assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." + assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." + assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." + assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." + assert (amax_dp is not None), "amax_dp is required for FP8 fused attention." + assert (amax_dqkv is not None), "amax_dqkv is required for FP8 fused attention." + assert (len(aux_ctx_tensors) == 3 + ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for FP8 fused attention." + check_scalar(d_scale_qkv) + check_scalar(d_scale_s) + check_scalar(d_scale_o) + check_scalar(d_scale_do) + check_scalar(q_scale_s) + check_scalar(q_scale_dp) + check_scalar(q_scale_dqkv) + check_scalar(amax_dp) + check_scalar(amax_dqkv) + m, z_inv = aux_ctx_tensors[:2] + check_stats(m, b, h, max_seqlen_q) + check_stats(z_inv, b, h, max_seqlen_q) # execute kernel output_tensors = tex.fused_attn_bwd_kvpacked( b, max_seqlen_q, max_seqlen_kv, total_seqs_q, total_seqs_kv, h, d, - attn_scale, dropout, set_zero, qkv_layout, bias_type, attn_mask_type, - cu_seqlens_q, cu_seqlens_kv, - q, kv, o, d_o, - qkv_dtype, - aux_ctx_tensors, + attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens_q, cu_seqlens_kv, q, kv, o, d_o, qkv_dtype, aux_ctx_tensors, d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, - q_scale_s, q_scale_dp, q_scale_dqkv, - amax_dp, amax_dqkv, + q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, ) - # returns (d_q, d_kv) when bias_type is no_bias; otherwise returns (d_q, d_kv, d_bias) - if bias_type == "no_bias": - return output_tensors[:2] - return output_tensors + if attn_bias_type == "no_bias": + # return (d_q, d_kv) when attn_bias_type is no_bias + return output_tensors + # otherwise return (d_q, d_kv), d_bias + return output_tensors[:2], output_tensors[2] diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index 4904e1ebad..17d36b9911 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -58,7 +58,10 @@ enum FP8FwdTensors { GEMM1_OUTPUT = 2, GEMM2_INPUT = 3, GEMM2_WEIGHT = 4, - GEMM2_OUTPUT = 5 + GEMM2_OUTPUT = 5, + GEMM3_INPUT = 6, + GEMM3_WEIGHT = 7, + GEMM3_OUTPUT = 8 }; // Used as named indices on the `scale`, `scale_inv`, @@ -67,7 +70,9 @@ enum FP8BwdTensors { GRAD_OUTPUT1 = 0, GRAD_INPUT1 = 1, GRAD_OUTPUT2 = 2, - GRAD_INPUT2 = 3 + GRAD_INPUT2 = 3, + GRAD_OUTPUT3 = 4, + GRAD_INPUT3 = 5 }; @@ -81,6 +86,9 @@ transformer_engine::DType getTransformerEngineFP8Type(bool e4m3_if_hybrid, inline at::ScalarType GetATenDType(transformer_engine::DType t) { switch (t) { case transformer_engine::DType::kInt32: + return torch::kInt32; + case transformer_engine::DType::kInt64: + return torch::kInt64; case transformer_engine::DType::kFloat32: return at::kFloat; case transformer_engine::DType::kFloat16: diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu index 6d8ec6f2bb..69248d4aa9 100644 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ b/transformer_engine/pytorch/csrc/extensions.cu @@ -12,43 +12,21 @@ constexpr int block_size = 512; constexpr int ctas_per_sm = 4; -// convert QKV layout to enum -NVTE_QKV_Layout get_nvte_qkv_layout(const std::string qkv_layout) { - if (qkv_layout == "not_interleaved") { - return NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED; - } else if (qkv_layout == "qkv_interleaved") { - return NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED; - } else if (qkv_layout == "kv_interleaved") { - return NVTE_QKV_Layout::NVTE_KV_INTERLEAVED; - } else { - NVTE_ERROR("Invalid QKV layout. \n"); - } -} - -// convert bias type to enum -NVTE_Bias_Type get_nvte_bias_type(const std::string bias_type) { - if (bias_type == "no_bias") { - return NVTE_Bias_Type::NVTE_NO_BIAS; - } else if (bias_type == "pre_scale_bias") { - return NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS; - } else if (bias_type == "post_scale_bias") { - return NVTE_Bias_Type::NVTE_POST_SCALE_BIAS; - } else { - NVTE_ERROR("Invalid bias type. \n"); - } -} - -// convert attn mask type to enum -NVTE_Mask_Type get_nvte_mask_type(const std::string mask_type) { - if (mask_type == "padding") { - return NVTE_Mask_Type::NVTE_PADDING_MASK; - } else if (mask_type == "causal") { - return NVTE_Mask_Type::NVTE_CAUSAL_MASK; - } else if (mask_type == "no_mask") { - return NVTE_Mask_Type::NVTE_NO_MASK; - } else { - NVTE_ERROR("Invalid attention mask type. \n"); - } +// get the fused attention backend +NVTE_Fused_Attn_Backend get_fused_attn_backend( + const transformer_engine::DType q_dtype, + const transformer_engine::DType kv_dtype, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + float p_dropout, size_t max_seqlen_q, + size_t max_seqlen_kv, size_t head_dim) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + static_cast(q_dtype), static_cast(kv_dtype), + qkv_layout, bias_type, attn_mask_type, + p_dropout, max_seqlen_q, max_seqlen_kv, head_dim); + return fused_attention_backend; } // fast zero-fills of tensors @@ -103,10 +81,8 @@ __global__ void unpack(at::PhiloxCudaState arg, int64_t* rng_state_ptr) { // extract PhiloxCudaState from CUDA random number generator at::PhiloxCudaState init_philox_state( at::CUDAGeneratorImpl* gen, - size_t max_seq_len, - size_t threads_per_cta) { + size_t elts_per_thread) { at::PhiloxCudaState philox_args; - size_t elts_per_thread = (max_seq_len * max_seq_len + threads_per_cta - 1)/threads_per_cta; std::lock_guard lock(gen->mutex_); philox_args = gen->philox_cuda_state(elts_per_thread); return philox_args; @@ -117,7 +93,7 @@ std::vector fused_attn_fwd_qkvpacked( size_t b, size_t max_seqlen, size_t total_seqs, size_t h, size_t d, bool is_training, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens, const at::Tensor QKV, const transformer_engine::DType qkv_type, @@ -127,15 +103,18 @@ std::vector fused_attn_fwd_qkvpacked( c10::optional amax_S, c10::optional amax_O, const c10::optional Bias, - const c10::optional rng_gen) { + const c10::optional rng_gen, + size_t rng_elts_per_thread) { using namespace transformer_engine; // create output tensor O auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); auto O = torch::empty({static_cast(total_seqs), static_cast(h), static_cast(d)}, options); - if (set_zero) { + if (set_zero && (h * d % block_size == 0)) { mha_fill(O, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + O.fill_(0); } // construct NVTE tensors @@ -166,7 +145,7 @@ std::vector fused_attn_fwd_qkvpacked( } else { NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } - if ((bias_type != "no_bias") && (Bias.has_value())) { + if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { auto bias_shape = Bias.value().sizes().vec(); std::vector shape{bias_shape.begin(), bias_shape.end()}; te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, @@ -175,23 +154,16 @@ std::vector fused_attn_fwd_qkvpacked( te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, DType::kInt32, nullptr, nullptr, nullptr); - // convert strings to enums - NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); - NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); - NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); - // extract random number generator seed and offset auto gen = at::get_generator_or_default( rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); - size_t threads_per_cta = 128; - at::PhiloxCudaState philox_args = init_philox_state(gen, max_seqlen, threads_per_cta); + at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( philox_args, static_cast(rng_state.data_ptr())); auto te_rng_state = makeTransformerEngineTensor(rng_state); // create auxiliary output tensors - // if training, tensors are [M, ZInv] NVTETensorPack nvte_aux_tensor_pack; nvte_tensor_pack_create(&nvte_aux_tensor_pack); @@ -209,7 +181,7 @@ std::vector fused_attn_fwd_qkvpacked( te_rng_state.data(), max_seqlen, is_training, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -219,10 +191,9 @@ std::vector fused_attn_fwd_qkvpacked( workspace_data.data_ptr(), workspace.shape(), workspace.dtype()); - // output_tensors = [O, nvte_aux_tensor_pack.tensors, rng_state] + // output_tensors = [O, nvte_aux_tensor_pack.tensors] std::vector output_tensors; output_tensors.push_back(O); - // nvte_aux_tensor_pack.size is 0 if inference for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); // allocate memory for nvte_aux_tensor_pack.tensors @@ -230,9 +201,6 @@ std::vector fused_attn_fwd_qkvpacked( output_tensors.push_back(output_tensor); tensor->data.dptr = output_tensor.data_ptr(); } - if (is_training) { - output_tensors.push_back(rng_state); - } // execute the kernel nvte_fused_attn_fwd_qkvpacked( @@ -245,14 +213,14 @@ std::vector fused_attn_fwd_qkvpacked( te_rng_state.data(), max_seqlen, is_training, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); // destroy tensor wrappers, but not allocated memory nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - // if training, [O, M, ZInv, rng_state]; if inference, [O] + // if training, [O, softmax-related tensors, rng_state]; if inference, [O] return output_tensors; } @@ -261,7 +229,7 @@ std::vector fused_attn_bwd_qkvpacked( size_t b, size_t max_seqlen, size_t total_seqs, size_t h, size_t d, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens, const at::Tensor QKV, const at::Tensor O, @@ -281,13 +249,18 @@ std::vector fused_attn_bwd_qkvpacked( // create output tensor dQKV at::Tensor dQKV = torch::empty_like(QKV); - if (set_zero) { + auto max_tokens = dQKV.size(0); + auto self_2d = dQKV.view({max_tokens, -1}); + auto fcd_size = self_2d.size(1); + if (set_zero && (fcd_size % block_size == 0)) { mha_fill(dQKV, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + dQKV.fill_(0); } auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); at::Tensor dBias; TensorWrapper te_dBias; - if (bias_type != "no_bias") { + if (bias_type != NVTE_NO_BIAS) { dBias = torch::zeros({1, static_cast(h), static_cast(max_seqlen), static_cast(max_seqlen)}, options); @@ -341,13 +314,7 @@ std::vector fused_attn_bwd_qkvpacked( NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } - // convert strings to enums - NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); - NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); - NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); - // convert auxiliary tensors from forward into NVTETensors - // aux_ctx_tensors are [M, ZInv, rng_state] NVTETensorPack nvte_aux_tensor_pack; nvte_tensor_pack_create(&nvte_aux_tensor_pack); nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); @@ -380,7 +347,7 @@ std::vector fused_attn_bwd_qkvpacked( te_cu_seqlens.data(), max_seqlen, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -403,7 +370,7 @@ std::vector fused_attn_bwd_qkvpacked( te_cu_seqlens.data(), max_seqlen, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -419,7 +386,7 @@ std::vector fused_attn_fwd_kvpacked( size_t total_seqs_q, size_t total_seqs_kv, size_t h, size_t d, bool is_training, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens_q, const at::Tensor cu_seqlens_kv, const at::Tensor Q, @@ -431,15 +398,18 @@ std::vector fused_attn_fwd_kvpacked( c10::optional amax_S, c10::optional amax_O, const c10::optional Bias, - const c10::optional rng_gen) { + const c10::optional rng_gen, + size_t rng_elts_per_thread) { using namespace transformer_engine; // create output tensor O auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); auto O = torch::empty({static_cast(total_seqs_q), static_cast(h), static_cast(d)}, options); - if (set_zero) { + if (set_zero && (h * d % block_size == 0)) { mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + O.fill_(0); } // construct NVTE tensors @@ -474,7 +444,7 @@ std::vector fused_attn_fwd_kvpacked( } else { NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } - if ((bias_type != "no_bias") && (Bias.has_value())) { + if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { auto bias_shape = Bias.value().sizes().vec(); std::vector shape{bias_shape.begin(), bias_shape.end()}; te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, @@ -485,24 +455,16 @@ std::vector fused_attn_fwd_kvpacked( te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, DType::kInt32, nullptr, nullptr, nullptr); - // convert strings to enums - NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); - NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); - NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); - // extract rng seed and offset auto gen = at::get_generator_or_default( rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); - size_t threads_per_cta = 128; - at::PhiloxCudaState philox_args = init_philox_state( - gen, max(max_seqlen_q, max_seqlen_kv), threads_per_cta); + at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( philox_args, static_cast(rng_state.data_ptr())); auto te_rng_state = makeTransformerEngineTensor(rng_state); // create auxiliary output tensors - // if training, tensors are [M, ZInv] NVTETensorPack nvte_aux_tensor_pack; nvte_tensor_pack_create(&nvte_aux_tensor_pack); @@ -522,7 +484,7 @@ std::vector fused_attn_fwd_kvpacked( te_rng_state.data(), max_seqlen_q, max_seqlen_kv, is_training, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -532,10 +494,9 @@ std::vector fused_attn_fwd_kvpacked( workspace_data.data_ptr(), workspace.shape(), workspace.dtype()); - // output_tensors = [O, nvte_aux_tensor_pack.tensors, rng_state] + // output_tensors = [O, nvte_aux_tensor_pack.tensors] std::vector output_tensors; output_tensors.push_back(O); - // nvte_aux_tensor_pack.size is 0 if inference for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); // allocate memory for nvte_aux_tensor_pack.tensors @@ -543,9 +504,6 @@ std::vector fused_attn_fwd_kvpacked( output_tensors.push_back(output_tensor); tensor->data.dptr = output_tensor.data_ptr(); } - if (is_training) { - output_tensors.push_back(rng_state); - } // execute the kernel nvte_fused_attn_fwd_kvpacked( @@ -560,14 +518,14 @@ std::vector fused_attn_fwd_kvpacked( te_rng_state.data(), max_seqlen_q, max_seqlen_kv, is_training, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); // destroy tensor wrappers, but not allocated memory nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - // if training, [O, M, ZInv, rng_state]; if inference, [O] + // if training, [O, softmax-related tensors, rng_state]; if inference, [O] return output_tensors; } @@ -577,7 +535,7 @@ std::vector fused_attn_bwd_kvpacked( size_t total_seqs_q, size_t total_seqs_kv, size_t h, size_t d, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens_q, const at::Tensor cu_seqlens_kv, const at::Tensor Q, @@ -600,14 +558,23 @@ std::vector fused_attn_bwd_kvpacked( // create output tensors dQ and dKV at::Tensor dQ = torch::empty_like(Q); at::Tensor dKV = torch::empty_like(KV); - if (set_zero) { + auto max_tokens_q = dQ.size(0); + auto self_2d_q = dQ.view({max_tokens_q, -1}); + auto fcd_size_q = self_2d_q.size(1); + auto max_tokens_kv = dQ.size(0); + auto self_2d_kv = dQ.view({max_tokens_kv, -1}); + auto fcd_size_kv = self_2d_kv.size(1); + if (set_zero && (fcd_size_q % block_size == 0) && (fcd_size_kv % block_size == 0)) { mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); mha_fill(dKV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + dQ.fill_(0); + dKV.fill_(0); } auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); at::Tensor dBias; TensorWrapper te_dBias; - if (bias_type != "no_bias") { + if (bias_type != NVTE_NO_BIAS) { dBias = torch::zeros({1, static_cast(h), static_cast(max_seqlen_q), static_cast(max_seqlen_kv)}, options); @@ -674,13 +641,7 @@ std::vector fused_attn_bwd_kvpacked( te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, DType::kInt32, nullptr, nullptr, nullptr); - // convert strings to enums - NVTE_QKV_Layout qkv_layout_enum = get_nvte_qkv_layout(qkv_layout); - NVTE_Bias_Type bias_type_enum = get_nvte_bias_type(bias_type); - NVTE_Mask_Type attn_mask_type_enum = get_nvte_mask_type(attn_mask_type); - // convert auxiliary tensors from forward to NVTETensors - // aux_ctx_tensors are [M, ZInv, rng_state] NVTETensorPack nvte_aux_tensor_pack; nvte_tensor_pack_create(&nvte_aux_tensor_pack); nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); @@ -711,7 +672,7 @@ std::vector fused_attn_bwd_kvpacked( te_cu_seqlens_kv.data(), max_seqlen_q, max_seqlen_kv, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -737,7 +698,7 @@ std::vector fused_attn_bwd_kvpacked( te_cu_seqlens_kv.data(), max_seqlen_q, max_seqlen_kv, attn_scale, p_dropout, - qkv_layout_enum, bias_type_enum, attn_mask_type_enum, + qkv_layout, bias_type, attn_mask_type, workspace.data(), at::cuda::getCurrentCUDAStream()); @@ -2227,6 +2188,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("dswiglu", &dswiglu, "Backward of SwiGLU"); m.def("fa_prepare_fwd", &fa_prepare_fwd, "Prepare QKV for Flash Attention"); m.def("fa_prepare_bwd", &fa_prepare_bwd, "Backward of QKV preparation for Flash Attention"); + m.def("get_fused_attn_backend", &get_fused_attn_backend, "Get Fused Attention backend"); // Misc m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); @@ -2279,11 +2241,37 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .value("GEMM1_OUTPUT", transformer_engine::FP8FwdTensors::GEMM1_OUTPUT) .value("GEMM2_INPUT", transformer_engine::FP8FwdTensors::GEMM2_INPUT) .value("GEMM2_WEIGHT", transformer_engine::FP8FwdTensors::GEMM2_WEIGHT) - .value("GEMM2_OUTPUT", transformer_engine::FP8FwdTensors::GEMM2_OUTPUT); + .value("GEMM2_OUTPUT", transformer_engine::FP8FwdTensors::GEMM2_OUTPUT) + .value("GEMM3_INPUT", transformer_engine::FP8FwdTensors::GEMM3_INPUT) + .value("GEMM3_WEIGHT", transformer_engine::FP8FwdTensors::GEMM3_WEIGHT) + .value("GEMM3_OUTPUT", transformer_engine::FP8FwdTensors::GEMM3_OUTPUT); py::enum_(m, "FP8BwdTensors") .value("GRAD_OUTPUT1", transformer_engine::FP8BwdTensors::GRAD_OUTPUT1) .value("GRAD_INPUT1", transformer_engine::FP8BwdTensors::GRAD_INPUT1) .value("GRAD_OUTPUT2", transformer_engine::FP8BwdTensors::GRAD_OUTPUT2) - .value("GRAD_INPUT2", transformer_engine::FP8BwdTensors::GRAD_INPUT2); + .value("GRAD_INPUT2", transformer_engine::FP8BwdTensors::GRAD_INPUT2) + .value("GRAD_OUTPUT3", transformer_engine::FP8BwdTensors::GRAD_OUTPUT3) + .value("GRAD_INPUT3", transformer_engine::FP8BwdTensors::GRAD_INPUT3); + + py::enum_(m, "NVTE_Bias_Type") + .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS) + .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS) + .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS); + + py::enum_(m, "NVTE_Mask_Type") + .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK) + .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK) + .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK); + + py::enum_(m, "NVTE_QKV_Layout") + .value("NVTE_NOT_INTERLEAVED", NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) + .value("NVTE_QKV_INTERLEAVED", NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) + .value("NVTE_KV_INTERLEAVED", NVTE_QKV_Layout::NVTE_KV_INTERLEAVED); + + py::enum_(m, "NVTE_Fused_Attn_Backend") + .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) + .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) + .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8) + .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend); } diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index a2083e5492..1467397c63 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -7,17 +7,22 @@ #include "common.h" #include "../common.h" -NVTE_QKV_Layout get_nvte_qkv_layout(const std::string qkv_layout); - -NVTE_Bias_Type get_nvte_bias_type(const std::string bias_type); - -NVTE_Mask_Type get_nvte_mask_type(const std::string mask_type); +NVTE_Fused_Attn_Backend get_fused_attn_backend( + const transformer_engine::DType q_dtype, + const transformer_engine::DType kv_dtype, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + float p_dropout, size_t max_seqlen_q, + size_t max_seqlen_kv, size_t head_dim); std::vector fused_attn_fwd_qkvpacked( size_t b, size_t max_seqlen, size_t total_seqs, - size_t h, size_t d, - bool is_training, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + size_t h, size_t d, bool is_training, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens, const at::Tensor QKV, const transformer_engine::DType qkv_type, @@ -27,13 +32,16 @@ std::vector fused_attn_fwd_qkvpacked( c10::optional amax_S, c10::optional amax_O, const c10::optional Bias, - const c10::optional rng_gen); + const c10::optional rng_gen, + size_t rng_elts_per_thread); std::vector fused_attn_bwd_qkvpacked( size_t b, size_t max_seqlen, size_t total_seqs, - size_t h, size_t d, - float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + size_t h, size_t d, float attn_scale, + float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens, const at::Tensor QKV, const at::Tensor O, @@ -53,9 +61,11 @@ std::vector fused_attn_bwd_qkvpacked( std::vector fused_attn_fwd_kvpacked( size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, size_t total_seqs_q, size_t total_seqs_kv, - size_t h, size_t d, - bool is_training, float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + size_t h, size_t d, bool is_training, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens_q, const at::Tensor cu_seqlens_kv, const at::Tensor Q, @@ -67,14 +77,17 @@ std::vector fused_attn_fwd_kvpacked( c10::optional amax_S, c10::optional amax_O, const c10::optional Bias, - const c10::optional rng_gen); + const c10::optional rng_gen, + size_t rng_elts_per_thread); std::vector fused_attn_bwd_kvpacked( size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, size_t total_seqs_q, size_t total_seqs_kv, - size_t h, size_t d, - float attn_scale, float p_dropout, bool set_zero, - std::string qkv_layout, std::string bias_type, std::string attn_mask_type, + size_t h, size_t d, float attn_scale, + float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, const at::Tensor cu_seqlens_q, const at::Tensor cu_seqlens_kv, const at::Tensor Q, diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index b30236acad..6a39c2cab1 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -400,6 +400,9 @@ def forward( checkpoint_core_attention: bool = False, inference_params: Optional[Any] = None, rotary_pos_emb: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[torch.Tensor] = None, + fast_zero_fill: bool = True, ) -> torch.Tensor: """ Transformer Layer: attention block and a feedforward network (MLP) @@ -442,6 +445,12 @@ def forward( rotary_pos_emb: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], default = `None` Embeddings for query and key tensors for applying rotary position embedding. By default no input embedding is applied. + core_attention_bias_type: str, default = `no_bias` + Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`} + core_attention_bias: Optional[torch.Tensor], default = `None` + Bias tensor for Q * K.T + fast_zero_fill: bool, default = `True` + Whether to set output tensors to 0 or not before use. """ hidden_states = hidden_states.contiguous() @@ -470,6 +479,9 @@ def forward( is_first_microbatch=is_first_microbatch, checkpoint_core_attention=checkpoint_core_attention, rotary_pos_emb=rotary_pos_emb, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + fast_zero_fill=fast_zero_fill, ) if self.apply_residual_connection_post_layernorm and not self.output_layernorm: @@ -513,6 +525,9 @@ def forward( encoder_output=encoder_output, is_first_microbatch=is_first_microbatch, checkpoint_core_attention=checkpoint_core_attention, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + fast_zero_fill=fast_zero_fill, ) if self.apply_residual_connection_post_layernorm: attention_output, attention_bias, residual = inter_attention_outputs From ac919e4559f1d04e782da31268894272c8eb79d4 Mon Sep 17 00:00:00 2001 From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Date: Tue, 20 Jun 2023 17:59:31 -0700 Subject: [PATCH 035/427] Fix BF16 ONNX export for successful ONNX Runtime Verification (#290) Signed-off-by: Asfiya Baig --- transformer_engine/pytorch/attention.py | 7 ++++++- transformer_engine/pytorch/te_onnx_extensions.py | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 492ebe5cb6..ab164cff79 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -180,14 +180,19 @@ def forward( key_layer = key_layer.reshape(output_size[3], output_size[0] * output_size[1], -1) # preallocting result tensor: [b * np, sq, sk] + # WAR to set dtype to FP32 as ONNX lacks BF16 support for ConstantOfShape operator + is_bf16 = query_layer.dtype == torch.bfloat16 matmul_result = torch.empty( output_size[0] * output_size[1], output_size[2], output_size[3], - dtype=query_layer.dtype, + dtype=torch.float32 if is_in_onnx_export_mode() and is_bf16 else query_layer.dtype, device=torch.cuda.current_device(), ) + if is_in_onnx_export_mode() and is_bf16: + matmul_result = matmul_result.bfloat16() + scale = self.norm_factor if apply_qk_layer_scaling: scale *= self.layer_number diff --git a/transformer_engine/pytorch/te_onnx_extensions.py b/transformer_engine/pytorch/te_onnx_extensions.py index f641926cc2..3f3e97f198 100755 --- a/transformer_engine/pytorch/te_onnx_extensions.py +++ b/transformer_engine/pytorch/te_onnx_extensions.py @@ -254,6 +254,7 @@ def onnx_te_gemm( """ONNX graph for te_gemm""" # pylint: disable=unused-argument is_fp16 = is_dtype_fp16(inputs) + is_bf16 = is_dtype_bf16(inputs) if input_type == int(tex.DType.kFloat8E4M3): inputs = dequantize(g, inputs, input_scale_inverse, input_fp8_tensor, out_type) @@ -277,6 +278,8 @@ def onnx_te_gemm( else: if is_fp16: output = g.op("Cast", output, to_i=_C_onnx.TensorProtoDataType.FLOAT16) + elif is_bf16: + output = g.op("Cast", output, to_i=_C_onnx.TensorProtoDataType.BFLOAT16) return output From 96ed6fc69d99a9cff49637dbc58c837c8d921ad7 Mon Sep 17 00:00:00 2001 From: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> Date: Fri, 23 Jun 2023 05:04:33 +0300 Subject: [PATCH 036/427] Fix layer_norm ONNX export (#293) * Fix ONNX export of layer_norm ONNX has a spec bug: ConstantOfShape supports all dtypes except for BF16. To WAR we use dtype FP32 and then cast to BF16. Will also issue a PR to the ONNX sig committee to change the spec in opset 20. Signed-off-by: Neta Zmora * fix lint Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Neta Zmora Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- .../pytorch/te_onnx_extensions.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/te_onnx_extensions.py b/transformer_engine/pytorch/te_onnx_extensions.py index 3f3e97f198..5990160294 100755 --- a/transformer_engine/pytorch/te_onnx_extensions.py +++ b/transformer_engine/pytorch/te_onnx_extensions.py @@ -304,6 +304,20 @@ def onnx_layernorm_fwd_fp8(g, inputs, weight, bias, eps, scale, amax, def onnx_layernorm_fwd(g, inputs, weight, bias, eps, zero_centered_gamma): """ONNX graph for layernorm_fwd""" # pylint: disable=unused-argument + + def ones_like(inp, dtype): + """Returns a tensor filled with the scalar value 1, with the same size as input and + with dtype data-type""" + shape = g.op("Shape", inp) + # WAR ONNX spec: ConstantOfShape accepts all data types except for BF16. To WAR + # create a ConstantOfShape with type FP32 and then add a Cast to BF16. + is_bf16 = dtype == torch.bfloat16 + one = g.op("ConstantOfShape", shape, value_t=torch.tensor([1], + dtype=torch.float32 if is_bf16 else dtype)) + if is_bf16: + one = g.op("Cast", one, to_i=_C_onnx.TensorProtoDataType.BFLOAT16) + return one + normalized_shape = torch.onnx.symbolic_helper._get_tensor_sizes(inputs) if normalized_shape is None: ndim = torch.onnx.symbolic_helper._get_tensor_rank(inputs) @@ -314,8 +328,7 @@ def onnx_layernorm_fwd(g, inputs, weight, bias, eps, zero_centered_gamma): if zero_centered_gamma: inputs_dtype = inputs.type().dtype() - shape = g.op("Shape", weight) - one = g.op("ConstantOfShape", shape, value_t=torch.tensor([1], dtype=inputs_dtype)) + one = ones_like(weight, inputs_dtype) weight = g.op("Add", weight, one) axis = -len(normalized_shape) From 94beb13062f98e03ca71197aeab6821545c4e679 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 18 Jul 2023 22:47:31 +0800 Subject: [PATCH 037/427] [JAX] Fully remove attn_type and set self_attn_mask_type default to 'causal' (#324) * Fully remove attn_type and set self_attn_mask_type default to 'causal' Signed-off-by: Reese Wang * Fix tests with new arguments Signed-off-by: Reese Wang * Explicit self_attn_mask_type for examples Signed-off-by: Reese Wang * Update transformer_engine/jax/flax/transformer.py Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: zlsh80826 * Update transformer_engine/jax/flax/transformer.py Co-authored-by: Kirthi Shankar Sivamani Signed-off-by: zlsh80826 --------- Signed-off-by: Reese Wang Signed-off-by: zlsh80826 Co-authored-by: Kirthi Shankar Sivamani --- .../encoder/test_model_parallel_encoder.py | 1 + examples/jax/encoder/test_multigpu_encoder.py | 1 + .../encoder/test_multiprocessing_encoder.py | 1 + .../jax/encoder/test_single_gpu_encoder.py | 1 + tests/jax/test_layer.py | 2 + tests/jax/test_praxis_layers.py | 20 ++++----- transformer_engine/jax/flax/transformer.py | 43 +++---------------- transformer_engine/jax/praxis/transformer.py | 6 +-- 8 files changed, 24 insertions(+), 51 deletions(-) diff --git a/examples/jax/encoder/test_model_parallel_encoder.py b/examples/jax/encoder/test_model_parallel_encoder.py index 4a26244fff..75c41964c9 100644 --- a/examples/jax/encoder/test_model_parallel_encoder.py +++ b/examples/jax/encoder/test_model_parallel_encoder.py @@ -48,6 +48,7 @@ def __call__(self, x, mask, disable_dropout=False): attention_dropout=0.1, dropout_rng_name=DROPOUT_KEY, layer_type=te_flax.TransformerLayerType.ENCODER, + self_attn_mask_type='padding', enable_relative_embedding=False, dtype=jnp.bfloat16) x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) diff --git a/examples/jax/encoder/test_multigpu_encoder.py b/examples/jax/encoder/test_multigpu_encoder.py index ef3837c8d4..53be4b7134 100644 --- a/examples/jax/encoder/test_multigpu_encoder.py +++ b/examples/jax/encoder/test_multigpu_encoder.py @@ -45,6 +45,7 @@ def __call__(self, x, mask, disable_dropout=False): attention_dropout=0.1, dropout_rng_name=DROPOUT_KEY, layer_type=te_flax.TransformerLayerType.ENCODER, + self_attn_mask_type='padding', enable_relative_embedding=False, dtype=jnp.bfloat16) x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) diff --git a/examples/jax/encoder/test_multiprocessing_encoder.py b/examples/jax/encoder/test_multiprocessing_encoder.py index a21346458c..c1cf94332f 100644 --- a/examples/jax/encoder/test_multiprocessing_encoder.py +++ b/examples/jax/encoder/test_multiprocessing_encoder.py @@ -51,6 +51,7 @@ def __call__(self, x, mask, disable_dropout=False): attention_dropout=0.1, dropout_rng_name=DROPOUT_KEY, layer_type=te_flax.TransformerLayerType.ENCODER, + self_attn_mask_type='padding', enable_relative_embedding=False, dtype=jnp.bfloat16) x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py index 62798eed82..6e519d87cc 100644 --- a/examples/jax/encoder/test_single_gpu_encoder.py +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -40,6 +40,7 @@ def __call__(self, x, mask, disable_dropout=False): attention_dropout=0.1, dropout_rng_name=DROPOUT_KEY, layer_type=te_flax.TransformerLayerType.ENCODER, + self_attn_mask_type='padding', enable_relative_embedding=False, dtype=jnp.bfloat16) x = te_Encoder()(x, attention_mask=mask, deterministic=disable_dropout) diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py index 30143e5f75..ef1faebaf0 100644 --- a/tests/jax/test_layer.py +++ b/tests/jax/test_layer.py @@ -171,6 +171,7 @@ def forward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.ENCODER, + self_attn_mask_type='padding', dtype=dtype, **te_layer_attrs) @@ -215,6 +216,7 @@ def forward_backward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e- layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.ENCODER, + self_attn_mask_type='padding', dtype=dtype, **te_layer_attrs) ref_layer, ref_params, ref_others = generate_layer(ref_layer_cls, init_rng, inputs, diff --git a/tests/jax/test_praxis_layers.py b/tests/jax/test_praxis_layers.py index de44b3a163..7a329d39ac 100644 --- a/tests/jax/test_praxis_layers.py +++ b/tests/jax/test_praxis_layers.py @@ -659,38 +659,38 @@ def test_forward(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): class MultiHeadAttnAttr: USE_BIAS = 'use_bias' LN_TYPE = 'layernorm_type' - ATTN_TYPE = 'attn_type' + ATTN_MASK_TYPE = 'attn_mask_type' ZERO_CEN = 'zero_centered_gamma' ATTRS = [{ USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: False, - ATTN_TYPE: 'padding' + ATTN_MASK_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: True, - ATTN_TYPE: 'padding' + ATTN_MASK_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'rmsnorm', ZERO_CEN: False, - ATTN_TYPE: 'padding' + ATTN_MASK_TYPE: 'padding' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: False, - ATTN_TYPE: 'causal' + ATTN_MASK_TYPE: 'causal' }, { USE_BIAS: True, LN_TYPE: 'layernorm', ZERO_CEN: True, - ATTN_TYPE: 'causal' + ATTN_MASK_TYPE: 'causal' }, { USE_BIAS: True, LN_TYPE: 'rmsnorm', ZERO_CEN: False, - ATTN_TYPE: 'causal' + ATTN_MASK_TYPE: 'causal' }] @@ -714,7 +714,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): bias_init = WeightInit.Constant(0.0) apply_residual_connection_post_layernorm = False output_layernorm = False - attn_type = attrs[MultiHeadAttnAttr.ATTN_TYPE] + attn_mask_type = attrs[MultiHeadAttnAttr.ATTN_MASK_TYPE] fuse_qkv: bool = True transpose_batch_sequence = True scale_attn_logits = False @@ -734,7 +734,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): bias_init=bias_init, apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm, output_layernorm=output_layernorm, - attn_type=attn_type, + attn_mask_type=attn_mask_type, fuse_qkv=fuse_qkv, transpose_batch_sequence=transpose_batch_sequence, scale_attn_logits=scale_attn_logits, @@ -752,7 +752,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): bias_init=TransformerEngineBaseLayer.generate_params_init("bias", bias_init), apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm, output_layernorm=output_layernorm, - attn_type=attn_type, + attn_mask_type=attn_mask_type, fuse_qkv=fuse_qkv, transpose_batch_sequence=transpose_batch_sequence, scale_attn_logits=scale_attn_logits, diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py index 14ad7f02e8..a5cf05bb5e 100644 --- a/transformer_engine/jax/flax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -202,10 +202,10 @@ class MultiHeadAttention(nn.Module): Multi-head Attention (MHA), including Query, Key, Value and Output projection. - .. warning:: + .. note:: - Argument :attr:`attn_type` is deprecated and superseded by :attr:`attn_mask_type`. - :attr:`attn_type` is ignored in version 0.10 and will be fully removed in version 0.11. + Argument :attr:`mask` will be ignored when + :attr:`attn_mask_type` is set to `"causal"`. Parameters ---------- @@ -244,11 +244,9 @@ class MultiHeadAttention(nn.Module): Indicate if apply residual connection with the output of layer normalization. output_layernorm : bool, default = False Indicate if apply a layer normalization at the end of MHA. - attn_type: Any, defult = None - *Deprecated*, will be ignored in v0.10 and be fully removed in v0.11. - Please use `attn_mask_type` to config the attention mask. attn_mask_type: {'causal', 'padding'}, default = 'causal' Type of attention mask passed into softmax operation. + Introduced in v0.10.0. Optimization parameters ----------------------- @@ -284,8 +282,6 @@ class MultiHeadAttention(nn.Module): bias_init: Initializer = nn.initializers.zeros apply_residual_connection_post_layernorm: bool = False output_layernorm: bool = False - # TODO(rewang): remove attn_type and the related doc after v0.11 - attn_type: Any = None attn_mask_type: str = 'causal' dtype: DType = jnp.float32 fuse_qkv: bool = True @@ -297,14 +293,6 @@ class MultiHeadAttention(nn.Module): def __post_init__(self): if self.kernel_init is None: self.kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'normal') - # TODO(rewang): remove attn_type after v0.11 - if self.attn_type is not None: - warnings.warn( - "The 'attn_type' argument in the 'MultiHeadAttention' is" - " deprecated in version 0.10 and will be removed in version 0.11." - " Passing value in attn_type will be ignored, please use `attn_mask_type`" - " to config the attention mask type.", - category=DeprecationWarning) super().__post_init__() @nn.compact @@ -803,13 +791,6 @@ class TransformerLayer(nn.Module): an attention block and a feedforward network (MLP). This standard layer is based on the paper “Attention Is All You Need”. - .. warning:: - - Argument :attr:`self_attn_mask_type` is introduced in version 0.10. - Starting from version 0.11, the default value will be `"causal"`. - However, to ensure compatibility with earlier versions, before 0.11, - the default value will be `"padding"` for the encoder and `"causal"` for the decoder. - .. note:: Argument :attr:`attention_mask` will be ignored when @@ -877,6 +858,7 @@ class TransformerLayer(nn.Module): Transformer in conjunction with the TransformerLayerType.ENCODER option. self_attn_mask_type: {'causal', 'padding'}, default = 'causal' Type of attention mask passed into softmax operation. + Introduced in v0.10.0. enable_relative_embedding: bool, default = True Whether to enable relative embedding as shifting of attention logits. relative_embedding: flax.linen.Module, default = None @@ -930,7 +912,7 @@ class TransformerLayer(nn.Module): output_layernorm: bool = False float32_attention_logits: bool = False layer_type: TransformerLayerType = TransformerLayerType.ENCODER - self_attn_mask_type: str = None # TODO(rewang): default to 'causal' after 0.11 + self_attn_mask_type: str = 'causal' enable_relative_embedding: bool = True relative_embedding: nn.Module = None dtype: DType = jnp.float32 @@ -946,19 +928,6 @@ def __post_init__(self): if self.mlp_kernel_init is None: self.mlp_kernel_init = nn.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal') - # TODO(rewang): default to 'causal' in 0.11 (also updated the doc after 0.11) - if self.self_attn_mask_type is None: - warnings.warn( - "The 'self_attn_mask_type' argument in the 'TransformerLayer' is" - " introduced in version 0.10. Starting from version 0.11, the default" - " value will be 'causal'. However, to ensure compatibility with earlier" - " versions, before 0.11, the default value will be 'padding' for the" - " encoder and 'causal' for the decoder.", - category=FutureWarning) - if self.layer_type == TransformerLayerType.ENCODER: - self.self_attn_mask_type = 'padding' - else: - self.self_attn_mask_type = 'causal' super().__post_init__() @nn.compact diff --git a/transformer_engine/jax/praxis/transformer.py b/transformer_engine/jax/praxis/transformer.py index 1260c266b5..9bf9628490 100644 --- a/transformer_engine/jax/praxis/transformer.py +++ b/transformer_engine/jax/praxis/transformer.py @@ -5,7 +5,7 @@ Praxis Modules related Transformer """ from functools import partial -from typing import Any, Optional, Sequence, Tuple +from typing import Optional, Sequence, Tuple from praxis import pax_fiddle from praxis.base_layer import WeightInit @@ -73,8 +73,6 @@ class MultiHeadAttention(TransformerEngineBaseLayer): bias_init: WeightInit = WeightInit.Constant(0.0) apply_residual_connection_post_layernorm: bool = False output_layernorm: bool = False - # TODO(rewang): remove attn_type and the related doc after v0.11 - attn_type: Any = None attn_mask_type: str = 'causal' fuse_qkv: bool = True transpose_batch_sequence: bool = True @@ -147,7 +145,7 @@ class TransformerLayer(TransformerEngineBaseLayer): output_layernorm: bool = False float32_attention_logits: bool = False layer_type: TransformerLayerType = TransformerLayerType.ENCODER - self_attn_mask_type: str = None # TODO(rewang): default to 'causal' after 0.11 + self_attn_mask_type: str = 'causal' enable_relative_embedding: bool = True relative_embedding: pax_fiddle.Config[RelativePositionBiases] = pax_fiddle.template_field(None) drop_path: float = 0.0 From 32ad922b143c4c6da4f0e1aaf65b12e0fe0de035 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 18 Jul 2023 21:27:26 -0400 Subject: [PATCH 038/427] FA does not support head_dim > 64 on Ada (#328) * FA does not support head_dim > 64 on Ada Signed-off-by: Kirthi Shankar Sivamani * Add cc8.7 to no FA list Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 44baa5cda5..9cf59e5b01 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -879,7 +879,7 @@ def forward( if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] - or (self.device_compute_capability == 8.6 and key_layer.shape[-1] > 64) + or (self.device_compute_capability in (8.6, 8.7, 8.9) and key_layer.shape[-1] > 64) ): use_flash_attention = False From 33576bec9ed8534d97920010097e8db7687525ab Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 18 Jul 2023 21:30:23 -0400 Subject: [PATCH 039/427] FlashAttention 2.0 support (#329) * FA v2.0 support Signed-off-by: Kirthi Shankar Sivamani * fix typo Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- setup.py | 2 +- transformer_engine/pytorch/attention.py | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 98edddfc3e..81ba934cbd 100644 --- a/setup.py +++ b/setup.py @@ -290,7 +290,7 @@ def add_unique(l: List[str], vals: Union[str, List[str]]) -> None: # Framework-specific requirements if "pytorch" in frameworks(): - add_unique(install_reqs, ["torch", "flash-attn>=1.0.6, <=1.0.7"]) + add_unique(install_reqs, ["torch", "flash-attn>=1.0.6, <=2.0.0.post1"]) add_unique(test_reqs, ["numpy", "onnxruntime", "torchvision"]) if "jax" in frameworks(): if not found_pybind11(): diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 9cf59e5b01..48600b17df 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -12,8 +12,6 @@ import torch -from flash_attn.flash_attn_interface import flash_attn_unpadded_func - import transformer_engine_extensions as tex from transformer_engine.pytorch.cpp_extensions.fused_attn import ( fused_attn_fwd_qkvpacked, @@ -47,6 +45,12 @@ _flash_attn_version = packaging.version.Version(version("flash-attn")) _flash_attn_version_required = packaging.version.Version("1.0.6") +_flash_attn_2_available = _flash_attn_version >= packaging.version.Version("2") + +if _flash_attn_2_available: + from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_forward_func # pylint: disable=no-name-in-module +else: + from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_forward_func # pylint: disable=no-name-in-module __all__ = ["DotProductAttention"] @@ -397,11 +401,14 @@ def forward( device=query_layer.device) with self.attention_dropout_ctx(): - output = flash_attn_unpadded_func( + fa_optional_forward_kwargs = {} + if not _flash_attn_2_available: + fa_optional_forward_kwargs["deterministic"] = self.deterministic + output = flash_attn_forward_func( query_layer, key_layer, value_layer, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, self.attention_dropout if self.training else 0.0, softmax_scale=1.0/self.norm_factor, causal=self.attn_causal_mask, - deterministic=self.deterministic, + **fa_optional_forward_kwargs ) # [(b sq), np, hn] -> [sq, b, (np hn)] @@ -700,11 +707,10 @@ class DotProductAttention(torch.nn.Module): .. warning:: - For the default attention mechanism, this module executes a non-deterministic version of - `flash-attn `_ whenever possible in order to - achieve optimal performance. To observe deterministic behavior, set the environment - variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order to disable - `flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`. + FlashAttention uses a non-deterministic algorithm for optimal performance. To observe + deterministic behavior at the cost of performance, use FlashAttention version < `2.0.0` + and set the environment variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order + to disable`flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`. Parameters ---------- From 07774089b079016ce79c16935f9e1c04fc3c62e2 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 19 Jul 2023 21:42:38 -0400 Subject: [PATCH 040/427] Relax FA 2.0 checks for Ada (#331) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/attention.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 48600b17df..f1d86e224d 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -885,10 +885,15 @@ def forward( if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] - or (self.device_compute_capability in (8.6, 8.7, 8.9) and key_layer.shape[-1] > 64) ): use_flash_attention = False + if key_layer.shape[-1] > 64: + if self.device_compute_capability in (8.6, 8.7): + use_flash_attention = False + elif not _flash_attn_2_available and self.device_compute_capability == 8.9: + use_flash_attention = False + if self.attn_mask_type == "padding" and attention_mask is not None: use_flash_attention = False use_fused_attention = False From 3f9db848564ec78d9c7b215a5bd81978b57b0ffe Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 25 Jul 2023 17:59:32 -0700 Subject: [PATCH 041/427] Make QK layer scaling opt-in (#339) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/attention.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index f1d86e224d..e75b67784b 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -157,6 +157,10 @@ def __init__( # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(attention_dropout) + # An FP16 training trick required for certain GPT-like models. + self.apply_qk_layer_scaling = ( + bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and layer_number is not None) + def forward( self, query_layer: torch.Tensor, @@ -166,7 +170,7 @@ def forward( ) -> torch.Tensor: """core attention fprop""" batch_size, seqlen = query_layer.shape[1], query_layer.shape[0] - apply_qk_layer_scaling = self.layer_number is not None and key_layer.dtype == torch.float16 + apply_qk_layer_scaling = self.apply_qk_layer_scaling and key_layer.dtype == torch.float16 # [b, np, sq, sk] output_size = ( From 058f9126871477fe7fc5e950964a304f406dde16 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Thu, 27 Jul 2023 23:48:22 +0200 Subject: [PATCH 042/427] Exposing RMSNorm in pyTorch (#306) * Exposing RMSNorm in pyTorch extensions Signed-off-by: Przemek Tredak * First pass at the Python API Signed-off-by: Przemek Tredak * Small fixes Signed-off-by: Przemek Tredak * Added numerics tests and fixed issues Signed-off-by: Przemek Tredak * Lint fixes Signed-off-by: Przemek Tredak * Added RMSNorm to LayerNormMLP Signed-off-by: Przemek Tredak * Added ONNX export and tests for RMSNorm Signed-off-by: Przemek Tredak * Fix python lint Signed-off-by: Przemek Tredak * Fix BERT case Signed-off-by: Przemek Tredak * Added normalization option to the TransformerLayer Added tests Fixed test failures Signed-off-by: Przemek Tredak * Fix documentation Co-authored-by: Przemyslaw Tredak Signed-off-by: Kirthi Shankar Sivamani * Fix kwarg bug Signed-off-by: Kirthi Shankar Sivamani * Fix IMA and invalid type error Signed-off-by: Kirthi Shankar Sivamani * Increase RMSNorm threshold for bf16 case Signed-off-by: Kirthi Shankar Sivamani * Fix ONNX tests Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Przemek Tredak Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- docs/api/c/index.rst | 1 + docs/api/c/rmsnorm.rst | 9 + docs/api/pytorch.rst | 2 + setup.py | 8 +- tests/pytorch/test_numerics.py | 108 +- tests/pytorch/test_onnx_export.py | 100 +- tests/pytorch/test_sanity.py | 52 +- transformer_engine/pytorch/__init__.py | 3 + transformer_engine/pytorch/attention.py | 3 + .../pytorch/cpp_extensions/normalization.py | 85 +- transformer_engine/pytorch/csrc/common.cu | 8 + transformer_engine/pytorch/csrc/common.h | 3 + transformer_engine/pytorch/csrc/extensions.cu | 2277 ----------------- transformer_engine/pytorch/csrc/extensions.h | 81 + .../pytorch/csrc/extensions/activation.cu | 267 ++ .../pytorch/csrc/extensions/attention.cu | 876 +++++++ .../pytorch/csrc/extensions/cast.cu | 75 + .../pytorch/csrc/extensions/gemm.cu | 75 + .../pytorch/csrc/extensions/misc.cu | 25 + .../pytorch/csrc/extensions/normalization.cu | 404 +++ .../pytorch/csrc/extensions/pybind.cpp | 158 ++ .../pytorch/csrc/extensions/softmax.cu | 211 ++ .../pytorch/csrc/extensions/transpose.cu | 321 +++ transformer_engine/pytorch/csrc/ts_fp8_op.cpp | 40 + transformer_engine/pytorch/module/__init__.py | 1 + transformer_engine/pytorch/module/_common.py | 95 + .../pytorch/module/layernorm_linear.py | 177 +- .../pytorch/module/layernorm_mlp.py | 122 +- transformer_engine/pytorch/module/rmsnorm.py | 168 ++ .../pytorch/te_onnx_extensions.py | 82 +- transformer_engine/pytorch/transformer.py | 16 +- 31 files changed, 3374 insertions(+), 2479 deletions(-) create mode 100644 docs/api/c/rmsnorm.rst delete mode 100644 transformer_engine/pytorch/csrc/extensions.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/activation.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/attention.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/cast.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/gemm.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/misc.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/normalization.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/pybind.cpp create mode 100644 transformer_engine/pytorch/csrc/extensions/softmax.cu create mode 100644 transformer_engine/pytorch/csrc/extensions/transpose.cu create mode 100644 transformer_engine/pytorch/module/_common.py create mode 100644 transformer_engine/pytorch/module/rmsnorm.py diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst index f98a419088..faf6cd4575 100644 --- a/docs/api/c/index.rst +++ b/docs/api/c/index.rst @@ -19,6 +19,7 @@ directly from C/C++, without Python. gemm.h fused_attn.h layer_norm.h + rmsnorm.h softmax.h transformer_engine.h transpose.h diff --git a/docs/api/c/rmsnorm.rst b/docs/api/c/rmsnorm.rst new file mode 100644 index 0000000000..9b43f26e91 --- /dev/null +++ b/docs/api/c/rmsnorm.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +rmsnorm.h +============ + +.. doxygenfile:: rmsnorm.h diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index e62984b3c8..22a571279b 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -11,6 +11,8 @@ pyTorch .. autoapiclass:: transformer_engine.pytorch.LayerNorm(hidden_size, eps=1e-5, **kwargs) +.. autoapiclass:: transformer_engine.pytorch.RMSNorm(hidden_size, eps=1e-5, **kwargs) + .. autoapiclass:: transformer_engine.pytorch.LayerNormLinear(in_features, out_features, eps=1e-5, bias=True, **kwargs) :members: forward diff --git a/setup.py b/setup.py index 81ba934cbd..ded19044fc 100644 --- a/setup.py +++ b/setup.py @@ -461,16 +461,20 @@ def setup_common_extension() -> CMakeExtension: cmake_flags=cmake_flags, ) +def _all_files_in_dir(path): + return list(path.iterdir()) + def setup_pytorch_extension() -> setuptools.Extension: """Setup CUDA extension for PyTorch support""" # Source files src_dir = root_path / "transformer_engine" / "pytorch" / "csrc" + extensions_dir = src_dir / "extensions" sources = [ - src_dir / "extensions.cu", src_dir / "common.cu", src_dir / "ts_fp8_op.cpp", - ] + ] + \ + _all_files_in_dir(extensions_dir) # Header files include_dirs = [ diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 15b820893a..2ed901cb20 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -21,7 +21,7 @@ attention_mask_func, ) from transformer_engine.pytorch import ( - DotProductAttention, Linear, LayerNormLinear, LayerNormMLP, TransformerLayer + DotProductAttention, Linear, LayerNormLinear, LayerNormMLP, TransformerLayer, RMSNorm ) from transformer_engine.pytorch.distributed import checkpoint as te_checkpoint @@ -59,6 +59,8 @@ def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq all_activations = ["gelu", "relu", "reglu", "geglu", "swiglu"] +all_normalizations = ["LayerNorm", "RMSNorm"] + def get_causal_attn_mask(sq: int) -> torch.Tensor: return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool() @@ -74,7 +76,16 @@ def assert_allclose(l1: List[torch.Tensor], l2: List[torch.Tensor], atol: float) """Ensures two lists are equal.""" assert len(l1) == len(l2), "Unequal number of outputs." for t1, t2 in zip(l1, l2): - assert torch.allclose(t1, t2, atol=atol), "Outputs not close enough." + result = torch.allclose(t1, t2, atol=atol) + if not result: + diff = torch.abs(t1 - t2).flatten() + m = torch.argmax(diff) + msg = (f"Outputs not close enough." + f"Location of the maximum difference: {m.item()} " + f"with {t1.flatten()[m].item()} vs {t2.flatten()[m].item()} " + f"(diff {diff[m].item()})." + ) + raise AssertionError(msg) def _set_cuda_rng_state(new_state, device=-1): @@ -310,11 +321,38 @@ def forward( return context_layer +# Adapted from https://github.com/bzhangGo/rmsnorm/blob/c6691f20ec0af4128c8159c903071f7575404295/rmsnorm_torch.py +class TorchRMSNorm(nn.Module): + def __init__(self, in_features, eps=1e-5): + super().__init__() + + self.eps = eps + self.in_features = in_features + + self.weight = nn.Parameter(torch.ones(in_features)) + self.register_parameter("weight", self.weight) + + def forward(self, x): + norm_x = x.norm(2, dim=-1, keepdim=True) + d_x = self.in_features + + rms_x = norm_x * d_x ** (-1. / 2) + x_normed = x / (rms_x + self.eps) + + return self.weight * x_normed class TorchLayerNormLinear(nn.Module): - def __init__(self, in_features: int, out_features: int, eps: float, bias: bool = True): + def __init__(self, in_features: int, out_features: int, + eps: float, bias: bool = True, + normalization: str = "LayerNorm"): super().__init__() - self.layernorm = nn.LayerNorm(in_features, eps=eps) + if normalization == "LayerNorm": + self.layernorm = nn.LayerNorm(in_features, eps=eps) + elif normalization == "RMSNorm": + self.layernorm = TorchRMSNorm(in_features, eps=eps) + else: + raise RuntimeError("Unsupported normalization") + self.linear = nn.Linear(in_features, out_features) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -355,9 +393,15 @@ def forward(self, x): class TorchLayerNormMLP(nn.Module): def __init__(self, hidden_size: int, ffn_hidden_size: int, - eps: float = 1e-5, activation = 'gelu'): + eps: float = 1e-5, activation = 'gelu', + normalization: str = "LayerNorm"): super().__init__() - self.ln = nn.LayerNorm(hidden_size, eps=eps) + if normalization == "LayerNorm": + self.ln = nn.LayerNorm(hidden_size, eps=eps) + elif normalization == "RMSNorm": + self.ln = TorchRMSNorm(hidden_size, eps=eps) + else: + raise RuntimeError("Unsupported normalization") if 'glu' in activation: fc1_output_features = 2 * ffn_hidden_size self.gelu = TorchGLU(activation) @@ -830,11 +874,48 @@ def test_linear_accuracy(dtype, bs, model): else: assert_allclose(te_outputs[0], torch_outputs[0], 5e-2) +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_rmsnorm_accuracy(dtype, bs, model): + config = model_configs[model] + + te_rmsnorm = ( + RMSNorm( + config.hidden_size, + ) + .to(dtype=dtype) + .cuda() + .eval() + ) + + torch_rmsnorm = ( + TorchRMSNorm( + config.hidden_size, + ) + .to(dtype=dtype) + .cuda() + .eval() + ) + + # Share params + with torch.no_grad(): + torch_rmsnorm.weight = Parameter(te_rmsnorm.weight.clone()) + + te_outputs = _test_granular_accuracy(te_rmsnorm, bs, dtype, config) + torch_outputs = _test_granular_accuracy(torch_rmsnorm, bs, dtype, config) + + # Check output. + if dtype == torch.float32: + assert_allclose(te_outputs[0], torch_outputs[0], 1e-7) + else: + assert_allclose(te_outputs[0], torch_outputs[0], 2e-2) @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) -def test_layernorm_linear_accuracy(dtype, bs, model): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_layernorm_linear_accuracy(dtype, bs, model, normalization): config = model_configs[model] te_ln_linear = ( @@ -843,6 +924,7 @@ def test_layernorm_linear_accuracy(dtype, bs, model): 4 * config.hidden_size, config.eps, bias=True, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -855,6 +937,7 @@ def test_layernorm_linear_accuracy(dtype, bs, model): 4 * config.hidden_size, config.eps, bias=True, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -864,7 +947,8 @@ def test_layernorm_linear_accuracy(dtype, bs, model): # Share params with torch.no_grad(): torch_ln_linear.layernorm.weight = Parameter(te_ln_linear.layer_norm_weight.clone()) - torch_ln_linear.layernorm.bias = Parameter(te_ln_linear.layer_norm_bias.clone()) + if normalization != "RMSNorm": + torch_ln_linear.layernorm.bias = Parameter(te_ln_linear.layer_norm_bias.clone()) torch_ln_linear.linear.weight = Parameter(te_ln_linear.weight.clone()) torch_ln_linear.linear.bias = Parameter(te_ln_linear.bias.clone()) @@ -882,7 +966,8 @@ def test_layernorm_linear_accuracy(dtype, bs, model): @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("activation", all_activations) -def test_layernorm_mlp_accuracy(dtype, bs, model, activation): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_layernorm_mlp_accuracy(dtype, bs, model, activation, normalization): config = model_configs[model] te_ln_mlp = ( @@ -890,6 +975,7 @@ def test_layernorm_mlp_accuracy(dtype, bs, model, activation): config.hidden_size, 4 * config.hidden_size, activation=activation, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -901,6 +987,7 @@ def test_layernorm_mlp_accuracy(dtype, bs, model, activation): config.hidden_size, 4 * config.hidden_size, activation=activation, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -910,7 +997,8 @@ def test_layernorm_mlp_accuracy(dtype, bs, model, activation): # Share params with torch.no_grad(): torch_ln_mlp.ln.weight = Parameter(te_ln_mlp.layer_norm_weight.clone()) - torch_ln_mlp.ln.bias = Parameter(te_ln_mlp.layer_norm_bias.clone()) + if normalization != "RMSNorm": + torch_ln_mlp.ln.bias = Parameter(te_ln_mlp.layer_norm_bias.clone()) torch_ln_mlp.fc1.weight = Parameter(te_ln_mlp.fc1_weight.clone()) torch_ln_mlp.fc1.bias = Parameter(te_ln_mlp.fc1_bias.clone()) torch_ln_mlp.fc2.weight = Parameter(te_ln_mlp.fc2_weight.clone()) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index cf158e9082..d4e834bdf2 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -71,6 +71,8 @@ supported_activations = ["gelu", "relu", "reglu", "geglu", "swiglu"] +all_normalizations = ["LayerNorm", "RMSNorm"] + @pytest.fixture() def seed_default_rng(): @@ -676,6 +678,90 @@ def forward(self, inp): validate_result( fname, inp, model, atol=atol, is_fp8=use_fp8, allow_cnt_errors=3, te_outputs=te_outputs) +@pytest.mark.parametrize("scale_factor", [448, 112]) +@pytest.mark.parametrize( + "use_fp8, precision, atol", [ + [False, torch.float32, 1e-7], + [False, torch.float16, 1e-7], + [False, torch.bfloat16, 1e-7], + [False, "fake-torch.bfloat16", 1e-7], + [True, torch.float32, 1e-7], + [True, torch.float16, 1e-7], + [True, torch.bfloat16, 1e-2], + [True, "fake-torch.bfloat16", 1e-2] +]) +def test_export_rmsnorm( + seed_default_rng, + use_fp8: bool, + scale_factor: float, + precision: torch.dtype, + atol: float +): + fake_bf16_io = precision == "fake-torch.bfloat16" + # reset precision to torch.bfloat16 after capturing fake BF16 mode + precision = torch.bfloat16 if precision == "fake-torch.bfloat16" else precision + + # Skip FP8 tests on non-hopper devices + if use_fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) + + # Set dimensions (these are arbitrary). + inp_shape = [64, 32] + + class Test_RMSnorm(nn.Module): + def __init__(self) -> None: + super().__init__() + eps = 1e-6 # An arbitrary small value + dtype = torch.float if fake_bf16_io else precision + self.ln = te.RMSNorm(inp_shape[1], eps, params_dtype=dtype).eval().cuda() + + def forward(self, inp): + ret = self.ln(inp) + return ret + + class TestFP8_RMSnorm(nn.Module): + def __init__(self) -> None: + super().__init__() + normalized_shape = torch.Size(inp.shape[1:]) + self.weight = torch.randn(*normalized_shape, device="cuda", + dtype=torch.float32 if fake_bf16_io else precision) + self.eps = 1e-6 # An arbitrary small value + + self.fp8_tensor = tex.FP8FwdTensors.GEMM1_INPUT + self.meta = create_meta(scale_factor) + self.fp8_type = tex.DType.kFloat8E4M3 + + def forward(self, inp): + ret = texcpp.rmsnorm_fwd_fp8_inf( + inp, + self.weight, + self.eps, + self.meta, + self.fp8_tensor, + self.fp8_type, + False) + + ret = cast_from_fp8( + ret, + self.meta, + self.fp8_tensor, + self.fp8_type, + as_te_type(precision)) + if fake_bf16_io: + ret = ret.type(torch.float32) + return ret + + inp = torch.randn(*inp_shape, device="cuda", dtype=torch.float32 if fake_bf16_io else precision) + model = TestFP8_RMSnorm() if use_fp8 else Test_RMSnorm() + high_prec_str = dtype2str(precision, fake_bf16_io=fake_bf16_io) + fp8_str = f"_fp8-{scale_factor}" if use_fp8 else "" + fname = f"te.layernorm{fp8_str}{high_prec_str}.onnx" + do_export(model, inp, fname, use_fp8=use_fp8) + te_outputs = te_infer(model, inp, is_fp8=use_fp8) + serialize_inputs_outputs(fname, inp, te_outputs) + if fake_bf16_io or precision != torch.bfloat16: + validate_result( + fname, inp, model, atol=atol, is_fp8=use_fp8, allow_cnt_errors=3, te_outputs=te_outputs) @skip_FP8 @pytest.mark.parametrize("softmax_fn", [ @@ -916,6 +1002,7 @@ def forward(self, inp): (torch.bfloat16, False), ]) @pytest.mark.parametrize("zero_centered_gamma", [False, True]) +@pytest.mark.parametrize("normalization", all_normalizations) def test_export_layernorm_linear( seed_default_rng, scale_factor: float, @@ -924,12 +1011,16 @@ def test_export_layernorm_linear( return_bias: bool, return_layernorm_output: bool, precision: torch.dtype, - zero_centered_gamma: bool + zero_centered_gamma: bool, + normalization: str, ): # Skip FP8 tests on non-hopper devices if use_fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + # Set dimensions (these are arbitrary). in_features = 64 out_features = 256 @@ -950,6 +1041,7 @@ def test_export_layernorm_linear( return_layernorm_output=return_layernorm_output, params_dtype=precision, zero_centered_gamma=zero_centered_gamma, + normalization=normalization, ).to(device='cuda') if use_fp8: set_layer_scale(model, scale_factor, num_gemms=1) @@ -980,6 +1072,7 @@ def test_export_layernorm_linear( ]) @pytest.mark.parametrize("zero_centered_gamma", [False, True]) @pytest.mark.parametrize("activation", supported_activations) +@pytest.mark.parametrize("normalization", all_normalizations) def test_export_layernorm_mlp( seed_default_rng, scale_factor: float, @@ -990,11 +1083,15 @@ def test_export_layernorm_mlp( precision: torch.dtype, zero_centered_gamma: bool, activation: str, + normalization: str, ): # Skip FP8 tests on non-hopper devices if use_fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + # Set dimensions (these are arbitrary). in_features = 64 out_features = 256 @@ -1016,6 +1113,7 @@ def test_export_layernorm_mlp( params_dtype=precision, zero_centered_gamma=zero_centered_gamma, activation=activation, + normalization=normalization, ).to(device='cuda') if use_fp8: set_layer_scale(model, scale_factor, num_gemms=2) diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index 101734b570..1643172c54 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -95,6 +95,7 @@ def __init__( all_boolean = [True, False] all_activations = ["gelu", "relu", "reglu", "geglu", "swiglu"] +all_normalizations = ["LayerNorm", "RMSNorm"] def _disable_wgrads(block): for p in block.parameters(): @@ -314,10 +315,16 @@ def _test_sanity_common(block, bs, dtype, config, fp8_recipe, skip_wgrad, skip_d @pytest.mark.parametrize("skip_wgrad", all_boolean) @pytest.mark.parametrize("zero_centered_gamma", all_boolean) @pytest.mark.parametrize("skip_dgrad", all_boolean) -def test_sanity_layernorm_linear(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, skip_dgrad): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_sanity_layernorm_linear(dtype, bs, fp8_recipe, model, skip_wgrad, + zero_centered_gamma, skip_dgrad, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -330,6 +337,7 @@ def test_sanity_layernorm_linear(dtype, bs, fp8_recipe, model, skip_wgrad, zero_ eps=config.eps, init_method=init_method, zero_centered_gamma=zero_centered_gamma, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -370,10 +378,16 @@ def test_sanity_linear(dtype, bs, fp8_recipe, model, skip_wgrad, skip_dgrad): @pytest.mark.parametrize("zero_centered_gamma", all_boolean) @pytest.mark.parametrize("skip_dgrad", all_boolean) @pytest.mark.parametrize("activation", all_activations) -def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, skip_dgrad, activation): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, + zero_centered_gamma, skip_dgrad, activation, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -389,6 +403,7 @@ def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, zero_cen output_layer_init_method=output_layer_init_method, zero_centered_gamma=zero_centered_gamma, activation=activation, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -404,10 +419,16 @@ def test_sanity_layernorm_mlp(dtype, bs, fp8_recipe, model, skip_wgrad, zero_cen @pytest.mark.parametrize("zero_centered_gamma", all_boolean) @pytest.mark.parametrize("bias", all_boolean) @pytest.mark.parametrize("activation", all_activations) -def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, bias, activation): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, + zero_centered_gamma, bias, activation, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -430,6 +451,7 @@ def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamm zero_centered_gamma=zero_centered_gamma, bias=bias, activation=activation, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -444,10 +466,15 @@ def test_sanity_gpt(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamm @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("skip_wgrad", all_boolean) @pytest.mark.parametrize("zero_centered_gamma", all_boolean) -def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -468,6 +495,7 @@ def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gam apply_residual_connection_post_layernorm=True, output_layernorm=True, zero_centered_gamma=zero_centered_gamma, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -482,10 +510,15 @@ def test_sanity_bert(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gam @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("skip_wgrad", all_boolean) @pytest.mark.parametrize("zero_centered_gamma", all_boolean) -def test_sanity_T5(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_sanity_T5(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -507,6 +540,7 @@ def test_sanity_T5(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma output_layernorm=False, layer_type="decoder", zero_centered_gamma=zero_centered_gamma, + normalization=normalization, ) .to(dtype=dtype) .cuda() @@ -669,10 +703,15 @@ def test_sanity_gradient_accumulation_fusion(dtype, bs, fp8_recipe, model, skip_ @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("skip_wgrad", all_boolean) @pytest.mark.parametrize("zero_centered_gamma", all_boolean) -def test_gpt_cuda_graph(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma): +@pytest.mark.parametrize("normalization", all_normalizations) +def test_gpt_cuda_graph(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_gamma, + normalization): if fp8_recipe is not None and not fp8_available: pytest.skip(reason_for_no_fp8) + if normalization == "RMSNorm" and zero_centered_gamma: + pytest.skip("RMSNorm does not support zero_centered_gamma yet!") + config = model_configs[model] sigma = 0.023 @@ -694,6 +733,7 @@ def test_gpt_cuda_graph(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_ output_layernorm=False, zero_centered_gamma=zero_centered_gamma, fuse_qkv_params=True, + normalization=normalization, ) .to(dtype=dtype) .cuda() diff --git a/transformer_engine/pytorch/__init__.py b/transformer_engine/pytorch/__init__.py index e7654b895f..b67ecd05b9 100644 --- a/transformer_engine/pytorch/__init__.py +++ b/transformer_engine/pytorch/__init__.py @@ -7,6 +7,7 @@ from .module import Linear from .module import LayerNormMLP from .module import LayerNorm +from .module import RMSNorm from .attention import DotProductAttention from .transformer import TransformerLayer from .fp8 import fp8_autocast @@ -21,4 +22,6 @@ onnx_te_gemm, onnx_layernorm_fwd_fp8, onnx_layernorm_fwd, + onnx_rmsnorm_fwd, + onnx_rmsnorm_fwd_fp8 ) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index e75b67784b..dd3f561c95 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -990,6 +990,7 @@ def __init__( ub_split_rs: bool = False, ub_split_ag: bool = False, bias: bool = True, + normalization: str = "LayerNorm", ) -> None: super().__init__() self.layer_number = (layer_number,) @@ -1044,6 +1045,7 @@ def __init__( ub_bulk_wgrad=ub_bulk_wgrad, ub_bulk_dgrad=ub_bulk_dgrad, ub_split_ag=ub_split_ag, + normalization=normalization, **common_gemm_kwargs, ) else: @@ -1072,6 +1074,7 @@ def __init__( ub_bulk_wgrad=ub_bulk_wgrad, ub_bulk_dgrad=ub_bulk_dgrad, ub_split_ag=ub_split_ag, + normalization=normalization, **common_gemm_kwargs, ) else: diff --git a/transformer_engine/pytorch/cpp_extensions/normalization.py b/transformer_engine/pytorch/cpp_extensions/normalization.py index ddee0152dc..54c7a0789f 100644 --- a/transformer_engine/pytorch/cpp_extensions/normalization.py +++ b/transformer_engine/pytorch/cpp_extensions/normalization.py @@ -10,7 +10,10 @@ __all__ = ['layernorm_fwd_fp8', 'layernorm_fwd_fp8_inf', - 'layernorm_fwd_inf'] + 'layernorm_fwd_inf', + 'rmsnorm_fwd_fp8', + 'rmsnorm_fwd_fp8_inf', + 'rmsnorm_fwd_inf'] def layernorm_fwd_fp8( @@ -99,3 +102,83 @@ def layernorm_fwd_inf( eps, zero_centered_gamma, ) + +def rmsnorm_fwd_fp8( + inp: torch.Tensor, + weight: torch.Tensor, + eps: float, + fp8_meta_tensor: tex.FP8TensorMeta, + fp8_tensor: Union[tex.FP8FwdTensors, tex.FP8BwdTensors], + otype: tex.DType, + sm_margin: int, + zero_centered_gamma: bool, + rmsnorm_out: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """RMSNorm with FP8 output""" + if rmsnorm_out is not None: + return tex.rmsnorm_fwd_fp8_noalloc( + inp, + weight, + eps, + fp8_meta_tensor.scale[fp8_tensor], + rmsnorm_out, + fp8_meta_tensor.amax_history[0][fp8_tensor], + fp8_meta_tensor.scale_inv[fp8_tensor], + otype, + sm_margin, + zero_centered_gamma + ) + + return tex.rmsnorm_fwd_fp8( + inp, + weight, + eps, + fp8_meta_tensor.scale[fp8_tensor], + fp8_meta_tensor.amax_history[0][fp8_tensor], + fp8_meta_tensor.scale_inv[fp8_tensor], + otype, + sm_margin, + zero_centered_gamma + ) + + +def rmsnorm_fwd_fp8_inf( + inp: torch.Tensor, + weight: torch.Tensor, + eps: float, + fp8_meta_tensor: tex.FP8TensorMeta, + fp8_tensor: Union[tex.FP8FwdTensors, tex.FP8BwdTensors], + otype: tex.DType, + zero_centered_gamma, +) -> torch.Tensor: + """RMSNorm with FP8 output. + + This version of rmsnorm_fwd_fp8 is specialized for inference, and returns + only the normalized output. + """ + ret = torch.ops.tex_ts.rmsnorm_fwd_fp8_inf_ts( + inp, + weight, + eps, + fp8_meta_tensor.scale, + fp8_meta_tensor.amax_history, + fp8_meta_tensor.scale_inv, + fp8_tensor, + otype, + zero_centered_gamma) + return ret + + +def rmsnorm_fwd_inf( + inp: torch.Tensor, + weight: torch.Tensor, + eps: float, + zero_centered_gamma: bool, +) -> torch.Tensor: + """RMSNorm with FP8 output""" + return torch.ops.tex_ts.rmsnorm_fwd_inf_ts( + inp, + weight, + eps, + zero_centered_gamma, + ) diff --git a/transformer_engine/pytorch/csrc/common.cu b/transformer_engine/pytorch/csrc/common.cu index 1d20607940..3209dda004 100644 --- a/transformer_engine/pytorch/csrc/common.cu +++ b/transformer_engine/pytorch/csrc/common.cu @@ -137,3 +137,11 @@ at::Tensor allocateTorchTensor(int M, return at::empty({static_cast(M)}, at::CUDA(GetATenDType(dtype))); } + +void *getDataPtr(at::Tensor t) { + if (t.numel() > 0) { + return t.data_ptr(); + } else { + return nullptr; + } +} diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index 17d36b9911..7c17f1f34c 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -180,4 +181,6 @@ at::Tensor allocateTorchTensor(int M, transformer_engine::DType dtype ); +void *getDataPtr(at::Tensor t); + #endif // TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_ diff --git a/transformer_engine/pytorch/csrc/extensions.cu b/transformer_engine/pytorch/csrc/extensions.cu deleted file mode 100644 index 69248d4aa9..0000000000 --- a/transformer_engine/pytorch/csrc/extensions.cu +++ /dev/null @@ -1,2277 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * See LICENSE for license information. - ************************************************************************/ - -#include "extensions.h" -#ifdef NVTE_WITH_USERBUFFERS -#include "comm_gemm_overlap.h" -#endif // NVTE_WITH_USERBUFFERS - -constexpr int block_size = 512; -constexpr int ctas_per_sm = 4; - -// get the fused attention backend -NVTE_Fused_Attn_Backend get_fused_attn_backend( - const transformer_engine::DType q_dtype, - const transformer_engine::DType kv_dtype, - NVTE_QKV_Layout qkv_layout, - NVTE_Bias_Type bias_type, - NVTE_Mask_Type attn_mask_type, - float p_dropout, size_t max_seqlen_q, - size_t max_seqlen_kv, size_t head_dim) { - NVTE_Fused_Attn_Backend fused_attention_backend = - nvte_get_fused_attn_backend( - static_cast(q_dtype), static_cast(kv_dtype), - qkv_layout, bias_type, attn_mask_type, - p_dropout, max_seqlen_q, max_seqlen_kv, head_dim); - return fused_attention_backend; -} - -// fast zero-fills of tensors -template -__global__ void __launch_bounds__(block_size) mha_fill_kernel(scalar_t* out_tensor, - const int32_t* const start_row, - const size_t num_rows) { - size_t row_stride = gridDim.y * blockDim.x; - size_t row_index = blockIdx.x + static_cast(start_row[0]); - size_t col_index = blockIdx.y * blockDim.x + threadIdx.x; - while (row_index < num_rows) { - out_tensor[row_index*row_stride + col_index] = 0; - row_index += gridDim.x; - } -} - -// fast zero-fills of tensors -void mha_fill(const at::Tensor &self, const at::Tensor &start_index) { - auto max_tokens = self.size(0); - auto self_2d = self.view({max_tokens, -1}); - auto fcd_size = self_2d.size(1); - TORCH_CHECK(self.is_contiguous(), "input not contiguous"); - TORCH_CHECK(fcd_size % block_size == 0, "input size not aligned to block size"); - const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - uint64_t num_blk_y = (uint64_t)(fcd_size / block_size); - uint64_t num_blk_x = (uint64_t)((num_mp * ctas_per_sm + num_blk_y - 1) / num_blk_y); - dim3 dim_grid(num_blk_x, num_blk_y); - dim3 dim_block(block_size); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( - at::ScalarType::Half, at::ScalarType::BFloat16, - self_2d.scalar_type(), "mha_fill", [&]() { - mha_fill_kernel<<>>( - self_2d.data_ptr(), - static_cast(start_index.data_ptr()), - max_tokens); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); -} - -// extract seed and offset from PhiloxCudaState -__global__ void unpack(at::PhiloxCudaState arg, int64_t* rng_state_ptr) { - if (arg.captured_) { - rng_state_ptr[0] = static_cast(*arg.seed_.ptr); - rng_state_ptr[1] = static_cast( - *(arg.offset_.ptr) + static_cast(arg.offset_intragraph_)); - } else { - rng_state_ptr[0] = static_cast(arg.seed_.val); - rng_state_ptr[1] = static_cast(arg.offset_.val); - } -} - -// extract PhiloxCudaState from CUDA random number generator -at::PhiloxCudaState init_philox_state( - at::CUDAGeneratorImpl* gen, - size_t elts_per_thread) { - at::PhiloxCudaState philox_args; - std::lock_guard lock(gen->mutex_); - philox_args = gen->philox_cuda_state(elts_per_thread); - return philox_args; -} - -// fused attention FWD with packed QKV -std::vector fused_attn_fwd_qkvpacked( - size_t b, size_t max_seqlen, size_t total_seqs, - size_t h, size_t d, - bool is_training, float attn_scale, float p_dropout, bool set_zero, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - const at::Tensor cu_seqlens, - const at::Tensor QKV, - const transformer_engine::DType qkv_type, - const c10::optional descale_QKV, - const c10::optional scale_S, - const c10::optional scale_O, - c10::optional amax_S, - c10::optional amax_O, - const c10::optional Bias, - const c10::optional rng_gen, - size_t rng_elts_per_thread) { - using namespace transformer_engine; - - // create output tensor O - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); - auto O = torch::empty({static_cast(total_seqs), - static_cast(h), static_cast(d)}, options); - if (set_zero && (h * d % block_size == 0)) { - mha_fill(O, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); - } else { - O.fill_(0); - } - - // construct NVTE tensors - TensorWrapper te_QKV, te_S, te_O, te_Bias, te_cu_seqlens; - if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { - // FP8 - if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) - || (!amax_S.has_value()) || (!amax_O.has_value())) { - std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; - NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); - } - te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - at::Tensor descale_S = torch::empty_like(scale_S.value()); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, amax_S.value().data_ptr(), - scale_S.value().data_ptr(), descale_S.data_ptr()); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, - qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); - } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { - // BF16 or FP16 - te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, - qkv_type, nullptr, nullptr, nullptr); - } else { - NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); - } - if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { - auto bias_shape = Bias.value().sizes().vec(); - std::vector shape{bias_shape.begin(), bias_shape.end()}; - te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, - DType::kFloat32, nullptr, nullptr, nullptr); - } - te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - - // extract random number generator seed and offset - auto gen = at::get_generator_or_default( - rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); - at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); - auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); - unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( - philox_args, static_cast(rng_state.data_ptr())); - auto te_rng_state = makeTransformerEngineTensor(rng_state); - - // create auxiliary output tensors - NVTETensorPack nvte_aux_tensor_pack; - nvte_tensor_pack_create(&nvte_aux_tensor_pack); - - // create workspace - TensorWrapper workspace; - - // populate tensors with appropriate shapes and dtypes - nvte_fused_attn_fwd_qkvpacked( - te_QKV.data(), - te_Bias.data(), - te_S.data(), - te_O.data(), - &nvte_aux_tensor_pack, - te_cu_seqlens.data(), - te_rng_state.data(), - max_seqlen, - is_training, attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // allocate memory for workspace and auxiliary output tensors - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor( - workspace_data.data_ptr(), - workspace.shape(), workspace.dtype()); - - // output_tensors = [O, nvte_aux_tensor_pack.tensors] - std::vector output_tensors; - output_tensors.push_back(O); - for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { - auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); - // allocate memory for nvte_aux_tensor_pack.tensors - auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); - output_tensors.push_back(output_tensor); - tensor->data.dptr = output_tensor.data_ptr(); - } - - // execute the kernel - nvte_fused_attn_fwd_qkvpacked( - te_QKV.data(), - te_Bias.data(), - te_S.data(), - te_O.data(), - &nvte_aux_tensor_pack, - te_cu_seqlens.data(), - te_rng_state.data(), - max_seqlen, - is_training, attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // destroy tensor wrappers, but not allocated memory - nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - - // if training, [O, softmax-related tensors, rng_state]; if inference, [O] - return output_tensors; -} - -// fused attention BWD with packed QKV -std::vector fused_attn_bwd_qkvpacked( - size_t b, size_t max_seqlen, size_t total_seqs, - size_t h, size_t d, - float attn_scale, float p_dropout, bool set_zero, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - const at::Tensor cu_seqlens, - const at::Tensor QKV, - const at::Tensor O, - const at::Tensor dO, - const transformer_engine::DType qkv_type, - const std::vector Aux_CTX_Tensors, - const c10::optional descale_QKV, - const c10::optional descale_S, - const c10::optional descale_O, - const c10::optional descale_dO, - const c10::optional scale_S, - const c10::optional scale_dP, - const c10::optional scale_dQKV, - c10::optional amax_dP, - c10::optional amax_dQKV) { - using namespace transformer_engine; - - // create output tensor dQKV - at::Tensor dQKV = torch::empty_like(QKV); - auto max_tokens = dQKV.size(0); - auto self_2d = dQKV.view({max_tokens, -1}); - auto fcd_size = self_2d.size(1); - if (set_zero && (fcd_size % block_size == 0)) { - mha_fill(dQKV, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); - } else { - dQKV.fill_(0); - } - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); - at::Tensor dBias; - TensorWrapper te_dBias; - if (bias_type != NVTE_NO_BIAS) { - dBias = torch::zeros({1, static_cast(h), - static_cast(max_seqlen), - static_cast(max_seqlen)}, options); - te_dBias = makeTransformerEngineTensor(dBias); - } - - // construct NVTE tensors - TensorWrapper te_QKV, te_O, te_dO, te_S, te_dP, te_dQKV; - if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { - // FP8 - if ((!descale_QKV.has_value()) || (!descale_S.has_value()) - || (!descale_O.has_value()) || (!descale_dO.has_value()) - || (!scale_S.has_value()) || (!scale_dP.has_value()) - || (!scale_dQKV.has_value()) - || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { - std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; - err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); - NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); - } - te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, - qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); - te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, - qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, - nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); - at::Tensor descale_dP = torch::empty_like(scale_dP.value()); - te_dP = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), - descale_dP.data_ptr()); - te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, - amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { - // BF16 or FP16 - te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_dP = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, - qkv_type, nullptr, nullptr, nullptr); - } else { - NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); - } - - // convert auxiliary tensors from forward into NVTETensors - NVTETensorPack nvte_aux_tensor_pack; - nvte_tensor_pack_create(&nvte_aux_tensor_pack); - nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); - for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { - auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); - tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); - std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); - tensor->data.shape = std::vector(tmp.begin(), tmp.end()); - tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); - } - - // create cu_seqlens tensorwrappers - TensorWrapper te_cu_seqlens; - te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - - // create workspace - TensorWrapper workspace; - - // populate tensors with appropriate shapes and dtypes - nvte_fused_attn_bwd_qkvpacked( - te_QKV.data(), - te_O.data(), - te_dO.data(), - te_S.data(), - te_dP.data(), - &nvte_aux_tensor_pack, - te_dQKV.data(), - te_dBias.data(), - te_cu_seqlens.data(), - max_seqlen, - attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // allocate memory for workspace - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor( - workspace_data.data_ptr(), - workspace.shape(), workspace.dtype()); - - // execute kernel - nvte_fused_attn_bwd_qkvpacked( - te_QKV.data(), - te_O.data(), - te_dO.data(), - te_S.data(), - te_dP.data(), - &nvte_aux_tensor_pack, - te_dQKV.data(), - te_dBias.data(), - te_cu_seqlens.data(), - max_seqlen, - attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // destroy tensor wrappers - nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - - return {dQKV, dBias}; -} - -// fused attention FWD with packed KV -std::vector fused_attn_fwd_kvpacked( - size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, - size_t total_seqs_q, size_t total_seqs_kv, - size_t h, size_t d, - bool is_training, float attn_scale, float p_dropout, bool set_zero, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - const at::Tensor cu_seqlens_q, - const at::Tensor cu_seqlens_kv, - const at::Tensor Q, - const at::Tensor KV, - const transformer_engine::DType qkv_type, - const c10::optional descale_QKV, - const c10::optional scale_S, - const c10::optional scale_O, - c10::optional amax_S, - c10::optional amax_O, - const c10::optional Bias, - const c10::optional rng_gen, - size_t rng_elts_per_thread) { - using namespace transformer_engine; - - // create output tensor O - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); - auto O = torch::empty({static_cast(total_seqs_q), - static_cast(h), static_cast(d)}, options); - if (set_zero && (h * d % block_size == 0)) { - mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); - } else { - O.fill_(0); - } - - // construct NVTE tensors - TensorWrapper te_Q, te_KV, te_S, te_O, te_Bias, te_cu_seqlens_q, te_cu_seqlens_kv; - if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { - // FP8 - if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) - || (!amax_S.has_value()) || (!amax_O.has_value())) { - std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; - NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); - } - te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - at::Tensor descale_S = torch::empty_like(scale_S.value()); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, amax_S.value().data_ptr(), - scale_S.value().data_ptr(), descale_S.data_ptr()); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, - qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); - } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { - // BF16 or FP16 - te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - } else { - NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); - } - if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { - auto bias_shape = Bias.value().sizes().vec(); - std::vector shape{bias_shape.begin(), bias_shape.end()}; - te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, - DType::kFloat32, nullptr, nullptr, nullptr); - } - te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - - // extract rng seed and offset - auto gen = at::get_generator_or_default( - rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); - at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); - auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); - unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( - philox_args, static_cast(rng_state.data_ptr())); - auto te_rng_state = makeTransformerEngineTensor(rng_state); - - // create auxiliary output tensors - NVTETensorPack nvte_aux_tensor_pack; - nvte_tensor_pack_create(&nvte_aux_tensor_pack); - - // create workspace - TensorWrapper workspace; - - // populate tensors with appropriate shapes and dtypes - nvte_fused_attn_fwd_kvpacked( - te_Q.data(), - te_KV.data(), - te_Bias.data(), - te_S.data(), - te_O.data(), - &nvte_aux_tensor_pack, - te_cu_seqlens_q.data(), - te_cu_seqlens_kv.data(), - te_rng_state.data(), - max_seqlen_q, max_seqlen_kv, - is_training, attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // allocate memory for workspace and auxiliary output tensors - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor( - workspace_data.data_ptr(), - workspace.shape(), workspace.dtype()); - - // output_tensors = [O, nvte_aux_tensor_pack.tensors] - std::vector output_tensors; - output_tensors.push_back(O); - for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { - auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); - // allocate memory for nvte_aux_tensor_pack.tensors - auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); - output_tensors.push_back(output_tensor); - tensor->data.dptr = output_tensor.data_ptr(); - } - - // execute the kernel - nvte_fused_attn_fwd_kvpacked( - te_Q.data(), - te_KV.data(), - te_Bias.data(), - te_S.data(), - te_O.data(), - &nvte_aux_tensor_pack, - te_cu_seqlens_q.data(), - te_cu_seqlens_kv.data(), - te_rng_state.data(), - max_seqlen_q, max_seqlen_kv, - is_training, attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // destroy tensor wrappers, but not allocated memory - nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - - // if training, [O, softmax-related tensors, rng_state]; if inference, [O] - return output_tensors; -} - -// fused attention BWD with packed KV -std::vector fused_attn_bwd_kvpacked( - size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, - size_t total_seqs_q, size_t total_seqs_kv, - size_t h, size_t d, - float attn_scale, float p_dropout, bool set_zero, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - const at::Tensor cu_seqlens_q, - const at::Tensor cu_seqlens_kv, - const at::Tensor Q, - const at::Tensor KV, - const at::Tensor O, - const at::Tensor dO, - const transformer_engine::DType qkv_type, - const std::vector Aux_CTX_Tensors, - const c10::optional descale_QKV, - const c10::optional descale_S, - const c10::optional descale_O, - const c10::optional descale_dO, - const c10::optional scale_S, - const c10::optional scale_dP, - const c10::optional scale_dQKV, - c10::optional amax_dP, - c10::optional amax_dQKV) { - using namespace transformer_engine; - - // create output tensors dQ and dKV - at::Tensor dQ = torch::empty_like(Q); - at::Tensor dKV = torch::empty_like(KV); - auto max_tokens_q = dQ.size(0); - auto self_2d_q = dQ.view({max_tokens_q, -1}); - auto fcd_size_q = self_2d_q.size(1); - auto max_tokens_kv = dQ.size(0); - auto self_2d_kv = dQ.view({max_tokens_kv, -1}); - auto fcd_size_kv = self_2d_kv.size(1); - if (set_zero && (fcd_size_q % block_size == 0) && (fcd_size_kv % block_size == 0)) { - mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); - mha_fill(dKV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); - } else { - dQ.fill_(0); - dKV.fill_(0); - } - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); - at::Tensor dBias; - TensorWrapper te_dBias; - if (bias_type != NVTE_NO_BIAS) { - dBias = torch::zeros({1, static_cast(h), - static_cast(max_seqlen_q), - static_cast(max_seqlen_kv)}, options); - te_dBias = makeTransformerEngineTensor(dBias); - } - - // construct NVTE tensors - TensorWrapper te_Q, te_KV, te_O, te_dO, te_S, te_dP, te_dQ, te_dKV; - if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { - // FP8 - if ((!descale_QKV.has_value()) || (!descale_S.has_value()) - || (!descale_O.has_value()) || (!descale_dO.has_value()) - || (!scale_S.has_value()) || (!scale_dP.has_value()) - || (!scale_dQKV.has_value()) - || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { - std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; - err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); - NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); - } - te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, - qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); - te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); - te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, - scale_S.value().data_ptr(), descale_S.value().data_ptr()); - at::Tensor descale_dP = torch::empty_like(scale_dP.value()); - te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, - amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), - descale_dP.data_ptr()); - te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, qkv_type, - amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, qkv_type, - amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { - // BF16 or FP16 - te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_S = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_dP = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, nullptr, nullptr, nullptr); - te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, - qkv_type, nullptr, nullptr, nullptr); - te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, - qkv_type, nullptr, nullptr, nullptr); - } else { - NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); - } - - // create cu_seqlens tensorwrappers - TensorWrapper te_cu_seqlens_q, te_cu_seqlens_kv; - te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, - DType::kInt32, nullptr, nullptr, nullptr); - - // convert auxiliary tensors from forward to NVTETensors - NVTETensorPack nvte_aux_tensor_pack; - nvte_tensor_pack_create(&nvte_aux_tensor_pack); - nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); - for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { - auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); - tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); - std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); - tensor->data.shape = std::vector(tmp.begin(), tmp.end()); - tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); - } - - // create workspace - TensorWrapper workspace; - - // populate tensors with appropriate shapes and dtypes - nvte_fused_attn_bwd_kvpacked( - te_Q.data(), - te_KV.data(), - te_O.data(), - te_dO.data(), - te_S.data(), - te_dP.data(), - &nvte_aux_tensor_pack, - te_dQ.data(), - te_dKV.data(), - te_dBias.data(), - te_cu_seqlens_q.data(), - te_cu_seqlens_kv.data(), - max_seqlen_q, max_seqlen_kv, - attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // allocate memory for workspace - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor( - workspace_data.data_ptr(), - workspace.shape(), workspace.dtype()); - - // execute kernel - nvte_fused_attn_bwd_kvpacked( - te_Q.data(), - te_KV.data(), - te_O.data(), - te_dO.data(), - te_S.data(), - te_dP.data(), - &nvte_aux_tensor_pack, - te_dQ.data(), - te_dKV.data(), - te_dBias.data(), - te_cu_seqlens_q.data(), - te_cu_seqlens_kv.data(), - max_seqlen_q, max_seqlen_kv, - attn_scale, p_dropout, - qkv_layout, bias_type, attn_mask_type, - workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // destroy tensor wrappers - nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); - - return {dQ, dKV, dBias}; -} - -void te_gemm(at::Tensor A, - at::Tensor A_scale_inverse, - transformer_engine::DType A_type, - bool transa, - at::Tensor B, - at::Tensor B_scale_inverse, - transformer_engine::DType B_type, - bool transb, - at::Tensor D, - at::Tensor D_scale, - transformer_engine::DType D_type, - at::Tensor D_amax, - at::Tensor bias, - transformer_engine::DType bias_type, - at::Tensor pre_gelu_out, - bool grad, - at::Tensor workspace, - size_t workspaceSize, - bool accumulate, - bool use_split_accumulator, - int math_sm_count -) { - using namespace transformer_engine; - auto te_A = makeTransformerEngineTensor(A.data_ptr(), - {static_cast(A.size(0)), - static_cast(A.size(1))}, - A_type, nullptr, nullptr, - A_scale_inverse.data_ptr()); - auto te_B = makeTransformerEngineTensor(B.data_ptr(), - {static_cast(B.size(0)), - static_cast(B.size(1))}, - B_type, nullptr, nullptr, - B_scale_inverse.data_ptr()); - auto te_D = makeTransformerEngineTensor(D.data_ptr(), - {static_cast(D.size(0)), - static_cast(D.size(1))}, - D_type, D_amax.data_ptr(), - D_scale.data_ptr(), nullptr); - auto te_bias = makeTransformerEngineTensor(bias.data_ptr(), {static_cast(bias.size(0))}, - bias_type); - - const auto gelu_shape = pre_gelu_out.data_ptr() == nullptr - ? std::vector{static_cast(pre_gelu_out.size(0))} - : std::vector{static_cast(pre_gelu_out.size(0)), - static_cast(pre_gelu_out.size(1))}; - auto te_pre_gelu_out = makeTransformerEngineTensor(pre_gelu_out.data_ptr(), - gelu_shape, - GetTransformerEngineDType( - pre_gelu_out.scalar_type())); - auto te_workspace = makeTransformerEngineTensor(workspace.data_ptr(), - {workspaceSize}, - DType::kByte); - - nvte_cublas_gemm(te_A.data(), - te_B.data(), - te_D.data(), - te_bias.data(), - te_pre_gelu_out.data(), - transa, - transb, - grad, - te_workspace.data(), - accumulate, - use_split_accumulator, - math_sm_count, - at::cuda::getCurrentCUDAStream()); -} - - -void fused_cast_transpose(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - at::Tensor input_cast, - at::Tensor input_transpose, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t M = static_cast(input.size(0)); - size_t N = static_cast(input.size(1)); - - auto input_cu = makeTransformerEngineTensor(input); - auto output_cast_cu = makeTransformerEngineTensor(input_cast.data_ptr(), {M, N}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto output_transpose_cu = makeTransformerEngineTensor(input_transpose.data_ptr(), {N, M}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_cast_transpose(input_cu.data(), output_cast_cu.data(), output_transpose_cu.data(), - at::cuda::getCurrentCUDAStream()); -} - - -std::vector fused_cast_transpose_bgrad(at::Tensor grad_output, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t M = static_cast(grad_output.size(0)); - size_t N = static_cast(grad_output.size(1)); - - DType grad_output_type = GetTransformerEngineDType(grad_output.scalar_type()); - auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_output_type); - auto grad_output_cast = - allocateTorchTensor(grad_output.size(0), - grad_output.size(1), - DType::kByte); - auto grad_output_transpose = - allocateTorchTensor(grad_output.size(1), - grad_output.size(0), - DType::kByte); - - auto input_cu = makeTransformerEngineTensor(grad_output); - auto cast_output_cu = makeTransformerEngineTensor(grad_output_cast.data_ptr(), {M, N}, - otype, amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto transposed_output_cu = makeTransformerEngineTensor(grad_output_transpose.data_ptr(), - {N, M}, otype, amax.data_ptr(), - scale.data_ptr(), scale_inv.data_ptr()); - auto dbias_cu = makeTransformerEngineTensor(grad_bias); - transformer_engine::TensorWrapper workspace; - - nvte_cast_transpose_dbias(input_cu.data(), cast_output_cu.data(), - transposed_output_cu.data(), dbias_cu.data(), - workspace.data(), at::cuda::getCurrentCUDAStream()); - - // Fill workspace - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - - nvte_cast_transpose_dbias(input_cu.data(), cast_output_cu.data(), - transposed_output_cu.data(), dbias_cu.data(), - workspace.data(), at::cuda::getCurrentCUDAStream()); - - return {grad_bias, grad_output_cast, grad_output_transpose}; -} - - -std::vector fused_fp8_transpose_bgrad(at::Tensor grad_output, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype, - transformer_engine::DType grad_bias_type -) { - using namespace transformer_engine; - - size_t M = static_cast(grad_output.size(0)); - size_t N = static_cast(grad_output.size(1)); - - auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_bias_type); - auto grad_output_transpose = - allocateTorchTensor(grad_output.size(1), - grad_output.size(0), - DType::kByte); - auto input_cu = makeTransformerEngineTensor(grad_output.data_ptr(), {M, N}, - otype, amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto transposed_output_cu = makeTransformerEngineTensor(grad_output_transpose.data_ptr(), - {N, M}, otype, amax.data_ptr(), - scale.data_ptr(), scale_inv.data_ptr()); - auto dbias_cu = makeTransformerEngineTensor(grad_bias); - transformer_engine::TensorWrapper workspace; - - nvte_fp8_transpose_dbias(input_cu.data(), transposed_output_cu.data(), dbias_cu.data(), - workspace.data(), at::cuda::getCurrentCUDAStream()); - - // Fill workspace - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - - nvte_fp8_transpose_dbias(input_cu.data(), transposed_output_cu.data(), dbias_cu.data(), - workspace.data(), at::cuda::getCurrentCUDAStream()); - - return {grad_bias, grad_output_transpose}; -} - - - -std::vector fused_cast_transpose_bgrad_dgelu(at::Tensor grad_output, - at::Tensor gelu_input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t M = static_cast(grad_output.size(0)); - size_t N = static_cast(grad_output.size(1)); - - DType grad_output_type = GetTransformerEngineDType(grad_output.scalar_type()); - auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_output_type); - auto dgelu = - allocateTorchTensor(grad_output.size(0), - grad_output.size(1), - DType::kByte); - auto dgelu_transpose = - allocateTorchTensor(grad_output.size(1), - grad_output.size(0), - DType::kByte); - - transformer_engine::TensorWrapper workspace; - auto gelu_input_cu = makeTransformerEngineTensor(gelu_input); - auto input_cu = makeTransformerEngineTensor(grad_output); - auto cast_output_cu = makeTransformerEngineTensor(dgelu.data_ptr(), {M, N}, - otype, amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto transposed_output_cu = makeTransformerEngineTensor(dgelu_transpose.data_ptr(), {N, M}, - otype, amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto dbias_cu = makeTransformerEngineTensor(grad_bias); - - nvte_cast_transpose_dbias_dgelu(input_cu.data(), gelu_input_cu.data(), - cast_output_cu.data(), transposed_output_cu.data(), - dbias_cu.data(), workspace.data(), - at::cuda::getCurrentCUDAStream()); - - // Fill workspace - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - - nvte_cast_transpose_dbias_dgelu(input_cu.data(), gelu_input_cu.data(), - cast_output_cu.data(), transposed_output_cu.data(), - dbias_cu.data(), workspace.data(), - at::cuda::getCurrentCUDAStream()); - - return {grad_bias, dgelu, dgelu_transpose}; -} - - -void fused_multi_cast_transpose(std::vector input_list, - std::vector scale_list, - std::vector cast_output_list, - std::vector transposed_output_list, - std::vector amax_list, - std::vector scale_inv_list, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - // Extract properties from PyTorch tensors - std::vector input_dptr_list, scale_dptr_list, - cast_output_dptr_list, transposed_output_dptr_list, - amax_dptr_list, scale_inv_dptr_list; - std::vector> input_shape_list, scale_shape_list, - cast_output_shape_list, transposed_output_shape_list, - amax_shape_list, scale_inv_shape_list; - std::vector input_type_list, scale_type_list, - cast_output_type_list, transposed_output_type_list, - amax_type_list, scale_inv_type_list; - auto extract_tensor_props_skip_dtype = [](at::Tensor& tensor, - std::vector& dptr_list, - std::vector>& shape_list) { - dptr_list.push_back(tensor.data_ptr()); - shape_list.push_back({}); - for (int d = 0; d < tensor.dim(); ++d) { - shape_list.back().push_back(tensor.size(d)); - } - }; - auto extract_tensor_props = [](at::Tensor& tensor, - std::vector& dptr_list, - std::vector>& shape_list, - std::vector& type_list) { - dptr_list.push_back(tensor.data_ptr()); - shape_list.push_back({}); - for (int d = 0; d < tensor.dim(); ++d) { - shape_list.back().push_back(tensor.size(d)); - } - type_list.push_back(GetTransformerEngineDType(tensor.scalar_type())); - }; - for (size_t tensor_id = 0; tensor_id < input_list.size(); ++tensor_id) { - extract_tensor_props(input_list[tensor_id], - input_dptr_list, - input_shape_list, - input_type_list); - extract_tensor_props(scale_list[tensor_id], - scale_dptr_list, - scale_shape_list, - scale_type_list); - extract_tensor_props_skip_dtype(cast_output_list[tensor_id], - cast_output_dptr_list, - cast_output_shape_list); - cast_output_type_list.push_back(otype); - extract_tensor_props_skip_dtype(transposed_output_list[tensor_id], - transposed_output_dptr_list, - transposed_output_shape_list); - transposed_output_type_list.push_back(otype); - extract_tensor_props(amax_list[tensor_id], - amax_dptr_list, - amax_shape_list, - amax_type_list); - extract_tensor_props(scale_inv_list[tensor_id], - scale_inv_dptr_list, - scale_inv_shape_list, - scale_inv_type_list); - } - - transformer_engine::TensorWrapper workspace; - - // Construct TE tensors - std::vector nvte_input_list, - nvte_cast_output_list, nvte_transposed_output_list; - std::vector tensor_wrappers; - auto make_tensor = [&tensor_wrappers](void* dptr, - const std::vector& shape, - transformer_engine::DType dtype, - void* amax_dptr, - void* scale_dptr, - void* scale_inv_dptr) - -> NVTETensor { - tensor_wrappers.emplace_back(makeTransformerEngineTensor(dptr, shape, dtype, amax_dptr, - scale_dptr, scale_inv_dptr)); - return tensor_wrappers.back().data(); - }; - for (size_t i = 0; i < input_dptr_list.size(); ++i) { - nvte_input_list.emplace_back(make_tensor(input_dptr_list[i], - input_shape_list[i], - input_type_list[i], - nullptr, - nullptr, - nullptr)); - nvte_cast_output_list.emplace_back(make_tensor(cast_output_dptr_list[i], - cast_output_shape_list[i], - cast_output_type_list[i], - amax_dptr_list[i], - scale_dptr_list[i], - scale_inv_dptr_list[i])); - nvte_transposed_output_list.emplace_back(make_tensor(transposed_output_dptr_list[i], - transposed_output_shape_list[i], - transposed_output_type_list[i], - amax_dptr_list[i], - scale_dptr_list[i], - scale_inv_dptr_list[i])); - } - - // Check tensor lists - NVTE_CHECK(nvte_cast_output_list.size() == nvte_input_list.size(), - "Number of input and C output tensors must match"); - NVTE_CHECK(nvte_transposed_output_list.size() == nvte_input_list.size(), - "Number of input and T output tensors must match"); - - // Launch TE kernel - nvte_multi_cast_transpose(nvte_input_list.size(), - nvte_input_list.data(), - nvte_cast_output_list.data(), - nvte_transposed_output_list.data(), - at::cuda::getCurrentCUDAStream()); -} - - -at::Tensor fp8_transpose(at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t M = static_cast(input.size(0)); - size_t N = static_cast(input.size(1)); - - auto output = - allocateTorchTensor(input.size(1), - input.size(0), - DType::kByte); - - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, otype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, M}, otype); - - nvte_transpose(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - - -at::Tensor gelu(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_gelu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor dgelu(at::Tensor grad, - at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto gtype = GetTransformerEngineDType(grad.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N}, gtype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); - - nvte_dgelu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor relu(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = static_cast(input.numel()) / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_relu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor drelu(at::Tensor grad, - at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto gtype = GetTransformerEngineDType(grad.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N}, gtype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); - - nvte_drelu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor geglu(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N / 2, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_geglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor dgeglu(at::Tensor grad, - at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto gtype = GetTransformerEngineDType(grad.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); - - nvte_dgeglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor reglu(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N / 2, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_reglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor dreglu(at::Tensor grad, - at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto gtype = GetTransformerEngineDType(grad.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); - - nvte_dreglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor swiglu(at::Tensor input, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N / 2, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_swiglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -at::Tensor dswiglu(at::Tensor grad, - at::Tensor input, - transformer_engine::DType otype -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(-1)); - size_t M = input.numel() / N; - - auto output = - allocateTorchTensor(M, - N, - otype); - - auto itype = GetTransformerEngineDType(input.scalar_type()); - auto gtype = GetTransformerEngineDType(grad.scalar_type()); - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); - auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); - - nvte_dswiglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); - - return output; -} - -std::vector layernorm_bwd(const at::Tensor &dz, - const at::Tensor &x, - const at::Tensor &mu, - const at::Tensor &rsigma, - const at::Tensor &gamma, - const int sm_margin, - const bool zero_centered_gamma -) { - auto dx = at::empty_like(x); - auto dgamma = at::empty_like(gamma); - auto dbeta = at::empty_like(gamma); - transformer_engine::TensorWrapper workspace, barrier, dgamma_part, dbeta_part; - - auto dz_cu = makeTransformerEngineTensor(dz); - auto x_cu = makeTransformerEngineTensor(x); - auto mu_cu = makeTransformerEngineTensor(mu); - auto rsigma_cu = makeTransformerEngineTensor(rsigma); - auto gamma_cu = makeTransformerEngineTensor(gamma); - auto dx_cu = makeTransformerEngineTensor(dx); - auto dgamma_cu = makeTransformerEngineTensor(dgamma); - auto dbeta_cu = makeTransformerEngineTensor(dbeta); - - // This call populates tensors with the required config. - const auto bwd_fun = zero_centered_gamma ? nvte_layernorm1p_bwd : nvte_layernorm_bwd; - bwd_fun(dz_cu.data(), x_cu.data(), mu_cu.data(), rsigma_cu.data(), gamma_cu.data(), - dx_cu.data(), dgamma_cu.data(), dbeta_cu.data(), dgamma_part.data(), - dbeta_part.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - // Alloc space for Tensors. - auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); - auto barrier_data = allocateSpace(barrier.shape(), barrier.dtype(), true); - auto dgamma_part_data = allocateSpace(dgamma_part.shape(), dgamma_part.dtype()); - auto dbeta_part_data = allocateSpace(dbeta_part.shape(), dbeta_part.dtype()); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), - barrier.shape(), - barrier.dtype()); - dgamma_part = makeTransformerEngineTensor(dgamma_part_data.data_ptr(), - dgamma_part.shape(), - dgamma_part.dtype()); - dbeta_part = makeTransformerEngineTensor(dbeta_part_data.data_ptr(), - dbeta_part.shape(), - dbeta_part.dtype()); - - // Actual call to bwd kernel. - bwd_fun(dz_cu.data(), x_cu.data(), mu_cu.data(), rsigma_cu.data(), gamma_cu.data(), - dx_cu.data(), dgamma_cu.data(), dbeta_cu.data(), dgamma_part.data(), - dbeta_part.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - return { dx, dgamma, dbeta }; -} - - -std::vector layernorm_fwd_fp8(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - float eps, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype, - const int sm_margin, - const bool zero_centered_gamma -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); - - DType itype = GetTransformerEngineDType(input.scalar_type()); - - auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(otype))); - auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto input_cu = makeTransformerEngineTensor(input); - auto gamma_cu = makeTransformerEngineTensor(weight); - auto beta_cu = makeTransformerEngineTensor(bias); - auto z_cu = makeTransformerEngineTensor(ln_out.data_ptr(), {N, H}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto mu_cu = makeTransformerEngineTensor(mu); - auto rsigma_cu = makeTransformerEngineTensor(rsigma); - transformer_engine::TensorWrapper workspace, barrier; - - // This call populates workspace and barrier tensors with the required config - const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - // Fill workspace and barrier - auto workspace_data = allocateSpace(workspace.shape(), - workspace.dtype()); - auto barrier_data = allocateSpace(barrier.shape(), - barrier.dtype(), - true); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), - barrier.shape(), - barrier.dtype()); - - // Actual call to fwd kernel - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - return {ln_out, mu, rsigma}; -} - - -std::vector layernorm_fwd_fp8_noalloc(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - float eps, - at::Tensor scale, - at::Tensor ln_out, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype, - const int sm_margin, - const bool zero_centered_gamma -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); - - DType itype = GetTransformerEngineDType(input.scalar_type()); - - auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto input_cu = makeTransformerEngineTensor(input); - auto gamma_cu = makeTransformerEngineTensor(weight); - auto beta_cu = makeTransformerEngineTensor(bias); - auto z_cu = makeTransformerEngineTensor(ln_out.data_ptr(), {N, H}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - auto mu_cu = makeTransformerEngineTensor(mu); - auto rsigma_cu = makeTransformerEngineTensor(rsigma); - transformer_engine::TensorWrapper workspace, barrier; - - // This call populates workspace and barrier tensors with the required config - const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - // Fill workspace and barrier - auto workspace_data = allocateSpace(workspace.shape(), - workspace.dtype()); - auto barrier_data = allocateSpace(barrier.shape(), - barrier.dtype(), - true); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), - barrier.shape(), - barrier.dtype()); - - // Actual call to fwd kernel - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - return {ln_out, mu, rsigma}; -} - - -at::Tensor layernorm_fwd_fp8_inf(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - float eps, - at::Tensor scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype, - const bool zero_centered_gamma -) { - // This is a specialized version of layernorm_fwd_fp8, optimized for inference, - // which only returns the normalized output. - std::vector out = layernorm_fwd_fp8( - input, weight, bias, eps, scale, amax, scale_inv, otype, 0, zero_centered_gamma); - return out[0]; -} - - -std::vector layernorm_fwd(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - float eps, - const int sm_margin, - const bool zero_centered_gamma -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); - - DType itype = GetTransformerEngineDType(input.scalar_type()); - - auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(itype))); - auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto input_cu = makeTransformerEngineTensor(input); - auto gamma_cu = makeTransformerEngineTensor(weight); - auto beta_cu = makeTransformerEngineTensor(bias); - auto z_cu = makeTransformerEngineTensor(ln_out); - auto mu_cu = makeTransformerEngineTensor(mu); - auto rsigma_cu = makeTransformerEngineTensor(rsigma); - transformer_engine::TensorWrapper workspace, barrier; - - // This call populates workspace and barrier tensors with the required config - const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - // Fill workspace and barrier - auto workspace_data = allocateSpace(workspace.shape(), - workspace.dtype()); - auto barrier_data = allocateSpace(barrier.shape(), - barrier.dtype(), - true); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), - barrier.shape(), - barrier.dtype()); - - // Actual call to fwd kernel - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - return {ln_out, mu, rsigma}; -} - - -std::vector layernorm_fwd_noalloc(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - at::Tensor ln_out, - float eps, - const int sm_margin, - const bool zero_centered_gamma -) { - using namespace transformer_engine; - - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); - - DType itype = GetTransformerEngineDType(input.scalar_type()); - - auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); - auto input_cu = makeTransformerEngineTensor(input); - auto gamma_cu = makeTransformerEngineTensor(weight); - auto beta_cu = makeTransformerEngineTensor(bias); - auto z_cu = makeTransformerEngineTensor(ln_out); - auto mu_cu = makeTransformerEngineTensor(mu); - auto rsigma_cu = makeTransformerEngineTensor(rsigma); - transformer_engine::TensorWrapper workspace, barrier; - - // This call populates workspace and barrier tensors with the required config - const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - // Fill workspace and barrier - auto workspace_data = allocateSpace(workspace.shape(), - workspace.dtype()); - auto barrier_data = allocateSpace(barrier.shape(), - barrier.dtype(), - true); - workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), - workspace.shape(), - workspace.dtype()); - barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), - barrier.shape(), - barrier.dtype()); - - // Actual call to fwd kernel - func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), - mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), - at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, - workspace.data(), barrier.data()); - - return {ln_out, mu, rsigma}; -} - - -at::Tensor layernorm_fwd_inf(const at::Tensor &input, - const at::Tensor &weight, - const at::Tensor &bias, - float eps, - const bool zero_centered_gamma -) { - // This is a specialized version of layernorm_fwd, optimized for inference, - // which only returns the normalized output. - std::vector out = layernorm_fwd(input, weight, bias, eps, 0, zero_centered_gamma); - return out[0]; -} - - -at::Tensor cast_to_fp8(const at::Tensor &input, - const at::Tensor &scale, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - auto input_shape = input.sizes().vec(); - std::vector shape{input_shape.begin(), input_shape.end()}; - - auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); - - auto input_cu = makeTransformerEngineTensor(input); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), shape, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_fp8_quantize(input_cu.data(), output_cu.data(), - at::cuda::getCurrentCUDAStream()); - - return output; -} - - -void cast_to_fp8_noalloc(const at::Tensor &input, - const at::Tensor &scale, - at::Tensor output, - at::Tensor amax, - at::Tensor scale_inv, - transformer_engine::DType otype -) { - using namespace transformer_engine; - size_t N = static_cast(input.size(0)); - size_t H = static_cast(input.size(1)); - - auto input_cu = makeTransformerEngineTensor(input); - auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, H}, otype, - amax.data_ptr(), scale.data_ptr(), - scale_inv.data_ptr()); - - nvte_fp8_quantize(input_cu.data(), output_cu.data(), - at::cuda::getCurrentCUDAStream()); - - return; -} - - -at::Tensor cast_from_fp8(const at::Tensor &input, - const at::Tensor &scale_inv, - transformer_engine::DType itype, - transformer_engine::DType otype -) { - using namespace transformer_engine; - auto input_shape = input.sizes().vec(); - std::vector shape{input_shape.begin(), input_shape.end()}; - - auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); - - auto input_cu = makeTransformerEngineTensor(input.data_ptr(), shape, itype, - nullptr, nullptr, scale_inv.data_ptr()); - auto output_cu = makeTransformerEngineTensor(output); - - nvte_fp8_dequantize(input_cu.data(), output_cu.data(), - at::cuda::getCurrentCUDAStream()); - - return output; -} - - -at::Tensor scaled_softmax_forward(at::Tensor input, - float scale_factor -) { - using namespace transformer_engine; - AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); - AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || - (input.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - - const int batches = input.size(0); - const int attn_heads = input.size(1); - const int query_seq_len = input.size(2); - const int key_seq_len = input.size(3); - - TORCH_CHECK(key_seq_len <= 4096); - TORCH_CHECK(query_seq_len > 1); - - // Output - auto act_options = input.options().requires_grad(false); - auto softmax_results = - torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); - - auto input_cu = makeTransformerEngineTensor(input); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - nvte_scaled_softmax_forward(input_cu.data(), softmax_results_cu.data(), scale_factor, - at::cuda::getCurrentCUDAStream()); - - return softmax_results; -} - - -at::Tensor scaled_softmax_backward(at::Tensor output_grad_, - at::Tensor softmax_results_, - float scale_factor -) { - using namespace transformer_engine; - - auto output_grads = output_grad_.contiguous(); - auto softmax_results = softmax_results_.contiguous(); - - AT_ASSERTM(output_grads.dim() == 4, "expected 4D tensor"); - AT_ASSERTM(softmax_results.dim() == 4, "expected 4D tensor"); - - AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || - (output_grads.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || - (softmax_results.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - - auto output_grads_cu = makeTransformerEngineTensor(output_grads); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - // Produce gradients in place. - nvte_scaled_softmax_backward( - output_grads_cu.data(), softmax_results_cu.data(), output_grads_cu.data(), - scale_factor, at::cuda::getCurrentCUDAStream()); - - return output_grads; -} - - -at::Tensor scaled_masked_softmax_forward(at::Tensor input, - at::Tensor mask, - float scale_factor -) { - using namespace transformer_engine; - - AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); - AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || - (input.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); - if (!input.is_contiguous()) - input = input.contiguous(); - if (!mask.is_contiguous()) - mask = mask.contiguous(); - - const int batches = input.size(0); - const int pad_batches = mask.size(0); - const int attn_heads = input.size(1); - const int query_seq_len = input.size(2); - const int key_seq_len = input.size(3); - TORCH_CHECK(key_seq_len <= 4096); - TORCH_CHECK(query_seq_len > 1); - TORCH_CHECK(pad_batches == 1 || pad_batches == batches); - TORCH_CHECK(mask.size(1) == 1); - TORCH_CHECK(mask.size(2) == query_seq_len); - TORCH_CHECK(mask.size(3) == key_seq_len); - - auto act_options = input.options().requires_grad(false); - auto softmax_results = - torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); - - - auto input_cu = makeTransformerEngineTensor(input); - auto mask_cu = makeTransformerEngineTensor(mask); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - nvte_scaled_masked_softmax_forward( - input_cu.data(), mask_cu.data(), softmax_results_cu.data(), - scale_factor, at::cuda::getCurrentCUDAStream()); - - return softmax_results; -} - - -at::Tensor scaled_masked_softmax_backward(at::Tensor output_grad_, - at::Tensor softmax_results_, - float scale_factor -) { - using namespace transformer_engine; - - auto output_grads = output_grad_.contiguous(); - auto softmax_results = softmax_results_.contiguous(); - - AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); - AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); - - AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || - (output_grads.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || - (softmax_results.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - - auto output_grads_cu = makeTransformerEngineTensor(output_grads); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - // Produce gradients in place. - nvte_scaled_softmax_backward( - output_grads_cu.data(), softmax_results_cu.data(), output_grads_cu.data(), - scale_factor, at::cuda::getCurrentCUDAStream()); - - return output_grads; -} - - -at::Tensor scaled_upper_triang_masked_softmax_forward(at::Tensor input, - float scale_factor -) { - using namespace transformer_engine; - - AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); - AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || - (input.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - - const int attn_batches = input.size(0); - const int seq_len = input.size(1); - TORCH_CHECK(seq_len <= 2048); - - // Output - auto act_options = input.options().requires_grad(false); - auto softmax_results = - torch::empty({attn_batches, seq_len, seq_len}, act_options); - - auto input_cu = makeTransformerEngineTensor(input); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - nvte_scaled_upper_triang_masked_softmax_forward(input_cu.data(), - softmax_results_cu.data(), - scale_factor, - at::cuda::getCurrentCUDAStream()); - - return softmax_results; -} - - -at::Tensor scaled_upper_triang_masked_softmax_backward(at::Tensor output_grads_, - at::Tensor softmax_results_, - float scale_factor -) { - using namespace transformer_engine; - - auto output_grads = output_grads_.contiguous(); - auto softmax_results = softmax_results_.contiguous(); - - AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); - AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); - - AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || - (output_grads.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || - (softmax_results.scalar_type() == at::ScalarType::BFloat16), - "Only fp16 and bf16 are supported"); - - TORCH_CHECK(output_grads.size(1) == output_grads.size(2)); - - auto output_grads_cu = makeTransformerEngineTensor(output_grads); - auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); - - // Produce gradients in place. - nvte_scaled_upper_triang_masked_softmax_backward(output_grads_cu.data(), - softmax_results_cu.data(), - output_grads_cu.data(), - scale_factor, - at::cuda::getCurrentCUDAStream()); - - return output_grads; -} - - -size_t get_cublasLt_version() { - return cublasLtGetVersion(); -} - - -bool userbuf_comm_available() { // TODO(ksivamani) check on python side -#ifdef NVTE_WITH_USERBUFFERS - return true; -#else - return false; -#endif -} - -void placeholder() {} // TODO(ksivamani) clean this up - -namespace flash_attention { - -constexpr int warp_size = 32; -constexpr int type_size = 2; // FP16 or BF16 -constexpr int nvec = sizeof(uint64_t) / type_size; -constexpr int load_size = warp_size * nvec; -constexpr int block_size = 512; - -template -__launch_bounds__(block_size) -__global__ void prepare_kernel_fwd(const T *qkvi, - T *qkv, - const size_t B, - const size_t S, - const size_t Z, - const size_t W) { - const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; - const int id_in_warp = threadIdx.x % warp_size; - const size_t offset_input = blockIdx.y * W + warpid * 3 * W * Z + id_in_warp * nvec; - const T *my_input = qkvi + offset_input; - - const size_t s = warpid / B; - if (s >= S) return; - - const size_t b = warpid % B; - - const size_t offset_output = blockIdx.y * B * S * Z * W + - (s + b * S) * W * Z + - id_in_warp * nvec; - - T *my_output = qkv + offset_output; - - for (int i = 0; i < Z; ++i) { - uint64_t *out = reinterpret_cast(my_output + i * load_size); - *out = *reinterpret_cast(my_input + i * load_size * 3); - } -} - -template -__launch_bounds__(block_size) -__global__ void prepare_kernel_bwd(const T *q, const T *k, const T *v, - T *qkv, const size_t B, const size_t S, - const size_t Z, const size_t W) { - const T *input = blockIdx.y == 0 ? q : (blockIdx.y == 1 ? k : v); - - const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; - const int id_in_warp = threadIdx.x % warp_size; - const size_t offset_input = warpid * W * Z + id_in_warp * nvec; - const T *my_input = input + offset_input; - - const size_t b = warpid / S; - if (b >= B) return; - - const size_t s = warpid % S; - - const size_t offset_output = (b + s * B) * 3 * W * Z + - id_in_warp * nvec + blockIdx.y * W; - - T *my_output = qkv + offset_output; - - for (int i = 0; i < Z; ++i) { - uint64_t *out = reinterpret_cast(my_output + i * load_size * 3); - *out = *reinterpret_cast(my_input + i * load_size); - } -} - -} // namespace flash_attention - -at::Tensor fa_prepare_fwd(at::Tensor qkvi) { - NVTE_CHECK(qkvi.dim() == 4, "Expected 4-dim tensor."); - NVTE_CHECK(qkvi.scalar_type() == at::ScalarType::Half || - qkvi.scalar_type() == at::ScalarType::BFloat16); - NVTE_CHECK(qkvi.size(3) % flash_attention::load_size == 0); - NVTE_CHECK(qkvi.size(3) == flash_attention::load_size); - NVTE_CHECK(qkvi.stride(3) == 1, "Wrong stride."); - NVTE_CHECK(qkvi.stride(2) == 3 * qkvi.size(3), "Wrong stride."); - NVTE_CHECK(qkvi.stride(1) == 3 * qkvi.size(3) * qkvi.size(2), "Wrong stride."); - NVTE_CHECK(qkvi.stride(0) == 3 * qkvi.size(3) * qkvi.size(2) * qkvi.size(1), "Wrong stride."); - - // [s, b, n, h * 3] -> [3, b, s, n, h] - std::vector shape = {3, qkvi.size(1), qkvi.size(0), qkvi.size(2), qkvi.size(3)}; - at::Tensor qkv = at::empty(shape, at::CUDA(qkvi.scalar_type())); - - size_t warps = qkvi.size(0) * qkvi.size(1); - size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; - size_t blocks = (warps + warps_per_block - 1) / warps_per_block; - dim3 grid(blocks, 3); - int threads = flash_attention::block_size; - if (qkvi.scalar_type() == at::ScalarType::Half) { - using dtype = at::Half; - flash_attention::prepare_kernel_fwd<<>>( - qkvi.data_ptr(), - qkv.data_ptr(), - shape[1], - shape[2], - shape[3], - shape[4]); - } else { - using dtype = at::BFloat16; - flash_attention::prepare_kernel_fwd<<>>( - qkvi.data_ptr(), - qkv.data_ptr(), - shape[1], - shape[2], - shape[3], - shape[4]); - } - - return qkv; -} - -at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v) { - NVTE_CHECK(q.is_contiguous()); - NVTE_CHECK(k.is_contiguous()); - NVTE_CHECK(v.is_contiguous()); - NVTE_CHECK(q.dim() == 4, "Expected 4-dim tensor."); - NVTE_CHECK(k.dim() == 4, "Expected 4-dim tensor."); - NVTE_CHECK(v.dim() == 4, "Expected 4-dim tensor."); - NVTE_CHECK(q.scalar_type() == at::ScalarType::Half || - q.scalar_type() == at::ScalarType::BFloat16); - NVTE_CHECK(k.scalar_type() == q.scalar_type()); - NVTE_CHECK(v.scalar_type() == q.scalar_type()); - NVTE_CHECK(q.size(3) % flash_attention::load_size == 0); - NVTE_CHECK(q.size(3) == flash_attention::load_size); - NVTE_CHECK(k.size(3) % flash_attention::load_size == 0); - NVTE_CHECK(k.size(3) == flash_attention::load_size); - NVTE_CHECK(v.size(3) % flash_attention::load_size == 0); - NVTE_CHECK(v.size(3) == flash_attention::load_size); - - // 3 x [s, b, n, h] -> [b, s, n, 3 * h] - - std::vector shape = {q.size(1), q.size(0), q.size(2), 3 * q.size(3)}; - at::Tensor qkv = at::empty(shape, at::CUDA(q.scalar_type())); - - size_t warps = q.size(0) * q.size(1); - size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; - size_t blocks = (warps + warps_per_block - 1) / warps_per_block; - dim3 grid(blocks, 3); - int threads = flash_attention::block_size; - if (q.scalar_type() == at::ScalarType::Half) { - using dtype = at::Half; - flash_attention::prepare_kernel_bwd<<>>( - q.data_ptr(), - k.data_ptr(), - v.data_ptr(), - qkv.data_ptr(), - q.size(0), - q.size(1), - q.size(2), - q.size(3)); - } else { - using dtype = at::BFloat16; - flash_attention::prepare_kernel_bwd<<>>( - q.data_ptr(), - k.data_ptr(), - v.data_ptr(), - qkv.data_ptr(), - q.size(0), - q.size(1), - q.size(2), - q.size(3)); - } - - return qkv; -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - // Softmax functions - m.def("scaled_softmax_forward", &scaled_softmax_forward, "Scaled Softmax FWD"); - m.def("scaled_softmax_backward", &scaled_softmax_backward, "Scaled Softmax BWD"); - m.def("scaled_masked_softmax_forward", &scaled_masked_softmax_forward, - "Scaled Masked Softmax FWD"); - m.def("scaled_masked_softmax_backward", &scaled_masked_softmax_backward, - "Scaled Masked Softmax BWD"); - m.def("scaled_upper_triang_masked_softmax_forward", - &scaled_upper_triang_masked_softmax_forward, - "Scaled Upper-Triangular Masked Softmax FWD"); - m.def("scaled_upper_triang_masked_softmax_backward", - &scaled_upper_triang_masked_softmax_backward, - "Scaled Upper-Triangular Masked Softmax BWD"); - - // Other granular functions - m.def("layernorm_fwd_fp8", &layernorm_fwd_fp8, "LN FWD FP8"); - m.def("layernorm_fwd_fp8_noalloc", &layernorm_fwd_fp8_noalloc, "LN FWD FP8"); - m.def("layernorm_bwd", &layernorm_bwd, "LN BWD"); - m.def("layernorm_fwd", &layernorm_fwd, "LN FWD"); - m.def("layernorm_fwd_noalloc", &layernorm_fwd_noalloc, "LN FWD"); - m.def("fused_cast_transpose", &fused_cast_transpose, "Fused Cast + Transpose"); - m.def("fused_cast_transpose_bgrad", &fused_cast_transpose_bgrad, - "Fused Cast + Transpose + BGRAD"); - m.def("fused_fp8_transpose_bgrad", &fused_fp8_transpose_bgrad, - "Fused FP8 Transpose + BGRAD"); - m.def("fused_cast_transpose_bgrad_dgelu", &fused_cast_transpose_bgrad_dgelu, - "Fused Cast + Transpose + BGRAD + DGELU"); - m.def("fused_multi_cast_transpose", &fused_multi_cast_transpose, - "Fused Multi-tensor Cast + Transpose"); - m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8"); - m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8"); - m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8"); - m.def("te_gemm", &te_gemm, "CublasLt GEMM"); - m.def("fused_attn_fwd_qkvpacked", &fused_attn_fwd_qkvpacked, - "Fused Attention FP8/BF16/FP16 FWD with packed QKV"); - m.def("fused_attn_bwd_qkvpacked", &fused_attn_bwd_qkvpacked, - "Fused Attention FP8/BF16/FP16 BWD with packed QKV"); - m.def("fused_attn_fwd_kvpacked", &fused_attn_fwd_kvpacked, - "Fused Attention FP8/BF16/FP16 FWD with packed KV"); - m.def("fused_attn_bwd_kvpacked", &fused_attn_bwd_kvpacked, - "Fused Attention FP8/BF16/FP16 BWD with packed KV"); - m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); - m.def("gelu", &gelu, "GeLU with FP8 output"); - m.def("relu", &relu, "ReLU with FP8 output"); - m.def("geglu", &geglu, "GeGLU with FP8 output"); - m.def("reglu", ®lu, "ReGLU with FP8 output"); - m.def("swiglu", &swiglu, "SwiGLU with FP8 output"); - m.def("dgelu", &dgelu, "Backward of GeLU"); - m.def("drelu", &drelu, "Backward of ReLU"); - m.def("dgeglu", &dgeglu, "Backward of GeGLU"); - m.def("dreglu", &dreglu, "Backward of ReGLU"); - m.def("dswiglu", &dswiglu, "Backward of SwiGLU"); - m.def("fa_prepare_fwd", &fa_prepare_fwd, "Prepare QKV for Flash Attention"); - m.def("fa_prepare_bwd", &fa_prepare_bwd, "Backward of QKV preparation for Flash Attention"); - m.def("get_fused_attn_backend", &get_fused_attn_backend, "Get Fused Attention backend"); - - // Misc - m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); - m.def("userbuf_comm_available", &userbuf_comm_available, "If userbuf backend is available"); - - // Data structures - py::class_(m, "FP8TensorMeta") - .def(py::init<>()) - .def_readwrite("scale", &transformer_engine::FP8TensorMeta::scale) - .def_readwrite("scale_inv", &transformer_engine::FP8TensorMeta::scale_inv) - .def_readwrite("amax_history", &transformer_engine::FP8TensorMeta::amax_history); - -#ifdef NVTE_WITH_USERBUFFERS - py::enum_(m, "UbufOverlapAlgo") - .value("BULK_OVERLAP_AG", ubuf::UBOverlapAlgo::BULK_OVERLAP_AG) - .value("BULK_OVERLAP_RS", ubuf::UBOverlapAlgo::BULK_OVERLAP_RS) - .value("SPLIT_PIPELINED_RS", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_RS) - .value("SPLIT_PIPELINED_AG", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_AG); - - py::class_(m, "UbufCommOverlap") - .def(py::init()) - .def("bulk_overlap", &ubuf::UbufCommOverlap::bulk_overlap) - .def("split_overlap_rs", &ubuf::UbufCommOverlap::split_overlap_rs) - .def("copy_input_to_ubuf", &ubuf::UbufCommOverlap::copy_input_to_ubuf) - .def("get_ubuf_output", &ubuf::UbufCommOverlap::get_ubuf_output); - - py::class_(m, "UbufP2PCommOverlap") - .def(py::init()) - .def("split_overlap_ag", &ubuf::UbufP2PCommOverlap::split_overlap_ag) - .def("copy_input_to_ubuf", &ubuf::UbufP2PCommOverlap::copy_input_to_ubuf) - .def("get_ubuf_output", &ubuf::UbufP2PCommOverlap::get_ubuf_output); -#else // NVTE_WITH_USERBUFFERS - m.def("UbufOverlapAlgo", &placeholder, "Dummy function for python side annotations"); - m.def("UbufCommOverlap", &placeholder, "Dummy function for python side annotations"); - m.def("UbufP2PCommOverlap", &placeholder, "Dummy function for python side annotations"); -#endif // NVTE_WITH_USERBUFFERS - - py::enum_(m, "DType", py::module_local()) - .value("kByte", transformer_engine::DType::kByte) - .value("kInt32", transformer_engine::DType::kInt32) - .value("kFloat32", transformer_engine::DType::kFloat32) - .value("kFloat16", transformer_engine::DType::kFloat16) - .value("kBFloat16", transformer_engine::DType::kBFloat16) - .value("kFloat8E4M3", transformer_engine::DType::kFloat8E4M3) - .value("kFloat8E5M2", transformer_engine::DType::kFloat8E5M2); - - py::enum_(m, "FP8FwdTensors") - .value("GEMM1_INPUT", transformer_engine::FP8FwdTensors::GEMM1_INPUT) - .value("GEMM1_WEIGHT", transformer_engine::FP8FwdTensors::GEMM1_WEIGHT) - .value("GEMM1_OUTPUT", transformer_engine::FP8FwdTensors::GEMM1_OUTPUT) - .value("GEMM2_INPUT", transformer_engine::FP8FwdTensors::GEMM2_INPUT) - .value("GEMM2_WEIGHT", transformer_engine::FP8FwdTensors::GEMM2_WEIGHT) - .value("GEMM2_OUTPUT", transformer_engine::FP8FwdTensors::GEMM2_OUTPUT) - .value("GEMM3_INPUT", transformer_engine::FP8FwdTensors::GEMM3_INPUT) - .value("GEMM3_WEIGHT", transformer_engine::FP8FwdTensors::GEMM3_WEIGHT) - .value("GEMM3_OUTPUT", transformer_engine::FP8FwdTensors::GEMM3_OUTPUT); - - py::enum_(m, "FP8BwdTensors") - .value("GRAD_OUTPUT1", transformer_engine::FP8BwdTensors::GRAD_OUTPUT1) - .value("GRAD_INPUT1", transformer_engine::FP8BwdTensors::GRAD_INPUT1) - .value("GRAD_OUTPUT2", transformer_engine::FP8BwdTensors::GRAD_OUTPUT2) - .value("GRAD_INPUT2", transformer_engine::FP8BwdTensors::GRAD_INPUT2) - .value("GRAD_OUTPUT3", transformer_engine::FP8BwdTensors::GRAD_OUTPUT3) - .value("GRAD_INPUT3", transformer_engine::FP8BwdTensors::GRAD_INPUT3); - - py::enum_(m, "NVTE_Bias_Type") - .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS) - .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS) - .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS); - - py::enum_(m, "NVTE_Mask_Type") - .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK) - .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK) - .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK); - - py::enum_(m, "NVTE_QKV_Layout") - .value("NVTE_NOT_INTERLEAVED", NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) - .value("NVTE_QKV_INTERLEAVED", NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) - .value("NVTE_KV_INTERLEAVED", NVTE_QKV_Layout::NVTE_KV_INTERLEAVED); - - py::enum_(m, "NVTE_Fused_Attn_Backend") - .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) - .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) - .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8) - .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend); -} diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index 1467397c63..d06906b5a2 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -106,6 +106,10 @@ std::vector fused_attn_bwd_kvpacked( c10::optional amax_dP, c10::optional amax_dQKV); +at::Tensor fa_prepare_fwd(at::Tensor qkvi); + +at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v); + void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, transformer_engine::DType A_type, @@ -318,6 +322,77 @@ at::Tensor layernorm_fwd_inf(const at::Tensor &input, const bool zero_centered_gamma ); +/*************************************************************************************************** + * RMSNorm + **************************************************************************************************/ + +std::vector rmsnorm_bwd(const at::Tensor &dz, + const at::Tensor &x, + const at::Tensor &rsigma, + const at::Tensor &gamma, + const int sm_margin, + const bool zero_centered_gamma +); + + +std::vector rmsnorm_fwd_fp8(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +); + +std::vector rmsnorm_fwd_fp8_noalloc(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor ln_out, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +); + +at::Tensor rmsnorm_fwd_fp8_inf(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const bool zero_centered_gamma +); + +std::vector rmsnorm_fwd(const at::Tensor &input, + const at::Tensor &weight, + float eps, + const int sm_margin, + const bool zero_centered_gamma +); + +std::vector rmsnorm_fwd_noalloc(const at::Tensor &input, + const at::Tensor &weight, + at::Tensor ln_out, + float eps, + const int sm_margin, + const bool zero_centered_gamma +); + +at::Tensor rmsnorm_fwd_inf(const at::Tensor &input, + const at::Tensor &weight, + float eps, + const bool zero_centered_gamma +); + +/*************************************************************************************************** + * Cast + **************************************************************************************************/ + at::Tensor cast_to_fp8(const at::Tensor &input, const at::Tensor &scale, at::Tensor amax, @@ -374,3 +449,9 @@ at::Tensor scaled_upper_triang_masked_softmax_backward(at::Tensor output_grads_, at::Tensor softmax_results_, float scale_factor ); + +size_t get_cublasLt_version(); + +bool userbuf_comm_available(); + +void placeholder(); diff --git a/transformer_engine/pytorch/csrc/extensions/activation.cu b/transformer_engine/pytorch/csrc/extensions/activation.cu new file mode 100644 index 0000000000..05c61acc59 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/activation.cu @@ -0,0 +1,267 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +at::Tensor gelu(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_gelu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor dgelu(at::Tensor grad, + at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto gtype = GetTransformerEngineDType(grad.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N}, gtype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); + + nvte_dgelu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor relu(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = static_cast(input.numel()) / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_relu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor drelu(at::Tensor grad, + at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto gtype = GetTransformerEngineDType(grad.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N}, gtype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); + + nvte_drelu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor geglu(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N / 2, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_geglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor dgeglu(at::Tensor grad, + at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto gtype = GetTransformerEngineDType(grad.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); + + nvte_dgeglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor reglu(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N / 2, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_reglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor dreglu(at::Tensor grad, + at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto gtype = GetTransformerEngineDType(grad.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); + + nvte_dreglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor swiglu(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N / 2, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N / 2}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_swiglu(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} + +at::Tensor dswiglu(at::Tensor grad, + at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(-1)); + size_t M = input.numel() / N; + + auto output = + allocateTorchTensor(M, + N, + otype); + + auto itype = GetTransformerEngineDType(input.scalar_type()); + auto gtype = GetTransformerEngineDType(grad.scalar_type()); + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, itype); + auto grad_cu = makeTransformerEngineTensor(grad.data_ptr(), {M, N / 2}, gtype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {M, N}, otype); + + nvte_dswiglu(grad_cu.data(), input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cu b/transformer_engine/pytorch/csrc/extensions/attention.cu new file mode 100644 index 0000000000..4904fbade5 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/attention.cu @@ -0,0 +1,876 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +constexpr int block_size = 512; +constexpr int ctas_per_sm = 4; + +// get the fused attention backend +NVTE_Fused_Attn_Backend get_fused_attn_backend( + const transformer_engine::DType q_dtype, + const transformer_engine::DType kv_dtype, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + float p_dropout, size_t max_seqlen_q, + size_t max_seqlen_kv, size_t head_dim) { + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + static_cast(q_dtype), static_cast(kv_dtype), + qkv_layout, bias_type, attn_mask_type, + p_dropout, max_seqlen_q, max_seqlen_kv, head_dim); + return fused_attention_backend; +} + +// fast zero-fills of tensors +template +__global__ void __launch_bounds__(block_size) mha_fill_kernel(scalar_t* out_tensor, + const int32_t* const start_row, + const size_t num_rows) { + size_t row_stride = gridDim.y * blockDim.x; + size_t row_index = blockIdx.x + static_cast(start_row[0]); + size_t col_index = blockIdx.y * blockDim.x + threadIdx.x; + while (row_index < num_rows) { + out_tensor[row_index*row_stride + col_index] = 0; + row_index += gridDim.x; + } +} + +// fast zero-fills of tensors +void mha_fill(const at::Tensor &self, const at::Tensor &start_index) { + auto max_tokens = self.size(0); + auto self_2d = self.view({max_tokens, -1}); + auto fcd_size = self_2d.size(1); + TORCH_CHECK(self.is_contiguous(), "input not contiguous"); + TORCH_CHECK(fcd_size % block_size == 0, "input size not aligned to block size"); + const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + uint64_t num_blk_y = (uint64_t)(fcd_size / block_size); + uint64_t num_blk_x = (uint64_t)((num_mp * ctas_per_sm + num_blk_y - 1) / num_blk_y); + dim3 dim_grid(num_blk_x, num_blk_y); + dim3 dim_block(block_size); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, + self_2d.scalar_type(), "mha_fill", [&]() { + mha_fill_kernel<<>>( + self_2d.data_ptr(), + static_cast(start_index.data_ptr()), + max_tokens); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +} + +// extract seed and offset from PhiloxCudaState +__global__ void unpack(at::PhiloxCudaState arg, int64_t* rng_state_ptr) { + if (arg.captured_) { + rng_state_ptr[0] = static_cast(*arg.seed_.ptr); + rng_state_ptr[1] = static_cast( + *(arg.offset_.ptr) + static_cast(arg.offset_intragraph_)); + } else { + rng_state_ptr[0] = static_cast(arg.seed_.val); + rng_state_ptr[1] = static_cast(arg.offset_.val); + } +} + +// extract PhiloxCudaState from CUDA random number generator +at::PhiloxCudaState init_philox_state( + at::CUDAGeneratorImpl* gen, + size_t elts_per_thread) { + at::PhiloxCudaState philox_args; + std::lock_guard lock(gen->mutex_); + philox_args = gen->philox_cuda_state(elts_per_thread); + return philox_args; +} + +// fused attention FWD with packed QKV +std::vector fused_attn_fwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen, + size_t rng_elts_per_thread) { + using namespace transformer_engine; + + // create output tensor O + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto O = torch::empty({static_cast(total_seqs), + static_cast(h), static_cast(d)}, options); + if (set_zero && (h * d % block_size == 0)) { + mha_fill(O, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + O.fill_(0); + } + + // construct NVTE tensors + TensorWrapper te_QKV, te_S, te_O, te_Bias, te_cu_seqlens; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + at::Tensor descale_S = torch::empty_like(scale_S.value()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_S.value().data_ptr(), + scale_S.value().data_ptr(), descale_S.data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { + auto bias_shape = Bias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, + DType::kFloat32, nullptr, nullptr, nullptr); + } + te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // extract random number generator seed and offset + auto gen = at::get_generator_or_default( + rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); + at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( + philox_args, static_cast(rng_state.data_ptr())); + auto te_rng_state = makeTransformerEngineTensor(rng_state); + + // create auxiliary output tensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_fwd_qkvpacked( + te_QKV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens.data(), + te_rng_state.data(), + max_seqlen, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace and auxiliary output tensors + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // output_tensors = [O, nvte_aux_tensor_pack.tensors] + std::vector output_tensors; + output_tensors.push_back(O); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + // allocate memory for nvte_aux_tensor_pack.tensors + auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + output_tensors.push_back(output_tensor); + tensor->data.dptr = output_tensor.data_ptr(); + } + + // execute the kernel + nvte_fused_attn_fwd_qkvpacked( + te_QKV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens.data(), + te_rng_state.data(), + max_seqlen, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers, but not allocated memory + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + // if training, [O, softmax-related tensors, rng_state]; if inference, [O] + return output_tensors; +} + +// fused attention BWD with packed QKV +std::vector fused_attn_bwd_qkvpacked( + size_t b, size_t max_seqlen, size_t total_seqs, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens, + const at::Tensor QKV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV) { + using namespace transformer_engine; + + // create output tensor dQKV + at::Tensor dQKV = torch::empty_like(QKV); + auto max_tokens = dQKV.size(0); + auto self_2d = dQKV.view({max_tokens, -1}); + auto fcd_size = self_2d.size(1); + if (set_zero && (fcd_size % block_size == 0)) { + mha_fill(dQKV, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + dQKV.fill_(0); + } + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + at::Tensor dBias; + TensorWrapper te_dBias; + if (bias_type != NVTE_NO_BIAS) { + dBias = torch::zeros({1, static_cast(h), + static_cast(max_seqlen), + static_cast(max_seqlen)}, options); + te_dBias = makeTransformerEngineTensor(dBias); + } + + // construct NVTE tensors + TensorWrapper te_QKV, te_O, te_dO, te_S, te_dP, te_dQKV; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!scale_S.has_value()) || (!scale_dP.has_value()) + || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; + err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, + nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); + at::Tensor descale_dP = torch::empty_like(scale_dP.value()); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), + descale_dP.data_ptr()); + te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), {total_seqs, 3, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + + // convert auxiliary tensors from forward into NVTETensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); + std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); + tensor->data.shape = std::vector(tmp.begin(), tmp.end()); + tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); + } + + // create cu_seqlens tensorwrappers + TensorWrapper te_cu_seqlens; + te_cu_seqlens = makeTransformerEngineTensor(cu_seqlens.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_bwd_qkvpacked( + te_QKV.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQKV.data(), + te_dBias.data(), + te_cu_seqlens.data(), + max_seqlen, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // execute kernel + nvte_fused_attn_bwd_qkvpacked( + te_QKV.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQKV.data(), + te_dBias.data(), + te_cu_seqlens.data(), + max_seqlen, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + return {dQKV, dBias}; +} + +// fused attention FWD with packed KV +std::vector fused_attn_fwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen, + size_t rng_elts_per_thread) { + using namespace transformer_engine; + + // create output tensor O + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto O = torch::empty({static_cast(total_seqs_q), + static_cast(h), static_cast(d)}, options); + if (set_zero && (h * d % block_size == 0)) { + mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + O.fill_(0); + } + + // construct NVTE tensors + TensorWrapper te_Q, te_KV, te_S, te_O, te_Bias, te_cu_seqlens_q, te_cu_seqlens_kv; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + at::Tensor descale_S = torch::empty_like(scale_S.value()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_S.value().data_ptr(), + scale_S.value().data_ptr(), descale_S.data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { + auto bias_shape = Bias.value().sizes().vec(); + std::vector shape{bias_shape.begin(), bias_shape.end()}; + te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), shape, + DType::kFloat32, nullptr, nullptr, nullptr); + } + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // extract rng seed and offset + auto gen = at::get_generator_or_default( + rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); + at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( + philox_args, static_cast(rng_state.data_ptr())); + auto te_rng_state = makeTransformerEngineTensor(rng_state); + + // create auxiliary output tensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_fwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace and auxiliary output tensors + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // output_tensors = [O, nvte_aux_tensor_pack.tensors] + std::vector output_tensors; + output_tensors.push_back(O); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + // allocate memory for nvte_aux_tensor_pack.tensors + auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + output_tensors.push_back(output_tensor); + tensor->data.dptr = output_tensor.data_ptr(); + } + + // execute the kernel + nvte_fused_attn_fwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers, but not allocated memory + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + // if training, [O, softmax-related tensors, rng_state]; if inference, [O] + return output_tensors; +} + +// fused attention BWD with packed KV +std::vector fused_attn_bwd_kvpacked( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t total_seqs_q, size_t total_seqs_kv, + size_t h, size_t d, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor KV, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV) { + using namespace transformer_engine; + + // create output tensors dQ and dKV + at::Tensor dQ = torch::empty_like(Q); + at::Tensor dKV = torch::empty_like(KV); + auto max_tokens_q = dQ.size(0); + auto self_2d_q = dQ.view({max_tokens_q, -1}); + auto fcd_size_q = self_2d_q.size(1); + auto max_tokens_kv = dQ.size(0); + auto self_2d_kv = dQ.view({max_tokens_kv, -1}); + auto fcd_size_kv = self_2d_kv.size(1); + if (set_zero && (fcd_size_q % block_size == 0) && (fcd_size_kv % block_size == 0)) { + mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + mha_fill(dKV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + dQ.fill_(0); + dKV.fill_(0); + } + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + at::Tensor dBias; + TensorWrapper te_dBias; + if (bias_type != NVTE_NO_BIAS) { + dBias = torch::zeros({1, static_cast(h), + static_cast(max_seqlen_q), + static_cast(max_seqlen_kv)}, options); + te_dBias = makeTransformerEngineTensor(dBias); + } + + // construct NVTE tensors + TensorWrapper te_Q, te_KV, te_O, te_dO, te_S, te_dP, te_dQ, te_dKV; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!scale_S.has_value()) || (!scale_dP.has_value()) + || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; + err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, + scale_S.value().data_ptr(), descale_S.value().data_ptr()); + at::Tensor descale_dP = torch::empty_like(scale_dP.value()); + te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, + amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), + descale_dP.data_ptr()); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_KV = makeTransformerEngineTensor(KV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), {total_seqs_q, h, d}, + qkv_type, nullptr, nullptr, nullptr); + te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), {total_seqs_kv, 2, h, d}, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + + // create cu_seqlens tensorwrappers + TensorWrapper te_cu_seqlens_q, te_cu_seqlens_kv; + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), {b+1}, + DType::kInt32, nullptr, nullptr, nullptr); + + // convert auxiliary tensors from forward to NVTETensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); + std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); + tensor->data.shape = std::vector(tmp.begin(), tmp.end()); + tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); + } + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_bwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dKV.data(), + te_dBias.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // execute kernel + nvte_fused_attn_bwd_kvpacked( + te_Q.data(), + te_KV.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dKV.data(), + te_dBias.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + return {dQ, dKV, dBias}; +} + +namespace flash_attention { + +constexpr int warp_size = 32; +constexpr int type_size = 2; // FP16 or BF16 +constexpr int nvec = sizeof(uint64_t) / type_size; +constexpr int load_size = warp_size * nvec; +constexpr int block_size = 512; + +template +__launch_bounds__(block_size) +__global__ void prepare_kernel_fwd(const T *qkvi, + T *qkv, + const size_t B, + const size_t S, + const size_t Z, + const size_t W) { + const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; + const int id_in_warp = threadIdx.x % warp_size; + const size_t offset_input = blockIdx.y * W + warpid * 3 * W * Z + id_in_warp * nvec; + const T *my_input = qkvi + offset_input; + + const size_t s = warpid / B; + if (s >= S) return; + + const size_t b = warpid % B; + + const size_t offset_output = blockIdx.y * B * S * Z * W + + (s + b * S) * W * Z + + id_in_warp * nvec; + + T *my_output = qkv + offset_output; + + for (int i = 0; i < Z; ++i) { + uint64_t *out = reinterpret_cast(my_output + i * load_size); + *out = *reinterpret_cast(my_input + i * load_size * 3); + } +} + +template +__launch_bounds__(block_size) +__global__ void prepare_kernel_bwd(const T *q, const T *k, const T *v, + T *qkv, const size_t B, const size_t S, + const size_t Z, const size_t W) { + const T *input = blockIdx.y == 0 ? q : (blockIdx.y == 1 ? k : v); + + const int warpid = (blockDim.x * blockIdx.x + threadIdx.x) / warp_size; + const int id_in_warp = threadIdx.x % warp_size; + const size_t offset_input = warpid * W * Z + id_in_warp * nvec; + const T *my_input = input + offset_input; + + const size_t b = warpid / S; + if (b >= B) return; + + const size_t s = warpid % S; + + const size_t offset_output = (b + s * B) * 3 * W * Z + + id_in_warp * nvec + blockIdx.y * W; + + T *my_output = qkv + offset_output; + + for (int i = 0; i < Z; ++i) { + uint64_t *out = reinterpret_cast(my_output + i * load_size * 3); + *out = *reinterpret_cast(my_input + i * load_size); + } +} + +} // namespace flash_attention + +at::Tensor fa_prepare_fwd(at::Tensor qkvi) { + NVTE_CHECK(qkvi.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(qkvi.scalar_type() == at::ScalarType::Half || + qkvi.scalar_type() == at::ScalarType::BFloat16); + NVTE_CHECK(qkvi.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(qkvi.size(3) == flash_attention::load_size); + NVTE_CHECK(qkvi.stride(3) == 1, "Wrong stride."); + NVTE_CHECK(qkvi.stride(2) == 3 * qkvi.size(3), "Wrong stride."); + NVTE_CHECK(qkvi.stride(1) == 3 * qkvi.size(3) * qkvi.size(2), "Wrong stride."); + NVTE_CHECK(qkvi.stride(0) == 3 * qkvi.size(3) * qkvi.size(2) * qkvi.size(1), "Wrong stride."); + + // [s, b, n, h * 3] -> [3, b, s, n, h] + std::vector shape = {3, qkvi.size(1), qkvi.size(0), qkvi.size(2), qkvi.size(3)}; + at::Tensor qkv = at::empty(shape, at::CUDA(qkvi.scalar_type())); + + size_t warps = qkvi.size(0) * qkvi.size(1); + size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; + size_t blocks = (warps + warps_per_block - 1) / warps_per_block; + dim3 grid(blocks, 3); + int threads = flash_attention::block_size; + if (qkvi.scalar_type() == at::ScalarType::Half) { + using dtype = at::Half; + flash_attention::prepare_kernel_fwd<<>>( + qkvi.data_ptr(), + qkv.data_ptr(), + shape[1], + shape[2], + shape[3], + shape[4]); + } else { + using dtype = at::BFloat16; + flash_attention::prepare_kernel_fwd<<>>( + qkvi.data_ptr(), + qkv.data_ptr(), + shape[1], + shape[2], + shape[3], + shape[4]); + } + + return qkv; +} + +at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v) { + NVTE_CHECK(q.is_contiguous()); + NVTE_CHECK(k.is_contiguous()); + NVTE_CHECK(v.is_contiguous()); + NVTE_CHECK(q.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(k.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(v.dim() == 4, "Expected 4-dim tensor."); + NVTE_CHECK(q.scalar_type() == at::ScalarType::Half || + q.scalar_type() == at::ScalarType::BFloat16); + NVTE_CHECK(k.scalar_type() == q.scalar_type()); + NVTE_CHECK(v.scalar_type() == q.scalar_type()); + NVTE_CHECK(q.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(q.size(3) == flash_attention::load_size); + NVTE_CHECK(k.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(k.size(3) == flash_attention::load_size); + NVTE_CHECK(v.size(3) % flash_attention::load_size == 0); + NVTE_CHECK(v.size(3) == flash_attention::load_size); + + // 3 x [s, b, n, h] -> [b, s, n, 3 * h] + + std::vector shape = {q.size(1), q.size(0), q.size(2), 3 * q.size(3)}; + at::Tensor qkv = at::empty(shape, at::CUDA(q.scalar_type())); + + size_t warps = q.size(0) * q.size(1); + size_t warps_per_block = flash_attention::block_size / flash_attention::warp_size; + size_t blocks = (warps + warps_per_block - 1) / warps_per_block; + dim3 grid(blocks, 3); + int threads = flash_attention::block_size; + if (q.scalar_type() == at::ScalarType::Half) { + using dtype = at::Half; + flash_attention::prepare_kernel_bwd<<>>( + q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + qkv.data_ptr(), + q.size(0), + q.size(1), + q.size(2), + q.size(3)); + } else { + using dtype = at::BFloat16; + flash_attention::prepare_kernel_bwd<<>>( + q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + qkv.data_ptr(), + q.size(0), + q.size(1), + q.size(2), + q.size(3)); + } + + return qkv; +} diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cu b/transformer_engine/pytorch/csrc/extensions/cast.cu new file mode 100644 index 0000000000..0e886e4107 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/cast.cu @@ -0,0 +1,75 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +at::Tensor cast_to_fp8(const at::Tensor &input, + const at::Tensor &scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + auto input_shape = input.sizes().vec(); + std::vector shape{input_shape.begin(), input_shape.end()}; + + auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); + + auto input_cu = makeTransformerEngineTensor(input); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), shape, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_fp8_quantize(input_cu.data(), output_cu.data(), + at::cuda::getCurrentCUDAStream()); + + return output; +} + + +void cast_to_fp8_noalloc(const at::Tensor &input, + const at::Tensor &scale, + at::Tensor output, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + auto input_cu = makeTransformerEngineTensor(input); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, H}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_fp8_quantize(input_cu.data(), output_cu.data(), + at::cuda::getCurrentCUDAStream()); + + return; +} + + +at::Tensor cast_from_fp8(const at::Tensor &input, + const at::Tensor &scale_inv, + transformer_engine::DType itype, + transformer_engine::DType otype +) { + using namespace transformer_engine; + auto input_shape = input.sizes().vec(); + std::vector shape{input_shape.begin(), input_shape.end()}; + + auto output = at::empty_like(input, at::CUDA(GetATenDType(otype))); + + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), shape, itype, + nullptr, nullptr, scale_inv.data_ptr()); + auto output_cu = makeTransformerEngineTensor(output); + + nvte_fp8_dequantize(input_cu.data(), output_cu.data(), + at::cuda::getCurrentCUDAStream()); + + return output; +} diff --git a/transformer_engine/pytorch/csrc/extensions/gemm.cu b/transformer_engine/pytorch/csrc/extensions/gemm.cu new file mode 100644 index 0000000000..1a7630edce --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/gemm.cu @@ -0,0 +1,75 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +void te_gemm(at::Tensor A, + at::Tensor A_scale_inverse, + transformer_engine::DType A_type, + bool transa, + at::Tensor B, + at::Tensor B_scale_inverse, + transformer_engine::DType B_type, + bool transb, + at::Tensor D, + at::Tensor D_scale, + transformer_engine::DType D_type, + at::Tensor D_amax, + at::Tensor bias, + transformer_engine::DType bias_type, + at::Tensor pre_gelu_out, + bool grad, + at::Tensor workspace, + size_t workspaceSize, + bool accumulate, + bool use_split_accumulator, + int math_sm_count +) { + using namespace transformer_engine; + auto te_A = makeTransformerEngineTensor(A.data_ptr(), + {static_cast(A.size(0)), + static_cast(A.size(1))}, + A_type, nullptr, nullptr, + A_scale_inverse.data_ptr()); + auto te_B = makeTransformerEngineTensor(B.data_ptr(), + {static_cast(B.size(0)), + static_cast(B.size(1))}, + B_type, nullptr, nullptr, + B_scale_inverse.data_ptr()); + auto te_D = makeTransformerEngineTensor(D.data_ptr(), + {static_cast(D.size(0)), + static_cast(D.size(1))}, + D_type, D_amax.data_ptr(), + D_scale.data_ptr(), nullptr); + auto te_bias = makeTransformerEngineTensor(bias.data_ptr(), {static_cast(bias.size(0))}, + bias_type); + + const auto gelu_shape = pre_gelu_out.data_ptr() == nullptr + ? std::vector{static_cast(pre_gelu_out.size(0))} + : std::vector{static_cast(pre_gelu_out.size(0)), + static_cast(pre_gelu_out.size(1))}; + auto te_pre_gelu_out = makeTransformerEngineTensor(pre_gelu_out.data_ptr(), + gelu_shape, + GetTransformerEngineDType( + pre_gelu_out.scalar_type())); + auto te_workspace = makeTransformerEngineTensor(workspace.data_ptr(), + {workspaceSize}, + DType::kByte); + + nvte_cublas_gemm(te_A.data(), + te_B.data(), + te_D.data(), + te_bias.data(), + te_pre_gelu_out.data(), + transa, + transb, + grad, + te_workspace.data(), + accumulate, + use_split_accumulator, + math_sm_count, + at::cuda::getCurrentCUDAStream()); +} diff --git a/transformer_engine/pytorch/csrc/extensions/misc.cu b/transformer_engine/pytorch/csrc/extensions/misc.cu new file mode 100644 index 0000000000..e6275d1159 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/misc.cu @@ -0,0 +1,25 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" +#ifdef NVTE_WITH_USERBUFFERS +#include "comm_gemm_overlap.h" +#endif // NVTE_WITH_USERBUFFERS + +size_t get_cublasLt_version() { + return cublasLtGetVersion(); +} + + +bool userbuf_comm_available() { // TODO(ksivamani) check on python side +#ifdef NVTE_WITH_USERBUFFERS + return true; +#else + return false; +#endif +} + +void placeholder() {} // TODO(ksivamani) clean this up diff --git a/transformer_engine/pytorch/csrc/extensions/normalization.cu b/transformer_engine/pytorch/csrc/extensions/normalization.cu new file mode 100644 index 0000000000..6c723cd37f --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/normalization.cu @@ -0,0 +1,404 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +std::vector layernorm_bwd(const at::Tensor &dz, + const at::Tensor &x, + const at::Tensor &mu, + const at::Tensor &rsigma, + const at::Tensor &gamma, + const int sm_margin, + const bool zero_centered_gamma +) { + auto dx = at::empty_like(x); + auto dgamma = at::empty_like(gamma); + auto dbeta = at::empty_like(gamma); + transformer_engine::TensorWrapper workspace, barrier, dgamma_part, dbeta_part; + + auto dz_cu = makeTransformerEngineTensor(dz); + auto x_cu = makeTransformerEngineTensor(x); + auto mu_cu = makeTransformerEngineTensor(mu); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + auto gamma_cu = makeTransformerEngineTensor(gamma); + auto dx_cu = makeTransformerEngineTensor(dx); + auto dgamma_cu = makeTransformerEngineTensor(dgamma); + auto dbeta_cu = makeTransformerEngineTensor(dbeta); + + // This call populates tensors with the required config. + const auto bwd_fun = zero_centered_gamma ? nvte_layernorm1p_bwd : nvte_layernorm_bwd; + bwd_fun(dz_cu.data(), x_cu.data(), mu_cu.data(), rsigma_cu.data(), gamma_cu.data(), + dx_cu.data(), dgamma_cu.data(), dbeta_cu.data(), dgamma_part.data(), + dbeta_part.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Alloc space for Tensors. + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), barrier.dtype(), true); + auto dgamma_part_data = allocateSpace(dgamma_part.shape(), dgamma_part.dtype()); + auto dbeta_part_data = allocateSpace(dbeta_part.shape(), dbeta_part.dtype()); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + dgamma_part = makeTransformerEngineTensor(dgamma_part_data.data_ptr(), + dgamma_part.shape(), + dgamma_part.dtype()); + dbeta_part = makeTransformerEngineTensor(dbeta_part_data.data_ptr(), + dbeta_part.shape(), + dbeta_part.dtype()); + + // Actual call to bwd kernel. + bwd_fun(dz_cu.data(), x_cu.data(), mu_cu.data(), rsigma_cu.data(), gamma_cu.data(), + dx_cu.data(), dgamma_cu.data(), dbeta_cu.data(), dgamma_part.data(), + dbeta_part.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return { dx, dgamma, dbeta }; +} + + +std::vector layernorm_fwd_fp8(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(otype))); + return layernorm_fwd_fp8_noalloc(input, weight, bias, eps, + scale, ln_out, amax, scale_inv, + otype, sm_margin, zero_centered_gamma); +} + + +std::vector layernorm_fwd_fp8_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + at::Tensor scale, + at::Tensor ln_out, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + auto mu = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto input_cu = makeTransformerEngineTensor(input); + auto gamma_cu = makeTransformerEngineTensor(weight); + auto beta_cu = makeTransformerEngineTensor(bias); + auto z_cu = makeTransformerEngineTensor(ln_out.data_ptr(), {N, H}, otype, + getDataPtr(amax), getDataPtr(scale), + getDataPtr(scale_inv)); + auto mu_cu = makeTransformerEngineTensor(mu); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + transformer_engine::TensorWrapper workspace, barrier; + + // This call populates workspace and barrier tensors with the required config + const auto func = zero_centered_gamma ? nvte_layernorm1p_fwd : nvte_layernorm_fwd; + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Fill workspace and barrier + auto workspace_data = allocateSpace(workspace.shape(), + workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), + barrier.dtype(), + true); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + + // Actual call to fwd kernel + func(input_cu.data(), gamma_cu.data(), beta_cu.data(), eps, z_cu.data(), + mu_cu.data(), rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return {ln_out, mu, rsigma}; +} + + +at::Tensor layernorm_fwd_fp8_inf(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const bool zero_centered_gamma +) { + // This is a specialized version of layernorm_fwd_fp8, optimized for inference, + // which only returns the normalized output. + std::vector out = layernorm_fwd_fp8( + input, weight, bias, eps, scale, amax, scale_inv, otype, 0, zero_centered_gamma); + return out[0]; +} + + +std::vector layernorm_fwd(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + DType itype = GetTransformerEngineDType(input.scalar_type()); + auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(itype))); + + return layernorm_fwd_noalloc(input, weight, bias, ln_out, eps, + sm_margin, zero_centered_gamma); +} + + +std::vector layernorm_fwd_noalloc(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + at::Tensor ln_out, + float eps, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + return layernorm_fwd_fp8_noalloc(input, weight, bias, eps, at::Tensor(), + ln_out, at::Tensor(), at::Tensor(), + itype, sm_margin, zero_centered_gamma); +} + + +at::Tensor layernorm_fwd_inf(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + float eps, + const bool zero_centered_gamma +) { + // This is a specialized version of layernorm_fwd, optimized for inference, + // which only returns the normalized output. + std::vector out = layernorm_fwd(input, weight, bias, eps, 0, zero_centered_gamma); + return out[0]; +} + +std::vector rmsnorm_bwd(const at::Tensor &dz, + const at::Tensor &x, + const at::Tensor &rsigma, + const at::Tensor &gamma, + const int sm_margin, + const bool zero_centered_gamma +) { + NVTE_CHECK(zero_centered_gamma == false, + "Zero-centered gamma is not supported yet for RMSNorm."); + auto dx = at::empty_like(x); + auto dgamma = at::empty_like(gamma); + transformer_engine::TensorWrapper workspace, barrier, dgamma_part; + + auto dz_cu = makeTransformerEngineTensor(dz); + auto x_cu = makeTransformerEngineTensor(x); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + auto gamma_cu = makeTransformerEngineTensor(gamma); + auto dx_cu = makeTransformerEngineTensor(dx); + auto dgamma_cu = makeTransformerEngineTensor(dgamma); + + // This call populates tensors with the required config. + const auto bwd_fun = nvte_rmsnorm_bwd; + bwd_fun(dz_cu.data(), x_cu.data(), rsigma_cu.data(), gamma_cu.data(), + dx_cu.data(), dgamma_cu.data(), dgamma_part.data(), + at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Alloc space for Tensors. + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), barrier.dtype(), true); + auto dgamma_part_data = allocateSpace(dgamma_part.shape(), dgamma_part.dtype()); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + dgamma_part = makeTransformerEngineTensor(dgamma_part_data.data_ptr(), + dgamma_part.shape(), + dgamma_part.dtype()); + + // Actual call to bwd kernel. + bwd_fun(dz_cu.data(), x_cu.data(), rsigma_cu.data(), gamma_cu.data(), + dx_cu.data(), dgamma_cu.data(), dgamma_part.data(), + at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return { dx, dgamma }; +} + + +std::vector rmsnorm_fwd_fp8(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(otype))); + return rmsnorm_fwd_fp8_noalloc(input, weight, eps, + scale, ln_out, amax, scale_inv, + otype, sm_margin, zero_centered_gamma); +} + + +std::vector rmsnorm_fwd_fp8_noalloc(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor ln_out, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + NVTE_CHECK(zero_centered_gamma == false, + "Zero-centered gamma is not supported yet for RMSNorm."); + + size_t N = static_cast(input.size(0)); + size_t H = static_cast(input.size(1)); + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + auto rsigma = at::empty({static_cast(N)}, at::CUDA(at::kFloat)); + auto input_cu = makeTransformerEngineTensor(input); + auto gamma_cu = makeTransformerEngineTensor(weight); + auto z_cu = makeTransformerEngineTensor(ln_out.data_ptr(), {N, H}, otype, + getDataPtr(amax), getDataPtr(scale), + getDataPtr(scale_inv)); + auto rsigma_cu = makeTransformerEngineTensor(rsigma); + transformer_engine::TensorWrapper workspace, barrier; + + // This call populates workspace and barrier tensors with the required config + const auto func = nvte_rmsnorm_fwd; + func(input_cu.data(), gamma_cu.data(), eps, z_cu.data(), + rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + // Fill workspace and barrier + auto workspace_data = allocateSpace(workspace.shape(), + workspace.dtype()); + auto barrier_data = allocateSpace(barrier.shape(), + barrier.dtype(), + true); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + barrier = makeTransformerEngineTensor(barrier_data.data_ptr(), + barrier.shape(), + barrier.dtype()); + + // Actual call to fwd kernel + func(input_cu.data(), gamma_cu.data(), eps, z_cu.data(), + rsigma_cu.data(), at::cuda::getCurrentCUDAStream(), + at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin, + workspace.data(), barrier.data()); + + return {ln_out, rsigma}; +} + + +at::Tensor rmsnorm_fwd_fp8_inf(const at::Tensor &input, + const at::Tensor &weight, + float eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + const bool zero_centered_gamma +) { + // This is a specialized version of rmsnorm_fwd_fp8, optimized for inference, + // which only returns the normalized output. + std::vector out = rmsnorm_fwd_fp8( + input, weight, eps, scale, amax, scale_inv, otype, 0, zero_centered_gamma); + return out[0]; +} + + +std::vector rmsnorm_fwd(const at::Tensor &input, + const at::Tensor &weight, + float eps, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + DType itype = GetTransformerEngineDType(input.scalar_type()); + auto ln_out = at::empty_like(input, at::CUDA(GetATenDType(itype))); + + return rmsnorm_fwd_noalloc(input, weight, ln_out, eps, + sm_margin, zero_centered_gamma); +} + + +std::vector rmsnorm_fwd_noalloc(const at::Tensor &input, + const at::Tensor &weight, + at::Tensor ln_out, + float eps, + const int sm_margin, + const bool zero_centered_gamma +) { + using namespace transformer_engine; + + DType itype = GetTransformerEngineDType(input.scalar_type()); + + return rmsnorm_fwd_fp8_noalloc(input, weight, eps, at::Tensor(), + ln_out, at::Tensor(), at::Tensor(), + itype, sm_margin, zero_centered_gamma); +} + + +at::Tensor rmsnorm_fwd_inf(const at::Tensor &input, + const at::Tensor &weight, + float eps, + const bool zero_centered_gamma +) { + // This is a specialized version of rmsnorm_fwd, optimized for inference, + // which only returns the normalized output. + std::vector out = rmsnorm_fwd(input, weight, eps, 0, zero_centered_gamma); + return out[0]; +} diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp new file mode 100644 index 0000000000..6dc48a4b5c --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -0,0 +1,158 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "../extensions.h" +#ifdef NVTE_WITH_USERBUFFERS +#include "comm_gemm_overlap.h" +#endif // NVTE_WITH_USERBUFFERS + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + // Softmax functions + m.def("scaled_softmax_forward", &scaled_softmax_forward, "Scaled Softmax FWD"); + m.def("scaled_softmax_backward", &scaled_softmax_backward, "Scaled Softmax BWD"); + m.def("scaled_masked_softmax_forward", &scaled_masked_softmax_forward, + "Scaled Masked Softmax FWD"); + m.def("scaled_masked_softmax_backward", &scaled_masked_softmax_backward, + "Scaled Masked Softmax BWD"); + m.def("scaled_upper_triang_masked_softmax_forward", + &scaled_upper_triang_masked_softmax_forward, + "Scaled Upper-Triangular Masked Softmax FWD"); + m.def("scaled_upper_triang_masked_softmax_backward", + &scaled_upper_triang_masked_softmax_backward, + "Scaled Upper-Triangular Masked Softmax BWD"); + + // Other granular functions + m.def("layernorm_fwd_fp8", &layernorm_fwd_fp8, "LN FWD FP8"); + m.def("layernorm_fwd_fp8_noalloc", &layernorm_fwd_fp8_noalloc, "LN FWD FP8"); + m.def("layernorm_bwd", &layernorm_bwd, "LN BWD"); + m.def("layernorm_fwd", &layernorm_fwd, "LN FWD"); + m.def("layernorm_fwd_noalloc", &layernorm_fwd_noalloc, "LN FWD"); + m.def("rmsnorm_fwd_fp8", &rmsnorm_fwd_fp8, "LN FWD FP8"); + m.def("rmsnorm_fwd_fp8_noalloc", &rmsnorm_fwd_fp8_noalloc, "LN FWD FP8"); + m.def("rmsnorm_bwd", &rmsnorm_bwd, "LN BWD"); + m.def("rmsnorm_fwd", &rmsnorm_fwd, "LN FWD"); + m.def("rmsnorm_fwd_noalloc", &rmsnorm_fwd_noalloc, "LN FWD"); + m.def("fused_cast_transpose", &fused_cast_transpose, "Fused Cast + Transpose"); + m.def("fused_cast_transpose_bgrad", &fused_cast_transpose_bgrad, + "Fused Cast + Transpose + BGRAD"); + m.def("fused_fp8_transpose_bgrad", &fused_fp8_transpose_bgrad, + "Fused FP8 Transpose + BGRAD"); + m.def("fused_cast_transpose_bgrad_dgelu", &fused_cast_transpose_bgrad_dgelu, + "Fused Cast + Transpose + BGRAD + DGELU"); + m.def("fused_multi_cast_transpose", &fused_multi_cast_transpose, + "Fused Multi-tensor Cast + Transpose"); + m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8"); + m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8"); + m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8"); + m.def("te_gemm", &te_gemm, "CublasLt GEMM"); + m.def("fused_attn_fwd_qkvpacked", &fused_attn_fwd_qkvpacked, + "Fused Attention FP8/BF16/FP16 FWD with packed QKV"); + m.def("fused_attn_bwd_qkvpacked", &fused_attn_bwd_qkvpacked, + "Fused Attention FP8/BF16/FP16 BWD with packed QKV"); + m.def("fused_attn_fwd_kvpacked", &fused_attn_fwd_kvpacked, + "Fused Attention FP8/BF16/FP16 FWD with packed KV"); + m.def("fused_attn_bwd_kvpacked", &fused_attn_bwd_kvpacked, + "Fused Attention FP8/BF16/FP16 BWD with packed KV"); + m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); + m.def("gelu", &gelu, "GeLU with FP8 output"); + m.def("relu", &relu, "ReLU with FP8 output"); + m.def("geglu", &geglu, "GeGLU with FP8 output"); + m.def("reglu", ®lu, "ReGLU with FP8 output"); + m.def("swiglu", &swiglu, "SwiGLU with FP8 output"); + m.def("dgelu", &dgelu, "Backward of GeLU"); + m.def("drelu", &drelu, "Backward of ReLU"); + m.def("dgeglu", &dgeglu, "Backward of GeGLU"); + m.def("dreglu", &dreglu, "Backward of ReGLU"); + m.def("dswiglu", &dswiglu, "Backward of SwiGLU"); + m.def("fa_prepare_fwd", &fa_prepare_fwd, "Prepare QKV for Flash Attention"); + m.def("fa_prepare_bwd", &fa_prepare_bwd, "Backward of QKV preparation for Flash Attention"); + m.def("get_fused_attn_backend", &get_fused_attn_backend, "Get Fused Attention backend"); + + // Misc + m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); + m.def("userbuf_comm_available", &userbuf_comm_available, "If userbuf backend is available"); + + // Data structures + py::class_(m, "FP8TensorMeta") + .def(py::init<>()) + .def_readwrite("scale", &transformer_engine::FP8TensorMeta::scale) + .def_readwrite("scale_inv", &transformer_engine::FP8TensorMeta::scale_inv) + .def_readwrite("amax_history", &transformer_engine::FP8TensorMeta::amax_history); + +#ifdef NVTE_WITH_USERBUFFERS + py::enum_(m, "UbufOverlapAlgo") + .value("BULK_OVERLAP_AG", ubuf::UBOverlapAlgo::BULK_OVERLAP_AG) + .value("BULK_OVERLAP_RS", ubuf::UBOverlapAlgo::BULK_OVERLAP_RS) + .value("SPLIT_PIPELINED_RS", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_RS) + .value("SPLIT_PIPELINED_AG", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_AG); + + py::class_(m, "UbufCommOverlap") + .def(py::init()) + .def("bulk_overlap", &ubuf::UbufCommOverlap::bulk_overlap) + .def("split_overlap_rs", &ubuf::UbufCommOverlap::split_overlap_rs) + .def("copy_input_to_ubuf", &ubuf::UbufCommOverlap::copy_input_to_ubuf) + .def("get_ubuf_output", &ubuf::UbufCommOverlap::get_ubuf_output); + + py::class_(m, "UbufP2PCommOverlap") + .def(py::init()) + .def("split_overlap_ag", &ubuf::UbufP2PCommOverlap::split_overlap_ag) + .def("copy_input_to_ubuf", &ubuf::UbufP2PCommOverlap::copy_input_to_ubuf) + .def("get_ubuf_output", &ubuf::UbufP2PCommOverlap::get_ubuf_output); +#else // NVTE_WITH_USERBUFFERS + m.def("UbufOverlapAlgo", &placeholder, "Dummy function for python side annotations"); + m.def("UbufCommOverlap", &placeholder, "Dummy function for python side annotations"); + m.def("UbufP2PCommOverlap", &placeholder, "Dummy function for python side annotations"); +#endif // NVTE_WITH_USERBUFFERS + + py::enum_(m, "DType", py::module_local()) + .value("kByte", transformer_engine::DType::kByte) + .value("kInt32", transformer_engine::DType::kInt32) + .value("kFloat32", transformer_engine::DType::kFloat32) + .value("kFloat16", transformer_engine::DType::kFloat16) + .value("kBFloat16", transformer_engine::DType::kBFloat16) + .value("kFloat8E4M3", transformer_engine::DType::kFloat8E4M3) + .value("kFloat8E5M2", transformer_engine::DType::kFloat8E5M2); + + py::enum_(m, "FP8FwdTensors") + .value("GEMM1_INPUT", transformer_engine::FP8FwdTensors::GEMM1_INPUT) + .value("GEMM1_WEIGHT", transformer_engine::FP8FwdTensors::GEMM1_WEIGHT) + .value("GEMM1_OUTPUT", transformer_engine::FP8FwdTensors::GEMM1_OUTPUT) + .value("GEMM2_INPUT", transformer_engine::FP8FwdTensors::GEMM2_INPUT) + .value("GEMM2_WEIGHT", transformer_engine::FP8FwdTensors::GEMM2_WEIGHT) + .value("GEMM2_OUTPUT", transformer_engine::FP8FwdTensors::GEMM2_OUTPUT) + .value("GEMM3_INPUT", transformer_engine::FP8FwdTensors::GEMM3_INPUT) + .value("GEMM3_WEIGHT", transformer_engine::FP8FwdTensors::GEMM3_WEIGHT) + .value("GEMM3_OUTPUT", transformer_engine::FP8FwdTensors::GEMM3_OUTPUT); + + py::enum_(m, "FP8BwdTensors") + .value("GRAD_OUTPUT1", transformer_engine::FP8BwdTensors::GRAD_OUTPUT1) + .value("GRAD_INPUT1", transformer_engine::FP8BwdTensors::GRAD_INPUT1) + .value("GRAD_OUTPUT2", transformer_engine::FP8BwdTensors::GRAD_OUTPUT2) + .value("GRAD_INPUT2", transformer_engine::FP8BwdTensors::GRAD_INPUT2) + .value("GRAD_OUTPUT3", transformer_engine::FP8BwdTensors::GRAD_OUTPUT3) + .value("GRAD_INPUT3", transformer_engine::FP8BwdTensors::GRAD_INPUT3); + + py::enum_(m, "NVTE_Bias_Type") + .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS) + .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS) + .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS); + + py::enum_(m, "NVTE_Mask_Type") + .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK) + .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK) + .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK); + + py::enum_(m, "NVTE_QKV_Layout") + .value("NVTE_NOT_INTERLEAVED", NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) + .value("NVTE_QKV_INTERLEAVED", NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) + .value("NVTE_KV_INTERLEAVED", NVTE_QKV_Layout::NVTE_KV_INTERLEAVED); + + py::enum_(m, "NVTE_Fused_Attn_Backend") + .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) + .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) + .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8) + .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend); +} diff --git a/transformer_engine/pytorch/csrc/extensions/softmax.cu b/transformer_engine/pytorch/csrc/extensions/softmax.cu new file mode 100644 index 0000000000..6bfbb7bb96 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/softmax.cu @@ -0,0 +1,211 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +at::Tensor scaled_softmax_forward(at::Tensor input, + float scale_factor +) { + using namespace transformer_engine; + AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); + AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || + (input.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + const int batches = input.size(0); + const int attn_heads = input.size(1); + const int query_seq_len = input.size(2); + const int key_seq_len = input.size(3); + + TORCH_CHECK(key_seq_len <= 4096); + TORCH_CHECK(query_seq_len > 1); + + // Output + auto act_options = input.options().requires_grad(false); + auto softmax_results = + torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); + + auto input_cu = makeTransformerEngineTensor(input); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + nvte_scaled_softmax_forward(input_cu.data(), softmax_results_cu.data(), scale_factor, + at::cuda::getCurrentCUDAStream()); + + return softmax_results; +} + + +at::Tensor scaled_softmax_backward(at::Tensor output_grad_, + at::Tensor softmax_results_, + float scale_factor +) { + using namespace transformer_engine; + + auto output_grads = output_grad_.contiguous(); + auto softmax_results = softmax_results_.contiguous(); + + AT_ASSERTM(output_grads.dim() == 4, "expected 4D tensor"); + AT_ASSERTM(softmax_results.dim() == 4, "expected 4D tensor"); + + AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || + (output_grads.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || + (softmax_results.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + auto output_grads_cu = makeTransformerEngineTensor(output_grads); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + // Produce gradients in place. + nvte_scaled_softmax_backward( + output_grads_cu.data(), softmax_results_cu.data(), output_grads_cu.data(), + scale_factor, at::cuda::getCurrentCUDAStream()); + + return output_grads; +} + + +at::Tensor scaled_masked_softmax_forward(at::Tensor input, + at::Tensor mask, + float scale_factor +) { + using namespace transformer_engine; + + AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); + AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || + (input.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); + if (!input.is_contiguous()) + input = input.contiguous(); + if (!mask.is_contiguous()) + mask = mask.contiguous(); + + const int batches = input.size(0); + const int pad_batches = mask.size(0); + const int attn_heads = input.size(1); + const int query_seq_len = input.size(2); + const int key_seq_len = input.size(3); + TORCH_CHECK(key_seq_len <= 4096); + TORCH_CHECK(query_seq_len > 1); + TORCH_CHECK(pad_batches == 1 || pad_batches == batches); + TORCH_CHECK(mask.size(1) == 1); + TORCH_CHECK(mask.size(2) == query_seq_len); + TORCH_CHECK(mask.size(3) == key_seq_len); + + auto act_options = input.options().requires_grad(false); + auto softmax_results = + torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); + + + auto input_cu = makeTransformerEngineTensor(input); + auto mask_cu = makeTransformerEngineTensor(mask); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + nvte_scaled_masked_softmax_forward( + input_cu.data(), mask_cu.data(), softmax_results_cu.data(), + scale_factor, at::cuda::getCurrentCUDAStream()); + + return softmax_results; +} + + +at::Tensor scaled_masked_softmax_backward(at::Tensor output_grad_, + at::Tensor softmax_results_, + float scale_factor +) { + using namespace transformer_engine; + + auto output_grads = output_grad_.contiguous(); + auto softmax_results = softmax_results_.contiguous(); + + AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); + AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); + + AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || + (output_grads.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || + (softmax_results.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + auto output_grads_cu = makeTransformerEngineTensor(output_grads); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + // Produce gradients in place. + nvte_scaled_softmax_backward( + output_grads_cu.data(), softmax_results_cu.data(), output_grads_cu.data(), + scale_factor, at::cuda::getCurrentCUDAStream()); + + return output_grads; +} + + +at::Tensor scaled_upper_triang_masked_softmax_forward(at::Tensor input, + float scale_factor +) { + using namespace transformer_engine; + + AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); + AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || + (input.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + const int attn_batches = input.size(0); + const int seq_len = input.size(1); + TORCH_CHECK(seq_len <= 2048); + + // Output + auto act_options = input.options().requires_grad(false); + auto softmax_results = + torch::empty({attn_batches, seq_len, seq_len}, act_options); + + auto input_cu = makeTransformerEngineTensor(input); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + nvte_scaled_upper_triang_masked_softmax_forward(input_cu.data(), + softmax_results_cu.data(), + scale_factor, + at::cuda::getCurrentCUDAStream()); + + return softmax_results; +} + + +at::Tensor scaled_upper_triang_masked_softmax_backward(at::Tensor output_grads_, + at::Tensor softmax_results_, + float scale_factor +) { + using namespace transformer_engine; + + auto output_grads = output_grads_.contiguous(); + auto softmax_results = softmax_results_.contiguous(); + + AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); + AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); + + AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || + (output_grads.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || + (softmax_results.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + TORCH_CHECK(output_grads.size(1) == output_grads.size(2)); + + auto output_grads_cu = makeTransformerEngineTensor(output_grads); + auto softmax_results_cu = makeTransformerEngineTensor(softmax_results); + + // Produce gradients in place. + nvte_scaled_upper_triang_masked_softmax_backward(output_grads_cu.data(), + softmax_results_cu.data(), + output_grads_cu.data(), + scale_factor, + at::cuda::getCurrentCUDAStream()); + + return output_grads; +} diff --git a/transformer_engine/pytorch/csrc/extensions/transpose.cu b/transformer_engine/pytorch/csrc/extensions/transpose.cu new file mode 100644 index 0000000000..c58d474fb2 --- /dev/null +++ b/transformer_engine/pytorch/csrc/extensions/transpose.cu @@ -0,0 +1,321 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include "extensions.h" + +void fused_cast_transpose(at::Tensor input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + at::Tensor input_cast, + at::Tensor input_transpose, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t M = static_cast(input.size(0)); + size_t N = static_cast(input.size(1)); + + auto input_cu = makeTransformerEngineTensor(input); + auto output_cast_cu = makeTransformerEngineTensor(input_cast.data_ptr(), {M, N}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto output_transpose_cu = makeTransformerEngineTensor(input_transpose.data_ptr(), {N, M}, otype, + amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + + nvte_cast_transpose(input_cu.data(), output_cast_cu.data(), output_transpose_cu.data(), + at::cuda::getCurrentCUDAStream()); +} + + +std::vector fused_cast_transpose_bgrad(at::Tensor grad_output, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t M = static_cast(grad_output.size(0)); + size_t N = static_cast(grad_output.size(1)); + + DType grad_output_type = GetTransformerEngineDType(grad_output.scalar_type()); + auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_output_type); + auto grad_output_cast = + allocateTorchTensor(grad_output.size(0), + grad_output.size(1), + DType::kByte); + auto grad_output_transpose = + allocateTorchTensor(grad_output.size(1), + grad_output.size(0), + DType::kByte); + + auto input_cu = makeTransformerEngineTensor(grad_output); + auto cast_output_cu = makeTransformerEngineTensor(grad_output_cast.data_ptr(), {M, N}, + otype, amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto transposed_output_cu = makeTransformerEngineTensor(grad_output_transpose.data_ptr(), + {N, M}, otype, amax.data_ptr(), + scale.data_ptr(), scale_inv.data_ptr()); + auto dbias_cu = makeTransformerEngineTensor(grad_bias); + transformer_engine::TensorWrapper workspace; + + nvte_cast_transpose_dbias(input_cu.data(), cast_output_cu.data(), + transposed_output_cu.data(), dbias_cu.data(), + workspace.data(), at::cuda::getCurrentCUDAStream()); + + // Fill workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + + nvte_cast_transpose_dbias(input_cu.data(), cast_output_cu.data(), + transposed_output_cu.data(), dbias_cu.data(), + workspace.data(), at::cuda::getCurrentCUDAStream()); + + return {grad_bias, grad_output_cast, grad_output_transpose}; +} + + +std::vector fused_fp8_transpose_bgrad(at::Tensor grad_output, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype, + transformer_engine::DType grad_bias_type +) { + using namespace transformer_engine; + + size_t M = static_cast(grad_output.size(0)); + size_t N = static_cast(grad_output.size(1)); + + auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_bias_type); + auto grad_output_transpose = + allocateTorchTensor(grad_output.size(1), + grad_output.size(0), + DType::kByte); + auto input_cu = makeTransformerEngineTensor(grad_output.data_ptr(), {M, N}, + otype, amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto transposed_output_cu = makeTransformerEngineTensor(grad_output_transpose.data_ptr(), + {N, M}, otype, amax.data_ptr(), + scale.data_ptr(), scale_inv.data_ptr()); + auto dbias_cu = makeTransformerEngineTensor(grad_bias); + transformer_engine::TensorWrapper workspace; + + nvte_fp8_transpose_dbias(input_cu.data(), transposed_output_cu.data(), dbias_cu.data(), + workspace.data(), at::cuda::getCurrentCUDAStream()); + + // Fill workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + + nvte_fp8_transpose_dbias(input_cu.data(), transposed_output_cu.data(), dbias_cu.data(), + workspace.data(), at::cuda::getCurrentCUDAStream()); + + return {grad_bias, grad_output_transpose}; +} + + + +std::vector fused_cast_transpose_bgrad_dgelu(at::Tensor grad_output, + at::Tensor gelu_input, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t M = static_cast(grad_output.size(0)); + size_t N = static_cast(grad_output.size(1)); + + DType grad_output_type = GetTransformerEngineDType(grad_output.scalar_type()); + auto grad_bias = allocateTorchTensor(grad_output.size(-1), grad_output_type); + auto dgelu = + allocateTorchTensor(grad_output.size(0), + grad_output.size(1), + DType::kByte); + auto dgelu_transpose = + allocateTorchTensor(grad_output.size(1), + grad_output.size(0), + DType::kByte); + + transformer_engine::TensorWrapper workspace; + auto gelu_input_cu = makeTransformerEngineTensor(gelu_input); + auto input_cu = makeTransformerEngineTensor(grad_output); + auto cast_output_cu = makeTransformerEngineTensor(dgelu.data_ptr(), {M, N}, + otype, amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto transposed_output_cu = makeTransformerEngineTensor(dgelu_transpose.data_ptr(), {N, M}, + otype, amax.data_ptr(), scale.data_ptr(), + scale_inv.data_ptr()); + auto dbias_cu = makeTransformerEngineTensor(grad_bias); + + nvte_cast_transpose_dbias_dgelu(input_cu.data(), gelu_input_cu.data(), + cast_output_cu.data(), transposed_output_cu.data(), + dbias_cu.data(), workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // Fill workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor(workspace_data.data_ptr(), + workspace.shape(), + workspace.dtype()); + + nvte_cast_transpose_dbias_dgelu(input_cu.data(), gelu_input_cu.data(), + cast_output_cu.data(), transposed_output_cu.data(), + dbias_cu.data(), workspace.data(), + at::cuda::getCurrentCUDAStream()); + + return {grad_bias, dgelu, dgelu_transpose}; +} + + +void fused_multi_cast_transpose(std::vector input_list, + std::vector scale_list, + std::vector cast_output_list, + std::vector transposed_output_list, + std::vector amax_list, + std::vector scale_inv_list, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + // Extract properties from PyTorch tensors + std::vector input_dptr_list, scale_dptr_list, + cast_output_dptr_list, transposed_output_dptr_list, + amax_dptr_list, scale_inv_dptr_list; + std::vector> input_shape_list, scale_shape_list, + cast_output_shape_list, transposed_output_shape_list, + amax_shape_list, scale_inv_shape_list; + std::vector input_type_list, scale_type_list, + cast_output_type_list, transposed_output_type_list, + amax_type_list, scale_inv_type_list; + auto extract_tensor_props_skip_dtype = [](at::Tensor& tensor, + std::vector& dptr_list, + std::vector>& shape_list) { + dptr_list.push_back(tensor.data_ptr()); + shape_list.push_back({}); + for (int d = 0; d < tensor.dim(); ++d) { + shape_list.back().push_back(tensor.size(d)); + } + }; + auto extract_tensor_props = [](at::Tensor& tensor, + std::vector& dptr_list, + std::vector>& shape_list, + std::vector& type_list) { + dptr_list.push_back(tensor.data_ptr()); + shape_list.push_back({}); + for (int d = 0; d < tensor.dim(); ++d) { + shape_list.back().push_back(tensor.size(d)); + } + type_list.push_back(GetTransformerEngineDType(tensor.scalar_type())); + }; + for (size_t tensor_id = 0; tensor_id < input_list.size(); ++tensor_id) { + extract_tensor_props(input_list[tensor_id], + input_dptr_list, + input_shape_list, + input_type_list); + extract_tensor_props(scale_list[tensor_id], + scale_dptr_list, + scale_shape_list, + scale_type_list); + extract_tensor_props_skip_dtype(cast_output_list[tensor_id], + cast_output_dptr_list, + cast_output_shape_list); + cast_output_type_list.push_back(otype); + extract_tensor_props_skip_dtype(transposed_output_list[tensor_id], + transposed_output_dptr_list, + transposed_output_shape_list); + transposed_output_type_list.push_back(otype); + extract_tensor_props(amax_list[tensor_id], + amax_dptr_list, + amax_shape_list, + amax_type_list); + extract_tensor_props(scale_inv_list[tensor_id], + scale_inv_dptr_list, + scale_inv_shape_list, + scale_inv_type_list); + } + + transformer_engine::TensorWrapper workspace; + + // Construct TE tensors + std::vector nvte_input_list, + nvte_cast_output_list, nvte_transposed_output_list; + std::vector tensor_wrappers; + auto make_tensor = [&tensor_wrappers](void* dptr, + const std::vector& shape, + transformer_engine::DType dtype, + void* amax_dptr, + void* scale_dptr, + void* scale_inv_dptr) + -> NVTETensor { + tensor_wrappers.emplace_back(makeTransformerEngineTensor(dptr, shape, dtype, amax_dptr, + scale_dptr, scale_inv_dptr)); + return tensor_wrappers.back().data(); + }; + for (size_t i = 0; i < input_dptr_list.size(); ++i) { + nvte_input_list.emplace_back(make_tensor(input_dptr_list[i], + input_shape_list[i], + input_type_list[i], + nullptr, + nullptr, + nullptr)); + nvte_cast_output_list.emplace_back(make_tensor(cast_output_dptr_list[i], + cast_output_shape_list[i], + cast_output_type_list[i], + amax_dptr_list[i], + scale_dptr_list[i], + scale_inv_dptr_list[i])); + nvte_transposed_output_list.emplace_back(make_tensor(transposed_output_dptr_list[i], + transposed_output_shape_list[i], + transposed_output_type_list[i], + amax_dptr_list[i], + scale_dptr_list[i], + scale_inv_dptr_list[i])); + } + + // Check tensor lists + NVTE_CHECK(nvte_cast_output_list.size() == nvte_input_list.size(), + "Number of input and C output tensors must match"); + NVTE_CHECK(nvte_transposed_output_list.size() == nvte_input_list.size(), + "Number of input and T output tensors must match"); + + // Launch TE kernel + nvte_multi_cast_transpose(nvte_input_list.size(), + nvte_input_list.data(), + nvte_cast_output_list.data(), + nvte_transposed_output_list.data(), + at::cuda::getCurrentCUDAStream()); +} + + +at::Tensor fp8_transpose(at::Tensor input, + transformer_engine::DType otype +) { + using namespace transformer_engine; + + size_t M = static_cast(input.size(0)); + size_t N = static_cast(input.size(1)); + + auto output = + allocateTorchTensor(input.size(1), + input.size(0), + DType::kByte); + + auto input_cu = makeTransformerEngineTensor(input.data_ptr(), {M, N}, otype); + auto output_cu = makeTransformerEngineTensor(output.data_ptr(), {N, M}, otype); + + nvte_transpose(input_cu.data(), output_cu.data(), at::cuda::getCurrentCUDAStream()); + + return output; +} diff --git a/transformer_engine/pytorch/csrc/ts_fp8_op.cpp b/transformer_engine/pytorch/csrc/ts_fp8_op.cpp index b0424d6f4b..6f38253052 100755 --- a/transformer_engine/pytorch/csrc/ts_fp8_op.cpp +++ b/transformer_engine/pytorch/csrc/ts_fp8_op.cpp @@ -328,6 +328,44 @@ at::Tensor layernorm_fwd_inf_ts(const at::Tensor &input, return output; } +at::Tensor rmsnorm_fwd_fp8_inf_ts(const at::Tensor &input, + const at::Tensor &weight, + double eps, + at::Tensor scale, + at::Tensor amax, + at::Tensor scale_inv, + int64_t fp8_tensor, + int64_t otype, + const bool zero_centered_gamma) { + transformer_engine::DType otype_arg = reverse_map_dtype(otype); + float eps_float = static_cast(eps); + + at::Tensor output = rmsnorm_fwd_fp8_inf(input, + weight, + eps_float, + scale, + amax, + scale_inv, + otype_arg, + zero_centered_gamma); + + return output; +} + +at::Tensor rmsnorm_fwd_inf_ts(const at::Tensor &input, + const at::Tensor &weight, + double eps, + const bool zero_centered_gamma) { + float eps_float = static_cast(eps); + + at::Tensor output = rmsnorm_fwd_inf(input, + weight, + eps_float, + zero_centered_gamma); + + return output; +} + TORCH_LIBRARY(tex_ts, m) { m.def("cast_to_fp8_ts", &cast_to_fp8_ts); m.def("cast_from_fp8_ts", &cast_from_fp8_ts); @@ -339,4 +377,6 @@ TORCH_LIBRARY(tex_ts, m) { m.def("te_gemm_ts", &te_gemm_ts); m.def("layernorm_fwd_fp8_inf_ts", &layernorm_fwd_fp8_inf_ts); m.def("layernorm_fwd_inf_ts", &layernorm_fwd_inf_ts); + m.def("rmsnorm_fwd_fp8_inf_ts", &rmsnorm_fwd_fp8_inf_ts); + m.def("rmsnorm_fwd_inf_ts", &rmsnorm_fwd_inf_ts); } diff --git a/transformer_engine/pytorch/module/__init__.py b/transformer_engine/pytorch/module/__init__.py index fef96e7738..51463eb12d 100644 --- a/transformer_engine/pytorch/module/__init__.py +++ b/transformer_engine/pytorch/module/__init__.py @@ -7,3 +7,4 @@ from .linear import Linear from .layernorm_mlp import LayerNormMLP from .layernorm import LayerNorm +from .rmsnorm import RMSNorm diff --git a/transformer_engine/pytorch/module/_common.py b/transformer_engine/pytorch/module/_common.py new file mode 100644 index 0000000000..4b8d4de643 --- /dev/null +++ b/transformer_engine/pytorch/module/_common.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Internal function used by multiple modules.""" + +from typing import Union, Dict, Any + +import torch + +from .. import cpp_extensions as tex +from ..fp8 import get_fp8_te_dtype + +def _get_normalization_func(normalization: str, + fp8_output: bool, + is_grad_enabled: bool, + forward: bool): + fwd_normalization_funcs = { + ('LayerNorm', True, True): tex.layernorm_fwd_fp8, + ('LayerNorm', True, False): tex.layernorm_fwd_fp8_inf, + ('LayerNorm', False, True): tex.layernorm_fwd_noalloc, + ('LayerNorm', False, False): tex.layernorm_fwd_inf, + ('RMSNorm', True, True): tex.rmsnorm_fwd_fp8, + ('RMSNorm', True, False): tex.rmsnorm_fwd_fp8_inf, + ('RMSNorm', False, True): tex.rmsnorm_fwd_noalloc, + ('RMSNorm', False, False): tex.rmsnorm_fwd_inf, + } + bwd_normalization_funcs = { + 'LayerNorm': tex.layernorm_bwd, + 'RMSNorm': tex.rmsnorm_bwd, + } + + if forward: + return fwd_normalization_funcs[(normalization, fp8_output, is_grad_enabled)] + assert not fp8_output, "FP8 output is not supported in backward normalization!" + assert is_grad_enabled, "Gradient has to be enabled to call backward normalization!" + return bwd_normalization_funcs[normalization] + +def _apply_normalization(inputmat:torch.Tensor, + ln_out: torch.Tensor, + ln_weight: torch.Tensor, + ln_bias: Union[torch.Tensor, None], + eps: float, + fp8_out: bool, + fp8_meta: Dict[str, Any], + normalization: str, + fwd_ln_sm_margin: int, + zero_centered_gamma: bool, + is_grad_enabled: bool): + normalization_func = _get_normalization_func(normalization, + fp8_out, + is_grad_enabled, + True) + + inputs = (inputmat, ln_weight) if ln_bias is None else (inputmat, ln_weight, ln_bias) + if fp8_out: + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + + if is_grad_enabled: + output_key = "ln_out" if normalization == "LayerNorm" else "rmsnorm_out" + output_kwarg = {output_key: ln_out} + output = normalization_func( + *inputs, + eps, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + fwd_ln_sm_margin, + zero_centered_gamma, + **output_kwarg, + ) + else: + return normalization_func( + *inputs, + eps, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + zero_centered_gamma, + ), None, None + else: + if is_grad_enabled: + output = normalization_func( + *inputs, ln_out, eps, + fwd_ln_sm_margin, zero_centered_gamma + ) + else: + return normalization_func( + *inputs, eps, zero_centered_gamma + ), None, None + if normalization == "RMSNorm": + output = (ln_out, None, output[1]) + elif normalization == "LayerNorm": + output = (ln_out, output[1], output[2]) + return output diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index c18da5ed85..698d88a284 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -12,7 +12,7 @@ from torch.nn.parameter import Parameter from torch.nn import init -import transformer_engine_extensions as tex +from .. import cpp_extensions as tex from .base import ( get_workspace, @@ -38,22 +38,13 @@ reduce_scatter_along_first_dim, gather_along_first_dim, ) -from ..cpp_extensions import ( - fp8_gemm, - gemm, - fp8_cast_transpose_fused, - layernorm_fwd_fp8, - layernorm_fwd_fp8_inf, - layernorm_fwd_inf, - cast_to_fp8, - cast_from_fp8, -) from ..constants import GemmParallelModes, dist_group_type, TE_DType from ..jit import no_torch_dynamo +from ._common import _apply_normalization -__all__ = ["LayerNormLinear"] +__all__ = ["LayerNormLinear"] class _LayerNormLinear(torch.autograd.Function): """LayerNormLinear semi-top level module @@ -65,7 +56,7 @@ def forward( ctx, inp: torch.Tensor, ln_weight: torch.Tensor, - ln_bias: torch.Tensor, + ln_bias: Union[torch.Tensor, None], weight: torch.Tensor, weight_fp8: Union[torch.Tensor, None], weight_t_fp8: Union[torch.Tensor, None], @@ -91,6 +82,7 @@ def forward( ub_bulk_wgrad: bool, ub_bulk_dgrad: bool, ub_split_ag: bool, + normalization: str, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -105,10 +97,9 @@ def forward( # Cast for native AMP inputmat = cast_if_needed(inputmat, activation_dtype) ln_weight = cast_if_needed(ln_weight, activation_dtype) - ln_bias = cast_if_needed(ln_bias, activation_dtype) - # If residual connection is after LN, we need `ln_out` - # tensor in higher precision, this comes at the cost - # of an extra fp8 cast. + if ln_bias is not None: + ln_bias = cast_if_needed(ln_bias, activation_dtype) + if ub_split_ag: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1 or (not is_grad_enabled) or return_layernorm_output: @@ -118,69 +109,35 @@ def forward( dim_size[0] = dim_size[0] * tp_world_size ub_obj_lnout = get_ub("qkv_fprop") ln_out = ub_obj_lnout.get_ubuf_output(0) - if fp8: - fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) - - if not return_layernorm_output: - if is_grad_enabled: - if not ub_split_ag: - ln_out = torch.empty_like(inputmat, dtype=torch.uint8) - _, mu, rsigma = layernorm_fwd_fp8( - inputmat, - ln_weight, - ln_bias, - eps, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - fwd_ln_sm_margin, - zero_centered_gamma, - ln_out = ln_out - ) - else: - mu = rsigma = None - ln_out = layernorm_fwd_fp8_inf( - inputmat, - ln_weight, - ln_bias, - eps, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - zero_centered_gamma, - ) - else: - if is_grad_enabled: - ln_out_return, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) - else: - ln_out_return, mu, rsigma = layernorm_fwd_inf( - inputmat, ln_weight, ln_bias, eps, zero_centered_gamma - ), None, None - - ln_out = cast_to_fp8( - ln_out_return, + else: + ln_out_dtype = torch.uint8 if fp8 else inputmat.dtype + ln_out = torch.empty_like(inputmat, dtype=ln_out_dtype) + + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + + ln_out, mu, rsigma = _apply_normalization(inputmat, + ln_out, + ln_weight, + ln_bias, + eps, + fp8 and not return_layernorm_output, + fp8_meta, + normalization, + fwd_ln_sm_margin, + zero_centered_gamma, + is_grad_enabled) + # If residual connection is after LN, we need `ln_out_return` + # tensor in higher precision, this comes at the cost + # of an extra fp8 cast. + if return_layernorm_output: + ln_out_return = ln_out + if fp8: + ln_out = tex.cast_to_fp8( + ln_out, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, ) - else: - if is_grad_enabled: - if ub_split_ag: - _, mu, rsigma = tex.layernorm_fwd_noalloc( - inputmat, ln_weight, ln_bias, ln_out, eps, - fwd_ln_sm_margin, zero_centered_gamma - ) - else: - ln_out, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) - else: - ln_out, mu, rsigma = layernorm_fwd_inf( - inputmat, ln_weight, ln_bias, eps, zero_centered_gamma - ), None, None - ln_out_return = ln_out # Column Parallel Linear if ub_split_ag: ln_out_total = ub_obj_lnout.get_ubuf_output(1) @@ -200,7 +157,7 @@ def forward( if update_fp8_weights: if is_grad_enabled: - fp8_cast_transpose_fused( + tex.fp8_cast_transpose_fused( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -210,13 +167,13 @@ def forward( ) else: weight_t_fp8 = None - weight_fp8 = cast_to_fp8( + weight_fp8 = tex.cast_to_fp8( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward) - out = fp8_gemm( + out = tex.fp8_gemm( weight_fp8, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -247,7 +204,7 @@ def forward( fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_WEIGHT] = \ torch.amax(weight).float() - out, _, _ = gemm( + out, _, _ = tex.gemm( weight, ln_out_total, activation_dtype, @@ -289,6 +246,7 @@ def forward( ctx.ub_bulk_wgrad = ub_bulk_wgrad ctx.ub_bulk_dgrad = ub_bulk_dgrad ctx.requires_dgrad = inp.requires_grad + ctx.normalization = normalization # Row Parallel Linear if parallel_mode == "row" and sequence_parallel: @@ -379,7 +337,7 @@ def backward( ) # DGRAD: Evaluated unconditionally to feed into Linear backward - _ = fp8_gemm( + _ = tex.fp8_gemm( weight_t_fp8, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, @@ -397,7 +355,7 @@ def backward( ) else: # DGRAD: Evaluated unconditionally to feed into Linear backward - _, _, _ = gemm( + _, _, _ = tex.gemm( weight, grad_output, ctx.activation_dtype, @@ -427,7 +385,7 @@ def backward( # WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: ln_out_total_t = tex.fp8_transpose(ln_out_total, fp8_dtype_forward) - wgrad = fp8_gemm( + wgrad = tex.fp8_gemm( ln_out_total_t, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_INPUT, @@ -446,14 +404,14 @@ def backward( ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) else: - ln_out_total_c = cast_from_fp8( + ln_out_total_c = tex.cast_from_fp8( ln_out_total, ctx.fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, TE_DType[ctx.activation_dtype], ) - wgrad, _, _ = gemm( + wgrad, _, _ = tex.gemm( ln_out_total_c, grad_output, ctx.activation_dtype, @@ -468,7 +426,7 @@ def backward( ) else: # WGRAD - wgrad, grad_bias, _ = gemm( + wgrad, grad_bias, _ = tex.gemm( ln_out_total, grad_output, ctx.activation_dtype, @@ -496,10 +454,18 @@ def backward( if ctx.return_layernorm_output: d_ln_out = d_ln_out + grad_outputs[1].view_as(d_ln_out) - dxmat, dgamma, dbeta = tex.layernorm_bwd( - d_ln_out, inputmat, mu, rsigma, ln_weight, - ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma - ) + if ctx.normalization == "LayerNorm": + dxmat, dgamma, dbeta = tex.layernorm_bwd( + d_ln_out, inputmat, mu, rsigma, ln_weight, + ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma + ) + elif ctx.normalization == "RMSNorm": + dxmat, dgamma = tex.rmsnorm_bwd( + d_ln_out, inputmat, rsigma, ln_weight, + ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma + ) + dbeta = None + if not ctx.use_bias: grad_bias = None @@ -533,6 +499,7 @@ def backward( None, None, None, + None, ) @@ -555,6 +522,8 @@ class LayerNormLinear(TransformerEngineBaseModule): a value added to the denominator of layer normalization for numerical stability. bias : bool, default = `True` if set to `False`, the layer will not learn an additive bias. + normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm' + type of normalization applied. init_method : Callable, default = `None` used for initializing weights in the following way: `init_method(weight)`. When set to `None`, defaults to `torch.nn.init.normal_(mean=0.0, std=0.023)`. @@ -624,6 +593,7 @@ def __init__( get_rng_state_tracker: Optional[Callable] = None, init_method: Optional[Callable] = None, bias: bool = True, + normalization: str = 'LayerNorm', return_bias: bool = False, params_dtype: Optional[torch.dtype] = None, parallel_mode: Optional[str] = None, @@ -649,9 +619,11 @@ def __init__( self.in_features = in_features self.out_features = out_features self.fuse_wgrad_accumulation = fuse_wgrad_accumulation + self.normalization = normalization + assert normalization in ['LayerNorm', 'RMSNorm'], "Unsupported normalization type!" self.use_bias = bias self.return_bias = return_bias - self.apply_bias = bias and not return_bias + self.apply_bias = self.use_bias and not return_bias self.return_layernorm_output = return_layernorm_output self.parameters_split = parameters_split self.zero_centered_gamma = zero_centered_gamma @@ -696,15 +668,18 @@ def __init__( dtype=params_dtype, ) ) - self.layer_norm_bias = Parameter( - torch.empty( - in_features, - device=torch.cuda.current_device(), - dtype=params_dtype, - ) - ) setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + if self.normalization != "RMSNorm": + self.layer_norm_bias = Parameter( + torch.empty( + in_features, + device=torch.cuda.current_device(), + dtype=params_dtype, + ) + ) + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + else: + self.layer_norm_bias = None self.reset_layer_norm_parameters() self.weight_tensor = torch.empty( @@ -796,7 +771,8 @@ def reset_layer_norm_parameters(self) -> None: init.ones_(self.layer_norm_weight) else: init.zeros_(self.layer_norm_weight) - init.zeros_(self.layer_norm_bias) + if self.layer_norm_bias is not None: + init.zeros_(self.layer_norm_bias) def get_fp8_weights_scratchpad( self, @@ -915,6 +891,7 @@ def forward( self.ub_bulk_wgrad, self.ub_bulk_dgrad, self.ub_split_ag, + self.normalization, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index bce92cabd7..d2d866667b 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -46,6 +46,8 @@ from ..constants import dist_group_type, TE_DType from ..jit import no_torch_dynamo +from ._common import _apply_normalization + __all__ = ["LayerNormMLP"] @@ -107,6 +109,7 @@ def forward( ub_split_rs: bool, ub_split_ag: bool, activation: str, + normalization: str, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -124,7 +127,8 @@ def forward( # Cast for native AMP inputmat = cast_if_needed(inputmat, activation_dtype) ln_weight = cast_if_needed(ln_weight, activation_dtype) - ln_bias = cast_if_needed(ln_bias, activation_dtype) + if ln_bias is not None: + ln_bias = cast_if_needed(ln_bias, activation_dtype) if ub_split_ag: tp_world_size = get_distributed_world_size(tp_group) @@ -133,70 +137,39 @@ def forward( if ub_split_ag: ub_obj_lnout = get_ub("fc1_fprop") ln_out = ub_obj_lnout.get_ubuf_output(0) + else: + ln_out_dtype = torch.uint8 if (fp8 and not return_layernorm_output) else inputmat.dtype + ln_out = torch.empty_like(inputmat, dtype=ln_out_dtype) if ub_split_rs: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1: ub_split_rs = False + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + + ln_out, mu, rsigma = _apply_normalization(inputmat, + ln_out, + ln_weight, + ln_bias, + eps, + fp8 and not return_layernorm_output, + fp8_meta, + normalization, + fwd_ln_sm_margin, + zero_centered_gamma, + is_grad_enabled) # If residual connection is after LN, we need `ln_out` # tensor in higher precision, this comes at the cost # of an extra fp8 cast. - if fp8: - fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) - if not return_layernorm_output: - if is_grad_enabled: - if not ub_split_ag: - ln_out = torch.empty_like(inputmat, dtype=torch.uint8) - _, mu, rsigma = tex.layernorm_fwd_fp8( - inputmat, - ln_weight, - ln_bias, - eps, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - fwd_ln_sm_margin, - zero_centered_gamma, - ln_out = ln_out, - ) - else: - ln_out = tex.layernorm_fwd_fp8_inf( - inputmat, - ln_weight, - ln_bias, - eps, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - zero_centered_gamma, - ) - else: - ln_out_return, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) + if return_layernorm_output: + ln_out_return = ln_out + if fp8: ln_out = tex.cast_to_fp8( - ln_out_return, + ln_out, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, ) - else: - if is_grad_enabled: - if ub_split_ag: - _, mu, rsigma = tex.layernorm_fwd_noalloc( - inputmat, ln_weight, ln_bias, ln_out, eps, - fwd_ln_sm_margin, zero_centered_gamma - ) - else: - ln_out, mu, rsigma = tex.layernorm_fwd( - inputmat, ln_weight, ln_bias, eps, fwd_ln_sm_margin, zero_centered_gamma - ) - else: - ln_out, mu, rsigma = tex.layernorm_fwd_inf( - inputmat, ln_weight, ln_bias, eps, zero_centered_gamma - ), None, None - - ln_out_return = ln_out # Column Parallel Linear if ub_split_ag: ln_out_total = ub_obj_lnout.get_ubuf_output(1) @@ -422,6 +395,7 @@ def forward( ctx.ub_bulk_dgrad = ub_bulk_dgrad ctx.ub_split_ag = ub_split_ag ctx.requires_dgrad = inp.requires_grad + ctx.normalization = normalization # Row Parallel Linear if ub_split_rs: @@ -804,10 +778,17 @@ def backward( if ctx.return_layernorm_output: d_ln_out = d_ln_out + grad_outputs[1].view_as(d_ln_out) - dxmat, dgamma, dbeta = tex.layernorm_bwd( - d_ln_out, inputmat, mu, rsigma, ln_weight, - ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma - ) + if ctx.normalization == "LayerNorm": + dxmat, dgamma, dbeta = tex.layernorm_bwd( + d_ln_out, inputmat, mu, rsigma, ln_weight, + ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma + ) + elif ctx.normalization == "RMSNorm": + dxmat, dgamma = tex.rmsnorm_bwd( + d_ln_out, inputmat, rsigma, ln_weight, + ctx.bwd_ln_sm_margin, ctx.zero_centered_gamma + ) + dbeta = None return ( dxmat.view(ctx.inp_shape) if ctx.requires_dgrad else None, @@ -846,6 +827,7 @@ def backward( None, None, None, + None, ) @@ -864,6 +846,8 @@ class LayerNormMLP(TransformerEngineBaseModule): a value added to the denominator of layer normalization for numerical stability. bias : bool, default = `True` if set to `False`, the FC1 and FC2 layers will not learn an additive bias. + normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm' + type of normalization applied. activation : str, default = 'gelu' activation function used. Options: 'gelu', 'geglu', 'relu', 'reglu', 'squared_relu', 'swiglu'. @@ -942,6 +926,7 @@ def __init__( tp_size: int = 1, init_method: Optional[Callable] = None, bias: bool = True, + normalization: str = 'LayerNorm', activation : str = "gelu", output_layer_init_method: Optional[Callable] = None, fuse_wgrad_accumulation: bool = False, @@ -960,6 +945,8 @@ def __init__( params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype self.fuse_wgrad_accumulation = fuse_wgrad_accumulation + self.normalization = normalization + assert normalization in ['LayerNorm', 'RMSNorm'], "Unsupported normalization type!" self.use_bias = bias self.activation = activation self.return_bias = return_bias @@ -1005,15 +992,18 @@ def __init__( dtype=params_dtype, ) ) - self.layer_norm_bias = Parameter( - torch.empty( - hidden_size, - device=torch.cuda.current_device(), - dtype=params_dtype, - ) - ) setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + if self.normalization != "RMSNorm": + self.layer_norm_bias = Parameter( + torch.empty( + hidden_size, + device=torch.cuda.current_device(), + dtype=params_dtype, + ) + ) + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + else: + self.layer_norm_bias = None self.reset_layer_norm_parameters() if self.activation in ['reglu', 'geglu', 'swiglu']: @@ -1114,7 +1104,8 @@ def reset_layer_norm_parameters(self) -> None: init.ones_(self.layer_norm_weight) else: init.zeros_(self.layer_norm_weight) - init.zeros_(self.layer_norm_bias) + if self.layer_norm_bias is not None: + init.zeros_(self.layer_norm_bias) def get_fp8_weights_scratchpad( self, @@ -1217,6 +1208,7 @@ def forward( self.ub_split_rs, self.ub_split_ag, self.activation, + self.normalization, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py new file mode 100644 index 0000000000..dc7db1a221 --- /dev/null +++ b/transformer_engine/pytorch/module/rmsnorm.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""RMSNorm API""" +import os +from typing import Union, Tuple, Optional + +import torch +from torch.nn.parameter import Parameter +from torch.nn import init + +from .. import cpp_extensions as tex +from ..jit import no_torch_dynamo + + +__all__ = ["RMSNorm"] + + +class _RMSNorm(torch.autograd.Function): + """functional RMSNorm""" + + @staticmethod + def forward( + ctx, + inp: torch.Tensor, + rmsnorm_weight: torch.Tensor, + eps: float, + fwd_rmsnorm_sm_margin: int, + bwd_rmsnorm_sm_margin: int, + zero_centered_gamma: bool, + is_grad_enabled: bool, + ) -> torch.Tensor: + # Make sure input dimensions are compatible + in_features = rmsnorm_weight.numel() + assert inp.is_cuda, "TransformerEngine needs CUDA." + assert inp.shape[-1] == in_features, "RMSNorm not possible" + inputmat = inp.view((-1, in_features)) + + if is_grad_enabled: + rmsnorm_out, rsigma = tex.rmsnorm_fwd(inputmat, rmsnorm_weight, + eps, fwd_rmsnorm_sm_margin, + zero_centered_gamma) + ctx.save_for_backward(inputmat, rmsnorm_weight, rsigma) + ctx.inp_shape = inp.shape + ctx.bwd_rmsnorm_sm_margin = bwd_rmsnorm_sm_margin + ctx.zero_centered_gamma = zero_centered_gamma + else: + rmsnorm_out = tex.rmsnorm_fwd_inf(inputmat, rmsnorm_weight, + eps, + zero_centered_gamma) + return rmsnorm_out.view_as(inp) + + @staticmethod + def backward( + ctx, grad_output: torch.Tensor + ) -> Tuple[Union[torch.Tensor, None], ...]: + inputmat, rmsnorm_weight, rsigma = ctx.saved_tensors + grad_output = grad_output.contiguous() + d_rmsnorm_out = grad_output.view(inputmat.shape) + dxmat, dgamma = tex.rmsnorm_bwd( + d_rmsnorm_out, inputmat, rsigma, rmsnorm_weight, + ctx.bwd_rmsnorm_sm_margin, ctx.zero_centered_gamma + ) + return ( + dxmat.view(ctx.inp_shape), + dgamma, + None, + None, + None, + None, + None, + ) + + +class RMSNorm(torch.nn.Module): + r""" + Applies Root Mean Square Layer Normalization over a mini-batch of inputs as described in + the paper `Root Mean Square Layer Normalization `__ + + .. math:: + y = \frac{x}{RMS(x) + \varepsilon} * \gamma + + where + + .. math:: + RMS(x) = \sqrt{\frac{1}{n}\sum_{i=0}^nx_i^2} + + :math:`\gamma` is a learnable affine transform parameter of size :attr:`hidden_size` + + Parameters + ---------- + hidden_size : int + size of each input sample. + eps : float, default = 1e-5 + a value added to the denominator of layer normalization for numerical stability. + sequence_parallel : bool, default = `False` + if set to `True`, uses sequence parallelism. + params_dtype : torch.dtype, default = `torch.get_default_dtype()` + it controls the type used to allocate the initial parameters. Useful when + the model is trained with lower precision and the original FP32 parameters + would not fit in GPU memory. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in RMSNorm is initialized to 0 and + the RMSNorm formula changes to + + .. math:: + y = \frac{x}{RMS(x) + \varepsilon} * (1 + \gamma) + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-5, + sequence_parallel: bool = False, + params_dtype: Optional[torch.dtype] = None, + zero_centered_gamma: bool = False, + ) -> None: + super().__init__() + params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype + self.eps = eps + self.zero_centered_gamma = zero_centered_gamma + self.weight = Parameter( + torch.empty( + hidden_size, + device=torch.cuda.current_device(), + dtype=params_dtype, + ) + ) + setattr(self.weight, "sequence_parallel", sequence_parallel) + self.reset_rms_norm_parameters() + + # These many SMs are subtracted from the total SM count when calling forward + # and backward RMSNorm C APIs. These envvars can be used to prevent the LN + # kernels from using all SMs in the device. This is useful for cases such as + # communication overlap with RMSNorm. + self.fwd_rmsnorm_sm_margin = int(os.getenv("NVTE_FWD_LAYERNORM_SM_MARGIN", "0")) + self.bwd_rmsnorm_sm_margin = int(os.getenv("NVTE_BWD_LAYERNORM_SM_MARGIN", "0")) + + def reset_rms_norm_parameters(self) -> None: + """Init RMSNorm params""" + if not self.zero_centered_gamma: + init.ones_(self.weight) + else: + init.zeros_(self.weight) + + + @no_torch_dynamo + def forward(self, inp: torch.Tensor) -> torch.Tensor: + """RMSNorm FWD""" + if torch.is_grad_enabled(): + fwd_fn = _RMSNorm.apply + args = [] + else: + fwd_fn = _RMSNorm.forward + args = [None] + + args += ( + inp, + self.weight, + self.eps, + self.fwd_rmsnorm_sm_margin, + self.bwd_rmsnorm_sm_margin, + self.zero_centered_gamma, + torch.is_grad_enabled() + ) + + return fwd_fn(*args) diff --git a/transformer_engine/pytorch/te_onnx_extensions.py b/transformer_engine/pytorch/te_onnx_extensions.py index 5990160294..7227205099 100755 --- a/transformer_engine/pytorch/te_onnx_extensions.py +++ b/transformer_engine/pytorch/te_onnx_extensions.py @@ -283,6 +283,20 @@ def onnx_te_gemm( return output +def _ones_like(g, inp, dtype): + """Returns a tensor filled with the scalar value 1, with the same size as input and + with dtype data-type""" + shape = g.op("Shape", inp) + # WAR ONNX spec: ConstantOfShape accepts all data types except for BF16. To WAR + # create a ConstantOfShape with type FP32 and then add a Cast to BF16. + is_bf16 = dtype == torch.bfloat16 + one = g.op("ConstantOfShape", shape, value_t=torch.tensor([1], + dtype=torch.float32 if is_bf16 else dtype)) + if is_bf16: + one = g.op("Cast", one, to_i=_C_onnx.TensorProtoDataType.BFLOAT16) + return one + + @symbolic_helper.parse_args("v", "v", "v", "f", "v", "v", "fs", "i", "i", "b") def onnx_layernorm_fwd_fp8(g, inputs, weight, bias, eps, scale, amax, scale_inv, fp8_tensor, otype, zero_centered_gamma): @@ -305,19 +319,6 @@ def onnx_layernorm_fwd(g, inputs, weight, bias, eps, zero_centered_gamma): """ONNX graph for layernorm_fwd""" # pylint: disable=unused-argument - def ones_like(inp, dtype): - """Returns a tensor filled with the scalar value 1, with the same size as input and - with dtype data-type""" - shape = g.op("Shape", inp) - # WAR ONNX spec: ConstantOfShape accepts all data types except for BF16. To WAR - # create a ConstantOfShape with type FP32 and then add a Cast to BF16. - is_bf16 = dtype == torch.bfloat16 - one = g.op("ConstantOfShape", shape, value_t=torch.tensor([1], - dtype=torch.float32 if is_bf16 else dtype)) - if is_bf16: - one = g.op("Cast", one, to_i=_C_onnx.TensorProtoDataType.BFLOAT16) - return one - normalized_shape = torch.onnx.symbolic_helper._get_tensor_sizes(inputs) if normalized_shape is None: ndim = torch.onnx.symbolic_helper._get_tensor_rank(inputs) @@ -328,7 +329,7 @@ def ones_like(inp, dtype): if zero_centered_gamma: inputs_dtype = inputs.type().dtype() - one = ones_like(weight, inputs_dtype) + one = _ones_like(g, weight, inputs_dtype) weight = g.op("Add", weight, one) axis = -len(normalized_shape) @@ -344,6 +345,57 @@ def ones_like(inp, dtype): ) return ln +@symbolic_helper.parse_args("v", "v", "f", "v", "v", "fs", "i", "i", "b") +def onnx_rmsnorm_fwd_fp8(g, inputs, weight, eps, scale, amax, + scale_inv, fp8_tensor, otype, zero_centered_gamma): + """ONNX graph for rmsnorm_fwd_fp8""" + # pylint: disable=unused-argument + inp_dtype = get_TensorProtoDataType(inputs) + + if inp_dtype != get_TensorProtoDataType(weight): + weight = g.op("Cast", weight, to_i=inp_dtype) + + ln = onnx_rmsnorm_fwd(g, inputs, weight, eps, zero_centered_gamma) + fp8_ln = quantize(g, ln, scale_inv, fp8_tensor) + return fp8_ln + + +@symbolic_helper.parse_args("v", "v", "f", "b") +def onnx_rmsnorm_fwd(g, inputs, weight, eps, zero_centered_gamma): + """ONNX graph for rmsnorm_fwd""" + # pylint: disable=unused-argument + + normalized_shape = torch.onnx.symbolic_helper._get_tensor_sizes(inputs) + if normalized_shape is None: + ndim = torch.onnx.symbolic_helper._get_tensor_rank(inputs) + assert ndim is not None + normalized_shape = list(range(0, ndim)) + # Normalization axis = 0, so normalized_shape uses all dims except dim = 0 + normalized_shape = normalized_shape[1:] + + if zero_centered_gamma: + inputs_dtype = inputs.type().dtype() + one = _ones_like(g, weight, inputs_dtype) + weight = g.op("Add", weight, one) + + axis = -len(normalized_shape) + + inputs_float = g.op("Cast", inputs, to_i=_C_onnx.TensorProtoDataType.FLOAT) + + norm = g.op("ReduceL2", inputs_float, axes_i=[axis]) + shape = g.op("Shape", inputs_float, start_i=-1) + shape_f = g.op("Cast", shape, to_i=_C_onnx.TensorProtoDataType.FLOAT) + n_reciprocal = g.op("Reciprocal", shape_f) + sqrt_n_reciprocal = g.op("Sqrt", n_reciprocal) + rms = g.op("Mul", norm, sqrt_n_reciprocal) + eps_tensor = g.op("ConstantOfShape", shape, value_t=torch.tensor([eps], dtype=torch.float32)) + rms_eps = g.op("Add", rms, eps_tensor) + normalized_input = g.op("Div", inputs_float, rms_eps) + result = g.op("Mul", weight, normalized_input) + result = g.op("Cast", result, to_i=get_TensorProtoDataType(inputs)) + + + return result register_custom_op_symbolic('tex_ts::cast_to_fp8_ts', onnx_cast_to_fp8, VER) register_custom_op_symbolic('tex_ts::cast_from_fp8_ts', onnx_cast_from_fp8, VER) @@ -355,3 +407,5 @@ def ones_like(inp, dtype): register_custom_op_symbolic('tex_ts::te_gemm_ts', onnx_te_gemm, VER) register_custom_op_symbolic('tex_ts::layernorm_fwd_fp8_inf_ts', onnx_layernorm_fwd_fp8, VER) register_custom_op_symbolic('tex_ts::layernorm_fwd_inf_ts', onnx_layernorm_fwd, VER) +register_custom_op_symbolic('tex_ts::rmsnorm_fwd_fp8_inf_ts', onnx_rmsnorm_fwd_fp8, VER) +register_custom_op_symbolic('tex_ts::rmsnorm_fwd_inf_ts', onnx_rmsnorm_fwd, VER) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 55c547b7ec..7f1b9a7246 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -11,7 +11,7 @@ import torch import transformer_engine_extensions as tex -from transformer_engine.pytorch.module import LayerNormMLP, LayerNorm +from transformer_engine.pytorch.module import LayerNormMLP, LayerNorm, RMSNorm from transformer_engine.pytorch.attention import MultiHeadAttention from transformer_engine.pytorch.jit import ( set_jit_fusion_options, @@ -128,6 +128,8 @@ class TransformerLayer(torch.nn.Module): .. math:: y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta + normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm' + type of normalization applied. qkv_weight_interleaved : bool, default = `True` if set to `False`, the QKV weight is interpreted as a concatenation of query, key, and value weights along the `0th` dimension. The default @@ -220,7 +222,8 @@ def __init__( qkv_weight_interleaved: bool = True, ub_tp_comm_overlap: bool = False, bias: bool = True, - activation: str = 'gelu' + activation: str = 'gelu', + normalization: str = "LayerNorm", ) -> None: super().__init__() @@ -312,6 +315,7 @@ def __init__( input_layernorm=not output_layernorm, attention_type="self", bias=bias, + normalization=normalization, ) if layer_type == "decoder": @@ -322,6 +326,7 @@ def __init__( input_layernorm=True, attention_type="cross", bias=bias, + normalization=normalization, ) # LayerNorm -> activation(Linear + Bias) -> Linear @@ -353,6 +358,7 @@ def __init__( ub_split_rs=ub_split_rs, ub_split_ag=ub_split_ag, activation=activation, + normalization=normalization, ) self.hidden_dropout = hidden_dropout @@ -376,8 +382,12 @@ def __init__( hidden_size, seq_length, micro_batch_size ) + norm_module = { + "LayerNorm": LayerNorm, + "RMSNorm": RMSNorm, + } if self.output_layernorm: - self.layernorm = LayerNorm( + self.layernorm = norm_module[normalization]( hidden_size, eps=layernorm_epsilon, sequence_parallel=self.sequence_parallel, From 5ed7e82c55a5adb03388c0854a36a449a21cad3b Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:50:11 -0700 Subject: [PATCH 043/427] Add support for multi-query and grouped-query attention (#338) * add support for multi-query/grouped-query attention Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix lint Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * revert to flash-attn 1.0.6 and build 2.0.0.post1 manually in CI Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add keyword name for DPA input Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix fused attn tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix skipif for pytest Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * Update transformer_engine/pytorch/attention.py Signed-off-by: Kirthi Shankar Sivamani * Update tests/pytorch/test_fused_attn.py Signed-off-by: Kirthi Shankar Sivamani * Fix TP and SP case Signed-off-by: Kirthi Shankar Sivamani * add skipifs for pytest Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove higher limit for flash-attn version Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- qa/L0_unittest/test.sh | 1 + setup.py | 2 +- tests/pytorch/test_fused_attn.py | 114 ++++++++++++++++++++++ tests/pytorch/test_numerics.py | 2 +- transformer_engine/pytorch/attention.py | 83 +++++++++++++--- transformer_engine/pytorch/transformer.py | 10 ++ 6 files changed, 195 insertions(+), 17 deletions(-) diff --git a/qa/L0_unittest/test.sh b/qa/L0_unittest/test.sh index d061b62453..f02ea1c6e8 100644 --- a/qa/L0_unittest/test.sh +++ b/qa/L0_unittest/test.sh @@ -11,3 +11,4 @@ pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py pytest -v -s $TE_PATH/tests/pytorch/test_jit.py +pytest -v -s $TE_PATH/tests/pytorch/test_fused_attn.py diff --git a/setup.py b/setup.py index ded19044fc..e42b6e01d0 100644 --- a/setup.py +++ b/setup.py @@ -290,7 +290,7 @@ def add_unique(l: List[str], vals: Union[str, List[str]]) -> None: # Framework-specific requirements if "pytorch" in frameworks(): - add_unique(install_reqs, ["torch", "flash-attn>=1.0.6, <=2.0.0.post1"]) + add_unique(install_reqs, ["torch", "flash-attn>=1.0.6"]) add_unique(test_reqs, ["numpy", "onnxruntime", "torchvision"]) if "jax" in frameworks(): if not found_pybind11(): diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index 1aa100672c..99a82eb6e1 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -8,11 +8,19 @@ from transformer_engine.pytorch.utils import ( init_method_normal, scaled_init_method_normal, + get_device_compute_capability, ) +from transformer_engine.pytorch.fp8 import is_fp8_available from transformer_engine.pytorch import TransformerLayer from transformer_engine.pytorch.attention import DotProductAttention import os +from pkg_resources import packaging +from importlib.metadata import version +fp8_available, reason_for_no_fp8 = is_fp8_available() +_flash_attn_version = packaging.version.Version(version("flash-attn")) +_flash_attn_2_available = _flash_attn_version >= packaging.version.Version("2") + class ModelConfig: def __init__( self, num_layers, hidden_size, num_attention_heads, head_dim, seq_len, @@ -45,6 +53,8 @@ def __init__( batch_sizes = [1, 2, 32] +@pytest.mark.skipif( + get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @@ -113,6 +123,8 @@ def _run_dot_product_attention(dtype, bs, config, backend): return op, inp.grad +@pytest.mark.skipif( + get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @@ -208,12 +220,114 @@ def _run_transformer_layer(dtype, bs, config, backend): return op, inp.grad +@pytest.mark.skipif(not _flash_attn_2_available, reason="FA2.0 is not available") +@pytest.mark.skipif( + get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_transformer_layer_gqa(dtype, bs, model): + """Test TransformerLayer module when its DotProductAttention is enabled with + FlashAttention, FusedAttention, or UnfusedDotProductAttention backend""" + + config = model_configs[model] + def find_factors(x): + f = [] + for i in range(1, x + 1): + if x % i == 0: + f.append(i) + return f + + num_querys_per_gqa_group = find_factors(config.num_attention_heads) + + for num_q_per_gqa_group in num_querys_per_gqa_group: + flash_attn_fwd, flash_attn_bwd = _run_transformer_layer_gqa( + dtype, bs, config, "FlashAttention", num_q_per_gqa_group) + unfused_attn_fwd, unfused_attn_bwd = _run_transformer_layer_gqa( + dtype, bs, config, "UnfusedDotProductAttention", num_q_per_gqa_group) + + atol, rtol = 5e-1, 5e-1 + assert torch.allclose(flash_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + assert torch.allclose(flash_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + +def _run_transformer_layer_gqa(dtype, bs, config, backend, num_querys_per_gqa_group): + + torch.manual_seed(1234) + torch.cuda.manual_seed(1234) + os.environ["NVTE_FLASH_ATTN"] = "0" + if backend == "FlashAttention": + os.environ["NVTE_FLASH_ATTN"] = "1" + + inp = 0.1 * torch.randn( + config.seq_len, bs, config.num_attention_heads * config.head_dim, + dtype = dtype).cuda() + inp.requires_grad=True + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + op_grad = 0.001 * torch.randint(0, 200, ( + config.seq_len, bs, config.num_attention_heads * config.head_dim + ), dtype = dtype).cuda() + + sigma = 0.02 + init_method = init_method_normal(sigma) + output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + + layer_number = 1 + drop_path_rate = 0.0 + drop_path_rates = [ + rate.item() for rate in torch.linspace(0, drop_path_rate, config.num_layers)] + + block = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + num_gqa_groups = config.num_attention_heads / num_querys_per_gqa_group, + layernorm_epsilon = 1e-5, + hidden_dropout = 0.0, + attention_dropout = config.dropout_p, + init_method = init_method, + output_layer_init_method = output_layer_init_method, + layer_number = layer_number, + kv_channels = config.head_dim, + self_attn_mask_type = config.attn_mask_type, + tp_group = None, + tp_size = 1, + params_dtype = dtype, + get_rng_state_tracker = None, + fuse_wgrad_accumulation = False, + seq_length = config.seq_len, + micro_batch_size = bs, + sequence_parallel = False, + apply_residual_connection_post_layernorm = False, + output_layernorm = False, + layer_type = "encoder", + drop_path_rate = drop_path_rates[layer_number - 1], + set_parallel_mode = True, + fuse_qkv_params = True, + zero_centered_gamma = False, + qkv_weight_interleaved = False, + ub_tp_comm_overlap = False, + bias = True, + ) + .to(dtype = dtype) + .cuda() + ) + + op = block(inp) + op.backward(op_grad) + + return op, inp.grad + model_configs_fp8 = { "test1": ModelConfig(1, 1024, 16, 64, 512, 0.0, "no_mask"), } batch_sizes_fp8 = [1, 4] param_types_fp8 = [torch.float16] +@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) @pytest.mark.parametrize("dtype", param_types_fp8) @pytest.mark.parametrize("bs", batch_sizes_fp8) @pytest.mark.parametrize("model", model_configs_fp8.keys()) diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 2ed901cb20..143fc9a74d 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -805,7 +805,7 @@ def test_dpa_accuracy(dtype, bs, model): DotProductAttention( config.num_attention_heads, config.embed, - 0.1, # dropout + attention_dropout=0.1, # dropout ) .to(dtype=dtype) .cuda() diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index dd3f561c95..8966f261ed 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -180,6 +180,15 @@ def forward( key_layer.size(0), ) + assert key_layer.shape == value_layer.shape, "Keys and values must have the same shape!" + if key_layer.shape[2] != query_layer.shape[2]: + assert (query_layer.shape[2]%key_layer.shape[2]==0 + ),"The number of attention heads must be divisible by the number of GQA groups!" + key_layer = key_layer.repeat_interleave( + int(query_layer.shape[2]/key_layer.shape[2]), dim = 2) + value_layer = value_layer.repeat_interleave( + int(query_layer.shape[2]/value_layer.shape[2]), dim = 2) + # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.reshape( output_size[2], output_size[0] * output_size[1], -1 @@ -722,6 +731,14 @@ class DotProductAttention(torch.nn.Module): number of attention heads in the transformer layer. kv_channels : int number of key-value channels. + num_gqa_groups : Optional[int] = None + number of GQA groups in the transformer layer. + Grouped Query Attention is described in + `this paper `_. + This only affects the keys and values, not the queries. + GQA-1 is equivalent to Multi-Query Attention + (`MQA `_), while GQA-H + is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`. attention_dropout: float, default = 0.0 dropout probability for the dropout op during multi-head attention. attn_mask_type: {'causal', 'padding'}, default = `causal` @@ -744,6 +761,7 @@ def __init__( self, num_attention_heads: int, kv_channels: int, + num_gqa_groups: Optional[int] = None, attention_dropout: float = 0.0, attn_mask_type: str = "causal", sequence_parallel: bool = False, @@ -758,12 +776,16 @@ def __init__( self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) self.tp_group = tp_group self.get_rng_state_tracker = get_rng_state_tracker + self.num_attention_heads = num_attention_heads - projection_size = kv_channels * num_attention_heads - self.hidden_size_per_partition = divide(projection_size, self.tp_size) - self.hidden_size_per_attention_head = divide( - projection_size, num_attention_heads + self.hidden_size_per_attention_head = kv_channels + self.num_gqa_groups = ( + num_attention_heads if num_gqa_groups is None else num_gqa_groups ) + self.num_gqa_groups_per_partition = int(self.num_gqa_groups // tp_size) + + assert (num_attention_heads % self.num_gqa_groups == 0 + ), "The number of attention heads must be divisible by the number of GQA groups!" if sequence_parallel or get_rng_state_tracker is None: attention_dropout_ctx = nullcontext @@ -883,6 +905,10 @@ def forward( Whether to use the fast path to set output tensors to 0 or not. """ + assert (key_layer.shape[-2] == self.num_gqa_groups_per_partition + and value_layer.shape[-2] == self.num_gqa_groups_per_partition + ), f"Keys and values must have {self.num_gqa_groups} heads!" + use_flash_attention = self.use_flash_attention use_fused_attention = self.use_fused_attention @@ -898,6 +924,9 @@ def forward( elif not _flash_attn_2_available and self.device_compute_capability == 8.9: use_flash_attention = False + if not _flash_attn_2_available and self.num_gqa_groups != self.num_attention_heads: + use_flash_attention = False + if self.attn_mask_type == "padding" and attention_mask is not None: use_flash_attention = False use_fused_attention = False @@ -919,7 +948,9 @@ def forward( # DPA does not support FP8; for FP8, use cpp_extensions modules directly is_backend_avail = (fused_attention_backend in [FusedAttnBackend["F16_max512_seqlen"], FusedAttnBackend["F16_arbitrary_seqlen"]]) - use_fused_attention = use_fused_attention and is_backend_avail + use_fused_attention = (use_fused_attention + and is_backend_avail + and self.num_gqa_groups == self.num_attention_heads) if use_flash_attention: if checkpoint_core_attention: @@ -974,6 +1005,7 @@ def __init__( attn_mask_type: str = "causal", tp_group: Optional[dist_group_type] = None, tp_size: int = 1, + num_gqa_groups: Optional[int] = None, fuse_wgrad_accumulation: bool = False, get_rng_state_tracker: Optional[Callable] = None, sequence_parallel: bool = False, @@ -1002,6 +1034,7 @@ def __init__( self.params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype self.init_method = init_method self.attn_mask_type = attn_mask_type + self.num_attention_heads = num_attention_heads if not fuse_qkv_params: qkv_weight_interleaved = False @@ -1017,6 +1050,15 @@ def __init__( self.hidden_size_per_attention_head = kv_channels self.num_attention_heads_per_partition = divide(num_attention_heads, tp_size) + self.num_gqa_groups = ( + num_attention_heads if num_gqa_groups is None else num_gqa_groups + ) + assert (num_attention_heads % self.num_gqa_groups == 0 + ), "The number of GQA groups must be divisible by the number of attention heads!" + assert (num_attention_heads % tp_size == 0 + ), "The number of GQA groups must be divisible by tensor parallel size!" + self.num_gqa_groups_per_partition = int(self.num_gqa_groups // tp_size) + self.hidden_size_kv = int(hidden_size * self.num_gqa_groups // num_attention_heads) common_gemm_kwargs = { "fuse_wgrad_accumulation": fuse_wgrad_accumulation, @@ -1029,7 +1071,7 @@ def __init__( qkv_parallel_mode = "column" if set_parallel_mode else None - if self.attention_type == "self": + if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads: if self.input_layernorm: self.layernorm_qkv = LayerNormLinear( hidden_size, @@ -1059,7 +1101,9 @@ def __init__( parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None, **common_gemm_kwargs, ) - else: + elif ((self.attention_type == "cross") + or (self.attention_type == "self" + and self.num_gqa_groups != self.num_attention_heads)): if self.input_layernorm: self.layernorm_query = LayerNormLinear( hidden_size, @@ -1089,7 +1133,7 @@ def __init__( ) self.key_value = Linear( hidden_size, - 2 * hidden_size, + 2 * self.hidden_size_kv, init_method=init_method, bias=bias, return_bias=False, @@ -1102,7 +1146,8 @@ def __init__( self.core_attention = DotProductAttention( num_attention_heads, kv_channels, - attention_dropout, + num_gqa_groups=self.num_gqa_groups, + attention_dropout=attention_dropout, tp_size=tp_size, get_rng_state_tracker=get_rng_state_tracker, attn_mask_type=attn_mask_type, @@ -1131,7 +1176,7 @@ def _allocate_memory( return torch.empty( inference_max_sequence_len, batch_size, - self.num_attention_heads_per_partition, + self.num_gqa_groups_per_partition, self.hidden_size_per_attention_head, dtype=dtype, device=torch.cuda.current_device(), @@ -1192,7 +1237,7 @@ def forward( # Query, Key, and Value # ===================== - if self.attention_type == "self": + if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads: # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] if self.input_layernorm: layernorm_qkv_outputs = self.layernorm_qkv( @@ -1235,17 +1280,25 @@ def forward( query_layer, key_layer, value_layer = split_tensor_along_dim( mixed_x_layer, split_dim, 3 ) - else: + elif ((self.attention_type == "cross") + or (self.attention_type == "self" + and self.num_gqa_groups != self.num_attention_heads)): + + if self.attention_type == "cross": + input_tensor = encoder_output + else: + input_tensor = hidden_states + # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer = self.key_value( - encoder_output, + input_tensor, is_first_microbatch=is_first_microbatch, ) if self.qkv_weight_interleaved: # [sq, b, (np * 2 * hn)] --> [sq, b, np, 2 * hn] new_tensor_shape = mixed_kv_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, + self.num_gqa_groups_per_partition, 2 * self.hidden_size_per_attention_head, ) # split along last dimension @@ -1253,7 +1306,7 @@ def forward( else: # [sq, b, (np * 2 * hn)] --> [sq, b, 2 * np, hn] new_tensor_shape = mixed_kv_layer.size()[:-1] + ( - 2 * self.num_attention_heads_per_partition, + 2 * self.num_gqa_groups_per_partition, self.hidden_size_per_attention_head, ) # split along second last dimension diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 7f1b9a7246..572b905dd8 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -86,6 +86,14 @@ class TransformerLayer(torch.nn.Module): intermediate size to which input samples are projected. num_attention_heads : int number of attention heads in the transformer layer. + num_gqa_groups : int, default = `None` + number of GQA groups in the transformer layer. + Grouped Query Attention is described in + `this paper `_. + This only affects the keys and values, not the querys. + GQA-1 is equivalent to Multi-Query Attention + (`MQA `_), while GQA-H + is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`. layernorm_epsilon : float, default = 1e-5 a value added to the denominator of layer normalization for numerical stability. @@ -194,6 +202,7 @@ def __init__( hidden_size: int, ffn_hidden_size: int, num_attention_heads: int, + num_gqa_groups: Optional[int] = None, layernorm_epsilon: float = 1e-5, hidden_dropout: float = 0.1, attention_dropout: float = 0.1, @@ -293,6 +302,7 @@ def __init__( "layer_number": layer_number, "tp_group": tp_group, "tp_size": self.tp_size, + "num_gqa_groups": num_gqa_groups, "fuse_wgrad_accumulation": fuse_wgrad_accumulation, "get_rng_state_tracker": get_rng_state_tracker, "sequence_parallel": self.sequence_parallel, From 9347b10ad9bb1faa289d92920fc0d889efeec177 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Mon, 31 Jul 2023 15:12:41 -0700 Subject: [PATCH 044/427] Add compilation OOM note for FA 2.0 (#346) Add compilation warning for FA 2.0 Signed-off-by: Kirthi Shankar Sivamani --- README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.rst b/README.rst index d892eae244..5920e36e5c 100644 --- a/README.rst +++ b/README.rst @@ -191,6 +191,14 @@ From source `See the installation guide `_. +Compiling with Flash Attention 2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TransformerEngine release v0.11.0 adds support for Flash Attention 2.0 for improved performance. It is a known issue that Flash Attention 2.0 compilation is +resource intensive and requires a large amount of RAM (see `bug `_), which may lead to out of memory +errors during the installation of TransformerEngine. To circumvent the issue, please try setting **MAX_JOBS=1** in the environment. If the errors persist, then +proceed to install a supported version of Flash Attention 1 (v1.0.6 to v1.0.9). + Model Support ---------- From 3f01b4f812e0e501257278ec269499ea02b2d4f3 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Wed, 19 Jul 2023 21:40:44 -0700 Subject: [PATCH 045/427] Replace deprecated sharding API in JAX test (#332) Replace deprecated sharding API Signed-off-by: Tim Moon Co-authored-by: Kirthi Shankar Sivamani --- tests/jax/test_sharding.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/jax/test_sharding.py b/tests/jax/test_sharding.py index 217af3f816..ea216ac514 100644 --- a/tests/jax/test_sharding.py +++ b/tests/jax/test_sharding.py @@ -5,7 +5,6 @@ import jax import numpy as np import pytest -from jax.experimental import maps from utils import is_devices_enough from transformer_engine.jax.flax import extend_logical_axis_rules @@ -79,7 +78,7 @@ def test_infer_major_sharding_type( sharding_type): devices = np.asarray(jax.devices()[:DEVICE_COUNT]).reshape(*mesh_shape) with global_shard_guard(_get_sharding_resource(mesh_names, sharding_type)): - with maps.Mesh(devices, mesh_names): + with jax.sharding.Mesh(devices, mesh_names): assert infer_major_sharding_type() is sharding_type.value[0] @pytest.mark.parametrize('mesh_shape,mesh_names,sharding_type', MESH_CONFIG) @@ -150,7 +149,7 @@ def get_ref_sm(): devices = np.asarray(jax.devices()[:DEVICE_COUNT]).reshape(*mesh_shape) with global_shard_guard(_get_sharding_resource(mesh_names, sharding_type)): - with maps.Mesh(devices, mesh_names): + with jax.sharding.Mesh(devices, mesh_names): test_sm = get_fp8_meta_sharding_meta( sharding_type, num_of_fp8_meta, @@ -240,7 +239,7 @@ def get_ref_sm(): devices = np.asarray(jax.devices()[:DEVICE_COUNT]).reshape(*mesh_shape) with global_shard_guard(_get_sharding_resource(mesh_names, sharding_type)): - with maps.Mesh(devices, mesh_names): + with jax.sharding.Mesh(devices, mesh_names): test_sm = get_dot_sharding_meta( sharding_type, a_shape, @@ -319,7 +318,7 @@ def get_ref_sm(): devices = np.asarray(jax.devices()[:DEVICE_COUNT]).reshape(*mesh_shape) with global_shard_guard(_get_sharding_resource(mesh_names, sharding_type)): - with maps.Mesh(devices, mesh_names): + with jax.sharding.Mesh(devices, mesh_names): ref_sm, need_assert = get_ref_sm() try: test_sm = get_elementwise_sharding_meta( From 9799608b50c30989cdc75468dd76b4bebed8738e Mon Sep 17 00:00:00 2001 From: Shijie Date: Fri, 18 Aug 2023 07:07:10 +0800 Subject: [PATCH 046/427] [Paddle] Add nn layer (#361) * Add nn.layer: softmax, attention, transformer Signed-off-by: Shijie Wang * code refactor Signed-off-by: Shijie Wang * code refactor Signed-off-by: Shijie Wang * update docs and set dropout=0.1 Signed-off-by: Shijie Wang * Update transformer_engine/paddle/layer/attention.py Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Shijie Wang Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- tests/paddle/test_layers.py | 490 +++++++++++++++ tests/paddle/test_operators.py | 8 +- transformer_engine/paddle/__init__.py | 3 +- transformer_engine/paddle/constants.py | 6 + transformer_engine/paddle/cpp_extensions.py | 8 +- transformer_engine/paddle/layer/__init__.py | 3 + transformer_engine/paddle/layer/attention.py | 568 ++++++++++++++++++ transformer_engine/paddle/layer/layernorm.py | 2 +- .../paddle/layer/layernorm_linear.py | 3 +- .../paddle/layer/layernorm_mlp.py | 2 +- transformer_engine/paddle/layer/softmax.py | 237 ++++++++ .../paddle/layer/transformer.py | 260 ++++++++ transformer_engine/paddle/utils.py | 34 ++ 13 files changed, 1610 insertions(+), 14 deletions(-) create mode 100644 transformer_engine/paddle/layer/attention.py create mode 100644 transformer_engine/paddle/layer/softmax.py create mode 100644 transformer_engine/paddle/layer/transformer.py diff --git a/tests/paddle/test_layers.py b/tests/paddle/test_layers.py index 3bd3a562db..171b9233e7 100644 --- a/tests/paddle/test_layers.py +++ b/tests/paddle/test_layers.py @@ -3,6 +3,7 @@ # See LICENSE for license information. """Test TE Paddle Layer-level APIs""" +import math import os import pytest from utils import assert_allclose @@ -605,3 +606,492 @@ def test_layernorm_mlp_fp8(bs, hidden_size, ffn_hidden_size, has_bias, no_dbias, if do_calibration: assert paddle.count_nonzero(layer_te.fp8_meta["scaling_fwd"].amax_history).item() > 0 + + +@pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), + reason="cuDNN fMHA requires Ampere+ GPU") +@pytest.mark.parametrize('bs', [1, 2, 8]) +@pytest.mark.parametrize('hidden_size, num_heads', [[1024, 16], [768, 12]]) +@pytest.mark.parametrize('q_seqlen, kv_seqlen', [[128, 128], [512, 512]]) +@pytest.mark.parametrize('attn_type', ['self', 'cross']) +@pytest.mark.parametrize('mask_type', ['causal', 'padding']) +@pytest.mark.parametrize('math_dtype', ['bfloat16', 'float16']) +def test_dot_product_attention(bs, hidden_size, num_heads, q_seqlen, kv_seqlen, attn_type, + mask_type, math_dtype): + """ + Test DotProductAttention Layer + """ + paddle.set_default_dtype(math_dtype) + rtol = 1e-4 + atol = 2e-2 + + head_size = hidden_size // num_heads + self_attn_qkv_input = paddle.normal(mean=0.0, + std=0.02, + shape=(bs, q_seqlen, 3, num_heads, + head_size)).astype(math_dtype) + cross_attn_q_input = paddle.normal(mean=0.0, + std=0.02, + shape=(bs, q_seqlen, num_heads, + head_size)).astype(math_dtype) + cross_attn_kv_input = paddle.normal(mean=0.0, + std=0.02, + shape=(bs, kv_seqlen, 2, num_heads, + head_size)).astype(math_dtype) + + q_actual_seqlen = paddle.randint(low=20, high=q_seqlen, shape=(bs,), dtype='int32') + kv_actual_seqlen = paddle.randint(low=20, high=kv_seqlen, shape=(bs,), + dtype='int32') if attn_type == 'cross' else q_actual_seqlen + attn_mask = paddle.ones(shape=(bs, 1, q_seqlen, kv_seqlen), dtype='bool') + + grad_out = paddle.normal(mean=0.0, std=0.02, + shape=(bs, q_seqlen, num_heads, head_size)).astype('float32') + for i in range(0, bs): + grad_out[i, q_actual_seqlen[i]:, :, :] = 0 + grad_out = grad_out.astype(math_dtype) + + for i in range(0, bs): + attn_mask[i, 0, 0:q_actual_seqlen[i], 0:kv_actual_seqlen[i]] = False + + norm_factor = math.sqrt(hidden_size // num_heads) + layer_te = te.DotProductAttention(norm_factor, + attention_dropout=0.0, + attn_mask_type=mask_type, + attention_type=attn_type, + backend='transformer_engine') + layer_pd = te.DotProductAttention(norm_factor, + attention_dropout=0.0, + attn_mask_type=mask_type, + attention_type=attn_type, + backend='paddle') + + def calc_attn_output_and_grad(layer, q, kv, mask, dout): + _q = paddle.to_tensor(q, stop_gradient=False) + _kv = paddle.to_tensor(kv, stop_gradient=False) if kv is not None else None + + out = layer(_q, _kv, mask) + out.backward(dout) + return out, _q.grad, _kv.grad if _kv is not None else None + + if attn_type == 'self': + out, qkv_grad, _ = calc_attn_output_and_grad(layer_te, self_attn_qkv_input, None, attn_mask, + grad_out) + out_ref, qkv_grad_ref, _ = calc_attn_output_and_grad(layer_pd, self_attn_qkv_input, None, + attn_mask, grad_out) + valid_out_ref = paddle.full_like(out_ref, 0) + for i in range(0, bs): + valid_out_ref[i, 0:q_actual_seqlen[i], :, :] = out_ref[i, 0:q_actual_seqlen[i], :, :] + + q_grad = qkv_grad[:, :, 0] + k_grad = qkv_grad[:, :, 1] + v_grad = qkv_grad[:, :, 2] + q_grad_ref = qkv_grad_ref[:, :, 0] + k_grad_ref = qkv_grad_ref[:, :, 1] + v_grad_ref = qkv_grad_ref[:, :, 2] + + else: + out, q_grad, kv_grad = calc_attn_output_and_grad(layer_te, cross_attn_q_input, + cross_attn_kv_input, attn_mask, grad_out) + out_ref, q_grad_ref, kv_grad_ref = calc_attn_output_and_grad(layer_pd, cross_attn_q_input, + cross_attn_kv_input, attn_mask, + grad_out) + + valid_out_ref = paddle.full_like(out_ref, 0) + for i in range(0, bs): + valid_out_ref[i, 0:q_actual_seqlen[i], :, :] = out_ref[i, 0:q_actual_seqlen[i], :, :] + + k_grad = kv_grad[:, :, 0] + v_grad = kv_grad[:, :, 1] + k_grad_ref = kv_grad_ref[:, :, 0] + v_grad_ref = kv_grad_ref[:, :, 1] + + valid_q_grad_ref = paddle.full_like(q_grad_ref, 0) + valid_k_grad_ref = paddle.full_like(k_grad_ref, 0) + valid_v_grad_ref = paddle.full_like(v_grad_ref, 0) + for i in range(0, bs): + valid_q_grad_ref[i, 0:q_actual_seqlen[i], :, :] = q_grad_ref[i, 0:q_actual_seqlen[i], :, :] + valid_k_grad_ref[i, 0:kv_actual_seqlen[i], :, :] = k_grad_ref[i, + 0:kv_actual_seqlen[i], :, :] + valid_v_grad_ref[i, 0:kv_actual_seqlen[i], :, :] = v_grad_ref[i, + 0:kv_actual_seqlen[i], :, :] + + assert_allclose(out, valid_out_ref, rtol=rtol, atol=atol) + assert_allclose(q_grad, valid_q_grad_ref, rtol=rtol, atol=atol) + assert_allclose(k_grad, valid_k_grad_ref, rtol=rtol, atol=atol) + assert_allclose(v_grad, valid_v_grad_ref, rtol=rtol, atol=atol) + + +@pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), + reason="cuDNN fMHA requires Ampere+ GPU") +@pytest.mark.parametrize('bs', [1, 2, 8]) +@pytest.mark.parametrize('hidden_size, num_heads, ffn_hidden_size', [[1024, 16, 4096]]) +@pytest.mark.parametrize('q_seqlen, kv_seqlen', [[128, 128], [512, 512]]) +@pytest.mark.parametrize('has_bias, no_dbias', [[False, True], [True, True], [True, False]]) +@pytest.mark.parametrize('no_wgrad', [True, False]) +@pytest.mark.parametrize('mask_type', ['causal', 'padding']) +@pytest.mark.parametrize('math_dtype', ['bfloat16', 'float16']) +@pytest.mark.parametrize('output_layernorm', [True, False]) +@pytest.mark.parametrize('return_layernorm_output', [True, False]) +def test_transformer_encoder_layer(bs, hidden_size, num_heads, ffn_hidden_size, has_bias, no_dbias, + no_wgrad, q_seqlen, kv_seqlen, mask_type, math_dtype, + output_layernorm, return_layernorm_output): + """ + Test Transformer Encoder Layer + """ + paddle.set_default_dtype(math_dtype) + rtol = 5e-2 + atol = 5e-2 + eps = 1e-3 + + encoder_input = paddle.uniform(shape=(bs, q_seqlen, hidden_size), dtype=math_dtype) + + q_actual_seqlen = paddle.ones(shape=(bs,), dtype='int32') * q_seqlen + kv_actual_seqlen = q_actual_seqlen + attn_mask = paddle.ones(shape=(bs, 1, q_seqlen, kv_seqlen), dtype='bool') + + grad_out = paddle.normal(mean=0.0, std=0.02, + shape=(bs, q_seqlen, hidden_size)).astype('float32') + for i in range(0, bs): + grad_out[i, q_actual_seqlen[i]:, :] = 0 + grad_out = grad_out.astype(math_dtype) + + for i in range(0, bs): + attn_mask[i, 0, 0:q_actual_seqlen[i], 0:kv_actual_seqlen[i]] = False + + layer_te = te.TransformerLayer(hidden_size, + ffn_hidden_size, + num_heads, + layernorm_epsilon=eps, + hidden_dropout=0.0, + attention_dropout=0.0, + weight_attr=None, + bias_attr=None if has_bias else False, + self_attn_mask_type=mask_type, + apply_residual_connection_post_layernorm=return_layernorm_output, + output_layernorm=output_layernorm, + layer_type='encoder', + backend='transformer_engine') + layer_pd = te.TransformerLayer(hidden_size, + ffn_hidden_size, + num_heads, + layernorm_epsilon=eps, + hidden_dropout=0.0, + attention_dropout=0.0, + weight_attr=None, + bias_attr=None if has_bias else False, + self_attn_mask_type=mask_type, + apply_residual_connection_post_layernorm=return_layernorm_output, + output_layernorm=output_layernorm, + layer_type='encoder', + backend='paddle') + + # MultiHeadAttention params + if output_layernorm: + layer_pd.self_attention.qkv.weight.copy_(layer_te.self_attention.qkv.weight.T, True) + layer_pd.self_attention.qkv.weight.stop_gradient = no_wgrad + layer_te.self_attention.qkv.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.qkv.bias.copy_(layer_te.self_attention.qkv.bias, True) + layer_pd.self_attention.qkv.bias.stop_gradient = no_dbias + layer_te.self_attention.qkv.bias.stop_gradient = no_dbias + else: + layer_pd.self_attention.layernorm_qkv.ln_weight.copy_( + layer_te.self_attention.layernorm_qkv.ln_weight, True) + layer_pd.self_attention.layernorm_qkv.ln_bias.copy_( + layer_te.self_attention.layernorm_qkv.ln_bias, True) + layer_pd.self_attention.layernorm_qkv.weight.copy_( + layer_te.self_attention.layernorm_qkv.weight.T, True) + layer_pd.self_attention.layernorm_qkv.ln_weight.stop_gradient = no_wgrad + layer_pd.self_attention.layernorm_qkv.ln_bias.stop_gradient = no_dbias + layer_pd.self_attention.layernorm_qkv.weight.stop_gradient = no_wgrad + layer_te.self_attention.layernorm_qkv.ln_weight.stop_gradient = no_wgrad + layer_te.self_attention.layernorm_qkv.ln_bias.stop_gradient = no_dbias + layer_te.self_attention.layernorm_qkv.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.layernorm_qkv.bias.copy_( + layer_te.self_attention.layernorm_qkv.bias, True) + layer_pd.self_attention.layernorm_qkv.bias.stop_gradient = no_dbias + layer_te.self_attention.layernorm_qkv.bias.stop_gradient = no_dbias + + layer_pd.self_attention.proj.weight.copy_(layer_te.self_attention.proj.weight.T, True) + layer_pd.self_attention.proj.weight.stop_gradient = no_wgrad + layer_te.self_attention.proj.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.proj.bias.copy_(layer_te.self_attention.proj.bias, True) + layer_pd.self_attention.proj.bias.stop_gradient = no_dbias + layer_te.self_attention.proj.bias.stop_gradient = no_dbias + + # LayerNorm MLP params + layer_pd.layernorm_mlp.ln_weight.copy_(layer_te.layernorm_mlp.ln_weight, True) + layer_pd.layernorm_mlp.ln_bias.copy_(layer_te.layernorm_mlp.ln_bias, True) + layer_pd.layernorm_mlp.fc1_weight.copy_(layer_te.layernorm_mlp.fc1_weight.T, True) + layer_pd.layernorm_mlp.fc2_weight.copy_(layer_te.layernorm_mlp.fc2_weight.T, True) + layer_pd.layernorm_mlp.ln_weight.stop_gradient = no_wgrad + layer_pd.layernorm_mlp.ln_bias.stop_gradient = no_dbias + layer_pd.layernorm_mlp.fc1_weight.stop_gradient = no_wgrad + layer_pd.layernorm_mlp.fc2_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.ln_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.ln_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc1_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.fc2_weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.layernorm_mlp.fc1_bias.copy_(layer_te.layernorm_mlp.fc1_bias, True) + layer_pd.layernorm_mlp.fc2_bias.copy_(layer_te.layernorm_mlp.fc2_bias, True) + layer_pd.layernorm_mlp.fc1_bias.stop_gradient = no_dbias + layer_pd.layernorm_mlp.fc2_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc1_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc2_bias.stop_gradient = no_dbias + + if output_layernorm: + layer_pd.layernorm.weight.copy_(layer_te.layernorm.weight, True) + layer_pd.layernorm.bias.copy_(layer_te.layernorm.bias, True) + layer_pd.layernorm.weight.stop_gradient = no_wgrad + layer_pd.layernorm.bias.stop_gradient = no_dbias + layer_te.layernorm.weight.stop_gradient = no_wgrad + layer_te.layernorm.bias.stop_gradient = no_dbias + + def calc_transformer_output_and_grad(layer, encoder_input, mask, dout): + _encoder_input = paddle.to_tensor(encoder_input, stop_gradient=False) + out = layer(_encoder_input, mask) + out.backward(dout) + return out, _encoder_input.grad + + out_ref, grad_input_ref = calc_transformer_output_and_grad(layer_pd, encoder_input, attn_mask, + grad_out) + out, grad_input = calc_transformer_output_and_grad(layer_te, encoder_input, attn_mask, grad_out) + + assert_allclose(out, out_ref, rtol=rtol, atol=atol) + assert_allclose(grad_input, grad_input_ref, rtol=rtol, atol=atol) + if not no_wgrad: + if output_layernorm: + assert_allclose(layer_te.self_attention.qkv.weight.grad, + layer_pd.self_attention.qkv.weight.grad.T, + rtol=rtol, + atol=atol) + else: + assert_allclose(layer_te.self_attention.layernorm_qkv.weight.grad, + layer_pd.self_attention.layernorm_qkv.weight.grad.T, + rtol=rtol, + atol=atol) + if not no_dbias: + if output_layernorm: + assert_allclose(layer_te.self_attention.qkv.bias.grad, + layer_pd.self_attention.qkv.bias.grad, + rtol=0.01, + atol=0.5) + else: + assert_allclose(layer_te.self_attention.layernorm_qkv.bias.grad, + layer_pd.self_attention.layernorm_qkv.bias.grad, + rtol=0.01, + atol=0.5) + + +@pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), + reason="cuDNN fMHA requires Ampere+ GPU") +@pytest.mark.parametrize('bs', [1, 2, 8]) +@pytest.mark.parametrize('hidden_size, num_heads, ffn_hidden_size', [[1024, 16, 4096]]) +@pytest.mark.parametrize('q_seqlen, kv_seqlen', [[128, 128], [512, 512]]) +@pytest.mark.parametrize('has_bias, no_dbias', [[False, True], [True, True], [True, False]]) +@pytest.mark.parametrize('no_wgrad', [True, False]) +@pytest.mark.parametrize('mask_type', ['causal', 'padding']) +@pytest.mark.parametrize('math_dtype', ['bfloat16', 'float16']) +@pytest.mark.parametrize('output_layernorm', [True, False]) +@pytest.mark.parametrize('return_layernorm_output', [True, False]) +def test_transformer_decoder_layer(bs, hidden_size, num_heads, ffn_hidden_size, has_bias, no_dbias, + no_wgrad, q_seqlen, kv_seqlen, mask_type, math_dtype, + output_layernorm, return_layernorm_output): + """ + Test Transformer Decoder Layer + """ + paddle.set_default_dtype(math_dtype) + rtol = 5e-2 + atol = 5e-2 + eps = 1e-3 + + encoder_input = paddle.uniform(shape=(bs, q_seqlen, hidden_size), dtype=math_dtype) + encoder_output = paddle.uniform(shape=(bs, kv_seqlen, hidden_size), dtype=math_dtype) + + q_actual_seqlen = paddle.ones(shape=(bs,), dtype='int32') * q_seqlen + kv_actual_seqlen = q_actual_seqlen + attn_mask = paddle.ones(shape=(bs, 1, q_seqlen, kv_seqlen), dtype='bool') + + grad_out = paddle.normal(mean=0.0, std=0.2, shape=(bs, q_seqlen, hidden_size)).astype('float32') + for i in range(0, bs): + grad_out[i, q_actual_seqlen[i]:, :] = 0 + grad_out = grad_out.astype(math_dtype) + + for i in range(0, bs): + attn_mask[i, 0, 0:q_actual_seqlen[i], 0:kv_actual_seqlen[i]] = False + + layer_te = te.TransformerLayer(hidden_size, + ffn_hidden_size, + num_heads, + layernorm_epsilon=eps, + hidden_dropout=0.0, + attention_dropout=0.0, + weight_attr=None, + bias_attr=None if has_bias else False, + self_attn_mask_type=mask_type, + apply_residual_connection_post_layernorm=return_layernorm_output, + output_layernorm=output_layernorm, + layer_type='decoder', + backend='transformer_engine') + layer_pd = te.TransformerLayer(hidden_size, + ffn_hidden_size, + num_heads, + layernorm_epsilon=eps, + hidden_dropout=0.0, + attention_dropout=0.0, + weight_attr=None, + bias_attr=None if has_bias else False, + self_attn_mask_type=mask_type, + apply_residual_connection_post_layernorm=return_layernorm_output, + output_layernorm=output_layernorm, + layer_type='decoder', + backend='paddle') + + # MultiHeadAttention params - self attn + if output_layernorm: + layer_pd.self_attention.qkv.weight.copy_(layer_te.self_attention.qkv.weight.T, True) + layer_pd.self_attention.qkv.weight.stop_gradient = no_wgrad + layer_te.self_attention.qkv.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.qkv.bias.copy_(layer_te.self_attention.qkv.bias, True) + layer_pd.self_attention.qkv.bias.stop_gradient = no_dbias + layer_te.self_attention.qkv.bias.stop_gradient = no_dbias + else: + layer_pd.self_attention.layernorm_qkv.ln_weight.copy_( + layer_te.self_attention.layernorm_qkv.ln_weight, True) + layer_pd.self_attention.layernorm_qkv.ln_bias.copy_( + layer_te.self_attention.layernorm_qkv.ln_bias, True) + layer_pd.self_attention.layernorm_qkv.weight.copy_( + layer_te.self_attention.layernorm_qkv.weight.T, True) + layer_pd.self_attention.layernorm_qkv.ln_weight.stop_gradient = no_wgrad + layer_pd.self_attention.layernorm_qkv.ln_bias.stop_gradient = no_dbias + layer_pd.self_attention.layernorm_qkv.weight.stop_gradient = no_wgrad + layer_te.self_attention.layernorm_qkv.ln_weight.stop_gradient = no_wgrad + layer_te.self_attention.layernorm_qkv.ln_bias.stop_gradient = no_dbias + layer_te.self_attention.layernorm_qkv.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.layernorm_qkv.bias.copy_( + layer_te.self_attention.layernorm_qkv.bias, True) + layer_pd.self_attention.layernorm_qkv.bias.stop_gradient = no_dbias + layer_te.self_attention.layernorm_qkv.bias.stop_gradient = no_dbias + + layer_pd.self_attention.proj.weight.copy_(layer_te.self_attention.proj.weight.T, True) + layer_pd.self_attention.proj.weight.stop_gradient = no_wgrad + layer_te.self_attention.proj.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.self_attention.proj.bias.copy_(layer_te.self_attention.proj.bias, True) + layer_pd.self_attention.proj.bias.stop_gradient = no_dbias + layer_te.self_attention.proj.bias.stop_gradient = no_dbias + + # MultiHeadAttention params - cross attn + layer_pd.inter_attention.layernorm_query.ln_weight.copy_( + layer_te.inter_attention.layernorm_query.ln_weight, True) + layer_pd.inter_attention.layernorm_query.ln_bias.copy_( + layer_te.inter_attention.layernorm_query.ln_bias, True) + layer_pd.inter_attention.layernorm_query.weight.copy_( + layer_te.inter_attention.layernorm_query.weight.T, True) + layer_pd.inter_attention.layernorm_query.ln_weight.stop_gradient = no_wgrad + layer_pd.inter_attention.layernorm_query.ln_bias.stop_gradient = no_dbias + layer_pd.inter_attention.layernorm_query.weight.stop_gradient = no_wgrad + layer_te.inter_attention.layernorm_query.ln_weight.stop_gradient = no_wgrad + layer_te.inter_attention.layernorm_query.ln_bias.stop_gradient = no_dbias + layer_te.inter_attention.layernorm_query.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.inter_attention.layernorm_query.bias.copy_( + layer_te.inter_attention.layernorm_query.bias, True) + layer_pd.inter_attention.layernorm_query.bias.stop_gradient = no_dbias + layer_te.inter_attention.layernorm_query.bias.stop_gradient = no_dbias + + layer_pd.inter_attention.key_value.weight.copy_(layer_te.inter_attention.key_value.weight.T, + True) + layer_pd.inter_attention.key_value.weight.stop_gradient = no_wgrad + layer_te.inter_attention.key_value.weight.stop_gradient = no_wgrad + layer_pd.inter_attention.proj.weight.copy_(layer_te.inter_attention.proj.weight.T, True) + layer_pd.inter_attention.proj.weight.stop_gradient = no_wgrad + layer_te.inter_attention.proj.weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.inter_attention.key_value.bias.copy_(layer_te.inter_attention.key_value.bias, True) + layer_pd.inter_attention.key_value.bias.stop_gradient = no_dbias + layer_te.inter_attention.key_value.bias.stop_gradient = no_dbias + layer_pd.inter_attention.proj.bias.copy_(layer_te.inter_attention.proj.bias, True) + layer_pd.inter_attention.proj.bias.stop_gradient = no_dbias + layer_te.inter_attention.proj.bias.stop_gradient = no_dbias + + # LayerNorm MLP params + layer_pd.layernorm_mlp.ln_weight.copy_(layer_te.layernorm_mlp.ln_weight, True) + layer_pd.layernorm_mlp.ln_bias.copy_(layer_te.layernorm_mlp.ln_bias, True) + layer_pd.layernorm_mlp.fc1_weight.copy_(layer_te.layernorm_mlp.fc1_weight.T, True) + layer_pd.layernorm_mlp.fc2_weight.copy_(layer_te.layernorm_mlp.fc2_weight.T, True) + layer_pd.layernorm_mlp.ln_weight.stop_gradient = no_wgrad + layer_pd.layernorm_mlp.ln_bias.stop_gradient = no_dbias + layer_pd.layernorm_mlp.fc1_weight.stop_gradient = no_wgrad + layer_pd.layernorm_mlp.fc2_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.ln_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.ln_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc1_weight.stop_gradient = no_wgrad + layer_te.layernorm_mlp.fc2_weight.stop_gradient = no_wgrad + if has_bias: + layer_pd.layernorm_mlp.fc1_bias.copy_(layer_te.layernorm_mlp.fc1_bias, True) + layer_pd.layernorm_mlp.fc2_bias.copy_(layer_te.layernorm_mlp.fc2_bias, True) + layer_pd.layernorm_mlp.fc1_bias.stop_gradient = no_dbias + layer_pd.layernorm_mlp.fc2_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc1_bias.stop_gradient = no_dbias + layer_te.layernorm_mlp.fc2_bias.stop_gradient = no_dbias + + if output_layernorm: + layer_pd.layernorm.weight.copy_(layer_te.layernorm.weight, True) + layer_pd.layernorm.bias.copy_(layer_te.layernorm.bias, True) + layer_pd.layernorm.weight.stop_gradient = no_wgrad + layer_pd.layernorm.bias.stop_gradient = no_dbias + layer_te.layernorm.weight.stop_gradient = no_wgrad + layer_te.layernorm.bias.stop_gradient = no_dbias + + def calc_transformer_output_and_grad(layer, encoder_input, mask, encoder_output, + enc_dec_attn_mask, dout): + _encoder_input = paddle.to_tensor(encoder_input, stop_gradient=False) + _encoder_output = paddle.to_tensor(encoder_output, stop_gradient=False) + out = layer(_encoder_input, mask, _encoder_output, enc_dec_attn_mask) + out.backward(dout) + return out, _encoder_input.grad, _encoder_output.grad + + out_ref, grad_encoder_input_ref, grad_encoder_output_ref = calc_transformer_output_and_grad( + layer_pd, encoder_input, attn_mask, encoder_output, attn_mask, grad_out) + out, grad_encoder_input, grad_encoder_output = calc_transformer_output_and_grad( + layer_te, encoder_input, attn_mask, encoder_output, attn_mask, grad_out) + + assert_allclose(out, out_ref, rtol=rtol, atol=atol) + assert_allclose(grad_encoder_input, grad_encoder_input_ref, rtol=rtol, atol=atol) + assert_allclose(grad_encoder_output, grad_encoder_output_ref, rtol=rtol, atol=atol) + if not no_wgrad: + if output_layernorm: + assert_allclose(layer_te.self_attention.qkv.weight.grad, + layer_pd.self_attention.qkv.weight.grad.T, + rtol=rtol, + atol=atol) + else: + assert_allclose(layer_te.self_attention.layernorm_qkv.weight.grad, + layer_pd.self_attention.layernorm_qkv.weight.grad.T, + rtol=rtol, + atol=0.1) + assert_allclose(layer_te.inter_attention.layernorm_query.weight.grad, + layer_pd.inter_attention.layernorm_query.weight.grad.T, + rtol=rtol, + atol=atol) + if not no_dbias: + if output_layernorm: + assert_allclose(layer_te.self_attention.qkv.bias.grad, + layer_pd.self_attention.qkv.bias.grad, + rtol=0.01, + atol=0.5) + else: + assert_allclose(layer_te.self_attention.layernorm_qkv.bias.grad, + layer_pd.self_attention.layernorm_qkv.bias.grad, + rtol=0.01, + atol=0.5) + assert_allclose(layer_te.inter_attention.layernorm_query.bias.grad, + layer_pd.inter_attention.layernorm_query.bias.grad, + rtol=rtol, + atol=atol) diff --git a/tests/paddle/test_operators.py b/tests/paddle/test_operators.py index c2769ee2bc..662978086a 100644 --- a/tests/paddle/test_operators.py +++ b/tests/paddle/test_operators.py @@ -46,7 +46,7 @@ from transformer_engine.common.recipe import DelayedScaling np.random.seed(10) -paddle.seed(10) +paddle.seed(11) GEMM_CASES = [(256, 256, 512), (32, 32, 32), (16384, 1024, 2816), (16384, 2816, 1024), (16384, 1024, 1024)] is_fp8_supported, reason = is_fp8_available() @@ -400,7 +400,7 @@ def test_layernorm_fwd(self): y_ref, mu_ref, rsigma_ref = self.calc_fwd_ref(x, eps, gamma, beta) - assert_allclose(y, y_ref, rtol=1e-5, atol=1e-5) + assert_allclose(y, y_ref, rtol=1e-4, atol=1e-4) assert_allclose(mu, mu_ref, rtol=1e-3, atol=1e-3) assert_allclose(rsigma, rsigma_ref, rtol=5e-2, atol=5e-2) @@ -725,10 +725,8 @@ def _get_fused_attention_out(self): q_grad = dq k_grad = dkv[:, :, 0, :, :] v_grad = dkv[:, :, 1, :, :] - fwd_out = paddle.reshape( - out, shape=[self.batch_size, self.q_seqlen, self.num_heads, self.head_size]) - return fwd_out, q_grad, k_grad, v_grad + return out, q_grad, k_grad, v_grad @pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), reason="cuDNN fMHA requires Ampere+ GPU") diff --git a/transformer_engine/paddle/__init__.py b/transformer_engine/paddle/__init__.py index 798ebb0527..6184c566d1 100644 --- a/transformer_engine/paddle/__init__.py +++ b/transformer_engine/paddle/__init__.py @@ -3,5 +3,6 @@ # See LICENSE for license information. """Transformer Engine bindings for Paddle""" -from .layer import Linear, LayerNorm, LayerNormLinear, LayerNormMLP from .fp8 import fp8_autocast +from .layer import (Linear, LayerNorm, LayerNormLinear, LayerNormMLP, FusedScaleMaskSoftmax, + DotProductAttention, MultiHeadAttention, TransformerLayer) diff --git a/transformer_engine/paddle/constants.py b/transformer_engine/paddle/constants.py index 0ae9e28b43..eac161ec60 100644 --- a/transformer_engine/paddle/constants.py +++ b/transformer_engine/paddle/constants.py @@ -40,3 +40,9 @@ class FP8BwdTensors(Enum): paddle.float16: tex.DType.kFloat16, paddle.bfloat16: tex.DType.kBFloat16, } + +AttnMaskTypes = ("causal", "padding", "no_mask") + +AttnTypes = ("self", "cross") + +LayerTypes = ("encoder", "decoder") diff --git a/transformer_engine/paddle/cpp_extensions.py b/transformer_engine/paddle/cpp_extensions.py index b16c1c81e6..97a141973b 100644 --- a/transformer_engine/paddle/cpp_extensions.py +++ b/transformer_engine/paddle/cpp_extensions.py @@ -435,9 +435,9 @@ def fused_attn_fwd_qkvpacked( assert (Bias.dtype == qkv.dtype), "bias tensor must be in the same dtype as qkv." if set_zero: - out = paddle.full(shape=[total_seqs, h, d], fill_value=0, dtype=qkv.dtype) + out = paddle.full(shape=[b, max_seqlen, h, d], fill_value=0, dtype=qkv.dtype) else: - out = paddle.empty(shape=[total_seqs, h, d], dtype=qkv.dtype) + out = paddle.empty(shape=[b, max_seqlen, h, d], dtype=qkv.dtype) if is_training: softmax_aux = paddle.empty(shape=[b, h, max_seqlen, max_seqlen], dtype=qkv.dtype) @@ -574,9 +574,9 @@ def fused_attn_fwd_kvpacked( assert (Bias.dtype == q.dtype), "bias tensor must be in the same dtype as q and kv." if set_zero: - out = paddle.full(shape=[total_seqs_q, h, d], fill_value=0, dtype=q.dtype) + out = paddle.full(shape=[b, max_seqlen_q, h, d], fill_value=0, dtype=q.dtype) else: - out = paddle.empty(shape=[total_seqs_q, h, d], dtype=q.dtype) + out = paddle.empty(shape=[b, max_seqlen_q, h, d], dtype=q.dtype) if is_training: softmax_aux = paddle.empty(shape=[b, h, max_seqlen_q, max_seqlen_kv], dtype=q.dtype) diff --git a/transformer_engine/paddle/layer/__init__.py b/transformer_engine/paddle/layer/__init__.py index bf5efd2753..b4d6ec9fef 100644 --- a/transformer_engine/paddle/layer/__init__.py +++ b/transformer_engine/paddle/layer/__init__.py @@ -3,7 +3,10 @@ # See LICENSE for license information. """Layer level Paddle APIs""" +from .attention import DotProductAttention, MultiHeadAttention from .layernorm import LayerNorm from .layernorm_linear import LayerNormLinear from .layernorm_mlp import LayerNormMLP from .linear import Linear +from .softmax import FusedScaleMaskSoftmax +from .transformer import TransformerLayer diff --git a/transformer_engine/paddle/layer/attention.py b/transformer_engine/paddle/layer/attention.py new file mode 100644 index 0000000000..a5aac3566f --- /dev/null +++ b/transformer_engine/paddle/layer/attention.py @@ -0,0 +1,568 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Attntion API""" + +import math +import warnings +from typing import Optional, Tuple, Union + +import paddle +import paddle.nn.functional as F + +from transformer_engine.paddle.constants import ( + AttnTypes, + TE_DType, +) +from transformer_engine.paddle.cpp_extensions import ( + fused_attn_fwd_qkvpacked, + fused_attn_bwd_qkvpacked, + fused_attn_fwd_kvpacked, + fused_attn_bwd_kvpacked, +) +from transformer_engine.paddle.utils import (attention_mask_func, mask_to_cu_seqlens) +from .base import TransformerEngineBaseLayer +from .layernorm_linear import LayerNormLinear +from .linear import Linear +from .softmax import FusedScaleMaskSoftmax + + +class FusedAttnFuncPackedQKV(paddle.autograd.PyLayer): + """Function for FusedAttention with packed QKV input""" + + @staticmethod + def forward(ctx, qkv, cu_seqlens, attn_bias, rng_state, max_seqlen, attn_scale, qkv_dtype, + dropout_p, set_zero, qkv_layout, attn_bias_type, attn_mask_type, is_training): + """Forward function for FusedAttention with packed QKV input""" + out, aux_ctx_tensors = fused_attn_fwd_qkvpacked( + qkv, + cu_seqlens, + rng_state, + is_training, + max_seqlen, + qkv_dtype, + attn_bias, + attn_scale, + dropout_p, + set_zero, + qkv_layout, + attn_bias_type, + attn_mask_type, + ) + + ctx.save_for_backward(qkv, out, cu_seqlens, rng_state, aux_ctx_tensors) + ctx.max_seqlen = max_seqlen + ctx.qkv_dtype = qkv_dtype + ctx.attn_scale = attn_scale + ctx.dropout_p = dropout_p + ctx.set_zero = set_zero + ctx.qkv_layout = qkv_layout + ctx.attn_bias_type = attn_bias_type + ctx.attn_mask_type = attn_mask_type + + return out + + @staticmethod + def backward(ctx, d_out): + """Backward function for FusedAttention with packed QKV input""" + qkv, out, cu_seqlens, rng_state, aux_ctx_tensors = ctx.saved_tensor() + dqkv, *rest = fused_attn_bwd_qkvpacked(qkv, cu_seqlens, rng_state, out, d_out, + aux_ctx_tensors, ctx.max_seqlen, ctx.qkv_dtype, + ctx.attn_scale, ctx.dropout_p, ctx.set_zero, + ctx.qkv_layout, ctx.attn_bias_type, + ctx.attn_mask_type) + + # if no_bias, return dqkv + if ctx.attn_bias_type == "no_bias": + return (dqkv, None, None) + # else, return (dqkv, dbias) + return (dqkv, None, rest[0], None) + + +class FusedAttnFuncPackedKV(paddle.autograd.PyLayer): + """Function for FusedAttention with packed KV input""" + + @staticmethod + def forward(ctx, q, kv, cu_seqlens_q, cu_seqlens_kv, attn_bias, rng_state, max_seqlen_q, + max_seqlen_kv, attn_scale, qkv_dtype, dropout_p, set_zero, qkv_layout, + attn_bias_type, attn_mask_type, is_training): + """Forward function for FusedAttention with packed KV input""" + out, aux_ctx_tensors = fused_attn_fwd_kvpacked(q, kv, cu_seqlens_q, cu_seqlens_kv, + rng_state, is_training, max_seqlen_q, + max_seqlen_kv, qkv_dtype, attn_bias, + attn_scale, dropout_p, set_zero, qkv_layout, + attn_bias_type, attn_mask_type) + + ctx.save_for_backward(q, kv, out, cu_seqlens_q, cu_seqlens_kv, rng_state, aux_ctx_tensors) + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_kv = max_seqlen_kv + ctx.qkv_dtype = qkv_dtype + ctx.attn_scale = attn_scale + ctx.dropout_p = dropout_p + ctx.set_zero = set_zero + ctx.qkv_layout = qkv_layout + ctx.attn_bias_type = attn_bias_type + ctx.attn_mask_type = attn_mask_type + + return out + + @staticmethod + def backward(ctx, d_out): + """Backward function for FusedAttention with packed KV input""" + q, kv, out, cu_seqlens_q, cu_seqlens_kv, rng_state, aux_ctx_tensors = ctx.saved_tensor() + dq, dkv, *rest = fused_attn_bwd_kvpacked(q, kv, cu_seqlens_q, cu_seqlens_kv, rng_state, out, + d_out, aux_ctx_tensors, ctx.max_seqlen_q, + ctx.max_seqlen_kv, ctx.qkv_dtype, ctx.attn_scale, + ctx.dropout_p, ctx.set_zero, ctx.qkv_layout, + ctx.attn_bias_type, ctx.attn_mask_type) + + # if no_bias, return dq, dkv + if ctx.attn_bias_type == "no_bias": + return (dq, dkv, None, None, None) + # else, return (dq, dkv, dbias) + return (dq, dkv, None, None, rest[0], None) + + +class DotProductAttention(paddle.nn.Layer): + """Dot Product Attention Layer + Allows the model to jointly attend to information from different + representation subspaces as described in the paper: + `Attention Is All You Need `_. + + .. note:: + + Argument :attr:`attention_mask` will be ignored in the `forward` call when + :attr:`attn_mask_type` is set to `"causal"`. + + Parameters + ---------- + norm_factor : float + normalization factor for the attention scores. + attention_dropout: float, default = 0.1 + dropout probability for the dropout op during multi-head attention. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. + attention_type: {'self', 'cross'}, default = `self` + type of attention operation. + backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` + backend to use for attention operation. + + """ + + def __init__(self, + norm_factor: float, + attention_dropout: float = 0.1, + attn_mask_type: str = "causal", + attention_type: str = "self", + backend: str = 'transformer_engine') -> None: + super().__init__() + + self.norm_factor = norm_factor + self.attn_mask_type = attn_mask_type + self.attention_dropout = attention_dropout + self.attention_type = attention_type + self.backend = backend + self.rng_state = paddle.zeros((2,), dtype='int64') + self.rng_state.persistable = True + if self.backend != 'transformer_engine': + self.scale_mask_softmax = FusedScaleMaskSoftmax(attn_mask_type, + attention_mask_func, + backend=self.backend) + + def forward( + self, + query_layer: paddle.Tensor, + key_value_layer: paddle.Tensor = None, + attention_mask: Optional[paddle.Tensor] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[paddle.Tensor] = None, + set_zero: bool = True, + ) -> paddle.Tensor: + """ + Dot Product Attention Layer. + + .. note:: + + Argument :attr:`attention_mask` will be ignored when :attr:`attn_mask_type` + is set to `"causal"`. + + .. note:: + + For self attention, :attr:`query_layer` is the `[query, key, value]` tensor + stacked along the 2nd dimension, which must be of shape (:attr:`batch_size`, + :attr:`seq_length`, 3, :attr:`num_attention_heads`, :attr:`size_per_head`). + And :attr:`key_value_layer` is `None`. + For cross attention, :attr:`query_layer` is the `[query]` tensor, which must + be of shape (:attr:`batch_size`, :attr:`seq_length`, :attr:`num_attention_heads`, + :attr:`size_per_head`). And :attr:`key_value_layer` is the `[key, value]` tensor, + which must be of shape (:attr:`batch_size`, :attr:`seq_length`, 2, + :attr:`num_attention_heads`, :attr:`size_per_head`). + + + + Parameters + ---------- + query_layer : paddle.Tensor + Query tensor. + key_value_layer : paddle.Tensor + Key tensor. + attention_mask : Optional[paddle.Tensor], default = `None` + Boolean tensor used to mask out softmax input when not using attention. + core_attention_bias_type: str, default = `no_bias` + only support no_bias type currently, {`no_bias`} + core_attention_bias: Optional[paddle.Tensor], default = `None` + Bias tensor for Q * K.T + set_zero: bool, defautl = `True` + Whether to use the fast path to set output tensors to 0 or not. + """ + + if self.backend == 'transformer_engine': + return self._te_forward(query_layer, key_value_layer, attention_mask, + core_attention_bias_type, core_attention_bias, set_zero) + if self.backend == 'paddle': + if core_attention_bias_type != "no_bias": + warnings.warn("Paddle backend dot product attention does not support bias yet. " + "Bias will be ignored.") + return self._pd_forward(query_layer, key_value_layer, attention_mask) + raise AttributeError(f"Backend {self.backend} is not supported.") + + def _te_forward( + self, + query_layer: paddle.Tensor, + key_value_layer: paddle.Tensor = None, + attention_mask: Optional[paddle.Tensor] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[paddle.Tensor] = None, + set_zero: bool = True, + ) -> paddle.Tensor: + + gen_state = paddle.get_rng_state()[0].__getstate__() + self.rng_state[0], self.rng_state[1] = gen_state[1], gen_state[2] # [seed, offset] + if self.attention_type == "self": + # self attention - q: [b, s, 3, h, d] kv: None + assert (len(query_layer.shape) == 5 and query_layer.shape[2] == 3 + and key_value_layer is None + ), "query shape must be [b, s, 3, h, d] for dot product self attention" + max_seqlen = query_layer.shape[1] + cu_seqlens, _ = mask_to_cu_seqlens(attention_mask) + qkv_dtype = TE_DType[query_layer.dtype] + qkv_layout = "qkv_interleaved" + + output = FusedAttnFuncPackedQKV.apply( + query_layer, + cu_seqlens, + core_attention_bias, + self.rng_state, + max_seqlen, + 1.0 / self.norm_factor, + qkv_dtype, + self.attention_dropout if self.training else 0.0, + set_zero, + qkv_layout, + core_attention_bias_type, + self.attn_mask_type, + self.training, + ) + elif self.attention_type == "cross": + # cross attention - q: [b, s_q, h, d] kv: [b, s_kv, 2, h, d] + assert ( + len(query_layer.shape) == 4 and len(key_value_layer.shape) == 5 + and key_value_layer.shape[2] == 2 + ), "query shape must be [b, s, h, d] and key shape must be [b, s, 2, h, d]" \ + "for dot product cross attention" + max_seqlen_q = query_layer.shape[1] + max_seqlen_kv = key_value_layer.shape[1] + cu_seqlens_q, cu_seqlens_kv = mask_to_cu_seqlens(attention_mask, need_kv=True) + qkv_dtype = TE_DType[query_layer.dtype] + qkv_layout = "kv_interleaved" + output = FusedAttnFuncPackedKV.apply( + query_layer, + key_value_layer, + cu_seqlens_q, + cu_seqlens_kv, + core_attention_bias, + self.rng_state, + max_seqlen_q, + max_seqlen_kv, + 1.0 / self.norm_factor, + qkv_dtype, + self.attention_dropout if self.training else 0.0, + set_zero, + qkv_layout, + core_attention_bias_type, + self.attn_mask_type, + self.training, + ) + else: + raise ValueError("attention_type must be one of ['self', 'cross']") + return output + + def _pd_forward( + self, + query_layer: paddle.Tensor, + key_value_layer: paddle.Tensor = None, + attention_mask: Optional[paddle.Tensor] = None, + ) -> paddle.Tensor: + if self.attention_type == "self": + # self attention - q: [b, s, 3, h, d] k: None + assert (len(query_layer.shape) == 5 and query_layer.shape[2] == 3 + and key_value_layer is None + ), "query shape must be [b, s, 3, h, d] for dot product self attention" + q = query_layer[:, :, 0] + k = query_layer[:, :, 1] + v = query_layer[:, :, 2] + elif self.attention_type == "cross": + # cross attention - q: [b, s, h, d] kv: [b, s, 2, h, d] + assert ( + len(query_layer.shape) == 4 and len(key_value_layer.shape) == 5 + and key_value_layer.shape[2] == 2 + ), f"query shape must be [b, s, h, d] and key_value shape must be [b, s, 2, h, d]" \ + f"for dot product cross attention. The actual shape is q: {query_layer.shape}" \ + f"kv: {key_value_layer.shape}" + q = query_layer + k = key_value_layer[:, :, 0] + v = key_value_layer[:, :, 1] + + q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) + k = paddle.transpose(x=k, perm=[0, 2, 1, 3]) + v = paddle.transpose(x=v, perm=[0, 2, 1, 3]) + + product = paddle.matmul(x=q * (1.0 / self.norm_factor), y=k, transpose_y=True) + attention_probs = self.scale_mask_softmax(product, attention_mask, scale=None) + + if self.attention_dropout > 0: + attention_probs = F.dropout( + attention_probs, + self.attention_dropout, + training=self.training, + ) + + out = paddle.matmul(attention_probs, v) + out = paddle.transpose(out, perm=[0, 2, 1, 3]) # [b, s, h, d] + # out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + return out + + +class MultiHeadAttention(TransformerEngineBaseLayer): + """Attention w/ QKV and Proj Gemms + + Parameters + ---------- + hidden_size: int + hidden size of the model. + num_attention_heads: int + number of attention heads. + attention_dropout: float, default = 0.1 + dropout probability for the dropout op during multi-head attention. + layernorm_epsilon: float, default = 1e-5 + epsilon to use in the layer norm operations. + weight_attr: Union[paddle.ParamAttr, None], default = `None` + paddle.ParamAttr object for the weight parameter. + bias_attr: Union[paddle.ParamAttr, None, bool], default = `None` + paddle.ParamAttr object for the bias parameter. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. + params_dtype: Optional[paddle.dtype], default = `None` + data type for the weights and biases. + return_layernorm_output: bool, default = `False` + whether to return the output of the layernorm operation. + input_layernorm: bool, default = `False` + whether to apply layernorm to the input. + attention_type: {'self', 'cross'}, default = `self` + type of attention operation. + zero_centered_gamma: bool, default = `False` + whether to zero initialize the gamma of the layernorm operation. + backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` + backend to use for attention operation. + """ + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + attention_dropout: float = 0.1, + layernorm_epsilon: float = 1e-5, + weight_attr: Union[paddle.ParamAttr, None] = None, + bias_attr: Union[paddle.ParamAttr, None, bool] = None, + attn_mask_type: str = "causal", + params_dtype: Optional[paddle.dtype] = None, + return_layernorm_output: bool = False, + input_layernorm: bool = False, + attention_type: str = "self", + zero_centered_gamma: bool = False, + backend: str = 'transformer_engine', + ) -> None: + super().__init__() + self.input_layernorm = input_layernorm + self.attention_type = attention_type + self.return_layernorm_output = return_layernorm_output + self.params_dtype = paddle.get_default_dtype() if params_dtype is None else params_dtype + self.weight_attr = weight_attr + self.bias_attr = bias_attr + self.attn_mask_type = attn_mask_type + + assert attention_type in AttnTypes, f"attention_type {attention_type} not supported" + + self.hidden_size_per_attention_head = hidden_size // num_attention_heads + self.num_attention_heads = num_attention_heads + norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.backend = backend + + if self.attention_type == "self": + if self.input_layernorm: + self.layernorm_qkv = LayerNormLinear( + hidden_size, + 3 * hidden_size, + eps=layernorm_epsilon, + weight_attr=self.weight_attr, + bias_attr=self.bias_attr, + return_layernorm_output=return_layernorm_output, + zero_centered_gamma=zero_centered_gamma, + backend=self.backend, + ) + else: + self.qkv = Linear( + hidden_size, + 3 * hidden_size, + self.weight_attr, + self.bias_attr, + backend=self.backend, + ) + + else: # cross attention + if self.input_layernorm: + self.layernorm_query = LayerNormLinear( + hidden_size, + hidden_size, + eps=layernorm_epsilon, + weight_attr=self.weight_attr, + bias_attr=self.bias_attr, + return_layernorm_output=return_layernorm_output, + zero_centered_gamma=zero_centered_gamma, + backend=self.backend, + ) + else: + self.query_layer = Linear( + hidden_size, + hidden_size, + self.weight_attr, + self.bias_attr, + backend=self.backend, + ) + self.key_value = Linear( + hidden_size, + 2 * hidden_size, + self.weight_attr, + self.bias_attr, + backend=self.backend, + ) + + # Attention. + self.core_attention = DotProductAttention( + norm_factor, + attention_dropout, + attn_mask_type=attn_mask_type, + attention_type=self.attention_type, + backend=self.backend, + ) + + # Linear + self.proj = Linear( + hidden_size, + hidden_size, + self.weight_attr, + self.bias_attr, + backend=self.backend, + ) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + encoder_output: Optional[paddle.Tensor] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[paddle.Tensor] = None, + set_zero: bool = True, + ) -> Tuple[Union[paddle.Tensor, None], ...]: + """ + MultiHeadAttention Layer. + + + Parameters + ---------- + hidden_states : paddle.Tensor + Input tensor. + attention_mask : Optional[paddle.Tensor], default = `None` + Boolean tensor used to mask out softmax input when not using attention. + encoder_output : Optional[paddle.Tensor], default = `None` + Output of the encoder layer. + core_attention_bias_type: str, default = `no_bias` + only support no_bias type currently, {`no_bias`} + core_attention_bias: Optional[paddle.Tensor], default = `None` + Bias tensor for Q * K.T + set_zero: bool, defautl = `True` + Whether to use the fast path to set output tensors to 0 or not. + + """ + + # hidden_states: [b, s_q, hidden_size] + if self.attn_mask_type != "causal" and attention_mask is not None: + assert (attention_mask.dtype == paddle.bool), "Attention mask must be a boolean tensor" + + if self.attention_type == "self": + if self.input_layernorm: + layernorm_qkv_outputs = self.layernorm_qkv(hidden_states) + if self.return_layernorm_output: + mixed_qkv_layer, layernorm_output = layernorm_qkv_outputs + else: + mixed_qkv_layer = layernorm_qkv_outputs + else: + mixed_qkv_layer = self.qkv(hidden_states) + + # [b, s_q, 3 * hidden_size] --> [b, s_q, 3, num_heads, head_size] + mixed_qkv_layer = mixed_qkv_layer.reshape( + shape=[0, 0, 3, self.num_attention_heads, self.hidden_size_per_attention_head]) + + context_layer = self.core_attention( + query_layer=mixed_qkv_layer, + key_value_layer=None, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) + + else: # cross attention + mixed_kv_layer = self.key_value(encoder_output) + # [b, s_kv, 2 * hidden_size] --> [b, s_kv, 2, num_heads, head_size] + mixed_kv_layer = mixed_kv_layer.reshape( + shape=[0, 0, 2, self.num_attention_heads, self.hidden_size_per_attention_head]) + + if self.input_layernorm: + layernorm_query_outputs = self.layernorm_query(hidden_states) + if self.return_layernorm_output: + query_layer, layernorm_output = layernorm_query_outputs + else: + query_layer = layernorm_query_outputs + else: + query_layer = self.query_layer(hidden_states) + + query_layer = query_layer.reshape( + shape=[0, 0, self.num_attention_heads, self.hidden_size_per_attention_head]) + context_layer = self.core_attention( + query_layer=query_layer, + key_value_layer=mixed_kv_layer, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) + + context_layer = paddle.reshape(context_layer, + [0, 0, context_layer.shape[2] * context_layer.shape[3]]) + # Output. [b, s, hidden] + attention_output = self.proj(context_layer) + + if self.input_layernorm and self.return_layernorm_output: + return attention_output, layernorm_output + return attention_output diff --git a/transformer_engine/paddle/layer/layernorm.py b/transformer_engine/paddle/layer/layernorm.py index a706c85c88..3f0b8c4a50 100644 --- a/transformer_engine/paddle/layer/layernorm.py +++ b/transformer_engine/paddle/layer/layernorm.py @@ -126,7 +126,7 @@ def _pd_forward( "Paddle backend does not support LayerNorm with zero-centered scale.") return F.layer_norm(x=inp, - normalized_shape=inp.shape[1:], + normalized_shape=inp.shape[-1], weight=self.weight, bias=self.bias, epsilon=self.eps) diff --git a/transformer_engine/paddle/layer/layernorm_linear.py b/transformer_engine/paddle/layer/layernorm_linear.py index 88736ba75f..608f02a6ff 100644 --- a/transformer_engine/paddle/layer/layernorm_linear.py +++ b/transformer_engine/paddle/layer/layernorm_linear.py @@ -402,7 +402,6 @@ def _te_forward( if self.return_layernorm_output: out, ln_out = out return out, ln_out - return out def _pd_forward( @@ -415,7 +414,7 @@ def _pd_forward( "Paddle backend does not support LayerNorm with zero-centered scale.") ln_out = F.layer_norm(x=inp, - normalized_shape=inp.shape[1:], + normalized_shape=inp.shape[-1], weight=self.ln_weight, bias=self.ln_bias, epsilon=self.eps) diff --git a/transformer_engine/paddle/layer/layernorm_mlp.py b/transformer_engine/paddle/layer/layernorm_mlp.py index 7bf3cc6fab..6d725114b0 100644 --- a/transformer_engine/paddle/layer/layernorm_mlp.py +++ b/transformer_engine/paddle/layer/layernorm_mlp.py @@ -624,7 +624,7 @@ def _pd_forward( "Paddle backend does not support LayerNorm with zero-centered scale.") ln_out = F.layer_norm(x=inp, - normalized_shape=inp.shape[1:], + normalized_shape=inp.shape[-1], weight=self.ln_weight, bias=self.ln_bias, epsilon=self.eps) diff --git a/transformer_engine/paddle/layer/softmax.py b/transformer_engine/paddle/layer/softmax.py new file mode 100644 index 0000000000..33b0293e0a --- /dev/null +++ b/transformer_engine/paddle/layer/softmax.py @@ -0,0 +1,237 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Fused scaled masked softmax functions""" + +import os +import warnings +from typing import Callable, Tuple, Union, Optional + +import paddle + +from transformer_engine.paddle.cpp_extensions import ( + scaled_upper_triang_masked_softmax_forward, + scaled_upper_triang_masked_softmax_backward, + scaled_masked_softmax_forward, + scaled_masked_softmax_backward, + scaled_softmax_forward, + scaled_softmax_backward, +) + +THREADS_PER_WARP = 32 +THREADS_PER_BLOCK = 128 + +_default_causal_mask = {} + + +def _get_default_causal_mask(seqlen: int) -> paddle.Tensor: + """Return the causal upper triangular mask for softmax input""" + if seqlen not in _default_causal_mask: + _default_causal_mask[seqlen] = paddle.triu(paddle.ones((seqlen, seqlen)), + diagonal=1).cast('bool') + return _default_causal_mask[seqlen] + + +class ScaledUpperTriangMaskedSoftmax(paddle.autograd.PyLayer): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor: + """ScaledUpperTriangMaskedSoftmax fwd""" + scale_t = paddle.Tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0]) + + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]: + """ScaledUpperTriangMaskedSoftmax bwd""" + softmax_results, scale_t = ctx.saved_tensor() + input_grads = scaled_upper_triang_masked_softmax_backward(output_grads, softmax_results, + scale_t[0]) + + return input_grads, None + + +class ScaledMaskedSoftmax(paddle.autograd.PyLayer): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply the mask. + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs: paddle.Tensor, mask: paddle.Tensor, scale: float) -> paddle.Tensor: + """ScaledMaskedSoftmax fwd""" + scale_t = paddle.Tensor([scale]) + + softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]: + """ScaledMaskedSoftmax bwd""" + softmax_results, scale_t = ctx.saved_tensor() + + input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class ScaledSoftmax(paddle.autograd.PyLayer): + """ + Fused operation which performs following two operations in sequence + 1. Scale the tensor. + 2. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor: + """ScaledSoftmax fwd""" + scale_t = paddle.Tensor([scale]) + + softmax_results = scaled_softmax_forward(inputs, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]: + """ScaledSoftmax bwd""" + softmax_results, scale_t = ctx.saved_tensor() + + input_grads = scaled_softmax_backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class FusedScaleMaskSoftmax(paddle.nn.Layer): + """ + fused operation: scaling + mask + softmax + + Arguments: + attn_mask_type: attention mask type (pad or causal) + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + """ + + def __init__( + self, + attn_mask_type: str, + mask_func: Callable, + softmax_in_fp32: bool = True, + backend: str = 'transformer_engine', + ) -> None: + super().__init__() + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = bool(int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1"))) + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.backend = backend + + def forward( + self, + inp: paddle.Tensor, + mask: paddle.Tensor, + scale: Optional[float] = None, + ) -> paddle.Tensor: + """FusedScaleMaskSoftmax fprop""" + # [batch_size, num_heads, s_q, s_kv] + assert inp.dim() == 4 + self.input_is_fp16 = inp.dtype == paddle.float16 + self.input_is_bf16 = inp.dtype == paddle.bfloat16 + self.input_in_16bit_float = self.input_is_fp16 or self.input_is_bf16 + + assert (scale is None or self.softmax_in_fp32), "softmax should be in fp32 when scaled" + + if self.backend == 'transformer_engine' and not self.is_kernel_available(*inp.shape): + warnings.warn( + "fused kernel is not available for this input shape, fall back to paddle backend") + self.backend = 'paddle' + + if self.backend == 'transformer_engine': + return self._te_forward(inp, mask, scale) + if self.backend == 'paddle': + return self._pd_forward(inp, mask, scale) + raise AttributeError(f"Backend {self.backend} is not supported.") + + def is_kernel_available(self, b: int, h: int, s_q: int, s_kv: int) -> bool: + """Check FusedScaleMaskSoftmax kernel availability based on size""" + attn_batches = b * h + + if (self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_16bit_float # input must be fp16 + and 16 < s_kv <= 4096 # s_kv must be 16 ~ 2048 + and s_q % 4 == 0 # s_q must be a multiple of 4 + and attn_batches % 4 == 0 # b * h must be a multiple of 4 + ): + if 0 <= s_kv <= 4096: + batch_per_block = self.get_batch_per_block(int(s_kv)) + + if self.attn_mask_type == "causal": + if attn_batches % batch_per_block == 0: + return True + else: + if s_q % batch_per_block == 0: + return True + return False + + def _te_forward(self, + inp: paddle.Tensor, + mask: paddle.Tensor, + scale: Optional[float] = None) -> paddle.Tensor: + """Fused masked softmax kernel""" + b, h, s_q, s_kv = inp.size() + scale = 1.0 if scale is None else scale + + if self.attn_mask_type == "causal": + assert s_q == s_kv, "causal mask is only for self attention" + + # input is 3D tensor (attn_batches, s_q, s_kv) + inp = inp.reshape((-1, s_q, s_kv)) + probs = ScaledUpperTriangMaskedSoftmax.apply(inp, scale) + return probs.reshape((b, h, s_q, s_kv)) + # input is 4D tensor (b, h, s_q, s_kv) + if mask is not None: + return ScaledMaskedSoftmax.apply(inp, mask, scale) + return ScaledSoftmax.apply(inp, scale) + + def _pd_forward(self, + inp: paddle.Tensor, + mask: paddle.Tensor, + scale: Optional[float] = None) -> paddle.Tensor: + """Call Paddle OP""" + if self.input_in_16bit_float and self.softmax_in_fp32: + inp = paddle.cast(inp, 'float32') + + if scale is not None: + inp = inp * scale + + if self.attn_mask_type == "causal": + mask = _get_default_causal_mask(inp.shape[2]) + + mask_output = self.mask_func(inp, mask) if mask is not None else inp + probs = paddle.nn.functional.softmax(mask_output, axis=-1) + + if self.input_in_16bit_float and self.softmax_in_fp32: + if self.input_is_fp16: + probs = paddle.cast(probs, 'float16') + else: + probs = paddle.cast(probs, 'bfloat16') + + return probs + + @staticmethod + def get_batch_per_block(key_seq_len: int) -> int: + """Softmax utility""" + pow2 = 1 << (key_seq_len - 1).bit_length() + warp_size = pow2 if pow2 < THREADS_PER_WARP else THREADS_PER_WARP + batches_per_warp = 2 if pow2 <= 128 else 1 + warps_per_block = THREADS_PER_BLOCK // warp_size + batches_per_block = warps_per_block * batches_per_warp + return batches_per_block diff --git a/transformer_engine/paddle/layer/transformer.py b/transformer_engine/paddle/layer/transformer.py new file mode 100644 index 0000000000..6e6afd4ca2 --- /dev/null +++ b/transformer_engine/paddle/layer/transformer.py @@ -0,0 +1,260 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Transformer""" + +from typing import Optional, Union + +import paddle + +from transformer_engine.paddle.constants import ( + AttnMaskTypes, + LayerTypes, +) +from transformer_engine.paddle.layer import (LayerNormMLP, LayerNorm, MultiHeadAttention) +from .base import TransformerEngineBaseLayer + + +class TransformerLayer(TransformerEngineBaseLayer): + r""" + TransformerLayer is made up of an attention block and a feedforward network (MLP). + This standard layer is based on the paper "Attention Is All You Need". + + Parameters + ---------- + hidden_size : int + size of each input sample. + ffn_hidden_size : int + intermediate size to which input samples are projected. + num_attention_heads : int + number of attention heads in the transformer layer. + layernorm_epsilon : float, default = 1e-5 + a value added to the denominator of layer normalization + for numerical stability. + hidden_dropout: float, default = 0.1 + dropout probability for the dropout op after FC2 layer. + attention_dropout: float, default = 0.1 + dropout probability for the dropout op during multi-head attention. + self_attn_mask_type: {'causal', 'padding'}, default = `causal` + type of attention mask passed into softmax operation. + apply_residual_connection_post_layernorm : bool, default = `False` + if set to `True`, residual connections are taken + from the output of layer norm (default is taken + from input of layer norm) + output_layernorm: bool, default = `False` + if set to `True`, layer normalization is applied on the output side, + after the final dropout-add. default behavior is to apply layer + normalization on the input side, before the QKV transformation. + layer_type: {'encoder', 'decoder'}, default = `encoder` + if set to `decoder`, an additional cross-attn block is added after self-attn. + This can be used for structures like `T5` Transformer in conjunction with the + `encoder` option. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in LayerNorm is initialized to 0 and + the LayerNorm formula changes to + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * + (1 + \gamma) + \beta + activation : str, default = 'gelu' + Type of activation used in MLP block. + Options are: 'gelu', 'relu', 'reglu', 'geglu' and 'swiglu'. + + params_dtype : paddle.dtype, default = `paddle.get_default_dtype()` + it controls the type used to allocate the initial parameters. Useful when + the model is trained with lower precision and the original FP32 parameters + would not fit in GPU memory. + """ + + def __init__(self, + hidden_size: int, + ffn_hidden_size: int, + num_attention_heads: int, + layernorm_epsilon: float = 1e-5, + hidden_dropout: float = 0.1, + attention_dropout: float = 0.1, + weight_attr: Union[paddle.ParamAttr, None] = None, + bias_attr: Union[paddle.ParamAttr, None, bool] = None, + self_attn_mask_type: str = "causal", + params_dtype: Optional[paddle.dtype] = None, + apply_residual_connection_post_layernorm: bool = False, + output_layernorm: bool = False, + layer_type: str = "encoder", + zero_centered_gamma: bool = False, + activation: str = 'gelu', + backend: str = 'transformer_engine') -> None: + super().__init__() + + params_dtype = paddle.get_default_dtype() if params_dtype is None else params_dtype + self.output_layernorm = output_layernorm + self.layer_type = layer_type + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.self_attn_mask_type = self_attn_mask_type + + assert (self_attn_mask_type + in AttnMaskTypes), f"self_attn_mask_type {self_attn_mask_type} not supported" + assert layer_type in LayerTypes, f"layer_type {layer_type} not supported" + + attention_args = ( + hidden_size, + num_attention_heads, + attention_dropout, + layernorm_epsilon, + weight_attr, + bias_attr, + ) + common_attention_kwargs = { + "params_dtype": params_dtype, + "return_layernorm_output": apply_residual_connection_post_layernorm, + "zero_centered_gamma": zero_centered_gamma, + "backend": backend, + } + + self.self_attention = MultiHeadAttention( + *attention_args, + **common_attention_kwargs, + attn_mask_type=self_attn_mask_type, + input_layernorm=not output_layernorm, + attention_type="self", + ) + + if layer_type == "decoder": + self.inter_attention = MultiHeadAttention( + *attention_args, + **common_attention_kwargs, + attn_mask_type="padding", + input_layernorm=True, + attention_type="cross", + ) + + self.layernorm_mlp = LayerNormMLP( + hidden_size, + ffn_hidden_size, + eps=layernorm_epsilon, + weight_attr=weight_attr, + bias_attr=bias_attr, + activation=activation, + return_layernorm_output=apply_residual_connection_post_layernorm, + zero_centered_gamma=zero_centered_gamma, + backend=backend, + ) + + self.hidden_dropout = hidden_dropout + + if self.output_layernorm: + self.layernorm = LayerNorm( + hidden_size, + layernorm_epsilon, + weight_attr, + bias_attr, + zero_centered_gamma=zero_centered_gamma, + backend=backend, + ) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + encoder_output: Optional[paddle.Tensor] = None, + enc_dec_attn_mask: Optional[paddle.Tensor] = None, + core_attention_bias_type: str = "no_bias", + core_attention_bias: Optional[paddle.Tensor] = None, + set_zero: bool = True, + ) -> paddle.Tensor: + """ + Transformer Layer: attention block and a feedforward network (MLP) + + .. note:: + + Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type` + is set to `"causal"`. + + Parameters + ---------- + hidden_states : paddle.Tensor + Input tensor. + attention_mask : Optional[paddle.Tensor], default = `None` + Boolean tensor used to mask out self-attention softmax input. + encoder_output : Optional[paddle.Tensor], default = `None` + Output of the encoder block to be fed into the decoder block if using + `layer_type="decoder"`. + enc_dec_attn_mask : Optional[paddle.Tensor], default = `None` + Boolean tensor used to mask out inter-attention softmax input if using + `layer_type="decoder"`. + core_attention_bias_type: str, default = `no_bias` + core_attention_bias: Optional[paddle.Tensor], default = `None` + Bias tensor for Q * K.T + set_zero: bool, default = `True` + Whether to set output tensors to 0 or not before use. + """ + + if self.self_attn_mask_type != "causal" and attention_mask is not None: + assert (attention_mask.dtype == paddle.bool), "Attention mask must be a boolean tensor" + + assert core_attention_bias_type in ['no_bias'], f"Only no_bias is supported currently, " \ + f"but receive core_attention_bias_type = {core_attention_bias_type}" + + # Self attention. + self_attention_outputs = self.self_attention( + hidden_states, + attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) + + if self.apply_residual_connection_post_layernorm and not self.output_layernorm: + attention_output, residual = self_attention_outputs + else: + attention_output = self_attention_outputs + residual = hidden_states + + # dropoout add. + out = paddle.nn.functional.dropout( + attention_output, + p=self.hidden_dropout, + training=True, + ) + bda_output = residual + out + + # Cross attention. + if self.layer_type == "decoder": + inter_attention_outputs = self.inter_attention( + bda_output, + enc_dec_attn_mask, + encoder_output=encoder_output, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) + if self.apply_residual_connection_post_layernorm: + attention_output, residual = inter_attention_outputs + else: + attention_output = inter_attention_outputs + residual = bda_output + + out = paddle.nn.functional.dropout( + attention_output, + p=self.hidden_dropout, + training=True, + ) + bda_output = residual + out + + # MLP. + mlp_outputs = self.layernorm_mlp(bda_output) + if self.apply_residual_connection_post_layernorm: + mlp_output, residual = mlp_outputs + else: + mlp_output = mlp_outputs + residual = bda_output + + # dropoout add. + out = paddle.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=True) + output = residual + out + + # For BERT like architectures. + if self.output_layernorm: + output = self.layernorm(output) + + # output: [b, s, hidden] + return output diff --git a/transformer_engine/paddle/utils.py b/transformer_engine/paddle/utils.py index 8bc1152a6f..9ade785d6e 100644 --- a/transformer_engine/paddle/utils.py +++ b/transformer_engine/paddle/utils.py @@ -52,3 +52,37 @@ def get_paddle_act_func(activation): if activation not in funcs: raise "Activation type " + activation + " is not supported." return funcs[activation] + + +def attention_mask_func(attention_scores: paddle.Tensor, + attention_mask: paddle.Tensor) -> paddle.Tensor: + """Get attention mask""" + + def _masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + attention_scores = _masked_fill(attention_scores, attention_mask, -10000.0) + return attention_scores + + +def mask_to_cu_seqlens(mask: paddle.Tensor, need_kv: bool = False) -> paddle.Tensor: + """Convert mask to cu_seqlens""" + assert 'bool' in str(mask.dtype), "mask must be bool dtype" + assert len(mask.shape) == 4 and mask.shape[1] == 1, "mask must be [b, 1, s_q, s_kv]" + q_actual_seqlens = paddle.sum(mask[:, :, :, 0] == False, axis=(-1, -2), dtype='int32') # pylint: disable=singleton-comparison + q_cu_seqlens = paddle.cumsum(q_actual_seqlens) + q_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), q_cu_seqlens], axis=0) + if not need_kv: + return q_cu_seqlens, None + kv_actual_seqlens = paddle.sum(mask[:, :, 0, :] == False, axis=(-1, -2), dtype='int32') # pylint: disable=singleton-comparison + kv_cu_seqlens = paddle.cumsum(kv_actual_seqlens) + kv_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), kv_cu_seqlens], axis=0) + return q_cu_seqlens, kv_cu_seqlens + + +def divide(numerator: int, denominator: int) -> int: + """Ensure that numerator is divisible by the denominator and return + the division value.""" + assert (numerator % denominator == 0), f"{numerator} is not divisible by {denominator}" + return numerator // denominator From d661d06c38ddaa6859b161fde5f00491e7184b04 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Fri, 18 Aug 2023 00:48:04 -0700 Subject: [PATCH 047/427] fix for amax_and_scale_update when reduce_amax=False (#386) Signed-off-by: Sudhakar Singh --- transformer_engine/pytorch/module/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 56ee70d8c9..0352a7ba2b 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -78,7 +78,7 @@ def _prepare_backward( # Update amax and scale; Skip all setup for global amax reduction if not fp8_meta["recipe"].reduce_amax: - FP8GlobalStateManager.amax_and_scale_update(fp8_meta, False) + amax_and_scale_update(fp8_meta, False) else: # From previous iteration FP8GlobalStateManager.copy_amax_from_global_buffer(fp8_meta, forward=False) From 8cdd80df74f7bcfff7db041b306f378205782845 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Sat, 19 Aug 2023 01:04:24 -0700 Subject: [PATCH 048/427] PyTorch MultiheadAttention API (#387) * PyTorch MultiheadAttention API Signed-off-by: Kirthi Shankar Sivamani * Fix ONNX export tests Signed-off-by: Kirthi Shankar Sivamani * Expose MultiheadAttention for import Signed-off-by: Kirthi Shankar Sivamani * Expand mask type and add no mask numerical test Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- docs/api/pytorch.rst | 3 + tests/pytorch/test_numerics.py | 87 ++++++++- tests/pytorch/test_onnx_export.py | 3 +- transformer_engine/pytorch/__init__.py | 1 + transformer_engine/pytorch/attention.py | 213 ++++++++++++++++++++-- transformer_engine/pytorch/transformer.py | 8 +- 6 files changed, 288 insertions(+), 27 deletions(-) diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index 22a571279b..af71e1a2a7 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -22,6 +22,9 @@ pyTorch .. autoapiclass:: transformer_engine.pytorch.DotProductAttention(num_attention_heads, kv_channels, **kwargs) :members: forward +.. autoapiclass:: transformer_engine.pytorch.MultiheadAttention(hidden_size, num_attention_heads, **kwargs) + :members: forward + .. autoapiclass:: transformer_engine.pytorch.TransformerLayer(hidden_size, ffn_hidden_size, num_attention_heads, **kwargs) :members: forward diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 6260c291c4..f8eda48cc3 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -21,7 +21,8 @@ attention_mask_func, ) from transformer_engine.pytorch import ( - DotProductAttention, Linear, LayerNormLinear, LayerNormMLP, TransformerLayer, RMSNorm + DotProductAttention, LayerNormLinear, LayerNormMLP, Linear, + MultiheadAttention, RMSNorm, TransformerLayer ) from transformer_engine.pytorch.distributed import checkpoint as te_checkpoint @@ -60,6 +61,9 @@ def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq all_normalizations = ["LayerNorm", "RMSNorm"] +mask_types = ["causal", "no_mask"] + + def get_causal_attn_mask(sq: int) -> torch.Tensor: return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool() @@ -320,6 +324,7 @@ def forward( return context_layer + # Adapted from https://github.com/bzhangGo/rmsnorm/blob/c6691f20ec0af4128c8159c903071f7575404295/rmsnorm_torch.py class TorchRMSNorm(nn.Module): def __init__(self, in_features, eps=1e-5): @@ -341,6 +346,7 @@ def forward(self, x): return (self.weight.float() * x_normed).to(x.dtype) + class TorchLayerNormLinear(nn.Module): def __init__(self, in_features: int, out_features: int, eps: float, bias: bool = True, @@ -371,7 +377,11 @@ def __init__(self, hidden_size: int, num_attention_heads: int): ) def forward(self, x, attn_mask=None): - return self.mhsa(x, x, x, attn_mask=attn_mask, need_weights=False) + output = self.mhsa(x, x, x, attn_mask=attn_mask, need_weights=False) + if isinstance(output, tuple): + output = output[0] + return output + _supported_act = {'geglu' : nn.GELU(approximate="tanh"), 'gelu' : nn.GELU(approximate="tanh"), @@ -379,6 +389,7 @@ def forward(self, x, attn_mask=None): 'relu' : nn.ReLU(), 'swiglu' : nn.SiLU()} + class TorchGLU(nn.Module): def __init__(self, activation: str): super().__init__() @@ -391,6 +402,7 @@ def forward(self, x): a = self.act(a) return a * b + class TorchLayerNormMLP(nn.Module): def __init__(self, hidden_size: int, ffn_hidden_size: int, eps: float = 1e-5, activation = 'gelu', @@ -431,7 +443,7 @@ def forward( attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: a = self.ln(x) - b, _ = self.causal_attn(a, attn_mask) + b = self.causal_attn(a, attn_mask) x = x + self.resid_attn_dropout(b) n = self.ln_mlp(x) x = x + self.resid_mlp_dropout(n) @@ -754,6 +766,75 @@ def test_gpt_accuracy(dtype, bs, model): assert_allclose(te_outputs[0], torch_outputs[0], 5e-2) +def _test_mha_accuracy(block, bs, dtype, config, mask_type): + reset_rng_states() + + inp_hidden_states = torch.randn( + config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True + ).cuda() + inp_hidden_states.retain_grad() + inp_attn_mask = get_causal_attn_mask(config.seq_len) if mask_type == "causal" else None + + out = block(inp_hidden_states, inp_attn_mask) + loss = out.sum() + loss.backward() + + torch.cuda.synchronize() + outputs = [out, inp_hidden_states.grad] + for p in block.parameters(): + if p.requires_grad: + outputs.append(p.grad) + return outputs + + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +@pytest.mark.parametrize("mask_type", mask_types) +def test_mha_accuracy(dtype, bs, model, mask_type): + config = model_configs[model] + + te_mha = ( + MultiheadAttention( + config.hidden_size, + config.num_attention_heads, + fuse_qkv_params=True, + qkv_weight_interleaved=False, + input_layernorm=False, + attn_mask_type=mask_type, + ) + .to(dtype=dtype) + .cuda() + .eval() + ) + + torch_mha = ( + TorchMHA( + config.hidden_size, + config.num_attention_heads, + ) + .to(dtype=dtype) + .cuda() + .eval() + ) + + # Share params + with torch.no_grad(): + torch_mha.mhsa.in_proj_weight = Parameter(te_mha.qkv.weight.clone()) + torch_mha.mhsa.in_proj_bias = Parameter(te_mha.qkv.bias.clone()) + torch_mha.mhsa.out_proj.weight = Parameter(te_mha.proj.weight.clone()) + torch_mha.mhsa.out_proj.bias = Parameter(te_mha.proj.bias.clone()) + + te_outputs = _test_mha_accuracy(te_mha, bs, dtype, config, mask_type) + torch_outputs = _test_mha_accuracy(torch_mha, bs, dtype, config, mask_type) + + # Check output. + if dtype == torch.float32: + assert_allclose(te_outputs[0], torch_outputs[0], 5e-3) + else: + assert_allclose(te_outputs[0], torch_outputs[0], 5e-2) + + def _test_granular_accuracy(block, bs, dtype, config): reset_rng_states() diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 65b2f39684..1e1fafcac5 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -1267,7 +1267,7 @@ def test_export_multihead_attention( input_ln_str = "_input-ln" if input_layernorm else "" fname = f"te.multihead_attention{fp8_str}{attn_mask_str}{attn_type_str}{input_ln_str}{fuse_qkv_str}{dtype_str}.onnx" - model = te.attention.MultiHeadAttention( + model = te.MultiheadAttention( *attention_args, attn_mask_type=attn_mask_type, params_dtype=precision, @@ -1275,6 +1275,7 @@ def test_export_multihead_attention( input_layernorm=input_layernorm, attention_type=attention_type, fuse_qkv_params=fuse_qkv_params, + return_bias=True, ).to(device='cuda') inp_context = (hidden_states_context, attention_mask, encoder_output) diff --git a/transformer_engine/pytorch/__init__.py b/transformer_engine/pytorch/__init__.py index b67ecd05b9..92a07e1242 100644 --- a/transformer_engine/pytorch/__init__.py +++ b/transformer_engine/pytorch/__init__.py @@ -9,6 +9,7 @@ from .module import LayerNorm from .module import RMSNorm from .attention import DotProductAttention +from .attention import MultiheadAttention from .transformer import TransformerLayer from .fp8 import fp8_autocast from .export import onnx_export diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 79f4b71c4e..6842a9bc60 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -30,6 +30,7 @@ attention_mask_func, split_tensor_along_dim, get_device_compute_capability, + get_default_init_method, ) from transformer_engine.pytorch.constants import ( AttnMaskTypes, @@ -56,7 +57,7 @@ from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_forward_func # pylint: disable=no-name-in-module,ungrouped-imports -__all__ = ["DotProductAttention"] +__all__ = ["DotProductAttention", "MultiheadAttention"] def _rotate_half(x: torch.Tensor) -> torch.Tensor: @@ -1181,20 +1182,132 @@ def forward( ) -class MultiHeadAttention(torch.nn.Module): - """Parallel attention w/o QKV and Proj Gemms - BMM1 -> softmax + dropout -> BMM2 +class MultiheadAttention(torch.nn.Module): + r""" + Multi-head Attention (MHA), including Query, + Key, Value and Output projection. + + .. note:: + + Argument :attr:`attention_mask` will be ignored in the `forward` call when + :attr:`self_attn_mask_type` is set to `"causal"`. + + Parameters + ---------- + hidden_size : int + size of each input sample. + num_attention_heads : int + number of attention heads in the transformer layer. + kv_channels: int, default = `None` + number of key-value channels. defaults to + :attr:`hidden_size` / :attr:`num_attention_heads` if `None`. + attention_dropout: float, default = 0.1 + dropout probability for the dropout op during multi-head attention. + layernorm_epsilon : float, default = 1e-5 + a value added to the denominator of layer normalization + for numerical stability. + init_method : Callable, default = `None` + used for initializing weights of QKV and FC1 weights in the following way: + `init_method(weight)`. When set to `None`, defaults to + `torch.nn.init.normal_(mean=0.0, std=0.023)`. + output_layer_init_method : Callable, default = `None` + used for initializing weights of PROJ and FC2 in the following way: + `output_layer_init_method(weight)`. When set to `None`, defaults to + `torch.nn.init.normal_(mean=0.0, std=0.023)`. + layer_number: int, default = `None` + layer number of the current `TransformerLayer` when multiple such modules are + concatenated to form a transformer block. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. + num_gqa_groups : int, default = `None` + number of GQA groups in the transformer layer. + Grouped Query Attention is described in + `this paper `_. + This only affects the keys and values, not the querys. + GQA-1 is equivalent to Multi-Query Attention + (`MQA `_), while GQA-H + is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`. + return_layernorm_output : bool, default = `False` + if set to `True`, output of layernorm is returned from the forward + together with the output of the linear transformation. + Example use case: residual connection for transformer module is + taken post layernorm. + input_layernorm: bool, default = `True` + if set to `False`, layer normalization to the input is not applied. + attention_type: { 'self', 'cross' }, default = 'self' + type of attention applied. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in LayerNorm is initialized to 0 and + the LayerNorm formula changes to + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * + (1 + \gamma) + \beta + normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm' + type of normalization applied. + qkv_weight_interleaved : bool, default = `True` + if set to `False`, the QKV weight is interpreted as a concatenation of + query, key, and value weights along the `0th` dimension. The default + interpretation is that the individual `q`, `k`, and `v` weights for each + attention head are interleaved. This parameter is set to `False` when + using :attr:`fuse_qkv_params=False`. + bias : bool, default = `True` + if set to `False`, the transformer layer will not learn any additive biases. + device : Union[torch.device, str], default = "cuda" + The device on which the parameters of the model will allocated. It is the user's + responsibility to ensure all parameters are moved to the GPU before running the + forward pass. + + Parallelism parameters + ---------------------- + set_parallel_mode : bool, default = `False` + if set to `True`, QKV and FC1 layers are used as Column Parallel + whereas PROJ and FC2 is used as Row Parallel as described + `here `_. + sequence_parallel : bool, default = `False` + if set to `True`, uses sequence parallelism. + tp_group : ProcessGroup, default = `None` + tensor parallel process group. + tp_size : int, default = 1 + used as TP (tensor parallel) world size when TP groups are not formed during + initialization. In this case, users must call the + `set_tensor_parallel_group(tp_group)` method on the initialized module before the + forward pass to supply the tensor parallel group needed for tensor and sequence + parallel collectives. + + Optimization parameters + ----------------------- + fuse_wgrad_accumulation : bool, default = 'False' + if set to `True`, enables fusing of creation and accumulation of + the weight gradient. When enabled, it is assumed that the weights + have an additional `main_grad` attribute (used instead of the + regular `grad`) which is a pre-allocated buffer of the correct + size to accumulate gradients in. + params_dtype : torch.dtype, default = `torch.get_default_dtype()` + it controls the type used to allocate the initial parameters. Useful when + the model is trained with lower precision and the original FP32 parameters + would not fit in GPU memory. + return_bias : bool, default = `False` + when set to `True`, this module will not apply the additive bias itself, but + instead return the bias value during the forward pass together with the + output of the linear transformation :math:`y = xA^T`. This is useful when + the bias addition can be fused to subsequent operations. + fuse_qkv_params: bool, default = 'False' + if set to `True`, `TransformerLayer` module exposes a single fused + parameter for query-key-value. This enables optimizations such as QKV + fusion without concatentations/splits and also enables the argument + `fuse_wgrad_accumulation`. """ def __init__( self, hidden_size: int, num_attention_heads: int, - kv_channels: int, - attention_dropout: float, - layernorm_epsilon: float, - init_method: Callable, - output_layer_init_method: Callable, + kv_channels: Optional[int] = None, + attention_dropout: float = 0.1, + layernorm_epsilon: float = 1e-5, + init_method: Optional[Callable] = None, + output_layer_init_method: Optional[Callable] = None, layer_number: Optional[int] = None, attn_mask_type: str = "causal", tp_group: Optional[dist_group_type] = None, @@ -1204,6 +1317,7 @@ def __init__( get_rng_state_tracker: Optional[Callable] = None, sequence_parallel: bool = False, params_dtype: Optional[torch.dtype] = None, + return_bias: bool = False, return_layernorm_output: bool = False, input_layernorm: bool = False, attention_type: str = "self", @@ -1227,9 +1341,16 @@ def __init__( self.tp_group = tp_group self.return_layernorm_output = return_layernorm_output self.params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype - self.init_method = init_method self.attn_mask_type = attn_mask_type self.num_attention_heads = num_attention_heads + self.return_bias = return_bias + + kv_channels = kv_channels if kv_channels else (hidden_size // num_attention_heads) + + if init_method is None: + init_method = get_default_init_method() + if output_layer_init_method is None: + output_layer_init_method = get_default_init_method() if not fuse_qkv_params: qkv_weight_interleaved = False @@ -1358,7 +1479,7 @@ def __init__( hidden_size, init_method=output_layer_init_method, bias=bias, - return_bias=True, + return_bias=return_bias, parallel_mode="row" if set_parallel_mode else None, ub_split_rs=ub_split_rs, ub_split_ag=ub_split_ag, @@ -1395,10 +1516,54 @@ def forward( core_attention_bias: Optional[torch.Tensor] = None, fast_zero_fill: bool = True, ) -> Tuple[Union[torch.Tensor, None], ...]: - """MultiHeadAttention FWD""" + """ + Forward propagation for MultiheadAttention layer. + + .. note:: + + Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type` + is set to `"causal"`. + + Parameters + ---------- + hidden_states : torch.Tensor + Input tensor. + attention_mask : Optional[torch.Tensor], default = `None` + Boolean tensor used to mask out self-attention softmax input. + encoder_output : Optional[torch.Tensor], default = `None` + Output of the encoder block to be fed into the decoder block if using + `layer_type="decoder"`. + is_first_microbatch : {True, False, None}, default = None + During training using either gradient accumulation or + pipeline parallelism a minibatch of data is further split + into microbatches. Between the microbatches of the same minibatch + the model weights are not updated. Setting this parameter indicates + whether the current microbatch is the first in a minibatch or not. + When set, this parameter enables additional optimizations: + + * during FP8 training, it allows caching of the FP8 versions of + the weights + * it also allows skipping gradient accumulation during the + first microbatch (since it is the first gradient being + produced) + checkpoint_core_attention: bool, default = `False` + If true, forward activations for core attention are recomputed + during the backward pass in order to save memory that would + otherwise be occupied to store the forward activations until + backprop. + rotary_pos_emb: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], default = `None` + Embeddings for query and key tensors for applying rotary position + embedding. By default no input embedding is applied. + core_attention_bias_type: str, default = `no_bias` + Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`} + core_attention_bias: Optional[torch.Tensor], default = `None` + Bias tensor for Q * K.T + fast_zero_fill: bool, default = `True` + Whether to set output tensors to 0 or not before use. + """ # hidden_states: [sq, b, h] - if self.attn_mask_type != "causal" and attention_mask is not None: + if self.attn_mask_type == "padding" and attention_mask is not None: assert ( attention_mask.dtype == torch.bool ), "Attention mask must be a boolean tensor" @@ -1604,20 +1769,28 @@ def forward( key_layer, value_layer, attention_mask, - checkpoint_core_attention = checkpoint_core_attention, - core_attention_bias_type = core_attention_bias_type, - core_attention_bias = core_attention_bias, - fast_zero_fill = fast_zero_fill, + checkpoint_core_attention=checkpoint_core_attention, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + fast_zero_fill=fast_zero_fill, ) # ================= # Output. [sq, b, h] # ================= - attention_output, attention_bias = self.proj( + projection_output = self.proj( context_layer, is_first_microbatch=is_first_microbatch ) + if self.return_bias: + attention_output, attention_bias = projection_output + else: + attention_output, attention_bias = projection_output, None + + outputs = (attention_output,) + if self.return_bias: + outputs += (attention_bias,) if self.input_layernorm and self.return_layernorm_output: - return attention_output, attention_bias, layernorm_output - return attention_output, attention_bias + outputs += (layernorm_output,) + return outputs if len(outputs) > 1 else outputs[0] diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index f27784d135..de93cd652f 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -12,7 +12,7 @@ import transformer_engine_extensions as tex from transformer_engine.pytorch.module import LayerNormMLP, LayerNorm, RMSNorm -from transformer_engine.pytorch.attention import MultiHeadAttention +from transformer_engine.pytorch.attention import MultiheadAttention from transformer_engine.pytorch.jit import ( set_jit_fusion_options, warmup_jit_bias_dropout_add_all_dtypes, @@ -323,25 +323,27 @@ def __init__( "ub_split_rs" : ub_split_rs, } - self.self_attention = MultiHeadAttention( + self.self_attention = MultiheadAttention( *attention_args, **common_attention_kwargs, attn_mask_type=self_attn_mask_type, input_layernorm=not output_layernorm, attention_type="self", bias=bias, + return_bias=True, normalization=normalization, device=device, ) if layer_type == "decoder": - self.inter_attention = MultiHeadAttention( + self.inter_attention = MultiheadAttention( *attention_args, **common_attention_kwargs, attn_mask_type="padding", input_layernorm=True, attention_type="cross", bias=bias, + return_bias=True, normalization=normalization, device=device, ) From 5b16352a5eb6bcb6e506fef5c0d8319a1c73400a Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:35:26 -0700 Subject: [PATCH 049/427] Fix rng_state issue and minor compiler warning (#395) fix rng_state issue and minor compiler warning Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- tests/pytorch/test_fused_attn.py | 6 ++---- .../common/transpose/transpose_fusion.cu | 2 -- .../pytorch/csrc/extensions/attention.cu | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index f516b70b0e..3c8a10e9e9 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -181,9 +181,6 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): seqlens.fill_(config.seq_len) cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) - op_grad = torch.randn( - config.seq_len, bs, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() sigma = 0.02 init_method = init_method_normal(sigma) @@ -241,7 +238,8 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): checkpoint_core_attention = ckpt_attn, core_attention_bias_type = bias_type, core_attention_bias = bias) - op.backward(op_grad) + loss = op.sum() + loss.backward() return op, inp.grad diff --git a/transformer_engine/common/transpose/transpose_fusion.cu b/transformer_engine/common/transpose/transpose_fusion.cu index ba89c4abd2..8561a6881b 100644 --- a/transformer_engine/common/transpose/transpose_fusion.cu +++ b/transformer_engine/common/transpose/transpose_fusion.cu @@ -293,8 +293,6 @@ transpose_dbias_kernel_notaligned(const Param param, } } OVec out_trans[nvec_in]; // NOLINT(*) - const bool valid_store = my_place < tile_length && - warp_id_in_tile * n_iterations + i < tile_height; transpose_regs_partial_dbias( in[current_in ^ 1], out_trans, diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cu b/transformer_engine/pytorch/csrc/extensions/attention.cu index 4904fbade5..423b16013f 100644 --- a/transformer_engine/pytorch/csrc/extensions/attention.cu +++ b/transformer_engine/pytorch/csrc/extensions/attention.cu @@ -194,7 +194,13 @@ std::vector fused_attn_fwd_qkvpacked( for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); // allocate memory for nvte_aux_tensor_pack.tensors - auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + at::Tensor output_tensor; + if (nvte_aux_tensor_pack.size >= 2) { + output_tensor = (i < nvte_aux_tensor_pack.size-1) + ? allocateSpace(tensor->data.shape, tensor->data.dtype, false) : rng_state; + } else { + output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + } output_tensors.push_back(output_tensor); tensor->data.dptr = output_tensor.data_ptr(); } @@ -497,7 +503,13 @@ std::vector fused_attn_fwd_kvpacked( for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); // allocate memory for nvte_aux_tensor_pack.tensors - auto output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + at::Tensor output_tensor; + if (nvte_aux_tensor_pack.size >= 2) { + output_tensor = (i < nvte_aux_tensor_pack.size-1) + ? allocateSpace(tensor->data.shape, tensor->data.dtype, false) : rng_state; + } else { + output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + } output_tensors.push_back(output_tensor); tensor->data.dptr = output_tensor.data_ptr(); } From e6db29d15bdfeaefea091372a1b43a8a59d0f51d Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 25 Aug 2023 19:21:17 -0700 Subject: [PATCH 050/427] [PyTorch] move mask types to fprop (#402) * API change and some test fixes Signed-off-by: Kirthi Shankar Sivamani * more test fixes Signed-off-by: Kirthi Shankar Sivamani * ONNX fixes Signed-off-by: Kirthi Shankar Sivamani * fix Signed-off-by: Kirthi Shankar Sivamani * Fixed fused attention tests Signed-off-by: Kirthi Shankar Sivamani * rm duplicate test Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/pytorch/test_fused_attn.py | 252 +++++++++++----------- tests/pytorch/test_numerics.py | 24 ++- tests/pytorch/test_onnx_export.py | 29 +-- tests/pytorch/test_sanity.py | 10 +- transformer_engine/pytorch/attention.py | 145 ++++++++----- transformer_engine/pytorch/softmax.py | 5 +- transformer_engine/pytorch/transformer.py | 48 +++-- 7 files changed, 287 insertions(+), 226 deletions(-) diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index 3c8a10e9e9..32442e40fb 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -77,10 +77,10 @@ def test_dot_product_attention(dtype, bs, model, ckpt_attn, bias_type): atol, rtol = (2.5e-2, 2.5e-2) if dtype == torch.bfloat16 else (5e-3, 5e-3) if bias_type == "no_bias": - assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type): @@ -94,18 +94,18 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) inp = torch.randn( config.seq_len, bs, 3, config.num_attention_heads, config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() inp.requires_grad=True - seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens = torch.empty(bs, dtype=torch.int32).cuda() seqlens.fill_(config.seq_len) - cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) - cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + cu_seqlens = torch.zeros(bs + 1, device=inp.device, dtype=torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) op_grad = torch.randn( config.seq_len, bs, config.num_attention_heads * config.head_dim, dtype = dtype).cuda() if bias_type != "no_bias": bias = torch.randn(1, config.num_attention_heads, config.seq_len, config.seq_len, - dtype = dtype).cuda() + dtype=dtype).cuda() else: bias = None @@ -113,24 +113,23 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) DotProductAttention( config.num_attention_heads, config.head_dim, - attention_dropout = config.dropout_p, - attn_mask_type = config.attn_mask_type, - sequence_parallel = False, - tp_size = 1, - get_rng_state_tracker = get_dummy_cuda_rng_tracker, - tp_group = None, - layer_number = 1, - attention_type = "self" - ).to(dtype = dtype).cuda() + attention_dropout=config.dropout_p, + sequence_parallel=False, + tp_size=1, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, + tp_group=None, + layer_number=1, + attention_type="self" + ).to(dtype=dtype).cuda() ) q = inp[:, :,0,:,:] k = inp[:, :,1,:,:] v = inp[:, :,2,:,:] - op = block(q, k, v, - checkpoint_core_attention = ckpt_attn, - core_attention_bias_type = bias_type, - core_attention_bias = bias) + op = block(q, k, v, attn_mask_type=config.attn_mask_type, + checkpoint_core_attention=ckpt_attn, + core_attention_bias_type=bias_type, + core_attention_bias=bias) op.backward(op_grad) return op, inp.grad @@ -158,10 +157,10 @@ def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type): atol, rtol = (5e-1, 5e-2) if bias_type == "no_bias": - assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): @@ -175,12 +174,12 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): inp = torch.randn( config.seq_len, bs, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() inp.requires_grad=True - seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens = torch.empty(bs, dtype=torch.int32).cuda() seqlens.fill_(config.seq_len) - cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) - cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + cu_seqlens = torch.zeros(bs + 1, device=inp.device, dtype=torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) sigma = 0.02 init_method = init_method_normal(sigma) @@ -192,7 +191,7 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): rate.item() for rate in torch.linspace(0, drop_path_rate, config.num_layers)] if bias_type != "no_bias": bias = torch.randn(1, config.num_attention_heads, config.seq_len, config.seq_len, - dtype = dtype).cuda() + dtype=dtype).cuda() else: bias = None @@ -201,43 +200,42 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): config.hidden_size, 4 * config.hidden_size, config.num_attention_heads, - layernorm_epsilon = 1e-5, - hidden_dropout = 0.0, - attention_dropout = config.dropout_p, - init_method = init_method, - output_layer_init_method = output_layer_init_method, - layer_number = layer_number, - kv_channels = config.head_dim, - self_attn_mask_type = config.attn_mask_type, - tp_group = None, - tp_size = 1, - params_dtype = dtype, - get_rng_state_tracker = None, - fuse_wgrad_accumulation = False, - seq_length = config.seq_len, - micro_batch_size = bs, - sequence_parallel = False, - apply_residual_connection_post_layernorm = False, - output_layernorm = False, - layer_type = "encoder", - drop_path_rate = drop_path_rates[layer_number - 1], - set_parallel_mode = True, - fuse_qkv_params = True, - zero_centered_gamma = False, - qkv_weight_interleaved = False, - ub_tp_comm_overlap = False, - bias = True, + layernorm_epsilon=1e-5, + hidden_dropout=0.0, + attention_dropout=config.dropout_p, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + layer_number=layer_number, + kv_channels=config.head_dim, + tp_group=None, + tp_size=1, + params_dtype=dtype, + get_rng_state_tracker=None, + fuse_wgrad_accumulation=False, + seq_length=config.seq_len, + micro_batch_size=bs, + sequence_parallel=False, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + layer_type="encoder", + drop_path_rate=drop_path_rates[layer_number - 1], + set_parallel_mode=True, + fuse_qkv_params=True, + zero_centered_gamma=False, + qkv_weight_interleaved=False, + ub_tp_comm_overlap=False, + bias=True, ) - .to(dtype = dtype) + .to(dtype=dtype) .cuda() ) num_iters = 10 for i in range(num_iters): - op = block(inp, - checkpoint_core_attention = ckpt_attn, - core_attention_bias_type = bias_type, - core_attention_bias = bias) + op = block(inp, self_attn_mask_type=config.attn_mask_type, + checkpoint_core_attention=ckpt_attn, + core_attention_bias_type=bias_type, + core_attention_bias=bias) loss = op.sum() loss.backward() @@ -270,8 +268,8 @@ def find_factors(x): dtype, bs, config, "UnfusedDotProductAttention", num_q_per_gqa_group) atol, rtol = 5e-1, 5e-2 - assert torch.allclose(flash_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(flash_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(flash_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(flash_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_transformer_layer_gqa(dtype, bs, config, backend, num_querys_per_gqa_group): @@ -282,15 +280,15 @@ def _run_transformer_layer_gqa(dtype, bs, config, backend, num_querys_per_gqa_gr inp = torch.randn( config.seq_len, bs, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() inp.requires_grad=True - seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens = torch.empty(bs, dtype=torch.int32).cuda() seqlens.fill_(config.seq_len) - cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) - cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + cu_seqlens = torch.zeros(bs + 1, device=inp.device, dtype=torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) op_grad = torch.randn( config.seq_len, bs, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() sigma = 0.02 init_method = init_method_normal(sigma) @@ -306,39 +304,38 @@ def _run_transformer_layer_gqa(dtype, bs, config, backend, num_querys_per_gqa_gr config.hidden_size, 4 * config.hidden_size, config.num_attention_heads, - num_gqa_groups = config.num_attention_heads / num_querys_per_gqa_group, - layernorm_epsilon = 1e-5, - hidden_dropout = 0.0, - attention_dropout = config.dropout_p, - init_method = init_method, - output_layer_init_method = output_layer_init_method, - layer_number = layer_number, - kv_channels = config.head_dim, - self_attn_mask_type = config.attn_mask_type, - tp_group = None, - tp_size = 1, - params_dtype = dtype, - get_rng_state_tracker = None, - fuse_wgrad_accumulation = False, - seq_length = config.seq_len, - micro_batch_size = bs, - sequence_parallel = False, - apply_residual_connection_post_layernorm = False, - output_layernorm = False, - layer_type = "encoder", - drop_path_rate = drop_path_rates[layer_number - 1], - set_parallel_mode = True, - fuse_qkv_params = True, - zero_centered_gamma = False, - qkv_weight_interleaved = False, - ub_tp_comm_overlap = False, - bias = True, + num_gqa_groups=config.num_attention_heads / num_querys_per_gqa_group, + layernorm_epsilon=1e-5, + hidden_dropout=0.0, + attention_dropout=config.dropout_p, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + layer_number=layer_number, + kv_channels=config.head_dim, + tp_group=None, + tp_size= 1, + params_dtype=dtype, + get_rng_state_tracker=None, + fuse_wgrad_accumulation=False, + seq_length=config.seq_len, + micro_batch_size=bs, + sequence_parallel=False, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + layer_type="encoder", + drop_path_rate=drop_path_rates[layer_number - 1], + set_parallel_mode=True, + fuse_qkv_params=True, + zero_centered_gamma=False, + qkv_weight_interleaved=False, + ub_tp_comm_overlap=False, + bias=True, ) - .to(dtype = dtype) + .to(dtype=dtype) .cuda() ) - op = block(inp) + op = block(inp, self_attn_mask_type=config.attn_mask_type) op.backward(op_grad) return op, inp.grad @@ -365,8 +362,8 @@ def test_dpa_fp8(dtype, bs, model): dtype, bs, config, "UnfusedDotProductAttention") atol, rtol = (2.5e-2, 2.5e-2) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol = atol, rtol = rtol) + assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_dpa_fp8(dtype, bs, config, backend): @@ -376,15 +373,15 @@ def _run_dpa_fp8(dtype, bs, config, backend): inp = 0.01 * torch.randn( bs * config.seq_len, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() inp.requires_grad=True - seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens = torch.empty(bs, dtype=torch.int32).cuda() seqlens.fill_(config.seq_len) - cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) - cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + cu_seqlens = torch.zeros(bs + 1, device=inp.device, dtype=torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) op_grad = 0.01 * torch.randn( bs * config.seq_len, config.num_attention_heads * config.head_dim, - dtype = dtype).cuda() + dtype=dtype).cuda() torch.save(op_grad, 'op_grad.pt') fp8_recipe = recipe.DelayedScaling( @@ -395,7 +392,7 @@ def _run_dpa_fp8(dtype, bs, config, backend): amax_compute_algo="most_recent", ) - dpa = DPA_FP8(config).to(dtype = torch.float16).cuda() + dpa = DPA_FP8(config).to(dtype=torch.float16).cuda() with fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): op = dpa(inp, cu_seqlens, config.seq_len) op.backward(op_grad) @@ -416,31 +413,30 @@ def _run_dpa_fp8_ref(dtype, bs, config, backend): inp = torch.load('qkv.pt').cuda() inp.requires_grad=True - seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens = torch.empty(bs, dtype=torch.int32).cuda() seqlens.fill_(config.seq_len) - cu_seqlens = torch.zeros(bs + 1, device = inp.device, dtype = torch.int32) - cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + cu_seqlens = torch.zeros(bs + 1, device=inp.device, dtype=torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) op_grad = torch.load('op_grad.pt').cuda().view(bs, config.seq_len, -1).transpose(0,1) block = ( DotProductAttention( config.num_attention_heads, config.head_dim, - attention_dropout = config.dropout_p, - attn_mask_type = config.attn_mask_type, - sequence_parallel = False, - tp_size = 1, - get_rng_state_tracker = None, - tp_group = None, - layer_number = 1, - attention_type = "self" - ).to(dtype = dtype).cuda() + attention_dropout=config.dropout_p, + sequence_parallel=False, + tp_size=1, + get_rng_state_tracker=None, + tp_group=None, + layer_number=1, + attention_type="self" + ).to(dtype=dtype).cuda() ) q = inp[:, :,0,:,:] k = inp[:, :,1,:,:] v = inp[:, :,2,:,:] - op = block(q, k, v) + op = block(q, k, v, attn_mask_type=config.attn_mask_type) op.backward(op_grad) torch.save(op,'ctx_ref.pt') torch.save(inp.grad,'dqkv_ref.pt') @@ -533,8 +529,8 @@ def forward( workspace, bias=qkv_bias, use_bias=True, - out_index = META_QKV, - fp8_meta_tensor = fp8_meta["scaling_fwd"], + out_index=META_QKV, + fp8_meta_tensor=fp8_meta["scaling_fwd"], use_split_accumulator=_2X_ACC_FPROP, D_dtype=fp8_dtype_forward, ) @@ -558,13 +554,13 @@ def forward( fp8_meta["scaling_fwd"].scale[META_O], fp8_meta["scaling_fwd"].amax_history[0][META_S], fp8_meta["scaling_fwd"].amax_history[0][META_O], - attn_scale = None, - dropout = p_dropout, - fast_zero_fill = fast_zero_fill, - qkv_layout = "qkv_interleaved", - attn_bias_type = "no_bias", - attn_mask_type = "padding", - rng_gen = None, + attn_scale=None, + dropout=p_dropout, + fast_zero_fill=fast_zero_fill, + qkv_layout="qkv_interleaved", + attn_bias_type="no_bias", + attn_mask_type="padding", + rng_gen=None, ) M, ZInv, philox_unpacked = aux_ctx_tensors diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index f8eda48cc3..bf9f7502fd 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -376,8 +376,8 @@ def __init__(self, hidden_size: int, num_attention_heads: int): batch_first=False, ) - def forward(self, x, attn_mask=None): - output = self.mhsa(x, x, x, attn_mask=attn_mask, need_weights=False) + def forward(self, x, attention_mask=None): + output = self.mhsa(x, x, x, attn_mask=attention_mask, need_weights=False) if isinstance(output, tuple): output = output[0] return output @@ -461,7 +461,7 @@ def _test_e2e_selective_recompute(block, bs, dtype, config, recompute=False): te_out = block( te_inp_hidden_states, - te_inp_attn_mask, + attention_mask=te_inp_attn_mask, checkpoint_core_attention=recompute, ) loss = te_out.sum() @@ -526,13 +526,13 @@ def _test_e2e_full_recompute(block, bs, dtype, config, recompute=False): get_dummy_cuda_rng_tracker, None, # tp_group te_inp_hidden_states, - te_inp_attn_mask, + attention_mask=te_inp_attn_mask, checkpoint_core_attention=False, ) else: te_out = block( te_inp_hidden_states, - te_inp_attn_mask, + attention_mask=te_inp_attn_mask, checkpoint_core_attention=False, ) loss = te_out.sum() @@ -766,7 +766,7 @@ def test_gpt_accuracy(dtype, bs, model): assert_allclose(te_outputs[0], torch_outputs[0], 5e-2) -def _test_mha_accuracy(block, bs, dtype, config, mask_type): +def _test_mha_accuracy(block, bs, dtype, config, mask_type, te=True): reset_rng_states() inp_hidden_states = torch.randn( @@ -775,7 +775,12 @@ def _test_mha_accuracy(block, bs, dtype, config, mask_type): inp_hidden_states.retain_grad() inp_attn_mask = get_causal_attn_mask(config.seq_len) if mask_type == "causal" else None - out = block(inp_hidden_states, inp_attn_mask) + forward_kwargs = {} + if te: + forward_kwargs["attn_mask_type"] = mask_type + forward_kwargs["attention_mask"] = inp_attn_mask + + out = block(inp_hidden_states, **forward_kwargs) loss = out.sum() loss.backward() @@ -801,7 +806,6 @@ def test_mha_accuracy(dtype, bs, model, mask_type): fuse_qkv_params=True, qkv_weight_interleaved=False, input_layernorm=False, - attn_mask_type=mask_type, ) .to(dtype=dtype) .cuda() @@ -825,8 +829,8 @@ def test_mha_accuracy(dtype, bs, model, mask_type): torch_mha.mhsa.out_proj.weight = Parameter(te_mha.proj.weight.clone()) torch_mha.mhsa.out_proj.bias = Parameter(te_mha.proj.bias.clone()) - te_outputs = _test_mha_accuracy(te_mha, bs, dtype, config, mask_type) - torch_outputs = _test_mha_accuracy(torch_mha, bs, dtype, config, mask_type) + te_outputs = _test_mha_accuracy(te_mha, bs, dtype, config, mask_type, te=True) + torch_outputs = _test_mha_accuracy(torch_mha, bs, dtype, config, mask_type, te=False) # Check output. if dtype == torch.float32: diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 1e1fafcac5..14640febde 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -783,7 +783,6 @@ def __init__(self, softmax_fn, fake_bf16_io, mask_inp=False): self.fake_bf16_io = fake_bf16_io if self.softmax_fn == te.softmax.FusedScaleMaskSoftmax: self.fused_scaled_softmax = te.softmax.FusedScaleMaskSoftmax( - attn_mask_type="causal", mask_func=te.utils.attention_mask_func, softmax_in_fp32=True, ) @@ -793,7 +792,7 @@ def forward(self, inp, mask): inp = inp.type(torch.bfloat16) if self.fused_scaled_softmax: - ret = self.fused_scaled_softmax(inp, mask, self.scale) + ret = self.fused_scaled_softmax(inp, mask, "causal", self.scale) else: if self.mask_inp: ret = self.softmax_fn.apply(inp, mask, self.scale) @@ -867,7 +866,6 @@ def __init__(self, use_default_te_mask_fn: bool, fake_bf16_io: bool): # even when is_in_onnx_export_mode()==False. os.environ["NVTE_MASKED_SOFTMAX_FUSION"] = "0" self.fused_scaled_softmax = te.softmax.FusedScaleMaskSoftmax( - attn_mask_type="causal", mask_func=te.utils.attention_mask_func, softmax_in_fp32=True, ) @@ -875,7 +873,7 @@ def __init__(self, use_default_te_mask_fn: bool, fake_bf16_io: bool): def forward(self, inp, mask): if self.fake_bf16_io: inp = inp.type(torch.bfloat16) - ret = self.fused_scaled_softmax(inp, mask, self.scale) + ret = self.fused_scaled_softmax(inp, mask, "causal", scale=self.scale) if self.fake_bf16_io: ret = ret.type(torch.float) return ret @@ -1161,13 +1159,13 @@ def test_export_core_attention( query_layer = torch.randn(qkv_size, dtype=precision, device="cuda") key_layer = torch.randn(qkv_size, dtype=precision, device="cuda") value_layer = torch.randn(qkv_size, dtype=precision, device="cuda") - input_names = ["query", "key", "value", "attention_mask"] + input_names = ["query", "key", "value", "attention_mask", "attn_mask_type"] attention_mask = None if use_mask: # Generate a random mask with 50% probability for 0 or 1. probs = 0.5 * torch.ones(qkv_size[1], qkv_size[2], qkv_size[0], qkv_size[0], device="cuda", dtype=precision) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) - inp = (query_layer, key_layer, value_layer, attention_mask) + inp = (query_layer, key_layer, value_layer, attention_mask, attn_mask_type) mask_str = get_attn_mask_str(use_mask, attn_mask_type) high_prec_str = dtype2str(precision) @@ -1177,7 +1175,6 @@ def test_export_core_attention( num_attention_heads=num_attention_heads, kv_channels=kv_channels, attention_dropout=0.5, - attn_mask_type=attn_mask_type, ).to(device='cuda') do_export(model, inp, @@ -1193,9 +1190,8 @@ def test_export_core_attention( test_configs_multihead_attention = [ #"use_mask, attn_mask_type" - (False, "causal"), # calls ScaledUpperTriangMaskedSoftmax + (False, "no_mask"), # calls ScaledUpperTriangMaskedSoftmax (True, "padding"), # calls ScaledMaskedSoftmax - (False, "padding"), # calls ScaledSoftmax ] test_configs_attention_type = [ #"input_layernorm, attention_type, fuse_qkv_params" @@ -1269,7 +1265,6 @@ def test_export_multihead_attention( model = te.MultiheadAttention( *attention_args, - attn_mask_type=attn_mask_type, params_dtype=precision, return_layernorm_output=return_layernorm_output, input_layernorm=input_layernorm, @@ -1278,8 +1273,8 @@ def test_export_multihead_attention( return_bias=True, ).to(device='cuda') - inp_context = (hidden_states_context, attention_mask, encoder_output) - input_names = ["hidden_states", "attention_mask", "encoder_output"] + inp_context = (hidden_states_context, attention_mask, encoder_output, attn_mask_type) + input_names = ["hidden_states", "attention_mask", "encoder_output", "attn_mask_type"] output_names=["attention_output", "attention_bias"] do_export(model, inp_context, fname, use_fp8, input_names=input_names, output_names=output_names, dynamic_axes={"hidden_states": {0: "seq", 1:"bs"}, @@ -1347,13 +1342,13 @@ def test_export_transformer_layer( num_attention_heads = 4 input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") - input_names = ["input", "attention_mask"] + input_names = ["input", "attention_mask", "self_attn_mask_type"] attention_mask = None if use_mask and attn_mask_type != "causal": # Generate a random mask with 50% probability for 0 or 1. probs = 0.5 * torch.ones(batch_size, 1, sequence_length, sequence_length, device="cuda", dtype=precision) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) - inp = (input_tensor, attention_mask) + inp = (input_tensor, attention_mask, attn_mask_type) fp8_str = "_fp8" if use_fp8 else "" fuse_qkv_params_str = "_fused-qkv" if fuse_qkv_params else "" @@ -1365,7 +1360,6 @@ def test_export_transformer_layer( hidden_size, ffn_hidden_size, num_attention_heads, - self_attn_mask_type=attn_mask_type, output_layernorm=output_layernorm, params_dtype=precision, fuse_qkv_params=fuse_qkv_params, @@ -1547,17 +1541,16 @@ def test_export_gpt_generation( hidden_size, ffn_hidden_size, num_attention_heads, - self_attn_mask_type=attn_mask_type, output_layernorm=output_layernorm, params_dtype=precision, fuse_qkv_params=fuse_qkv_params, zero_centered_gamma=zero_centered_gamma).to(device='cuda') # "Context phase": use full input sequence length - input_names = ["input"] + input_names = ["input", "attention_mask", "self_attn_mask_type"] output_names = ["output"] input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") - inp = (input_tensor,) + inp = (input_tensor, None, attn_mask_type) do_export(model, inp, fname, use_fp8, input_names=input_names, output_names=output_names, dynamic_axes={"input": {0: "seq", 1:"bs"}, diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index 2605c563d6..21497b417f 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -176,7 +176,7 @@ def _test_sanity_e2e_amp(block, bs, dtype, config, fp8_recipe, skip_wgrad): use_fp8 = fp8_recipe is not None with torch.autocast(device_type="cuda", enabled=True, dtype=dtype): with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe): - te_out = block(te_inp_hidden_states, te_inp_attn_mask) + te_out = block(te_inp_hidden_states, attention_mask=te_inp_attn_mask) loss = te_out.sum() loss.backward() @@ -217,7 +217,7 @@ def _test_sanity_e2e_gradient_accumulation_fusion(block, bs, dtype, config, fp8_ use_fp8 = fp8_recipe is not None with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe): - te_out = block(te_inp_hidden_states, te_inp_attn_mask) + te_out = block(te_inp_hidden_states, attention_mask=te_inp_attn_mask) loss = te_out.sum() loss.backward() torch.cuda.synchronize() @@ -253,7 +253,7 @@ def _test_sanity_e2e(block, bs, dtype, config, fp8_recipe, skip_wgrad): use_fp8 = fp8_recipe is not None with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe): - te_out = block(te_inp_hidden_states, te_inp_attn_mask) + te_out = block(te_inp_hidden_states, attention_mask=te_inp_attn_mask) loss = te_out.sum() loss.backward() torch.cuda.synchronize() @@ -282,7 +282,9 @@ def _test_sanity_e2e_T5(block, bs, dtype, config, fp8_recipe, skip_wgrad): use_fp8 = fp8_recipe is not None with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe): te_out = block( - te_inp_hidden_states, te_inp_attn_mask, encoder_output=te_inp_hidden_states + te_inp_hidden_states, + attention_mask=te_inp_attn_mask, + encoder_output=te_inp_hidden_states ) loss = te_out.sum() loss.backward() diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 6842a9bc60..a30f20d3a8 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -196,23 +196,15 @@ def __init__( norm_factor: float, attention_dropout: float = 0.0, attention_dropout_ctx: Optional[Callable] = nullcontext, - attn_mask_type: str = "causal", layer_number: Optional[int] = None, ) -> None: super().__init__() - assert ( - attn_mask_type in AttnMaskTypes - ), f"attn_mask_type {attn_mask_type} not supported" - self.norm_factor = norm_factor self.attention_dropout_ctx = attention_dropout_ctx self.layer_number = layer_number - self.scale_mask_softmax = FusedScaleMaskSoftmax( - attn_mask_type, - attention_mask_func, - ) + self.scale_mask_softmax = FusedScaleMaskSoftmax(attention_mask_func) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but @@ -228,11 +220,17 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + attn_mask_type: str = "causal", attention_mask: Optional[torch.Tensor] = None, core_attention_bias_type: str = "no_bias", core_attention_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: """core attention fprop""" + + assert ( + attn_mask_type in AttnMaskTypes + ), f"attn_mask_type {attn_mask_type} not supported" + batch_size, seqlen = query_layer.shape[1], query_layer.shape[0] apply_qk_layer_scaling = self.apply_qk_layer_scaling and key_layer.dtype == torch.float16 @@ -321,7 +319,8 @@ def forward( # attention scores and attention mask [b, np, sq, sk] softmax_scale = self.layer_number if apply_qk_layer_scaling else None - attention_probs = self.scale_mask_softmax(attention_scores, attention_mask, softmax_scale) + attention_probs = self.scale_mask_softmax( + attention_scores, attention_mask, attn_mask_type, softmax_scale) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -464,7 +463,6 @@ def __init__( norm_factor: float, attention_dropout: float = 0.0, attention_dropout_ctx: Optional[Callable] = nullcontext, - attn_mask_type: str = "causal", deterministic: bool = False, ) -> None: super().__init__() @@ -473,7 +471,6 @@ def __init__( _flash_attn_version >= _flash_attn_version_required ), f"FlashAttention minimum version {_flash_attn_version_required} is required." - self.attn_causal_mask = attn_mask_type == "causal" self.norm_factor = norm_factor self.attention_dropout_ctx = attention_dropout_ctx self.attention_dropout = attention_dropout @@ -484,6 +481,7 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + attn_mask_type: str = "causal", ) -> torch.Tensor: """flash-attn fprop""" @@ -531,7 +529,7 @@ def forward( output = flash_attn_forward_func( query_layer, key_layer, value_layer, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, self.attention_dropout if self.training else 0.0, - softmax_scale=1.0/self.norm_factor, causal=self.attn_causal_mask, + softmax_scale=1.0/self.norm_factor, causal=attn_mask_type=="causal", **fa_optional_forward_kwargs ) @@ -703,7 +701,6 @@ def __init__( norm_factor: float, attention_dropout: float = 0.0, attention_dropout_ctx: Optional[Callable] = nullcontext, - attn_mask_type: str = "causal", attention_type: str = "self", ) -> None: super().__init__() @@ -711,7 +708,6 @@ def __init__( self.norm_factor = norm_factor self.attention_dropout = attention_dropout self.attention_dropout_ctx = attention_dropout_ctx - self.attn_mask_type = attn_mask_type self.attention_type = attention_type self.use_FAv2_bwd = (os.getenv("NVTE_FUSED_ATTN_USE_FAv2_BWD", "1") == "1" and _flash_attn_2_available @@ -722,6 +718,7 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + attn_mask_type: str = "causal", fused_attention_backend: tex.NVTE_Fused_Attn_Backend = tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend, core_attention_bias_type: str = "no_bias", @@ -797,7 +794,7 @@ def forward( fast_zero_fill, qkv_layout, core_attention_bias_type, - self.attn_mask_type, + attn_mask_type, None, # rng_gen fused_attention_backend, use_FAv2_bwd @@ -858,7 +855,7 @@ def forward( fast_zero_fill, qkv_layout, core_attention_bias_type, - self.attn_mask_type, + attn_mask_type, None, # rng_gen fused_attention_backend, use_FAv2_bwd @@ -886,6 +883,11 @@ class DotProductAttention(torch.nn.Module): and set the environment variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order to disable`flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`. + .. warning:: + + Argument :attr:`attn_mask_type` has been moved to the `forward` method and + is deprecated. It will be fully removed in future releases. + Parameters ---------- num_attention_heads : int @@ -902,8 +904,6 @@ class DotProductAttention(torch.nn.Module): is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`. attention_dropout: float, default = 0.0 dropout probability for the dropout op during multi-head attention. - attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` - type of attention mask passed into softmax operation. layer_number: int, default = `None` layer number of the current `DotProductAttention` when multiple such modules are concatenated, for instance in consecutive transformer blocks. @@ -924,7 +924,7 @@ def __init__( kv_channels: int, num_gqa_groups: Optional[int] = None, attention_dropout: float = 0.0, - attn_mask_type: str = "causal", + attn_mask_type: Optional[str] = None, sequence_parallel: bool = False, tp_size: int = 1, get_rng_state_tracker: Optional[Callable] = None, @@ -934,6 +934,14 @@ def __init__( ) -> None: super().__init__() + if attn_mask_type is not None: + warnings.warn( + "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + + self.attn_mask_type = attn_mask_type self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) self.tp_group = tp_group self.get_rng_state_tracker = get_rng_state_tracker @@ -978,10 +986,8 @@ def __init__( attn_kwargs = { "attention_dropout": attention_dropout, "attention_dropout_ctx": attention_dropout_ctx, - "attn_mask_type": attn_mask_type, } self.attention_type = attention_type - self.attn_mask_type = attn_mask_type self.attention_dropout = attention_dropout if self.use_flash_attention: @@ -1025,6 +1031,7 @@ def forward( key_layer: torch.Tensor, value_layer: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, + attn_mask_type: str = "causal", checkpoint_core_attention: bool = False, core_attention_bias_type: str = "no_bias", core_attention_bias: Optional[torch.Tensor] = None, @@ -1067,6 +1074,8 @@ def forward( Value tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out softmax input when not using flash-attn. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. checkpoint_core_attention : bool, default = `False` If true, forward activations for attention are recomputed during the backward pass in order to save memory that would @@ -1080,6 +1089,15 @@ def forward( Whether to use the fast path to set output tensors to 0 or not. """ + if self.attn_mask_type is not None: + warnings.warn( + "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + # Keep previous functionality for current users. + attn_mask_type = self.attn_mask_type + assert (key_layer.shape[-2] == self.num_gqa_groups_per_partition and value_layer.shape[-2] == self.num_gqa_groups_per_partition ), f"Keys and values must have {self.num_gqa_groups} heads!" @@ -1102,7 +1120,7 @@ def forward( if not _flash_attn_2_available and self.num_gqa_groups != self.num_attention_heads: use_flash_attention = False - if self.attn_mask_type == "padding" and attention_mask is not None: + if attn_mask_type == "padding" and attention_mask is not None: use_flash_attention = False use_fused_attention = False @@ -1121,7 +1139,7 @@ def forward( TE_DType[key_layer.dtype], QKVLayout[qkv_layout], AttnBiasType[core_attention_bias_type], - AttnMaskType[self.attn_mask_type], + AttnMaskType[attn_mask_type], self.attention_dropout, query_layer.shape[0], key_layer.shape[0], query_layer.shape[-1]) @@ -1144,8 +1162,10 @@ def forward( return self._checkpointed_attention_forward(self.flash_attention, query_layer, key_layer, - value_layer) - return self.flash_attention(query_layer, key_layer, value_layer) + value_layer, + attn_mask_type=attn_mask_type) + return self.flash_attention( + query_layer, key_layer, value_layer, attn_mask_type=attn_mask_type) if use_fused_attention: if checkpoint_core_attention: @@ -1153,15 +1173,17 @@ def forward( query_layer, key_layer, value_layer, - fused_attention_backend = fused_attention_backend, - core_attention_bias_type = core_attention_bias_type, - core_attention_bias = core_attention_bias, - fast_zero_fill = fast_zero_fill) + attn_mask_type=attn_mask_type, + fused_attention_backend=fused_attention_backend, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + fast_zero_fill=fast_zero_fill) return self.fused_attention(query_layer, key_layer, value_layer, - fused_attention_backend = fused_attention_backend, - core_attention_bias_type = core_attention_bias_type, - core_attention_bias = core_attention_bias, - fast_zero_fill = fast_zero_fill) + attn_mask_type=attn_mask_type, + fused_attention_backend=fused_attention_backend, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + fast_zero_fill=fast_zero_fill) if checkpoint_core_attention: return self._checkpointed_attention_forward( @@ -1169,16 +1191,18 @@ def forward( query_layer, key_layer, value_layer, - attention_mask = attention_mask, - core_attention_bias_type = core_attention_bias_type, - core_attention_bias = core_attention_bias, + attn_mask_type=attn_mask_type, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, ) return self.unfused_attention(query_layer, key_layer, value_layer, - attention_mask = attention_mask, - core_attention_bias_type = core_attention_bias_type, - core_attention_bias = core_attention_bias, + attn_mask_type=attn_mask_type, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, ) @@ -1190,7 +1214,12 @@ class MultiheadAttention(torch.nn.Module): .. note:: Argument :attr:`attention_mask` will be ignored in the `forward` call when - :attr:`self_attn_mask_type` is set to `"causal"`. + :attr:`attn_mask_type` is set to `"causal"`. + + .. warning:: + + Argument :attr:`attn_mask_type` has been moved to the `forward` method and + is deprecated. It will be fully removed in future releases. Parameters ---------- @@ -1217,8 +1246,6 @@ class MultiheadAttention(torch.nn.Module): layer_number: int, default = `None` layer number of the current `TransformerLayer` when multiple such modules are concatenated to form a transformer block. - attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` - type of attention mask passed into softmax operation. num_gqa_groups : int, default = `None` number of GQA groups in the transformer layer. Grouped Query Attention is described in @@ -1309,7 +1336,7 @@ def __init__( init_method: Optional[Callable] = None, output_layer_init_method: Optional[Callable] = None, layer_number: Optional[int] = None, - attn_mask_type: str = "causal", + attn_mask_type: Optional[str] = None, tp_group: Optional[dist_group_type] = None, tp_size: int = 1, num_gqa_groups: Optional[int] = None, @@ -1334,6 +1361,15 @@ def __init__( device: Union[torch.device, str] = "cuda", ) -> None: super().__init__() + + if attn_mask_type is not None: + warnings.warn( + "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + + self.attn_mask_type = attn_mask_type self.layer_number = layer_number self.input_layernorm = input_layernorm self.attention_type = attention_type @@ -1341,7 +1377,6 @@ def __init__( self.tp_group = tp_group self.return_layernorm_output = return_layernorm_output self.params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype - self.attn_mask_type = attn_mask_type self.num_attention_heads = num_attention_heads self.return_bias = return_bias @@ -1467,7 +1502,6 @@ def __init__( attention_dropout=attention_dropout, tp_size=tp_size, get_rng_state_tracker=get_rng_state_tracker, - attn_mask_type=attn_mask_type, sequence_parallel=sequence_parallel, tp_group=tp_group, layer_number=self.layer_number, @@ -1508,6 +1542,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, encoder_output: Optional[torch.Tensor] = None, + attn_mask_type: str = "causal", is_first_microbatch: Optional[bool] = None, checkpoint_core_attention: bool = False, inference_params: Optional[Any] = None, @@ -1521,7 +1556,7 @@ def forward( .. note:: - Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type` + Argument :attr:`attention_mask` will be ignored when :attr:`attn_mask_type` is set to `"causal"`. Parameters @@ -1530,6 +1565,8 @@ def forward( Input tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out self-attention softmax input. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. encoder_output : Optional[torch.Tensor], default = `None` Output of the encoder block to be fed into the decoder block if using `layer_type="decoder"`. @@ -1563,7 +1600,16 @@ def forward( """ # hidden_states: [sq, b, h] - if self.attn_mask_type == "padding" and attention_mask is not None: + if self.attn_mask_type is not None: + warnings.warn( + "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + # Keep previous functionality for current users. + attn_mask_type = self.attn_mask_type + + if attn_mask_type == "padding" and attention_mask is not None: assert ( attention_mask.dtype == torch.bool ), "Attention mask must be a boolean tensor" @@ -1768,7 +1814,8 @@ def forward( query_layer, key_layer, value_layer, - attention_mask, + attention_mask=attention_mask, + attn_mask_type=attn_mask_type, checkpoint_core_attention=checkpoint_core_attention, core_attention_bias_type=core_attention_bias_type, core_attention_bias=core_attention_bias, diff --git a/transformer_engine/pytorch/softmax.py b/transformer_engine/pytorch/softmax.py index b4166309d7..036ea98369 100644 --- a/transformer_engine/pytorch/softmax.py +++ b/transformer_engine/pytorch/softmax.py @@ -215,19 +215,16 @@ class FusedScaleMaskSoftmax(nn.Module): fused operation: scaling + mask + softmax Arguments: - attn_mask_type: attention mask type (pad or causal) mask_func: mask function to be applied. softmax_in_fp32: if true, softmax in performed at fp32 precision. """ def __init__( self, - attn_mask_type: str, mask_func: Callable, softmax_in_fp32: bool = True, ) -> None: super().__init__() - self.attn_mask_type = attn_mask_type self.scaled_masked_softmax_fusion = bool( int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1")) ) @@ -249,6 +246,7 @@ def forward( self, inp: torch.Tensor, mask: torch.Tensor, + attn_mask_type: str, scale: Optional[float] = None, ) -> torch.Tensor: """FusedScaleMaskSoftmax fprop""" @@ -257,6 +255,7 @@ def forward( self.input_in_fp16 = inp.dtype == torch.float16 self.input_in_bf16 = inp.dtype == torch.bfloat16 self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type assert ( scale is None or self.softmax_in_fp32 diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index de93cd652f..6b45a10fb3 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -73,10 +73,10 @@ class TransformerLayer(torch.nn.Module): Arguments :attr:`attention_softmax_in_fp32` and :attr:`apply_query_key_layer_scaling` are deprecated and will be fully removed in future releases. - .. note:: + .. warning:: - Argument :attr:`attention_mask` will be ignored in the `forward` call when - :attr:`self_attn_mask_type` is set to `"causal"`. + Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and + is deprecated. It will be fully removed in future releases. Parameters ---------- @@ -127,8 +127,6 @@ class TransformerLayer(torch.nn.Module): kv_channels: int, default = `None` number of key-value channels. defaults to :attr:`hidden_size` / :attr:`num_attention_heads` if `None`. - self_attn_mask_type: {'causal', 'padding'}, default = `causal` - type of attention mask passed into softmax operation. zero_centered_gamma : bool, default = 'False' if set to 'True', gamma parameter in LayerNorm is initialized to 0 and the LayerNorm formula changes to @@ -214,7 +212,7 @@ def __init__( output_layer_init_method: Optional[Callable] = None, layer_number: Optional[int] = None, kv_channels: Optional[int] = None, - self_attn_mask_type: str = "causal", + self_attn_mask_type: Optional[str] = None, tp_group: Optional[dist_group_type] = None, tp_size: int = 1, params_dtype: Optional[torch.dtype] = None, @@ -241,6 +239,13 @@ def __init__( ) -> None: super().__init__() + if self_attn_mask_type is not None: + warnings.warn( + "Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + warnings.warn( "Arguments `attention_softmax_in_fp32` and `apply_query_key_layer_scaling`" "are deprecated and will be fully removed in future releases.", @@ -252,6 +257,7 @@ def __init__( tex.userbuf_comm_available() ), "Userbuffer communication backend not available." + self.self_attn_mask_type = self_attn_mask_type params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype ub_tp_comm_overlap = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_OVERLAP", "1"))) ub_bulk_wgrad = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_BULK_WGRAD", "1"))) @@ -265,10 +271,7 @@ def __init__( self.apply_residual_connection_post_layernorm = ( apply_residual_connection_post_layernorm ) - self.self_attn_mask_type = self_attn_mask_type - assert ( - self_attn_mask_type in AttnMaskTypes - ), f"self_attn_mask_type {self_attn_mask_type} not supported" + assert layer_type in LayerTypes, f"layer_type {layer_type} not supported" if not fuse_qkv_params: @@ -326,7 +329,6 @@ def __init__( self.self_attention = MultiheadAttention( *attention_args, **common_attention_kwargs, - attn_mask_type=self_attn_mask_type, input_layernorm=not output_layernorm, attention_type="self", bias=bias, @@ -429,6 +431,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, + self_attn_mask_type: str = "causal", encoder_output: Optional[torch.Tensor] = None, enc_dec_attn_mask: Optional[torch.Tensor] = None, is_first_microbatch: Optional[bool] = None, @@ -453,6 +456,8 @@ def forward( Input tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out self-attention softmax input. + self_attn_mask_type: {'causal', 'padding'}, default = `causal` + type of attention mask passed into softmax operation. encoder_output : Optional[torch.Tensor], default = `None` Output of the encoder block to be fed into the decoder block if using `layer_type="decoder"`. @@ -488,6 +493,19 @@ def forward( Whether to set output tensors to 0 or not before use. """ + if self.self_attn_mask_type is not None: + warnings.warn( + "Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and" + "is deprecated. It will be fully removed in future releases.", + category=DeprecationWarning, + ) + # Keep previous functionality for current users. + self_attn_mask_type = self.self_attn_mask_type + + assert ( + self_attn_mask_type in AttnMaskTypes + ), f"self_attn_mask_type {self_attn_mask_type} not supported" + hidden_states = hidden_states.contiguous() if self.sequence_parallel and self.seq_length is not None: @@ -495,7 +513,7 @@ def forward( hidden_states.shape[0] == self.seq_length // self.tp_size ), "Sequence dimension must be split across TP group when using sequence parallel." - if self.self_attn_mask_type != "causal" and attention_mask is not None: + if self_attn_mask_type != "causal" and attention_mask is not None: assert ( attention_mask.dtype == torch.bool ), "Attention mask must be a boolean tensor" @@ -509,7 +527,8 @@ def forward( # Self attention. self_attention_outputs = self.self_attention( hidden_states, - attention_mask, + attention_mask=attention_mask, + attn_mask_type=self_attn_mask_type, inference_params=inference_params, is_first_microbatch=is_first_microbatch, checkpoint_core_attention=checkpoint_core_attention, @@ -556,7 +575,8 @@ def forward( if self.layer_type == "decoder": inter_attention_outputs = self.inter_attention( bda_output, - enc_dec_attn_mask, + attention_mask=enc_dec_attn_mask, + attn_mask_type=self_attn_mask_type, encoder_output=encoder_output, is_first_microbatch=is_first_microbatch, checkpoint_core_attention=checkpoint_core_attention, From 0170797ce9fc2a6114f4e72383ad58e1fa321dfd Mon Sep 17 00:00:00 2001 From: Tian Zheng Date: Sun, 27 Aug 2023 02:08:10 +0800 Subject: [PATCH 051/427] [Paddle] Add parallel support (#357) * [Paddle] Add TP, DP, PP, FSDP Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Minor fix Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Fix CI failure Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Remove set_nccl_overlap_warning_if_tp Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Improve variable naming Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Refactor FP8 Buffer Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Stylic changes Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Fix FP32 parallel training Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Fix numel performance issue Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Squashed commit of the following: commit 79e2e5fd774e67dcdda9aae01a9f31a6479c5d70 Author: Tian Zheng (Engrg-Hardware 1) Date: Sun Aug 20 14:39:16 2023 +0000 Add TP test Signed-off-by: Tian Zheng (Engrg-Hardware 1) commit 1d40ad60540490f97ed82ba877cc6eda8902cbf6 Author: Tian Zheng (Engrg-Hardware 1) Date: Sun Aug 20 14:22:25 2023 +0000 Fix tp_size when disabled Signed-off-by: Tian Zheng (Engrg-Hardware 1) commit 6632f735a0c8251862355fc74622af59fae3a509 Author: Tian Zheng (Engrg-Hardware 1) Date: Sun Aug 20 05:52:18 2023 +0000 Add TP for attention and transformer layer Signed-off-by: Tian Zheng (Engrg-Hardware 1) Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Add shape check Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Add FSDP check for stage 1,2,3 Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Review changes Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Fix group_sharding test Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Support NVTE_FUSE_ATTN Signed-off-by: Tian Zheng (Engrg-Hardware 1) * Fix CI errors Signed-off-by: Tian Zheng (Engrg-Hardware 1) --------- Signed-off-by: Tian Zheng (Engrg-Hardware 1) Co-authored-by: Kirthi Shankar Sivamani --- .../paddle/mnist/test_single_gpu_mnist.py | 8 +- tests/paddle/dist_launcher.py | 140 ++++++++++ tests/paddle/parallel_tests/amax_reduction.py | 87 ++++++ tests/paddle/parallel_tests/group_sharding.py | 187 +++++++++++++ .../parallel_tests/layernorm_linear_tp.py | 119 ++++++++ .../paddle/parallel_tests/layernorm_mlp_tp.py | 125 +++++++++ tests/paddle/parallel_tests/linear_pp.py | 192 +++++++++++++ tests/paddle/parallel_tests/linear_tp.py | 180 ++++++++++++ tests/paddle/parallel_tests/transformer_tp.py | 151 ++++++++++ tests/paddle/test_layers.py | 10 +- tests/paddle/test_operators.py | 8 +- tests/paddle/test_parallel.py | 89 ++++++ tests/paddle/utils.py | 18 ++ transformer_engine/paddle/constants.py | 4 + transformer_engine/paddle/distributed.py | 100 +++++++ transformer_engine/paddle/fp8.py | 92 +++++-- transformer_engine/paddle/fp8_buffer.py | 257 ++++++++++++++++++ transformer_engine/paddle/layer/attention.py | 106 +++++--- transformer_engine/paddle/layer/base.py | 78 +++++- transformer_engine/paddle/layer/layernorm.py | 2 +- .../paddle/layer/layernorm_linear.py | 109 ++++++-- .../paddle/layer/layernorm_mlp.py | 153 +++++++++-- transformer_engine/paddle/layer/linear.py | 145 ++++++++-- .../paddle/layer/transformer.py | 28 +- 24 files changed, 2248 insertions(+), 140 deletions(-) create mode 100644 tests/paddle/dist_launcher.py create mode 100644 tests/paddle/parallel_tests/amax_reduction.py create mode 100644 tests/paddle/parallel_tests/group_sharding.py create mode 100644 tests/paddle/parallel_tests/layernorm_linear_tp.py create mode 100644 tests/paddle/parallel_tests/layernorm_mlp_tp.py create mode 100644 tests/paddle/parallel_tests/linear_pp.py create mode 100644 tests/paddle/parallel_tests/linear_tp.py create mode 100644 tests/paddle/parallel_tests/transformer_tp.py create mode 100644 tests/paddle/test_parallel.py create mode 100644 transformer_engine/paddle/distributed.py create mode 100644 transformer_engine/paddle/fp8_buffer.py diff --git a/examples/paddle/mnist/test_single_gpu_mnist.py b/examples/paddle/mnist/test_single_gpu_mnist.py index dabeb55656..cffd036d95 100644 --- a/examples/paddle/mnist/test_single_gpu_mnist.py +++ b/examples/paddle/mnist/test_single_gpu_mnist.py @@ -57,11 +57,13 @@ def forward(self, x): def train(args, model, train_loader, optimizer, epoch, use_fp8): """Training function.""" model.train() + losses = [] for batch_id, (data, labels) in enumerate(train_loader): with paddle.amp.auto_cast(dtype='bfloat16', level='O2'): # pylint: disable=not-context-manager with te.fp8_autocast(enabled=use_fp8): outputs = model(data) loss = F.cross_entropy(outputs, labels) + losses.append(loss.item()) loss.backward() optimizer.step() @@ -74,7 +76,9 @@ def train(args, model, train_loader, optimizer, epoch, use_fp8): f"Loss: {loss.item():.6f}") if args.dry_run: return loss.item() - return loss.item() + avg_loss = sum(losses) / len(losses) + print(f"Train Epoch: {epoch}, Average Loss: {avg_loss}") + return avg_loss def evaluate(model, test_loader, epoch, use_fp8): @@ -226,7 +230,7 @@ def setUpClass(cls): @staticmethod def verify(actual): """Check If loss and accuracy match target""" - desired_traing_loss = 0.5 + desired_traing_loss = 0.1 desired_test_accuracy = 0.98 assert actual[0] < desired_traing_loss assert actual[1] > desired_test_accuracy diff --git a/tests/paddle/dist_launcher.py b/tests/paddle/dist_launcher.py new file mode 100644 index 0000000000..e59b686435 --- /dev/null +++ b/tests/paddle/dist_launcher.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Helper functions to launch distributed tests""" + +import copy +import os +from pathlib import Path +import subprocess +import time +import unittest + +from paddle import fluid +from paddle.distributed.utils.launch_utils import ( + TrainerProc, + find_free_ports, + get_cluster, + watch_local_trainers, +) + +__all__ = ['TestDistributed'] + + +def get_cluster_from_args(selected_gpus): + """Get node information from selected GPUs""" + cluster_node_ips = '127.0.0.1' + node_ip = '127.0.0.1' + + node_ips = [x.strip() for x in cluster_node_ips.split(',')] + + node_ips.index(node_ip) + + free_ports = None + + free_ports = find_free_ports(len(selected_gpus)) + if free_ports is not None: + free_ports = list(free_ports) + + trainer_endpoints = [] + for ip in node_ips: + trainer_endpoints.append([f"{ip}:{port}" for port in free_ports]) + return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus) + + +def get_gpus(selected_gpus): + """Get selected GPU string""" + selected_gpus = [x.strip() for x in selected_gpus.split(',')] + return selected_gpus + + +def start_local_trainers( + cluster, + pod, + training_script, + training_script_args, + allocator_strategy="auto_growth", +): + """Launch trainers""" + current_env = copy.copy(os.environ.copy()) + # paddle broadcast ncclUniqueId use socket, and + # proxy maybe make trainers unreachable, so delete them. + # if we set them to "", grpc will log error message "bad uri" + # so just delete them. + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + procs = [] + for t in pod.trainers: + proc_env = { + "FLAGS_selected_gpus": ",".join([str(g) for g in t.gpus]), + "PADDLE_TRAINER_ID": f"{t.rank}", + "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", + "PADDLE_TRAINERS_NUM": f"{cluster.trainers_nranks()}", + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + "PYTHONPATH": str(Path(__file__).resolve().parent), + } + + proc_env["FLAGS_allocator_strategy"] = allocator_strategy + if allocator_strategy == "auto_growth": + proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1" + + current_env.update(proc_env) + + print(f"trainer proc env:{current_env}") + + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + cmd = "python -m coverage run --branch -p " + training_script + else: + cmd = "python -u " + training_script + + print(f"start trainer proc:{cmd} env:{proc_env}") + + fn = None + + proc = subprocess.Popen(cmd.split(" ") + training_script_args, env=current_env) # pylint: disable=consider-using-with + + tp = TrainerProc() + tp.proc = proc + tp.rank = t.rank + tp.log_fn = fn + tp.cmd = cmd + + procs.append(tp) + + return procs + + +class TestDistributed(unittest.TestCase): + """Base class for distributed test""" + + @staticmethod + def run_2gpu( + target_file_name, + allocator_strategy="auto_growth", + ): + """Run target file in subprocesses""" + if (not fluid.core.is_compiled_with_cuda() or fluid.core.get_cuda_device_count() == 0): + return + + selected_gpus = get_gpus('0,1') + cluster = None + pod = None + + cluster, pod = get_cluster_from_args(selected_gpus) + + procs = start_local_trainers( + cluster, + pod, + allocator_strategy=allocator_strategy, + training_script=target_file_name, + training_script_args=[], + ) + + while True: + alive = watch_local_trainers(procs, cluster.trainers_endpoints()) + + if not alive: + print(f"Local procs complete, POD info:{pod}") + break + time.sleep(3) diff --git a/tests/paddle/parallel_tests/amax_reduction.py b/tests/paddle/parallel_tests/amax_reduction.py new file mode 100644 index 0000000000..931af07657 --- /dev/null +++ b/tests/paddle/parallel_tests/amax_reduction.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for Linear layer in tensor parallel""" + +import unittest + +import paddle +from paddle.distributed import fleet + +from utils import assert_allclose, set_random_seed +import transformer_engine.paddle as te + + +def assert_allclose_across_ranks(tensor, group=None): + """Assert tensor is identical in all ranks""" + gathered_list = [] + paddle.distributed.all_gather(gathered_list, tensor, group=group) + assert len(gathered_list) > 1 + for gathered_tensor in gathered_list: + assert_allclose(tensor, gathered_tensor) + + +class TestAmaxReduction(unittest.TestCase): + """Tests Amax reduction""" + + def setUp(self): + self.data_parallel_size = 2 + self.init_dist_env() + self.global_dtype = 'bfloat16' + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": 1, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + + def test_amax_reduction(self): + """Tests column parallel linear""" + set_random_seed(1024) + layer1 = te.Linear(16, 16) + layer2 = te.Linear(16, 16) + model = paddle.nn.Sequential(layer1, layer2) + model = fleet.distributed_model(model) + + rank_id = paddle.distributed.get_rank() + set_random_seed(rank_id) + + optimizer = paddle.optimizer.SGD(learning_rate=10.0, parameters=model.parameters()) + optimizer = fleet.distributed_optimizer(optimizer) + + def train_one_step(layer, inp, optimizer): + inp = paddle.to_tensor(inp) + inp.stop_gradient = False + out = layer(inp) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss + + for _ in range(5): + inp = paddle.uniform([16, 16], self.global_dtype) + with te.fp8_autocast(enabled=True): + train_one_step(model, inp, optimizer) + + assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].amax_history[-1]) + assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].scale) + assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].scale_inv) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].amax_history[-1]) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].scale) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].scale_inv) + assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].amax_history[-1]) + assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].scale) + assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].scale_inv) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].amax_history[-1]) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].scale) + assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].scale_inv) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/group_sharding.py b/tests/paddle/parallel_tests/group_sharding.py new file mode 100644 index 0000000000..b8e4fd885d --- /dev/null +++ b/tests/paddle/parallel_tests/group_sharding.py @@ -0,0 +1,187 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for group sharding""" + +import unittest + +import paddle +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer import ( + DygraphShardingOptimizer,) + +from utils import assert_allclose, set_random_seed +import transformer_engine.paddle as te + + +class TestGroupSharding(unittest.TestCase): + """Tests group sharding""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def set_attr(self): + """Set test configs""" + self.sharding_degree = 2 + self.global_dtype = 'float32' + self.rtol = 1e-5 + self.atol = 1e-5 + self.batch_size = 16 + self.in_channels = 16 + self.out_channels = 32 + self.fp8 = False + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": self.sharding_degree, + } + self.strategy = strategy + fleet.init(is_collective=True, strategy=strategy) + + def _get_model_and_optimizer(self, model, stage): + if stage == 1: + optimizer = DygraphShardingOptimizer( + hcg=fleet.get_hybrid_communicate_group(), + user_defined_strategy=self.strategy, + params=model.parameters(), + inner_optimizer_class=paddle.optimizer.AdamW, + learning_rate=0.01, + ) + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + elif stage in [2, 3]: + optimizer = paddle.optimizer.AdamW(learning_rate=0.01, parameters=model.parameters()) + group = fleet.get_hybrid_communicate_group().get_sharding_parallel_group() + + class ShardingLevel: # pylint: disable=too-few-public-methods, + """Paddle sharding options""" + kStage1 = 'os' + kStage2 = 'os_g' + kStage3 = 'p_g_os' + + level = ShardingLevel.kStage3 if stage == 3 else ShardingLevel.kStage2 + model, optimizer, _ = paddle.distributed.sharding.group_sharded_parallel( + model=model, + optimizer=optimizer, + level=level, + group=group, + segment_size=256, + ) + else: + raise ValueError(f"Stage {stage} not supported") + return model, optimizer + + def test_group_sharding_stage1(self): + """Tests group sharding training""" + set_random_seed(1024) + model_te = te.Linear(self.in_channels, self.out_channels) + model_pd = paddle.nn.Linear(self.in_channels, self.out_channels) + model_pd.weight.copy_(model_te.weight.T, True) + model_pd.bias.copy_(model_te.bias, True) + + model_te, optimizer_te = self._get_model_and_optimizer(model_te, stage=1) + model_pd, optimizer_pd = self._get_model_and_optimizer(model_pd, stage=1) + + rank_id = paddle.distributed.get_rank() + paddle.seed(rank_id) + + def train_one_step(model, inp, optimizer): + out = model(inp) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_channels], self.global_dtype) + with te.fp8_autocast(enabled=False): + loss_te = train_one_step(model_te, inp, optimizer_te) + loss_pd = train_one_step(model_pd, inp, optimizer_pd) + assert_allclose(loss_te, loss_pd, rtol=self.rtol, atol=self.atol) + + assert len(optimizer_te.state_dict()) == 4, \ + "Expect each rank to hold 4 optimizer state entries." + + def test_group_sharding_stage2(self): + """Tests group sharding training""" + set_random_seed(1024) + model_te = te.Linear(self.in_channels, self.out_channels) + model_pd = paddle.nn.Linear(self.in_channels, self.out_channels) + model_pd.weight.copy_(model_te.weight.T, True) + model_pd.bias.copy_(model_te.bias, True) + + model_te, optimizer_te = self._get_model_and_optimizer(model_te, stage=2) + model_pd, optimizer_pd = self._get_model_and_optimizer(model_pd, stage=2) + + rank_id = paddle.distributed.get_rank() + paddle.seed(rank_id) + + def train_one_step(model, inp, optimizer): + out = model(inp) + loss = out.mean() + loss.backward() + # Check gradients are split to different trainers + if rank_id == 0: + assert model.bias.grad is None and model.weight.grad is not None + elif rank_id == 1: + assert model.weight.grad is None and model.bias.grad is not None + optimizer.step() + optimizer.clear_grad() + return loss + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_channels], self.global_dtype) + with te.fp8_autocast(enabled=False): + loss_te = train_one_step(model_te, inp, optimizer_te) + loss_pd = train_one_step(model_pd, inp, optimizer_pd) + assert_allclose(loss_te, loss_pd, rtol=self.rtol, atol=self.atol) + + assert len(optimizer_te.state_dict()) == 4, \ + "Expect each rank to hold 4 optimizer state entries." + + def test_group_sharding_stage3(self): + """Tests group sharding training""" + set_random_seed(1024) + model_te = te.Linear(self.in_channels, self.out_channels) + model_pd = paddle.nn.Linear(self.in_channels, self.out_channels) + model_pd.weight.copy_(model_te.weight.T, True) + model_pd.bias.copy_(model_te.bias, True) + + model_te, optimizer_te = self._get_model_and_optimizer(model_te, stage=3) + model_pd, optimizer_pd = self._get_model_and_optimizer(model_pd, stage=3) + + rank_id = paddle.distributed.get_rank() + paddle.seed(rank_id) + + def train_one_step(model, inp, optimizer): + out = model(inp) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_channels], self.global_dtype) + with te.fp8_autocast(enabled=False): + loss_te = train_one_step(model_te, inp, optimizer_te) + loss_pd = train_one_step(model_pd, inp, optimizer_pd) + assert_allclose(loss_te, loss_pd, rtol=self.rtol, atol=self.atol) + + for name, value in optimizer_te.state_dict().items(): + if name.endswith('w_0_moment1_0'): + assert value.numel() == \ + self.in_channels * self.out_channels // self.sharding_degree, \ + "Expect optimizer state to be sharded across trainers." + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/layernorm_linear_tp.py b/tests/paddle/parallel_tests/layernorm_linear_tp.py new file mode 100644 index 0000000000..1034fb26fc --- /dev/null +++ b/tests/paddle/parallel_tests/layernorm_linear_tp.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for LayerNormLinear layer in tensor parallel""" + +import unittest + +import paddle +from paddle.distributed import fleet +from paddle.distributed.fleet.layers.mpu import mp_ops + +from utils import assert_allclose, assert_shape, set_random_seed +import transformer_engine.paddle as te + + +class TestLayerNormLinearTp(unittest.TestCase): + """Tests LayerNormLinear layer with column/row parallelism in BF16""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": self.model_parallel_size, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + self.hcg = fleet.get_hybrid_communicate_group() + self.tp_group = self.hcg.get_model_parallel_group() + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.in_features = 32 + self.out_features = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-3 + self.atol = 1e-3 + self.eps = 1e-3 + self.fp8 = False + + def test_column_parallel_layer(self): + """Tests column parallel LayerNormLinear""" + set_random_seed(1024) + layer_te = te.LayerNormLinear( + self.in_features, + self.out_features, + eps=self.eps, + parallel_mode='column', + ) + layer_pd = te.LayerNormLinear( + self.in_features, + self.out_features, + eps=self.eps, + backend='paddle', + ) + # Get total weight + total_weight = [] + partial_weight = layer_te.weight.clone().detach() + paddle.distributed.all_gather(total_weight, partial_weight, group=self.tp_group) + total_weight = paddle.concat(total_weight, axis=0) + layer_pd.weight.copy_(total_weight.T, True) + + assert_shape(layer_te.weight, + [self.out_features // self.model_parallel_size, self.in_features]) + assert_shape(layer_te.bias, [self.out_features // self.model_parallel_size]) + + optimizer_te = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_te.parameters()) + optimizer_pd = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_pd.parameters()) + + layer_te = fleet.distributed_model(layer_te) + optimizer_te = fleet.distributed_optimizer(optimizer_te) + + def train_one_step(layer, inp, optimizer, gather=False): + inp = paddle.to_tensor(inp) + inp.stop_gradient = False + out = layer(inp) + if gather: + total_out = mp_ops._c_concat(out, group=self.tp_group) + else: + total_out = out + loss = total_out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss, inp.grad + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_features], self.global_dtype) + with te.fp8_autocast(enabled=self.fp8): + loss_tp, grad_input = train_one_step(layer_te, inp, optimizer_te, gather=True) + loss_ref, grad_input_ref = train_one_step(layer_pd, inp, optimizer_pd) + assert_allclose(loss_tp, loss_ref, rtol=self.rtol, atol=self.atol) + assert_allclose(grad_input, grad_input_ref, rtol=self.rtol, atol=self.atol) + + +class TestLayerNormLinearTpFp8(TestLayerNormLinearTp): + """Tests LayernormLinear layer with column/row parallelism in FP8""" + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.in_features = 32 + self.out_features = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-2 + self.atol = 1e-2 + self.eps = 1e-3 + self.fp8 = True + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/layernorm_mlp_tp.py b/tests/paddle/parallel_tests/layernorm_mlp_tp.py new file mode 100644 index 0000000000..f579f5f371 --- /dev/null +++ b/tests/paddle/parallel_tests/layernorm_mlp_tp.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for LayerNormMLP layer in tensor parallel""" + +import unittest + +import paddle +from paddle.distributed import fleet + +from utils import assert_allclose, assert_shape, set_random_seed +import transformer_engine.paddle as te + + +class TestLayerNormMLPTp(unittest.TestCase): + """Tests LayerNormMLP layer with model parallel in BF16""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": self.model_parallel_size, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + self.hcg = fleet.get_hybrid_communicate_group() + self.tp_group = self.hcg.get_model_parallel_group() + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.hidden_size = 32 + self.ffn_hidden_size = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-3 + self.atol = 1e-3 + self.eps = 1e-3 + self.fp8 = False + + def test_parallel_layer(self): + """Tests parallel LayerNormMLP""" + set_random_seed(1024) + layer_te = te.LayerNormMLP( + hidden_size=self.hidden_size, + ffn_hidden_size=self.ffn_hidden_size, + eps=self.eps, + set_parallel_mode=True, + ) + layer_pd = te.LayerNormMLP( + hidden_size=self.hidden_size, + ffn_hidden_size=self.ffn_hidden_size, + eps=self.eps, + set_parallel_mode=False, + backend='paddle', + ) + + def _get_total_weight(local_weight, tp_group, axis): + total_weight = [] + partial_weight = local_weight.clone().detach() + paddle.distributed.all_gather(total_weight, partial_weight, group=tp_group) + total_weight = paddle.concat(total_weight, axis=axis) + return total_weight + + # Get total weight + total_fc1_weight = _get_total_weight(layer_te.fc1_weight, tp_group=self.tp_group, axis=0) + total_fc2_weight = _get_total_weight(layer_te.fc2_weight, tp_group=self.tp_group, axis=1) + layer_pd.fc1_weight.copy_(total_fc1_weight.T, True) + layer_pd.fc2_weight.copy_(total_fc2_weight.T, True) + + assert_shape(layer_te.fc1_weight, + [self.ffn_hidden_size // self.model_parallel_size, self.hidden_size]) + assert_shape(layer_te.fc1_bias, [self.ffn_hidden_size // self.model_parallel_size]) + assert_shape(layer_te.fc2_weight, + [self.hidden_size, self.ffn_hidden_size // self.model_parallel_size]) + assert_shape(layer_te.fc2_bias, [self.hidden_size]) + + optimizer_te = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_te.parameters()) + optimizer_pd = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_pd.parameters()) + + layer_te = fleet.distributed_model(layer_te) + optimizer_te = fleet.distributed_optimizer(optimizer_te) + + def train_one_step(layer, inp, optimizer): + inp = paddle.to_tensor(inp) + inp.stop_gradient = False + out = layer(inp) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss, inp.grad + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.hidden_size], self.global_dtype) + with te.fp8_autocast(enabled=self.fp8): + loss_tp, grad_input = train_one_step(layer_te, inp, optimizer_te) + loss_ref, grad_input_ref = train_one_step(layer_pd, inp, optimizer_pd) + assert_allclose(loss_tp, loss_ref, rtol=self.rtol, atol=self.atol) + assert_allclose(grad_input, grad_input_ref, rtol=self.rtol, atol=self.atol) + + +class TestLayerNormMLPTpFp8(TestLayerNormMLPTp): + """Tests LayerNormMLP layer with tensor parallelism in FP8""" + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.hidden_size = 32 + self.ffn_hidden_size = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-2 + self.atol = 1e-2 + self.eps = 1e-3 + self.fp8 = True + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/linear_pp.py b/tests/paddle/parallel_tests/linear_pp.py new file mode 100644 index 0000000000..994e15ba7d --- /dev/null +++ b/tests/paddle/parallel_tests/linear_pp.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for Linear layer in pipeline parallel""" + +import unittest + +import numpy as np + +import paddle +from paddle.distributed import fleet + +from paddle.distributed.fleet.meta_parallel import ( + LayerDesc, + PipelineLayer, +) + +from utils import assert_allclose, set_random_seed +import transformer_engine.paddle as te + + +class TEPipelineModel(PipelineLayer): + """Model for pipeline parallel test""" + + def __init__(self, + in_features, + hidden_features, + weight_attrs, + use_te=True, + use_fp8=False, + **kwargs): + self.in_features = in_features + self.hidden_features = hidden_features + self.fp8 = use_fp8 + hcg = fleet.get_hybrid_communicate_group() + self.dp_group = hcg.get_data_parallel_group() + + Linear = te.Linear if use_te else paddle.nn.Linear + model_desc = [ + LayerDesc(Linear, self.in_features, self.hidden_features, weight_attr=weight_attrs[0]), + LayerDesc(Linear, self.hidden_features, self.in_features, weight_attr=weight_attrs[1]), + ] + super().__init__(layers=model_desc, loss_fn=paddle.nn.CrossEntropyLoss(), **kwargs) + + def forward(self, *args, **kwargs): + with te.fp8_autocast(enabled=self.fp8, fp8_group=self.dp_group): + return super().forward(*args, **kwargs) + + +class StandaloneModel(paddle.nn.Layer): + """Model for pipeline parallel test""" + + def __init__(self, in_features, hidden_features, weight_attrs): + super().__init__() + self.in_features = in_features + self.hidden_features = hidden_features + Linear = paddle.nn.Linear + self.layer = paddle.nn.Sequential( + Linear(self.in_features, self.hidden_features, weight_attr=weight_attrs[0]), + Linear(self.hidden_features, self.in_features, weight_attr=weight_attrs[1]), + ) + self.loss = paddle.nn.CrossEntropyLoss() + + def forward(self, inp): + out = self.layer(inp[0]) + loss = self.loss(out, inp[1]) + return loss + + +class TestLinearPipelineParallel(unittest.TestCase): + """Tests Linear layer with pipeline parallel""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 1, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": self.batch_size // self.micro_batch_size, + "micro_batch_size": self.micro_batch_size, + } + fleet.init(is_collective=True, strategy=strategy) + self.rank = fleet.worker_index() + self.hcg = fleet.get_hybrid_communicate_group() + + def set_attr(self): + """Set test configs""" + self.batch_size = 32 + self.micro_batch_size = 16 + self.in_features = 32 + self.hidden_features = 64 + self.global_dtype = 'float32' + self.rtol = 1e-5 + self.atol = 1e-5 + self.iter = 10 + self.fp8 = False + + def test_pipeline_train(self): + """Test pipeline parallel training""" + set_random_seed(1024) + + weight1_np = np.random.normal(size=[self.in_features, self.hidden_features]) + weight2_np = np.random.normal(size=[self.hidden_features, self.in_features]) + weight_attrs = [ + paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(weight1_np)), + paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(weight2_np)), + ] + weight_attrs_transposed = [ + paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(weight1_np.T)), + paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(weight2_np.T)), + ] + + pipe_model = TEPipelineModel( + self.in_features, + self.hidden_features, + weight_attrs_transposed, + use_te=True, + use_fp8=self.fp8, + seg_method="layer:Linear", + num_stages=self.pipeline_parallel_size, + ) + + # Check if model is split across ranks as expected + for name, sublayer in pipe_model.named_sublayers(): + if name in ('_loss_fn', 'shared_layers'): + continue + if self.rank == 0: + assert tuple(sublayer.weight.shape) == weight1_np.T.shape, \ + f"Shape does not match, expect: {weight1_np.T.shape}, " \ + f"actual: {tuple(sublayer.weight.shape)}" + elif self.rank == 1: + assert tuple(sublayer.weight.shape) == weight2_np.T.shape, \ + f"Shape does not match, expect: {weight2_np.T.shape}, " \ + f"actual: {tuple(sublayer.weight.shape)}" + + standalone_model = StandaloneModel( + self.in_features, + self.hidden_features, + weight_attrs, + ) + + optimizer_te = paddle.optimizer.SGD(learning_rate=0.1, parameters=pipe_model.parameters()) + optimizer_pd = paddle.optimizer.SGD(learning_rate=0.1, + parameters=standalone_model.parameters()) + + pipe_model = fleet.distributed_model(pipe_model) + optimizer_te = fleet.distributed_optimizer(optimizer_te) + + def train_one_step(layer, inp, optimizer): + loss = layer(inp) + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss + + for i in range(self.iter): + inp = paddle.to_tensor(np.random.normal(size=[self.batch_size, self.in_features]), + dtype=self.global_dtype) + label = paddle.to_tensor(np.random.randint(self.in_features, size=[self.batch_size, 1])) + loss_te = pipe_model.train_batch([inp, label], optimizer_te) + loss_pd = train_one_step(standalone_model, [inp, label], optimizer_pd) + print(f"Iter: {i}, loss_te: {loss_te.item()}, loss_pd: {loss_pd.item()}") + assert_allclose(loss_te, loss_pd, rtol=self.rtol, atol=self.atol) + + +class TestLinearPipelineParallelFP8(TestLinearPipelineParallel): + """Tests Linear layer with column/row parallelism in FP8""" + + def set_attr(self): + """Set test configs""" + self.batch_size = 32 + self.micro_batch_size = 16 + self.in_features = 32 + self.hidden_features = 64 + self.global_dtype = 'float32' + self.rtol = 5e-2 + self.atol = 5e-2 + self.iter = 10 + self.fp8 = True + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/linear_tp.py b/tests/paddle/parallel_tests/linear_tp.py new file mode 100644 index 0000000000..fe0aeddccd --- /dev/null +++ b/tests/paddle/parallel_tests/linear_tp.py @@ -0,0 +1,180 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for Linear layer in tensor parallel""" + +import unittest + +import paddle +from paddle.distributed import fleet +from paddle.distributed.fleet.layers.mpu import mp_ops + +from utils import assert_allclose, assert_shape, set_random_seed +import transformer_engine.paddle as te + + +class TestLinearTp(unittest.TestCase): + """Tests Linear layer with column/row parallelism in BF16""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": self.model_parallel_size, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + self.rank = fleet.worker_index() + self.hcg = fleet.get_hybrid_communicate_group() + self.tp_group = self.hcg.get_model_parallel_group() + self.world_size = self.hcg.get_model_parallel_world_size() + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.in_features = 32 + self.out_features = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-3 + self.atol = 1e-3 + self.fp8 = False + + def test_column_parallel_layer(self): + """Tests column parallel linear""" + set_random_seed(1024) + layer_te = te.Linear( + self.in_features, + self.out_features, + parallel_mode='column', + ) + layer_pd = te.Linear( + self.in_features, + self.out_features, + backend='paddle', + ) + # Get total weight + total_weight = [] + partial_weight = layer_te.weight.clone().detach() + paddle.distributed.all_gather(total_weight, partial_weight, group=self.tp_group) + total_weight = paddle.concat(total_weight, axis=0) + layer_pd.weight.copy_(total_weight.T, True) + + assert_shape(layer_te.weight, + [self.out_features // self.model_parallel_size, self.in_features]) + assert_shape(layer_te.bias, [self.out_features // self.model_parallel_size]) + + optimizer_te = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_te.parameters()) + optimizer_pd = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_pd.parameters()) + + layer_te = fleet.distributed_model(layer_te) + optimizer_te = fleet.distributed_optimizer(optimizer_te) + + def train_one_step(layer, inp, optimizer, gather=False): + inp = paddle.to_tensor(inp) + inp.stop_gradient = False + out = layer(inp) + if gather: + total_out = mp_ops._c_concat(out, group=self.tp_group) + else: + total_out = out + loss = total_out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss, inp.grad + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_features], self.global_dtype) + with te.fp8_autocast(enabled=self.fp8): + loss_tp, grad_input = train_one_step(layer_te, inp, optimizer_te, gather=True) + loss_ref, grad_input_ref = train_one_step(layer_pd, inp, optimizer_pd) + assert_allclose(loss_tp, loss_ref, rtol=self.rtol, atol=self.atol) + assert_allclose(grad_input, grad_input_ref, rtol=self.rtol, atol=self.atol) + + def test_row_parallel_layer(self): + """Tests row parallel linear""" + set_random_seed(1024) + layer_te = te.Linear( + self.in_features, + self.out_features, + parallel_mode='row', + ) + layer_pd = te.Linear( + self.in_features, + self.out_features, + backend='paddle', + ) + # Get total weight + total_weight = [] + partial_weight = layer_te.weight.clone().detach() + paddle.distributed.all_gather(total_weight, partial_weight, group=self.tp_group) + total_weight = paddle.concat(total_weight, axis=1) + layer_pd.weight.copy_(total_weight.T, True) + + assert_shape(layer_te.weight, + [self.out_features, self.in_features // self.model_parallel_size]) + assert_shape(layer_te.bias, [self.out_features]) + + optimizer_te = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_te.parameters()) + optimizer_pd = paddle.optimizer.SGD(learning_rate=0.001, parameters=layer_pd.parameters()) + + # Note(tizheng): For this test, we cannot wrap model with fleet.distributed_model, + # because it will broadcast inputs across mp group. However, RPL expects splitted + # inputs, which is different on each rank. + + def train_one_step(layer, inp, optimizer, split=False): + inp = paddle.to_tensor(inp, stop_gradient=True) + if split: + # TODO(tizheng): Why not working? + # issue: https://github.com/PaddlePaddle/Paddle/issues/55565 + # input_parallel = mp_ops._c_split(inp, group=layer.tp_group) + split_size = inp.shape[1] // self.world_size + input_parallel = inp[:, split_size * self.rank:split_size * (self.rank + 1)] + else: + input_parallel = inp + input_parallel.stop_gradient = False + out = layer(input_parallel) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + if split: + grad_input = [] + paddle.distributed.all_gather(grad_input, input_parallel.grad, group=self.tp_group) + grad_input = paddle.concat(grad_input, axis=1) + else: + grad_input = input_parallel.grad + return loss, grad_input + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.in_features], self.global_dtype) + with te.fp8_autocast(enabled=self.fp8): + loss_tp, grad_input = train_one_step(layer_te, inp, optimizer_te, split=True) + loss_ref, grad_input_ref = train_one_step(layer_pd, inp, optimizer_pd) + assert_allclose(loss_tp, loss_ref, rtol=self.rtol, atol=self.atol) + assert_allclose(grad_input, grad_input_ref, rtol=self.rtol, atol=self.atol) + + +class TestLinearTpFP8(TestLinearTp): + """Tests Linear layer with column/row parallelism in FP8""" + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.in_features = 32 + self.out_features = 64 + self.global_dtype = 'bfloat16' + self.rtol = 1e-2 + self.atol = 1e-2 + self.fp8 = True + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/parallel_tests/transformer_tp.py b/tests/paddle/parallel_tests/transformer_tp.py new file mode 100644 index 0000000000..69fef08d56 --- /dev/null +++ b/tests/paddle/parallel_tests/transformer_tp.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Unittest for Transformer layer in tensor parallel""" + +import unittest + +import paddle +from paddle.distributed import fleet + +from utils import assert_allclose, set_random_seed +import transformer_engine.paddle as te + + +class TestTransformerTp(unittest.TestCase): + """Tests Transformer layer with model parallel in BF16""" + + def setUp(self): + self.set_attr() + self.init_dist_env() + paddle.set_default_dtype(self.global_dtype) + + def init_dist_env(self): + """Init Paddle Fleet environment""" + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": self.model_parallel_size, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + self.hcg = fleet.get_hybrid_communicate_group() + self.tp_group = self.hcg.get_model_parallel_group() + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.hidden_size = 1024 + self.num_heads = 16 + self.ffn_hidden_size = 4096 + self.q_seqlen = 128 + self.kv_seqlen = 128 + self.mask_type = 'padding' + self.layer_type = 'encoder' + self.global_dtype = 'bfloat16' + self.rtol = 5e-2 + self.atol = 5e-2 + self.eps = 1e-3 + self.fp8 = False + + def test_parallel_layer(self): + """Tests parallel Transformer""" + set_random_seed(1024) + common_args = [ + self.hidden_size, + self.ffn_hidden_size, + self.num_heads, + ] + common_kwargs = { + 'layernorm_epsilon': self.eps, + 'hidden_dropout': 0.0, + 'attention_dropout': 0.0, + 'self_attn_mask_type': self.mask_type, + 'layer_type': self.layer_type, + } + layer_tp = te.TransformerLayer(*common_args, **common_kwargs, set_parallel_mode=True) + layer_single = te.TransformerLayer(*common_args, **common_kwargs, set_parallel_mode=False) + + def _get_total_weight(local_weight, tp_group, axis): + total_weight = [] + partial_weight = local_weight.clone().detach() + paddle.distributed.all_gather(total_weight, partial_weight, group=tp_group) + total_weight = paddle.concat(total_weight, axis=axis) + return total_weight + + def _get_weight(obj, weight_names): + for name in weight_names: + obj = getattr(obj, name) + return obj + + def copy_weight(layer_src, layer_dst, partition_mode, weight_names): + weight_src = _get_weight(layer_src, weight_names) + weight_dst = _get_weight(layer_dst, weight_names) + if partition_mode is None: + total_weight = weight_src + elif partition_mode == 'column': + total_weight = _get_total_weight(weight_src, tp_group=self.tp_group, axis=0) + elif partition_mode == 'row': + total_weight = _get_total_weight(weight_src, tp_group=self.tp_group, axis=1) + else: + raise ValueError(f"Partition Mode {partition_mode} is not supported.") + assert weight_dst.shape == total_weight.shape, \ + f"Shapes of src:{total_weight.shape} and dst:{weight_dst.shape} do not match." + weight_dst.copy_(total_weight, True) + + copy_weight(layer_tp, layer_single, None, ['self_attention', 'layernorm_qkv', 'ln_weight']) + copy_weight(layer_tp, layer_single, 'column', ['self_attention', 'layernorm_qkv', 'weight']) + copy_weight(layer_tp, layer_single, 'row', ['self_attention', 'proj', 'weight']) + copy_weight(layer_tp, layer_single, None, ['layernorm_mlp', 'ln_weight']) + copy_weight(layer_tp, layer_single, 'column', ['layernorm_mlp', 'fc1_weight']) + copy_weight(layer_tp, layer_single, 'row', ['layernorm_mlp', 'fc2_weight']) + + optimizer_tp = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer_tp.parameters()) + optimizer_single = paddle.optimizer.SGD(learning_rate=0.1, + parameters=layer_single.parameters()) + + layer_tp = fleet.distributed_model(layer_tp) + optimizer_tp = fleet.distributed_optimizer(optimizer_tp) + + def train_one_step(layer, inp_list, optimizer, fp8_enabled): + with te.fp8_autocast(enabled=fp8_enabled): + out = layer(*inp_list) + loss = out.mean() + loss.backward() + optimizer.step() + optimizer.clear_grad() + return loss + + for _ in range(5): + inp = paddle.uniform([self.batch_size, self.q_seqlen, self.hidden_size], + self.global_dtype) + mask = paddle.zeros(shape=(self.batch_size, 1, self.q_seqlen, self.kv_seqlen), + dtype='bool') + loss_tp = train_one_step(layer_tp, [inp, mask], optimizer_tp, self.fp8) + loss_single = train_one_step(layer_single, [inp, mask], optimizer_single, self.fp8) + assert_allclose(loss_tp, loss_single, rtol=self.rtol, atol=self.atol) + + +class TestTransformerTpFp8(TestTransformerTp): + """Tests Transformer layer with tensor parallelism in FP8""" + + def set_attr(self): + """Set test configs""" + self.batch_size = 16 + self.hidden_size = 1024 + self.num_heads = 16 + self.ffn_hidden_size = 4096 + self.q_seqlen = 128 + self.kv_seqlen = 128 + self.mask_type = 'padding' + self.layer_type = 'encoder' + self.global_dtype = 'bfloat16' + self.rtol = 5e-2 + self.atol = 5e-2 + self.eps = 1e-3 + self.fp8 = True + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/test_layers.py b/tests/paddle/test_layers.py index 171b9233e7..bb93458230 100644 --- a/tests/paddle/test_layers.py +++ b/tests/paddle/test_layers.py @@ -98,8 +98,8 @@ def test_linear_bf16(bs, in_features, out_features, has_bias, no_dbias, no_dgrad """ Test BF16 Linear """ - rtol = 1e-2 - atol = 1e-2 + rtol = 5e-2 + atol = 5e-2 input_tensor = paddle.uniform(shape=(bs, in_features), dtype=activation_dtype) input_tensor.stop_gradient = no_dgrad @@ -258,8 +258,8 @@ def test_layernorm_linear_bf16(bs, in_features, out_features, has_bias, no_dbias Test BF16 LayerNormLinear Layer """ paddle.set_default_dtype(activation_dtype) - rtol = 1e-2 - atol = 1e-2 + rtol = 5e-2 + atol = 5e-2 input_tensor = paddle.uniform(shape=(bs, in_features), dtype=activation_dtype) input_tensor.stop_gradient = no_dgrad @@ -905,7 +905,7 @@ def test_transformer_decoder_layer(bs, hidden_size, num_heads, ffn_hidden_size, """ paddle.set_default_dtype(math_dtype) rtol = 5e-2 - atol = 5e-2 + atol = 6e-2 eps = 1e-3 encoder_input = paddle.uniform(shape=(bs, q_seqlen, hidden_size), dtype=math_dtype) diff --git a/tests/paddle/test_operators.py b/tests/paddle/test_operators.py index 662978086a..241f96214b 100644 --- a/tests/paddle/test_operators.py +++ b/tests/paddle/test_operators.py @@ -728,8 +728,8 @@ def _get_fused_attention_out(self): return out, q_grad, k_grad, v_grad - @pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), - reason="cuDNN fMHA requires Ampere+ GPU") + @pytest.mark.skipif(paddle.device.cuda.get_device_capability() not in ((8, 0), (9, 0)), + reason="cuDNN fMHA requires Ampere and Hopper GPU") @pytest.mark.parametrize('b, s, h, d', SELF_ATTN_CASES) @pytest.mark.parametrize('dtype', ['float16', 'bfloat16']) @pytest.mark.parametrize('is_causal_masking', [True, False]) @@ -745,8 +745,8 @@ def test_self_attn_forward_backward(self, b, s, h, d, dtype, is_causal_masking): assert_allclose(k_grad_ref, k_grad, rtol=1e-3, atol=1e-2) assert_allclose(v_grad_ref, v_grad, rtol=1e-3, atol=1e-2) - @pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), - reason="cuDNN fMHA requires Ampere+ GPU") + @pytest.mark.skipif(paddle.device.cuda.get_device_capability() not in ((8, 0), (9, 0)), + reason="cuDNN fMHA requires Ampere and Hopper GPU") @pytest.mark.parametrize('b, s_q, s_kv, h, d', CROSS_ATTN_CASES) @pytest.mark.parametrize('dtype', ['float16', 'bfloat16']) def test_cross_attn_forward_backward(self, b, s_q, s_kv, h, d, dtype): diff --git a/tests/paddle/test_parallel.py b/tests/paddle/test_parallel.py new file mode 100644 index 0000000000..d6e02747d1 --- /dev/null +++ b/tests/paddle/test_parallel.py @@ -0,0 +1,89 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Test TE Paddle Parallel""" + +from pathlib import Path +import unittest + +from dist_launcher import TestDistributed +from utils import is_devices_enough + +from transformer_engine.paddle.fp8 import is_fp8_available + +test_root = Path(__file__).resolve().parent +gpu_has_fp8, reason = is_fp8_available() + + +class TestParallelLinear(TestDistributed): + """Test Linear in Parallel mode""" + + @unittest.skipIf(not is_devices_enough(2), "TestParallelLinear needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_linear_tp(self): + """Tests linear with tensor parallel in BF16""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'linear_tp.py')) + + +class TestParallelLayerNormLinear(TestDistributed): + """Test LayerNormLinear in Parallel mode""" + + @unittest.skipIf(not is_devices_enough(2), "TestParallelLayerNormLinear needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_layernorm_linear_tp(self): + """Tests layernorm_linear with tensor parallel in BF16""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'layernorm_linear_tp.py')) + + +class TestParallelLayerNormMLP(TestDistributed): + """Test LayerNormMLP in Parallel mode""" + + @unittest.skipIf(not is_devices_enough(2), "TestParallelLayerNormMLP needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_layernorm_mlp_tp(self): + """Tests layernorm_mlp with tensor parallel in BF16""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'layernorm_mlp_tp.py')) + + +class TestAmaxReduction(TestDistributed): + """Test amax reduction in dp mode""" + + @unittest.skipIf(not is_devices_enough(2), "TestAmaxReduction needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_amax_reduction(self): + """Tests amax reduction""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'amax_reduction.py')) + + +class TestPipelineParallel(TestDistributed): + """Test pipeline parallel""" + + @unittest.skipIf(not is_devices_enough(2), "TestPipelineParallel needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_pipeline_parallel(self): + """Tests pipeline parallel""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'linear_pp.py')) + + +class TestGroupSharding(TestDistributed): + """Test group sharding""" + + @unittest.skipIf(not is_devices_enough(2), "TestGroupSharding needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_group_sharding(self): + """Tests group sharding""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'group_sharding.py')) + + +class TestParallelTransformerLayer(TestDistributed): + """Test Transformer Layer in Parallel mode""" + + @unittest.skipIf(not is_devices_enough(2), "TestParallelTransformerLayer needs 2 GPUs") + @unittest.skipIf(not gpu_has_fp8, reason) + def test_transformer_tp(self): + """Tests Transformer Layer with tensor parallel in BF16""" + self.run_2gpu(str(test_root / 'parallel_tests' / 'transformer_tp.py')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/paddle/utils.py b/tests/paddle/utils.py index 432b39c2e0..5960cccd3d 100644 --- a/tests/paddle/utils.py +++ b/tests/paddle/utils.py @@ -34,3 +34,21 @@ def assert_allclose(actual, if isinstance(desired, paddle.Tensor): desired = paddle.cast(desired, 'float32').numpy() np.testing.assert_allclose(actual, desired, rtol, atol, equal_nan, err_msg, verbose) + + +def assert_shape(inp, expected_shape): + """Assert the shape of input tensor equals to expected shape""" + assert inp.shape == expected_shape, f"Expected tensor shape: {expected_shape} != " \ + f"actual tensor shape: {inp.shape}" + + +def is_devices_enough(required): + """If the number of device is enough""" + return paddle.device.cuda.device_count() >= required + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + np.random.seed(seed) + paddle.seed(seed) + paddle.distributed.fleet.meta_parallel.model_parallel_random_seed(seed) diff --git a/transformer_engine/paddle/constants.py b/transformer_engine/paddle/constants.py index eac161ec60..cfecd39564 100644 --- a/transformer_engine/paddle/constants.py +++ b/transformer_engine/paddle/constants.py @@ -46,3 +46,7 @@ class FP8BwdTensors(Enum): AttnTypes = ("self", "cross") LayerTypes = ("encoder", "decoder") + +GemmParallelModes = ("row", "column", None) + +dist_group_type = paddle.distributed.collective.Group diff --git a/transformer_engine/paddle/distributed.py b/transformer_engine/paddle/distributed.py new file mode 100644 index 0000000000..5bf51c9274 --- /dev/null +++ b/transformer_engine/paddle/distributed.py @@ -0,0 +1,100 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Methods needed for distributed training.""" + +from contextlib import contextmanager +from typing import Optional, Union, Tuple + +import paddle + +import paddle.distributed.fleet.base.topology as tp +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker +from paddle.distributed.fleet.layers.mpu import mp_ops + +from .constants import dist_group_type + +_weight_split_axis = { + 'transformer_engine': { + 'row': 1, + 'column': 0 + }, + 'paddle': { + 'row': 0, + 'column': 1 + } +} + + +def get_tp_group_and_world_size(tp_group: Union[dist_group_type, None], + enable_tp: bool = True) -> Tuple[Union[dist_group_type, None], int]: + """Get TP group and world size using Fleet API""" + if not (paddle.distributed.is_initialized() and enable_tp): + return None, 1 + model_parallel_group = (tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group() + if tp_group is None else tp_group) + world_size = (tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() + if tp_group is None else tp_group.nranks) + return model_parallel_group, world_size + + +@contextmanager +def track_rng_state(enable: bool) -> None: + """ + Applies get_rng_state_tracker().rng_state() to the context. + If not enabled, it does nothing. + """ + if enable: + with get_rng_state_tracker().rng_state(): + yield + else: + yield + + +def set_tensor_dist_attr(tensor: paddle.Tensor, is_parallel: bool, axis: int) -> None: + """Set distributed attributes for the input tensor""" + tensor.is_distributed = is_parallel + if is_parallel: + tensor.split_axis = axis + + +def set_weight_tensor_dist_attr(tensor: paddle.Tensor, is_parallel: bool, + parallel_mode: Optional[str], backend: str) -> None: + """Set distributed attributes for the weight tensor""" + if not is_parallel or parallel_mode is None: + return + set_tensor_dist_attr(tensor, is_parallel, axis=_weight_split_axis[backend][parallel_mode]) + + +def allreduce( + input_: paddle.Tensor, + tp_group: Optional[dist_group_type] = None, +) -> paddle.Tensor: + """All-reduce the input tensor across model parallel group.""" + + # Bypass the function if we are using only 1 GPU. + if tp_group is None or tp_group.nranks == 1: + return input_ + + # All-reduce. + output = mp_ops._mp_allreduce( + input_, + group=tp_group, + use_calc_stream=True, + use_model_parallel=True, + ) + + return output + + +def identity( + input_: paddle.Tensor, + tp_group: Optional[dist_group_type] = None, +) -> paddle.Tensor: + """ + Identity when forward. + Allreduce across model parallel group when backward. + """ + output = mp_ops._c_identity(input_, group=tp_group) + + return output diff --git a/transformer_engine/paddle/fp8.py b/transformer_engine/paddle/fp8.py index bcd7ae2b22..576b8d859c 100644 --- a/transformer_engine/paddle/fp8.py +++ b/transformer_engine/paddle/fp8.py @@ -3,9 +3,8 @@ # See LICENSE for license information. """FP8 utilities for TransformerEngine""" -import copy from contextlib import contextmanager -from typing import Tuple, Optional, Dict, Any +from typing import Tuple, Optional, Dict, Any, Union import numpy as np @@ -13,6 +12,9 @@ import transformer_engine_paddle as tex from transformer_engine.common.recipe import DelayedScaling, Format +from .constants import dist_group_type +from .fp8_buffer import FP8MetaFwdBuffer, FP8MetaBwdBuffer + # FP8 support _is_fp8_available = None _reason_for_no_fp8 = "" @@ -50,21 +52,27 @@ class FP8State: """Stores FP8 state""" def __init__(self): - self.fp8_enabled = False - self.fp8_calibration = False - self.fp8_recipe = None + self._fp8_enabled = False + self._fp8_calibration = False + self._fp8_recipe = None + self._fp8_distributed_group = None + self._is_first_fp8_module = False + self._fp8_autocast_counter = 0 + self._fp8_autocast_depth = 0 + self._fp8_fwd_buffer = FP8MetaFwdBuffer() + self._fp8_bwd_buffer = FP8MetaBwdBuffer() def is_fp8_enabled(self) -> bool: """Is FP8 enabled""" - return self.fp8_enabled + return self._fp8_enabled def is_fp8_calibration(self) -> bool: """Is FP8 calibration""" - return self.fp8_calibration + return self._fp8_calibration def get_fp8_recipe(self) -> DelayedScaling: """Return the fp8 recipe""" - return self.fp8_recipe + return self._fp8_recipe @staticmethod def get_default_fp8_recipe() -> DelayedScaling: @@ -73,6 +81,63 @@ def get_default_fp8_recipe() -> DelayedScaling: """ return DelayedScaling() + def get_autocast_id(self) -> int: + """Returns the number of times of entering the `fp8_autocast` context. + as a unique ID for different training steps.""" + return self._fp8_autocast_counter + + def is_first_fp8_module(self): + """Returns `True` only the first time when called multiple + times from within the same `fp8_autocast` context. + """ + tmp = self._is_first_fp8_module + self._is_first_fp8_module = False + return tmp + + def get_fp8_group(self) -> Union[dist_group_type, None]: + """Return the fp8 group for scale/amax comm""" + return self._fp8_distributed_group + + def get_fp8_fwd_buffer(self) -> FP8MetaFwdBuffer: + """Returns global fp8 forward buffer.""" + return self._fp8_fwd_buffer + + def get_fp8_bwd_buffer(self) -> FP8MetaBwdBuffer: + """Returns global fp8 backward buffer.""" + return self._fp8_bwd_buffer + + def enter( + self, + enabled: bool, + calibrating: bool, + fp8_recipe: Optional[DelayedScaling], + fp8_group: Optional[dist_group_type], + ) -> None: + """Called when entering 'fp8_autocast'""" + self.saved_states = (self._fp8_enabled, self._fp8_calibration, self._fp8_recipe, + self._fp8_distributed_group, self._is_first_fp8_module) + + self._fp8_enabled = enabled + self._fp8_calibration = calibrating + self._fp8_recipe = self.get_default_fp8_recipe() if fp8_recipe is None else fp8_recipe + self._fp8_distributed_group = fp8_group + + if self._fp8_autocast_depth == 0: + self._is_first_fp8_module = True + self._fp8_autocast_counter += 1 + self._fp8_autocast_depth += 1 + + def exit(self): + """Called when exiting 'fp8_autocast'""" + # Restore saved states + (self._fp8_enabled, self._fp8_calibration, self._fp8_recipe, self._fp8_distributed_group, + self._is_first_fp8_module) = self.saved_states + + self._fp8_autocast_depth -= 1 + + if self._fp8_autocast_depth == 0: + self._fp8_fwd_buffer.finalize() + _global_fp8_state = FP8State() @@ -87,25 +152,20 @@ def fp8_autocast( enabled: bool = False, calibrating: bool = False, fp8_recipe: Optional[DelayedScaling] = None, + fp8_group: Optional[dist_group_type] = None, ) -> None: """ Context manager for FP8 usage. """ - - global _global_fp8_state - saved_fp8_state = copy.deepcopy(_global_fp8_state) try: - _global_fp8_state.fp8_enabled = enabled - _global_fp8_state.fp8_calibration = calibrating - _global_fp8_state.fp8_recipe = FP8State.get_default_fp8_recipe( - ) if fp8_recipe is None else fp8_recipe + _global_fp8_state.enter(enabled, calibrating, fp8_recipe, fp8_group) if enabled: fp8_available, reason_for_no_fp8 = is_fp8_available() assert fp8_available, reason_for_no_fp8 yield finally: - _global_fp8_state = saved_fp8_state + _global_fp8_state.exit() def get_fp8_te_dtype(fp8_recipe: DelayedScaling, fprop_tensor: bool = True) -> tex.DType: diff --git a/transformer_engine/paddle/fp8_buffer.py b/transformer_engine/paddle/fp8_buffer.py new file mode 100644 index 0000000000..76b0c9db59 --- /dev/null +++ b/transformer_engine/paddle/fp8_buffer.py @@ -0,0 +1,257 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""FP8 meta buffer for FP8 amax reduction""" + +from abc import ABC, abstractmethod +from functools import partial +import os +from typing import Dict, Any, List, Union + +import numpy as np +import paddle + +from .constants import dist_group_type + + +class FP8MetaBufferBase(ABC): + """ + A global buffer that holds FP8 meta for reduction across trainers. + """ + + def __init__(self): + self._data = {} + self._buffer_delete_key = None + self._amax_reduce_wait_func = None + self._dp_amax_reduce_interval = None + self._dp_amax_reduce_idx = 0 + + @staticmethod + @abstractmethod + def _get_meta_tensor_key(): + """Returns scaling key in `fp8_meta`.""" + + @staticmethod + @abstractmethod + def _get_buffer_position_key(): + """Returns module position key in `fp8_meta`.""" + + @staticmethod + @abstractmethod + def _get_autocast_key(): + """Returns autocast id key in `fp8_meta`.""" + + def _get_amax_buffer_key(self, fp8_meta: Dict[str, Any]) -> str: + """Return a key in `_data` for the AMAX storage.""" + return f"AMAX_{fp8_meta[self._get_autocast_key()]}" + + def _execute_deletion(self) -> None: + """Delete the key from global amax buffer.""" + if (self._buffer_delete_key is not None and self._buffer_delete_key in self._data): + del self._data[self._buffer_delete_key] + + def _wait_handle_and_split( + self, + contiguous_amax: paddle.Tensor, + chunk_sizes: List[int], + amax_buffer_key: str, + wait_handle: Union[bool, None], + ) -> None: + """Wait for amax reduction to finish and then copy reduced amax to buffer""" + if wait_handle is not None: + wait_handle.wait() + self._data[amax_buffer_key] = list(contiguous_amax.split(chunk_sizes)) + + def _global_amax_reduction( + self, + fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, + ) -> None: + """Concatenate, reduce, and split amaxes in the global buffer.""" + + def _reduce_tensor_across_group_op_max(tensor, group, sync_op): + if paddle.distributed.is_initialized(): + wait_handle = paddle.distributed.all_reduce( + tensor, + op=paddle.distributed.ReduceOp.MAX, + group=group, + sync_op=sync_op, + ) + return wait_handle + return None + + amax_buffer_key = self._get_amax_buffer_key(fp8_meta) + # Key already deleted. + if amax_buffer_key not in self._data: + return None + + # Reduce AMAX in DP-domain at an interval. + if self._dp_amax_reduce_interval is None: + self._dp_amax_reduce_interval = int(os.getenv("NVTE_DP_AMAX_REDUCE_INTERVAL", "1")) + + tp_amax_reduce = False + if self._dp_amax_reduce_idx == 0: + reduce_group = fp8_meta["fp8_group"] + else: + tp_amax_reduce = True + self._dp_amax_reduce_idx = (self._dp_amax_reduce_idx + 1) % self._dp_amax_reduce_interval + + if tp_amax_reduce: + if tp_size > 1: + reduce_group = tp_group + else: + return None + + chunk_sizes = [x.shape[0] for x in self._data[amax_buffer_key]] + contiguous_amax = paddle.concat(self._data[amax_buffer_key]) + + wait_handle = _reduce_tensor_across_group_op_max( + contiguous_amax, + reduce_group, + not fp8_meta["async_amax_reduction"], + ) + + return partial( + self._wait_handle_and_split, + contiguous_amax, + chunk_sizes, + amax_buffer_key, + wait_handle, + ) + + def add_amax(self, fp8_meta: Dict[str, Any]) -> None: + """Append `amax_history` to global buffer.""" + buffer_key = self._get_amax_buffer_key(fp8_meta) + fp8_meta_tensor_key = self._get_meta_tensor_key() + buffer_position_key = self._get_buffer_position_key() + + if buffer_key not in self._data: + self._data[buffer_key] = [fp8_meta[fp8_meta_tensor_key].amax_history[0]] + else: + self._data[buffer_key].append(fp8_meta[fp8_meta_tensor_key].amax_history[0]) + + if buffer_position_key not in fp8_meta: + fp8_meta[buffer_position_key] = len(self._data[buffer_key]) - 1 + + # Catch incorrect fp8_autocast usage. + assert fp8_meta[buffer_position_key] == len(self._data[buffer_key]) - 1, \ + "Same module is being invoked more than once inside an `fp8_autocast` " \ + "region when using FP8 with amax reduction. This behavior is currently " \ + "unsupported. For more details and correct usage, please see " \ + "https://github.com/NVIDIA/TransformerEngine/pull/93." + + def copy_amax_from_buffer(self, fp8_meta: Dict[str, Any]) -> None: + """Populate current amax with the correct location from buffer.""" + fp8_meta_tensor_key = self._get_meta_tensor_key() + buffer_position_key = self._get_buffer_position_key() + if buffer_position_key not in fp8_meta: + return + + amax_buffer_key = self._get_amax_buffer_key(fp8_meta) + assert amax_buffer_key in self._data, "TE internal error." + + fp8_meta[fp8_meta_tensor_key].amax_history[0] = self._data[amax_buffer_key][ + fp8_meta[buffer_position_key]] + + def set_for_deletion(self, fp8_meta: Dict[str, Any]) -> None: + """Delete this amax key from global buffer during autocast end.""" + if self._get_autocast_key() not in fp8_meta: + return + self._buffer_delete_key = self._get_amax_buffer_key(fp8_meta) + + def get_amax_reduce_handle(self) -> Union[bool, None]: + """Return AMAX reduction wait handle.""" + return self._amax_reduce_handle + + def wait(self) -> None: + """Wait for reduced amax to be available in buffer.""" + if self._amax_reduce_wait_func is not None: + self._amax_reduce_wait_func() # pylint: disable=not-callable + self._amax_reduce_wait_func = None + + def to_numpy(self) -> Dict[str, List[np.array]]: + """Convert to numpy arrays""" + out = {} + for k, v in self._data.items(): + out[k] = [tensor.numpy() for tensor in v] + return out + + def from_numpy(self, buffer: Dict[str, np.array]) -> None: + """Set buffer values from numpy arrays""" + for k, v in buffer.items(): + self._data[k] = [paddle.to_tensor(arr) for arr in v] + + +class FP8MetaFwdBuffer(FP8MetaBufferBase): + """FP8Meta Buffer for forward""" + + @staticmethod + def _get_meta_tensor_key() -> str: + """Returns scaling key in `fp8_meta`.""" + return "scaling_fwd" + + @staticmethod + def _get_buffer_position_key() -> str: + """Returns module position key in `fp8_meta`.""" + return "global_fp8_buffer_pos_fwd" + + @staticmethod + def _get_autocast_key() -> str: + """Returns module position key in `fp8_meta`.""" + return "autocast_id_fwd" + + def set_for_amax_reduction( + self, + fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, + ) -> None: + """Sets up the function to call during autocast exit.""" + self._amax_global_reduce_func = partial( + self._global_amax_reduction, + fp8_meta, + tp_group, + tp_size, + ) + + def finalize(self) -> None: + """ + Called at FP8 autocast end. + Performs AMAX reduction and delete unused buffer entries. + """ + if hasattr(self, '_amax_global_reduce_func') and callable(self._amax_global_reduce_func): + self._amax_reduce_wait_func = self._amax_global_reduce_func() + self._execute_deletion() + + +class FP8MetaBwdBuffer(FP8MetaBufferBase): + """FP8Meta Buffer for backward""" + + @staticmethod + def _get_meta_tensor_key() -> str: + """Returns scaling key in `fp8_meta`.""" + return "scaling_bwd" + + @staticmethod + def _get_buffer_position_key() -> str: + """Returns module position key in `fp8_meta`.""" + return "global_fp8_buffer_pos_bwd" + + @staticmethod + def _get_autocast_key() -> str: + """Returns module position key in `fp8_meta`.""" + return "autocast_id_bwd" + + def finalize( + self, + fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, + ) -> None: + """ + Called at FP8 autocast end in backward. + Performs AMAX reduction and delete unused buffer entries. + """ + self._amax_reduce_wait_func = self._global_amax_reduction(fp8_meta, tp_group, tp_size) + self._execute_deletion() diff --git a/transformer_engine/paddle/layer/attention.py b/transformer_engine/paddle/layer/attention.py index a5aac3566f..565321baad 100644 --- a/transformer_engine/paddle/layer/attention.py +++ b/transformer_engine/paddle/layer/attention.py @@ -4,27 +4,25 @@ """Attntion API""" import math +import os import warnings from typing import Optional, Tuple, Union import paddle import paddle.nn.functional as F -from transformer_engine.paddle.constants import ( - AttnTypes, - TE_DType, -) -from transformer_engine.paddle.cpp_extensions import ( +from .layernorm_linear import LayerNormLinear +from .linear import Linear +from .softmax import FusedScaleMaskSoftmax +from ..constants import AttnTypes, TE_DType, dist_group_type +from ..cpp_extensions import ( fused_attn_fwd_qkvpacked, fused_attn_bwd_qkvpacked, fused_attn_fwd_kvpacked, fused_attn_bwd_kvpacked, ) -from transformer_engine.paddle.utils import (attention_mask_func, mask_to_cu_seqlens) -from .base import TransformerEngineBaseLayer -from .layernorm_linear import LayerNormLinear -from .linear import Linear -from .softmax import FusedScaleMaskSoftmax +from ..distributed import get_tp_group_and_world_size, track_rng_state +from ..utils import attention_mask_func, divide, mask_to_cu_seqlens class FusedAttnFuncPackedQKV(paddle.autograd.PyLayer): @@ -161,9 +159,20 @@ def __init__(self, self.attn_mask_type = attn_mask_type self.attention_dropout = attention_dropout self.attention_type = attention_type - self.backend = backend self.rng_state = paddle.zeros((2,), dtype='int64') self.rng_state.persistable = True + + self.backend = backend + + arch = paddle.device.cuda.get_device_capability() + self.is_fused_attn_supported = arch in ((8, 0), (9, 0)) + self.enable_fused_attn = int(os.getenv("NVTE_FUSED_ATTN", + "0")) and self.is_fused_attn_supported + + if not self.enable_fused_attn and backend == 'transformer_engine': + # FMHA is not enabled, falling back to Paddle backend + self.backend = 'paddle' + if self.backend != 'transformer_engine': self.scale_mask_softmax = FusedScaleMaskSoftmax(attn_mask_type, attention_mask_func, @@ -343,7 +352,7 @@ def _pd_forward( return out -class MultiHeadAttention(TransformerEngineBaseLayer): +class MultiHeadAttention(paddle.nn.Layer): """Attention w/ QKV and Proj Gemms Parameters @@ -390,6 +399,8 @@ def __init__( input_layernorm: bool = False, attention_type: str = "self", zero_centered_gamma: bool = False, + set_parallel_mode: bool = False, + tp_group: Optional[dist_group_type] = None, backend: str = 'transformer_engine', ) -> None: super().__init__() @@ -403,11 +414,19 @@ def __init__( assert attention_type in AttnTypes, f"attention_type {attention_type} not supported" + self.tp_group, self.tp_size = get_tp_group_and_world_size(tp_group, + enable_tp=set_parallel_mode) + self.tensor_parallel = self.tp_size > 1 + self.hidden_size_per_attention_head = hidden_size // num_attention_heads self.num_attention_heads = num_attention_heads norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.set_parallel_mode = set_parallel_mode self.backend = backend + self.num_attention_heads_per_partition = divide(self.num_attention_heads, self.tp_size) + qkv_parallel_mode = "column" if set_parallel_mode else None + if self.attention_type == "self": if self.input_layernorm: self.layernorm_qkv = LayerNormLinear( @@ -418,6 +437,8 @@ def __init__( bias_attr=self.bias_attr, return_layernorm_output=return_layernorm_output, zero_centered_gamma=zero_centered_gamma, + parallel_mode=qkv_parallel_mode, + tp_group=self.tp_group, backend=self.backend, ) else: @@ -426,6 +447,8 @@ def __init__( 3 * hidden_size, self.weight_attr, self.bias_attr, + parallel_mode=qkv_parallel_mode, + tp_group=self.tp_group, backend=self.backend, ) @@ -439,6 +462,8 @@ def __init__( bias_attr=self.bias_attr, return_layernorm_output=return_layernorm_output, zero_centered_gamma=zero_centered_gamma, + parallel_mode=qkv_parallel_mode, + tp_group=self.tp_group, backend=self.backend, ) else: @@ -447,6 +472,8 @@ def __init__( hidden_size, self.weight_attr, self.bias_attr, + parallel_mode=qkv_parallel_mode, + tp_group=self.tp_group, backend=self.backend, ) self.key_value = Linear( @@ -454,6 +481,8 @@ def __init__( 2 * hidden_size, self.weight_attr, self.bias_attr, + parallel_mode=qkv_parallel_mode, + tp_group=self.tp_group, backend=self.backend, ) @@ -472,6 +501,8 @@ def __init__( hidden_size, self.weight_attr, self.bias_attr, + parallel_mode="row" if set_parallel_mode else None, + tp_group=self.tp_group, backend=self.backend, ) @@ -520,23 +551,26 @@ def forward( mixed_qkv_layer = self.qkv(hidden_states) # [b, s_q, 3 * hidden_size] --> [b, s_q, 3, num_heads, head_size] - mixed_qkv_layer = mixed_qkv_layer.reshape( - shape=[0, 0, 3, self.num_attention_heads, self.hidden_size_per_attention_head]) - - context_layer = self.core_attention( - query_layer=mixed_qkv_layer, - key_value_layer=None, - attention_mask=attention_mask, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - set_zero=set_zero, - ) + mixed_qkv_layer = mixed_qkv_layer.reshape(shape=[ + 0, 0, 3, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head + ]) + + with track_rng_state(enable=self.tensor_parallel): + context_layer = self.core_attention( + query_layer=mixed_qkv_layer, + key_value_layer=None, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) else: # cross attention mixed_kv_layer = self.key_value(encoder_output) # [b, s_kv, 2 * hidden_size] --> [b, s_kv, 2, num_heads, head_size] - mixed_kv_layer = mixed_kv_layer.reshape( - shape=[0, 0, 2, self.num_attention_heads, self.hidden_size_per_attention_head]) + mixed_kv_layer = mixed_kv_layer.reshape(shape=[ + 0, 0, 2, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head + ]) if self.input_layernorm: layernorm_query_outputs = self.layernorm_query(hidden_states) @@ -547,16 +581,18 @@ def forward( else: query_layer = self.query_layer(hidden_states) - query_layer = query_layer.reshape( - shape=[0, 0, self.num_attention_heads, self.hidden_size_per_attention_head]) - context_layer = self.core_attention( - query_layer=query_layer, - key_value_layer=mixed_kv_layer, - attention_mask=attention_mask, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - set_zero=set_zero, - ) + query_layer = query_layer.reshape(shape=[ + 0, 0, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head + ]) + with track_rng_state(enable=self.tensor_parallel): + context_layer = self.core_attention( + query_layer=query_layer, + key_value_layer=mixed_kv_layer, + attention_mask=attention_mask, + core_attention_bias_type=core_attention_bias_type, + core_attention_bias=core_attention_bias, + set_zero=set_zero, + ) context_layer = paddle.reshape(context_layer, [0, 0, context_layer.shape[2] * context_layer.shape[3]]) diff --git a/transformer_engine/paddle/layer/base.py b/transformer_engine/paddle/layer/base.py index 5e16fda098..0f5a1af65c 100644 --- a/transformer_engine/paddle/layer/base.py +++ b/transformer_engine/paddle/layer/base.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod from contextlib import contextmanager +import os import pickle from typing import Generator, Dict, Tuple, Union, Any @@ -14,7 +15,7 @@ from paddle.fluid import core from paddle.fluid.framework import _dygraph_tracer -from ..constants import FP8BwdTensors +from ..constants import FP8BwdTensors, dist_group_type from ..cpp_extensions import cast_transpose, cast_transpose_bgrad, cast_to_fp8 from ..fp8 import ( FP8State, @@ -24,7 +25,6 @@ get_fp8_te_dtype, ) from ..profile import nvtx_range -from ..utils import get_bias_dtype, cast_if_needed _2X_ACC_FPROP = False _2X_ACC_DGRAD = True @@ -61,9 +61,15 @@ def __init__(self) -> None: self.fp8_calibration = False self.fp8_meta = {} self.fp8_meta["fp8_checkpoint"] = False + self.fp8_meta["fp8_group"] = None self.fp8_meta["recipe"] = FP8State.get_default_fp8_recipe() self.fp8_meta["scaling_fwd"] = FP8TensorMeta(is_forward=True) self.fp8_meta["scaling_bwd"] = FP8TensorMeta(is_forward=False) + self.tp_group = None + self.tp_size = 1 + self.fp8_meta["autocast_id_fwd_stack"] = [] + self.fp8_meta["async_amax_reduction"] = bool( + int(os.getenv("NVTE_ASYNC_AMAX_REDUCTION", "0"))) def set_activation_dtype(self, inp: paddle.Tensor) -> None: """Get activation data type for AMP.""" @@ -102,18 +108,20 @@ def set_activation_dtype(self, inp: paddle.Tensor) -> None: # assume FP8 execution. def fp8_init(self, num_gemms: int = 1) -> None: """Initialize fp8 related metadata and tensors during fprop.""" - state = get_global_fp8_state() - self.fp8_enabled = state.is_fp8_enabled() - self.fp8_calibration = state.is_fp8_calibration() + global_fp8_state = get_global_fp8_state() + self.fp8_enabled = global_fp8_state.is_fp8_enabled() + self.fp8_calibration = global_fp8_state.is_fp8_calibration() self.fp8_meta["fp8_checkpoint"] = self.fp8_enabled or self.fp8_calibration if self.fp8_enabled or self.fp8_calibration: # FP8 init has already been run and recipe is the same, don't do anything. - if self.fp8_initialized and state.get_fp8_recipe() == self.fp8_meta["recipe"]: + if self.fp8_initialized and global_fp8_state.get_fp8_recipe( + ) == self.fp8_meta["recipe"]: return # Set FP8, recipe, and other FP8 metadata - self.fp8_meta["recipe"] = state.get_fp8_recipe() + self.fp8_meta["recipe"] = global_fp8_state.get_fp8_recipe() + self.fp8_meta["fp8_group"] = global_fp8_state.get_fp8_group() # Set FP8_MAX per tensor according to recipe self.fp8_meta["fp8_max_fwd"] = self.fp8_meta["recipe"].fp8_format.value.max_fwd @@ -136,6 +144,8 @@ def _get_fp8_state(self) -> paddle.Tensor: state = {} state["scaling_fwd"] = self.fp8_meta["scaling_fwd"].to_numpy() state["scaling_bwd"] = self.fp8_meta["scaling_bwd"].to_numpy() + state["global_fp8_fwd_buffer"] = get_global_fp8_state().get_fp8_fwd_buffer().to_numpy() + state["global_fp8_bwd_buffer"] = get_global_fp8_state().get_fp8_bwd_buffer().to_numpy() # Store other pickelable values. extra = {} for k, v in self.fp8_meta.items(): @@ -179,6 +189,12 @@ def _set_fp8_state(self, state: paddle.Tensor) -> None: self.fp8_meta["scaling_fwd"].from_numpy(state["scaling_fwd"]) self.fp8_meta["scaling_bwd"].from_numpy(state["scaling_bwd"]) + # Restore global FP8 buffer states. + global_fp8_fwd_buffer = get_global_fp8_state().get_fp8_fwd_buffer() + global_fp8_bwd_buffer = get_global_fp8_state().get_fp8_bwd_buffer() + global_fp8_fwd_buffer.from_numpy(state["global_fp8_fwd_buffer"]) + global_fp8_bwd_buffer.from_numpy(state["global_fp8_bwd_buffer"]) + # Load extra items. self.fp8_meta.update(state["extra_fp8_variables"]) self.fp8_meta["recipe"].amax_history_len = self.fp8_meta["scaling_fwd"].amax_history.shape[ @@ -210,9 +226,22 @@ def prepare_forward( # Previous iteration was grad_enabled if self.fp8_meta.get("update_amax_and_scale_fwd", False): - amax_and_scale_update(self.fp8_meta, True) + global_fp8_fwd_buffer = get_global_fp8_state().get_fp8_fwd_buffer() + global_fp8_fwd_buffer.wait() + if self.fp8_meta["recipe"].reduce_amax: + global_fp8_fwd_buffer.copy_amax_from_buffer(self.fp8_meta) + amax_and_scale_update(self.fp8_meta, True) + global_fp8_fwd_buffer.set_for_deletion(self.fp8_meta) + else: + amax_and_scale_update(self.fp8_meta, True) if self.fp8_enabled and self.training: + # Setup for amax reduction + if self.fp8_meta["recipe"].reduce_amax: + global_fp8_state = get_global_fp8_state() + self.fp8_meta["first_module"] = global_fp8_state.is_first_fp8_module() + self.fp8_meta["autocast_id_fwd"] = global_fp8_state.get_autocast_id() + self.fp8_meta["autocast_id_fwd_stack"].append(self.fp8_meta["autocast_id_fwd"]) self.fp8_meta["update_amax_and_scale_fwd"] = True else: self.fp8_meta["update_amax_and_scale_fwd"] = False @@ -220,18 +249,47 @@ def prepare_forward( with nvtx_range(self.__class__.__name__ + " forward"): yield inp + if self.fp8_enabled and self.training and self.fp8_meta["recipe"].reduce_amax: + global_fp8_state = get_global_fp8_state() + global_fp8_fwd_buffer = global_fp8_state.get_fp8_fwd_buffer() + global_fp8_fwd_buffer.add_amax(self.fp8_meta) + global_fp8_fwd_buffer.set_for_amax_reduction( + self.fp8_meta, + self.tp_group, + self.tp_size, + ) + @staticmethod @contextmanager def prepare_backward(fp8_enabled: bool, fp8_meta: Dict[str, Any], + tp_group: dist_group_type, + tp_size: int, name: str = "") -> Generator[None, None, None]: """Checks and prep for BWD.""" if fp8_enabled: - amax_and_scale_update(fp8_meta, False) + global_fp8_state = get_global_fp8_state() + global_fp8_bwd_buffer = global_fp8_state.get_fp8_bwd_buffer() + global_fp8_bwd_buffer.wait() + + if fp8_meta["recipe"].reduce_amax: + global_fp8_bwd_buffer.copy_amax_from_buffer(fp8_meta) + amax_and_scale_update(fp8_meta, False) + global_fp8_bwd_buffer.set_for_deletion(fp8_meta) + + # Get new backward key. + fp8_meta["autocast_id_bwd"] = fp8_meta["autocast_id_fwd_stack"].pop(0) + else: + amax_and_scale_update(fp8_meta, False) with nvtx_range(name + " backward"): yield + if fp8_enabled and fp8_meta["recipe"].reduce_amax: + global_fp8_bwd_buffer.add_amax(fp8_meta) + if fp8_meta["first_module"]: + global_fp8_bwd_buffer.finalize(fp8_meta, tp_group, tp_size) + @staticmethod def grad_output_preprocess( ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]: @@ -258,8 +316,6 @@ def grad_output_preprocess( FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, ) - bias_dtype = get_bias_dtype(ctx.activation_dtype) - bgrad = cast_if_needed(bgrad, bias_dtype) else: if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: grad_output_c, grad_output_t = cast_transpose( diff --git a/transformer_engine/paddle/layer/layernorm.py b/transformer_engine/paddle/layer/layernorm.py index 3f0b8c4a50..89c03ee25c 100644 --- a/transformer_engine/paddle/layer/layernorm.py +++ b/transformer_engine/paddle/layer/layernorm.py @@ -31,7 +31,7 @@ def forward( zero_centered_gamma: bool, ) -> paddle.Tensor: # Make sure input dimensions are compatible - in_features = ln_weight.numel() + in_features = ln_weight.shape[0] assert inp.shape[-1] == in_features, "LayerNorm not possible" inputmat = inp.reshape((-1, in_features)) diff --git a/transformer_engine/paddle/layer/layernorm_linear.py b/transformer_engine/paddle/layer/layernorm_linear.py index 608f02a6ff..285cf4609a 100644 --- a/transformer_engine/paddle/layer/layernorm_linear.py +++ b/transformer_engine/paddle/layer/layernorm_linear.py @@ -4,7 +4,7 @@ """LayerNormLinear API""" import os -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional import paddle import paddle.nn.functional as F @@ -21,9 +21,22 @@ from .base import TransformerEngineBaseLayer from .linear import _linear_fwd, _linear_bwd -from ..constants import TE_DType, FP8FwdTensors, FP8BwdTensors +from ..constants import TE_DType, FP8FwdTensors, FP8BwdTensors, GemmParallelModes, dist_group_type +from ..distributed import ( + allreduce, + get_tp_group_and_world_size, + identity, + track_rng_state, + set_tensor_dist_attr, + set_weight_tensor_dist_attr, +) from ..fp8 import get_fp8_te_dtype -from ..utils import cast_if_needed, cast_if_needed_inplace, assert_dim_for_fp8_forward_exec +from ..utils import ( + assert_dim_for_fp8_forward_exec, + cast_if_needed, + cast_if_needed_inplace, + divide, +) __all__ = ["LayerNormLinear", "_layernorm_fwd_fp8_cast", "_layernorm_bwd"] @@ -128,9 +141,13 @@ def forward( fwd_ln_sm_margin: int, bwd_ln_sm_margin: int, zero_centered_gamma: bool, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], + tp_size: int, ) -> Union[Tuple[paddle.Tensor, ...], paddle.Tensor]: # Make sure input dimensions are compatible - in_features = ln_weight.numel() + in_features = ln_weight.shape[0] assert inp.shape[-1] == in_features, "GEMM not possible" inputmat = inp.reshape((-1, in_features)) if fp8_enabled: @@ -169,6 +186,9 @@ def forward( fp8_calibration, fp8_meta, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, is_grad_enabled, ) @@ -192,6 +212,10 @@ def forward( ctx.return_layernorm_output = return_layernorm_output ctx.bwd_ln_sm_margin = bwd_ln_sm_margin ctx.zero_centered_gamma = zero_centered_gamma + ctx.parallel_mode = parallel_mode + ctx.tensor_parallel = tensor_parallel + ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.requires_dgrad = not inp.stop_gradient ctx.requires_bgrad = use_bias and not bias.stop_gradient ctx.requires_ln_bgrad = not ln_bias.stop_gradient @@ -208,6 +232,8 @@ def backward( ...]) -> Tuple[Union[paddle.Tensor, None], ...]: with TransformerEngineBaseLayer.prepare_backward(ctx.fp8_enabled, ctx.fp8_meta, + ctx.tp_group, + ctx.tp_size, name="_LayerNormLinear"): ( inputmat, @@ -262,6 +288,9 @@ def backward( ctx.fp8_meta, True, # Always compute dgrad to feed into LayerNorm bwd ctx.activation_dtype, + ctx.parallel_mode, + ctx.tensor_parallel, + ctx.tp_group, ) if not ctx.fp8_enabled: @@ -307,6 +336,8 @@ def __init__( bias_attr: Union[paddle.ParamAttr, None, bool] = None, return_layernorm_output: bool = False, zero_centered_gamma: bool = False, + parallel_mode: Optional[str] = None, + tp_group: Union[dist_group_type, None] = None, backend: str = 'transformer_engine', ) -> None: super().__init__() @@ -322,9 +353,23 @@ def __init__( self._bias_attr = bias_attr self._dtype = self._helper.get_default_dtype() + # Set parallel configs + self.tp_group, self.tp_size = get_tp_group_and_world_size(tp_group, + enable_tp=parallel_mode + is not None) + self.tensor_parallel = self.tp_size > 1 + self.parallel_mode = parallel_mode + assert (self.parallel_mode + in GemmParallelModes), f"parallel_mode {parallel_mode} not supported" + + if self.parallel_mode == "column": + self.out_features = divide(self.out_features, self.tp_size) + elif self.parallel_mode == "row": + self.in_features = divide(self.in_features, self.tp_size) + # LayerNorm weights self.ln_weight = self.create_parameter( - shape=[in_features], + shape=[self.in_features], attr=paddle.ParamAttr(initializer=Constant( value=0.0 if self.zero_centered_gamma else 1.0)), dtype=self._dtype, @@ -332,34 +377,48 @@ def __init__( ) self.ln_bias = self.create_parameter( - shape=[in_features], + shape=[self.in_features], attr=paddle.ParamAttr(initializer=Constant(value=0.0)), dtype=self._dtype, is_bias=True, ) - # Linear weights - self.weight = self.create_parameter( - shape=[out_features, in_features] - if self.backend == 'transformer_engine' else [in_features, out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) + # Initialize Linear weight parameter + with track_rng_state(enable=self.tensor_parallel): + # TE linear weight is in column major + self.weight = self.create_parameter( + shape=[self.out_features, self.in_features] + if self.backend == 'transformer_engine' else [self.in_features, self.out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + set_weight_tensor_dist_attr(self.weight, self.tensor_parallel, self.parallel_mode, + self.backend) + # Initialize Linear bias parameter self.has_bias = self._bias_attr is not False use_default_bias = self._bias_attr is None or self._bias_attr is True if self.has_bias: self.bias = self.create_parameter( - shape=[out_features], + shape=[self.out_features], attr=self._bias_attr if not use_default_bias else paddle.ParamAttr( initializer=Constant(value=0.0)), dtype=self._dtype, is_bias=True, ) + if parallel_mode == "column": + set_tensor_dist_attr(self.bias, self.tensor_parallel, axis=0) else: self.bias = None + # For RPL, bias has to be added after TP collectives + # So it cannot be fused with the GEMM + if self.parallel_mode == "row" and self.tensor_parallel and self.has_bias: + self.gemm_bias_fused_add = False + else: + self.gemm_bias_fused_add = True + # These many SMs are subtracted from the total SM count when calling forward # and backward LayerNorm C APIs. These envvars can be used to prevent the LN # kernels from using all SMs in the device. This is useful for cases such as @@ -385,8 +444,8 @@ def _te_forward( self.ln_weight, self.ln_bias, self.weight, - self.bias, - self.has_bias, + self.bias if self.gemm_bias_fused_add else None, + self.has_bias and self.gemm_bias_fused_add, self.eps, self.fp8_enabled, self.fp8_calibration, @@ -397,10 +456,19 @@ def _te_forward( self.fwd_ln_sm_margin, self.bwd_ln_sm_margin, self.zero_centered_gamma, + self.parallel_mode, + self.tensor_parallel, + self.tp_group, + self.tp_size, ) if self.return_layernorm_output: out, ln_out = out + + if not self.gemm_bias_fused_add: + out = out + cast_if_needed_inplace(self.bias, self.activation_dtype) + + if self.return_layernorm_output: return out, ln_out return out @@ -418,7 +486,12 @@ def _pd_forward( weight=self.ln_weight, bias=self.ln_bias, epsilon=self.eps) - out = F.linear(ln_out, self.weight, self.bias) + if self.parallel_mode == 'column' and self.tensor_parallel: + ln_out = identity(ln_out, self.tp_group) + out = F.linear(ln_out, self.weight, self.bias if self.gemm_bias_fused_add else None) + if self.parallel_mode == 'row' and self.tensor_parallel: + out = allreduce(out, self.tp_group) + out = out + self.bias if self.bias is not None else out if self.return_layernorm_output: return out, ln_out return out diff --git a/transformer_engine/paddle/layer/layernorm_mlp.py b/transformer_engine/paddle/layer/layernorm_mlp.py index 6d725114b0..9b89d05d47 100644 --- a/transformer_engine/paddle/layer/layernorm_mlp.py +++ b/transformer_engine/paddle/layer/layernorm_mlp.py @@ -4,25 +4,38 @@ """LayerNormMLP API""" import os -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional import paddle import paddle.nn.functional as F from paddle.nn.initializer import Constant +from .base import TransformerEngineBaseLayer +from .layernorm_linear import _layernorm_fwd_fp8_cast, _layernorm_bwd +from .linear import _linear_fwd_fp8, _linear_fwd_non_fp8, _linear_bwd_fp8, _linear_bwd_non_fp8 +from ..constants import TE_DType, FP8FwdTensors, FP8BwdTensors, dist_group_type from ..cpp_extensions import ( cast_from_fp8, dgelu_cast_transpose_bgrad_fp8, gelu_fp8, transpose, ) - -from .base import TransformerEngineBaseLayer -from .layernorm_linear import _layernorm_fwd_fp8_cast, _layernorm_bwd -from .linear import _linear_fwd_fp8, _linear_fwd_non_fp8, _linear_bwd_fp8, _linear_bwd_non_fp8 -from ..constants import TE_DType, FP8FwdTensors, FP8BwdTensors +from ..distributed import ( + allreduce, + get_tp_group_and_world_size, + identity, + track_rng_state, + set_tensor_dist_attr, + set_weight_tensor_dist_attr, +) from ..fp8 import get_fp8_te_dtype -from ..utils import cast_if_needed, assert_dim_for_fp8_forward_exec, get_paddle_act_func +from ..utils import ( + assert_dim_for_fp8_forward_exec, + cast_if_needed, + cast_if_needed_inplace, + divide, + get_paddle_act_func, +) __all__ = ["LayerNormMLP"] @@ -43,7 +56,11 @@ def _mlp_forward( fp8_calibration: bool, fp8_meta: Dict[str, Any], activation_dtype: paddle.dtype, + activation: str, is_grad_enabled: bool, + set_parallel_mode: bool, + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], ): if fp8_enabled: fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) @@ -56,6 +73,9 @@ def _mlp_forward( use_fc1_bias, fp8_meta, activation_dtype, + 'column' if set_parallel_mode else None, + tensor_parallel, + tp_group, is_grad_enabled, ) @@ -75,6 +95,9 @@ def _mlp_forward( use_fc2_bias, fp8_meta, activation_dtype, + 'row' if set_parallel_mode else None, + tensor_parallel, + tp_group, is_grad_enabled, ) else: @@ -88,7 +111,10 @@ def _mlp_forward( fp8_calibration, fp8_meta, activation_dtype, - activation='gelu', + 'column' if set_parallel_mode else None, + tensor_parallel, + tp_group, + activation=activation, ) fc2_out = _linear_fwd_non_fp8( @@ -101,6 +127,9 @@ def _mlp_forward( fp8_calibration, fp8_meta, activation_dtype, + 'row' if set_parallel_mode else None, + tensor_parallel, + tp_group, ) return ( fc1_out, @@ -136,6 +165,9 @@ def _mlp_backward( requires_dgrad: bool, activation_dtype: paddle.dtype, activation: str, + set_parallel_mode: bool, + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], ): ( fc1_dgrad, @@ -179,6 +211,9 @@ def _mlp_backward( True, requires_fc2_wgrad, activation_dtype, + 'row' if set_parallel_mode else None, + tensor_parallel, + tp_group, ) # GELU Bwd @@ -193,7 +228,7 @@ def _mlp_backward( if requires_fc1_bgrad: fc1_bgrad = fc1_bgrad_ - # FC2 Bwd + # FC1 Bwd requires_fc1_wgrad = not fc1_weight.stop_gradient dgelu_no_fp8, fc1_input_no_fp8, fc1_input_t = None, None, None if requires_fc1_wgrad: @@ -231,6 +266,9 @@ def _mlp_backward( requires_dgrad, requires_fc1_wgrad, activation_dtype, + 'column' if set_parallel_mode else None, + tensor_parallel, + tp_group, ) else: dgelu, fc2_wgrad, fc2_bgrad = _linear_bwd_non_fp8( @@ -240,6 +278,9 @@ def _mlp_backward( requires_fc2_bgrad, True, activation_dtype, + 'row' if set_parallel_mode else None, + tensor_parallel, + tp_group, gelu_input=fc1_out, activation=activation, ) @@ -250,6 +291,9 @@ def _mlp_backward( requires_fc1_bgrad, requires_dgrad, activation_dtype, + 'column' if set_parallel_mode else None, + tensor_parallel, + tp_group, ) return ( fc1_dgrad, @@ -286,9 +330,13 @@ def forward( bwd_ln_sm_margin: int, zero_centered_gamma: bool, activation: str, + set_parallel_mode: bool, + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], + tp_size: int, ) -> Union[Tuple[paddle.Tensor, ...], paddle.Tensor]: # Make sure input dimensions are compatible - in_features = ln_weight.numel() + in_features = ln_weight.shape[0] assert inp.shape[-1] == in_features, "GEMM not possible" inputmat = inp.reshape((-1, in_features)) if fp8_enabled: @@ -341,7 +389,11 @@ def forward( fp8_calibration, fp8_meta, activation_dtype, + activation, is_grad_enabled, + set_parallel_mode, + tensor_parallel, + tp_group, ) if is_grad_enabled: @@ -369,6 +421,10 @@ def forward( ctx.return_layernorm_output = return_layernorm_output ctx.bwd_ln_sm_margin = bwd_ln_sm_margin ctx.zero_centered_gamma = zero_centered_gamma + ctx.set_parallel_mode = set_parallel_mode + ctx.tensor_parallel = tensor_parallel + ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.requires_dgrad = not inp.stop_gradient ctx.requires_fc1_bgrad = use_fc1_bias and not fc1_bias.stop_gradient ctx.requires_fc2_bgrad = use_fc2_bias and not fc2_bias.stop_gradient @@ -387,6 +443,8 @@ def backward( ...]) -> Tuple[Union[paddle.Tensor, None], ...]: with TransformerEngineBaseLayer.prepare_backward(ctx.fp8_enabled, ctx.fp8_meta, + ctx.tp_group, + ctx.tp_size, name="_LayerNormMLP"): ( inputmat, @@ -442,6 +500,9 @@ def backward( True, ctx.activation_dtype, ctx.activation, + ctx.set_parallel_mode, + ctx.tensor_parallel, + ctx.tp_group, ) if not ctx.fp8_enabled: # fc2_bias is fused with gemm for non-FP8 path @@ -491,6 +552,8 @@ def __init__( activation: str = "gelu", return_layernorm_output: bool = False, zero_centered_gamma: bool = False, + set_parallel_mode: bool = False, + tp_group: Optional[dist_group_type] = None, backend: str = 'transformer_engine', ) -> None: super().__init__() @@ -507,6 +570,17 @@ def __init__( self._bias_attr = bias_attr self._dtype = self._helper.get_default_dtype() + # Set parallel configs + self.tp_group, self.tp_size = get_tp_group_and_world_size(tp_group, + enable_tp=set_parallel_mode) + self.tensor_parallel = self.tp_size > 1 + self.set_parallel_mode = set_parallel_mode + + if self.set_parallel_mode: + self.size_per_partition = divide(self.ffn_hidden_size, self.tp_size) + else: + self.size_per_partition = self.ffn_hidden_size + # LayerNorm weights self.ln_weight = self.create_parameter( shape=[self.hidden_size], @@ -524,36 +598,47 @@ def __init__( ) # FC1 weights - self.fc1_weight = self.create_parameter( - shape=[self.ffn_hidden_size, self.hidden_size] - if self.backend == 'transformer_engine' else [self.hidden_size, self.ffn_hidden_size], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) + with track_rng_state(enable=self.tensor_parallel): + self.fc1_weight = self.create_parameter( + shape=[self.size_per_partition, self.hidden_size] if self.backend + == 'transformer_engine' else [self.hidden_size, self.size_per_partition], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + set_weight_tensor_dist_attr(self.fc1_weight, + self.tensor_parallel, + parallel_mode='column', + backend=self.backend) self.has_bias = self._bias_attr is not False - if self._bias_attr is None or self._bias_attr is True: + use_default_bias = self._bias_attr is None or self._bias_attr is True + if use_default_bias: self._bias_attr = paddle.ParamAttr(initializer=Constant(value=0.0)) if self.has_bias: self.fc1_bias = self.create_parameter( - shape=[self.ffn_hidden_size], + shape=[self.size_per_partition], attr=self._bias_attr, dtype=self._dtype, is_bias=True, ) + set_tensor_dist_attr(self.fc1_bias, self.tensor_parallel, axis=0) else: self.fc1_bias = None # FC2 weights self.fc2_weight = self.create_parameter( - shape=[self.hidden_size, self.ffn_hidden_size] - if self.backend == 'transformer_engine' else [self.ffn_hidden_size, self.hidden_size], + shape=[self.hidden_size, self.size_per_partition] if self.backend + == 'transformer_engine' else [self.size_per_partition, self.hidden_size], attr=self._weight_attr, dtype=self._dtype, is_bias=False, ) + set_weight_tensor_dist_attr(self.fc2_weight, + self.tensor_parallel, + parallel_mode='row', + backend=self.backend) if self.has_bias: self.fc2_bias = self.create_parameter( @@ -565,6 +650,13 @@ def __init__( else: self.fc2_bias = None + # For RPL, bias has to be added after TP collectives + # So it cannot be fused with the GEMM + if self.set_parallel_mode and self.tensor_parallel and self.has_bias: + self.gemm_bias_fused_add = False + else: + self.gemm_bias_fused_add = True + # These many SMs are subtracted from the total SM count when calling forward # and backward LayerNorm C APIs. These envvars can be used to prevent the LN # kernels from using all SMs in the device. This is useful for cases such as @@ -606,12 +698,20 @@ def _te_forward( self.bwd_ln_sm_margin, self.zero_centered_gamma, self.activation, + self.set_parallel_mode, + self.tensor_parallel, + self.tp_group, + self.tp_size, ) if self.return_layernorm_output: out, ln_out = out - return out, ln_out + if not self.gemm_bias_fused_add: + out = out + cast_if_needed_inplace(self.fc2_bias, self.activation_dtype) + + if self.return_layernorm_output: + return out, ln_out return out def _pd_forward( @@ -628,11 +728,16 @@ def _pd_forward( weight=self.ln_weight, bias=self.ln_bias, epsilon=self.eps) + if self.set_parallel_mode and self.tensor_parallel: + ln_out = identity(ln_out, self.tp_group) fc1_out = F.linear(ln_out, self.fc1_weight, self.fc1_bias) act_func = get_paddle_act_func(self.activation) act_out = act_func(fc1_out) - out = F.linear(act_out, self.fc2_weight, self.fc2_bias) - + out = F.linear(act_out, self.fc2_weight, + self.fc2_bias if self.gemm_bias_fused_add else None) + if self.set_parallel_mode and self.tensor_parallel: + out = allreduce(out, self.tp_group) + out = out + self.fc2_bias if self.fc2_bias is not None else out if self.return_layernorm_output: return out, ln_out return out diff --git a/transformer_engine/paddle/layer/linear.py b/transformer_engine/paddle/layer/linear.py index dc9863e062..ff164067a7 100644 --- a/transformer_engine/paddle/layer/linear.py +++ b/transformer_engine/paddle/layer/linear.py @@ -3,7 +3,7 @@ # See LICENSE for license information. """Linear API""" -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional import paddle import paddle.nn.functional as F @@ -17,13 +17,22 @@ _2X_ACC_WGRAD, ) -from ..fp8 import get_fp8_te_dtype -from ..constants import FP8FwdTensors, FP8BwdTensors +from ..constants import FP8FwdTensors, FP8BwdTensors, GemmParallelModes, dist_group_type from ..cpp_extensions import gemm, fp8_gemm, cast_to_fp8, cast_transpose +from ..distributed import ( + allreduce, + get_tp_group_and_world_size, + identity, + track_rng_state, + set_tensor_dist_attr, + set_weight_tensor_dist_attr, +) +from ..fp8 import get_fp8_te_dtype from ..utils import ( + assert_dim_for_fp8_forward_exec, cast_if_needed, cast_if_needed_inplace, - assert_dim_for_fp8_forward_exec, + divide, get_bias_dtype, ) @@ -39,12 +48,15 @@ def _linear_fwd_fp8( use_bias: bool, fp8_meta: Dict[str, Any], activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], is_grad_enabled: bool, ): """FP8 path of Linear Fwd""" fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) bias_dtype = get_bias_dtype(activation_dtype) - bias = cast_if_needed_inplace(bias, bias_dtype) + bias = cast_if_needed(bias, bias_dtype) if is_grad_enabled: weight_fp8, weight_t_fp8 = cast_transpose( @@ -78,6 +90,10 @@ def _linear_fwd_fp8( use_split_accumulator=_2X_ACC_FPROP, ) + # Row Parallel Linear + if parallel_mode == "row" and tensor_parallel: + out = allreduce(out, tp_group) + return out, weight_t_fp8 @@ -91,6 +107,9 @@ def _linear_fwd_non_fp8( fp8_calibration: bool, fp8_meta: Dict[str, Any], activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], activation: str = "", ): """Non-FP8 path of Linear Fwd""" @@ -123,6 +142,9 @@ def _linear_fwd_non_fp8( return out, gelu_out out, _, _ = outputs + # Row Parallel Linear + if parallel_mode == "row" and tensor_parallel: + out = allreduce(out, tp_group) return out @@ -137,6 +159,9 @@ def _linear_fwd( fp8_calibration: bool, fp8_meta: Dict[str, Any], activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], is_grad_enabled: bool, ): if fp8_enabled: @@ -149,6 +174,9 @@ def _linear_fwd( use_bias, fp8_meta, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, is_grad_enabled, ) else: @@ -162,6 +190,9 @@ def _linear_fwd( fp8_calibration, fp8_meta, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, ) return ( out, @@ -184,6 +215,9 @@ def _linear_bwd_fp8( requires_dgrad: bool, requires_wgrad: bool, activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], ): dgrad, wgrad = None, None fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) @@ -202,6 +236,9 @@ def _linear_bwd_fp8( get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, ) + if parallel_mode == "column" and tensor_parallel: + dgrad = allreduce(dgrad, tp_group) + if requires_wgrad: if not fp8_meta["recipe"].override_linear_precision.wgrad: wgrad = fp8_gemm( @@ -236,6 +273,9 @@ def _linear_bwd_non_fp8( requires_bgrad: bool, requires_dgrad: bool, activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], gelu_input: Union[paddle.Tensor, None] = None, activation: str = "", ): @@ -255,6 +295,9 @@ def _linear_bwd_non_fp8( gelu_input=gelu_input, grad=True, ) + if parallel_mode == "column" and tensor_parallel: + dgrad = allreduce(dgrad, tp_group) + if requires_wgrad: wgrad, bgrad, _ = gemm( inputmat, @@ -288,6 +331,9 @@ def _linear_bwd( fp8_meta: Dict[str, Any], requires_dgrad: bool, activation_dtype: paddle.dtype, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], ): dgrad, wgrad, bgrad = None, None, None requires_wgrad = not weight.stop_gradient @@ -307,6 +353,9 @@ def _linear_bwd( requires_dgrad, requires_wgrad, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, ) else: dgrad, wgrad, bgrad = _linear_bwd_non_fp8( @@ -316,6 +365,9 @@ def _linear_bwd( requires_bgrad, requires_dgrad, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, ) return dgrad, wgrad, bgrad @@ -335,6 +387,10 @@ def forward( fp8_meta: Dict[str, Any], activation_dtype: paddle.dtype, is_grad_enabled: bool, + parallel_mode: Union[str, None], + tensor_parallel: bool, + tp_group: Union[dist_group_type, None], + tp_size: int, ) -> paddle.Tensor: # Make sure input dimensions are compatible in_features = weight.shape[-1] @@ -385,6 +441,9 @@ def forward( fp8_calibration, fp8_meta, activation_dtype, + parallel_mode, + tensor_parallel, + tp_group, is_grad_enabled, ) @@ -402,6 +461,10 @@ def forward( ctx.fp8_meta = fp8_meta ctx.use_bias = use_bias ctx.inp_shape = inp.shape + ctx.parallel_mode = parallel_mode + ctx.tensor_parallel = tensor_parallel + ctx.tp_group = tp_group + ctx.tp_size = tp_size ctx.requires_dgrad = not inp.stop_gradient ctx.requires_bgrad = use_bias and not bias.stop_gradient @@ -411,6 +474,8 @@ def forward( def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]: with TransformerEngineBaseLayer.prepare_backward(ctx.fp8_enabled, ctx.fp8_meta, + ctx.tp_group, + ctx.tp_size, name="_Linear"): ( inputmat, @@ -444,6 +509,9 @@ def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None ctx.fp8_meta, ctx.requires_dgrad, ctx.activation_dtype, + ctx.parallel_mode, + ctx.tensor_parallel, + ctx.tp_group, ) if not ctx.fp8_enabled: @@ -474,6 +542,8 @@ def __init__( out_features: int, weight_attr: Union[paddle.ParamAttr, None] = None, bias_attr: Union[paddle.ParamAttr, None, bool] = None, + parallel_mode: Optional[str] = None, + tp_group: Union[dist_group_type, None] = None, backend: str = 'transformer_engine', ) -> None: super().__init__() @@ -484,28 +554,56 @@ def __init__( self._bias_attr = bias_attr self._dtype = self._helper.get_default_dtype() - # TE linear weight is in column major - self.weight = self.create_parameter( - shape=[out_features, in_features] - if self.backend == 'transformer_engine' else [in_features, out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) + # Set parallel configs + self.tp_group, self.tp_size = get_tp_group_and_world_size(tp_group, + enable_tp=parallel_mode + is not None) + self.tensor_parallel = self.tp_size > 1 + self.parallel_mode = parallel_mode + assert (self.parallel_mode + in GemmParallelModes), f"parallel_mode {parallel_mode} not supported" + + if self.parallel_mode == "column": + self.out_features = divide(self.out_features, self.tp_size) + elif self.parallel_mode == "row": + self.in_features = divide(self.in_features, self.tp_size) + + # Initialize weight parameter + with track_rng_state(enable=self.tensor_parallel): + # TE linear weight is in column major + self.weight = self.create_parameter( + shape=[self.out_features, self.in_features] + if self.backend == 'transformer_engine' else [self.in_features, self.out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + set_weight_tensor_dist_attr(self.weight, self.tensor_parallel, self.parallel_mode, + self.backend) + # Initialize bias parameter self.has_bias = self._bias_attr is not False use_default_bias = self._bias_attr is None or self._bias_attr is True if self.has_bias: self.bias = self.create_parameter( - shape=[out_features], + shape=[self.out_features], attr=self._bias_attr if not use_default_bias else paddle.ParamAttr( initializer=Constant(value=0.0)), dtype=self._dtype, is_bias=True, ) + if parallel_mode == "column": + set_tensor_dist_attr(self.bias, self.tensor_parallel, axis=0) else: self.bias = None + # For RPL, bias has to be added after TP collectives + # So it cannot be fused with the GEMM + if self.parallel_mode == "row" and self.tensor_parallel and self.has_bias: + self.gemm_bias_fused_add = False + else: + self.gemm_bias_fused_add = True + def _te_forward( self, inp: paddle.Tensor, @@ -521,15 +619,22 @@ def _te_forward( out = _Linear.apply( self.weight, inp, - self.bias, - self.has_bias, + self.bias if self.gemm_bias_fused_add else None, + self.has_bias and self.gemm_bias_fused_add, self.fp8_enabled, self.fp8_calibration, self.fp8_meta, self.activation_dtype, paddle.is_grad_enabled(), + self.parallel_mode, + self.tensor_parallel, + self.tp_group, + self.tp_size, ) + if not self.gemm_bias_fused_add: + out = out + cast_if_needed_inplace(self.bias, self.activation_dtype) + return out def _pd_forward( @@ -537,7 +642,13 @@ def _pd_forward( inp: paddle.Tensor, ) -> paddle.Tensor: """Calls Paddle OP""" - return F.linear(inp, self.weight, self.bias) + if self.parallel_mode == 'column' and self.tensor_parallel: + inp = identity(inp, self.tp_group) + out = F.linear(inp, self.weight, self.bias if self.gemm_bias_fused_add else None) + if self.parallel_mode == 'row' and self.tensor_parallel: + out = allreduce(out, self.tp_group) + out = out + self.bias if self.bias is not None else out + return out def forward(self, *args, **kwargs): """forward""" diff --git a/transformer_engine/paddle/layer/transformer.py b/transformer_engine/paddle/layer/transformer.py index 6e6afd4ca2..a95b9fcfe1 100644 --- a/transformer_engine/paddle/layer/transformer.py +++ b/transformer_engine/paddle/layer/transformer.py @@ -7,15 +7,11 @@ import paddle -from transformer_engine.paddle.constants import ( - AttnMaskTypes, - LayerTypes, -) -from transformer_engine.paddle.layer import (LayerNormMLP, LayerNorm, MultiHeadAttention) -from .base import TransformerEngineBaseLayer +from . import LayerNormMLP, LayerNorm, MultiHeadAttention +from ..constants import AttnMaskTypes, LayerTypes, dist_group_type -class TransformerLayer(TransformerEngineBaseLayer): +class TransformerLayer(paddle.nn.Layer): r""" TransformerLayer is made up of an attention block and a feedforward network (MLP). This standard layer is based on the paper "Attention Is All You Need". @@ -64,6 +60,16 @@ class TransformerLayer(TransformerEngineBaseLayer): it controls the type used to allocate the initial parameters. Useful when the model is trained with lower precision and the original FP32 parameters would not fit in GPU memory. + + Parallelism parameters + ---------------------- + set_parallel_mode : bool, default = `False` + if set to `True`, QKV and FC1 layers are used as Column Parallel + whereas PROJ and FC2 is used as Row Parallel as described + `here `_. + tp_group : ProcessGroup, default = `None` + tensor parallel process group. + """ def __init__(self, @@ -82,6 +88,8 @@ def __init__(self, layer_type: str = "encoder", zero_centered_gamma: bool = False, activation: str = 'gelu', + set_parallel_mode: bool = False, + tp_group: Optional[dist_group_type] = None, backend: str = 'transformer_engine') -> None: super().__init__() @@ -90,6 +98,8 @@ def __init__(self, self.layer_type = layer_type self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm self.self_attn_mask_type = self_attn_mask_type + self.set_parallel_mode = set_parallel_mode + self.tp_group = tp_group assert (self_attn_mask_type in AttnMaskTypes), f"self_attn_mask_type {self_attn_mask_type} not supported" @@ -107,6 +117,8 @@ def __init__(self, "params_dtype": params_dtype, "return_layernorm_output": apply_residual_connection_post_layernorm, "zero_centered_gamma": zero_centered_gamma, + "set_parallel_mode": set_parallel_mode, + "tp_group": tp_group, "backend": backend, } @@ -136,6 +148,8 @@ def __init__(self, activation=activation, return_layernorm_output=apply_residual_connection_post_layernorm, zero_centered_gamma=zero_centered_gamma, + set_parallel_mode=set_parallel_mode, + tp_group=tp_group, backend=backend, ) From 112f67f6bbb93d2d3e42fb75c16801815f187e95 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Thu, 21 Sep 2023 00:55:08 +0200 Subject: [PATCH 052/427] [pyTorch] Enable the model to change precision between iterations (#414) * Enable the model to be change precision between iterations Signed-off-by: Przemek Tredak * Add test Signed-off-by: Przemek Tredak * Fix for the test Signed-off-by: Przemek Tredak --------- Signed-off-by: Przemek Tredak Co-authored-by: Kirthi Shankar Sivamani --- tests/pytorch/test_sanity.py | 13 +++++++++++++ transformer_engine/pytorch/module/base.py | 3 +-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index 21497b417f..65af2f9713 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -788,3 +788,16 @@ def test_gpt_cuda_graph(dtype, bs, fp8_recipe, model, skip_wgrad, zero_centered_ ) _test_sanity_e2e_cuda_graph(block, bs, dtype, config, fp8_recipe, skip_wgrad) + +def test_model_multiple_cast(): + a = torch.zeros((16,16)).cuda() + m = Linear(16,32) + + y = m(a) + assert y.dtype == torch.float32 + + m.half() + a = a.half() + + y2 = m(a) + assert y2.dtype == torch.float16 diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 0352a7ba2b..82d39eeaf0 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -445,8 +445,7 @@ def set_activation_dtype(self, inp: torch.Tensor) -> None: return # All checks after this have already been performed once, thus skip - # We assume that user doesn't change input types across iterations - if hasattr(self, "activation_dtype"): + if hasattr(self, "activation_dtype") and self.activation_dtype == inp.dtype: return dtype = inp.dtype From 291cb4fcbe97d8711c3bd4b78afb02d8cb440a34 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 22 Sep 2023 10:05:29 +0800 Subject: [PATCH 053/427] [Paddle] Eliminate amax update bubbles by using custom_ops (#436) * Eliminate amax_and_scale_update bubbles Signed-off-by: rewang * Add CUDA check Signed-off-by: rewang --------- Signed-off-by: rewang --- tests/paddle/test_operators.py | 38 ++++++++- transformer_engine/paddle/csrc/custom_ops.cu | 81 +++++++++++++++----- transformer_engine/paddle/fp8.py | 32 ++------ transformer_engine/paddle/fp8_buffer.py | 7 +- 4 files changed, 108 insertions(+), 50 deletions(-) diff --git a/tests/paddle/test_operators.py b/tests/paddle/test_operators.py index c4211a7218..7a2472e4bc 100644 --- a/tests/paddle/test_operators.py +++ b/tests/paddle/test_operators.py @@ -865,13 +865,17 @@ def test_scaled_upper_triang_masked_softmax_fwd_bwd(dtype): assert_allclose(dx_ref, dx, rtol=1e-4, atol=5e-3) -def test_update_scale(): +def test_amax_and_scale_update(): """Test update_scale""" num_gemm = 6 + history_len = 1024 recipe = DelayedScaling() fp8_max = recipe.fp8_format.value.max_fwd - amax_tensor = paddle.rand(shape=[num_gemm], dtype='float32') * fp8_max + amax_history_tensor = paddle.rand(shape=[history_len, num_gemm], dtype='float32') + rolled_history_ref = paddle.roll(amax_history_tensor, -1, axis=0) + rolled_history_ref[0] = 0.0 + amax_tensor = paddle.max(amax_history_tensor, axis=0) scale_tensor = paddle.ones(shape=[num_gemm], dtype='float32') def calc_ref(amax, scale, fp8_max, margin=0): @@ -884,6 +888,32 @@ def calc_ref(amax, scale, fp8_max, margin=0): return sf scale_ref = calc_ref(amax_tensor, scale_tensor, fp8_max, 0.) - scale_actual = tex.update_scale(amax_tensor, scale_tensor, fp8_max, 0.) + scale_inv_ref = 1. / scale_ref - assert_allclose(scale_ref, scale_actual, rtol=1e-5, atol=1e-5) + # Placeholder + scale_actual = paddle.zeros_like(scale_tensor) + scale_inv_actual = paddle.zeros_like(scale_tensor) + + tex.amax_and_scale_update_inplace(_amax_history=amax_history_tensor, + _scale=scale_actual, + _scale_inv=scale_inv_actual, + fp8_max=fp8_max, + margin=0., + amax_compute="max") + + assert_allclose(scale_actual, scale_ref, rtol=1e-7, atol=1e-7) + assert_allclose(scale_inv_actual, scale_inv_ref, rtol=1e-7, atol=1e-7) + assert_allclose(amax_history_tensor, rolled_history_ref, rtol=1e-7, atol=1e-7) + + +def test_update_latest_history(): + """Test update_latest_history""" + num_gemm = 6 + history_len = 1024 + + amax_history_tensor = paddle.rand(shape=[history_len, num_gemm], dtype='float32') + amax = paddle.rand(shape=[num_gemm], dtype='float32') + + tex.update_latest_amax_history_inplace(_history=amax_history_tensor, amax=amax) + + assert_allclose(amax_history_tensor[0], amax, rtol=1e-7, atol=1e-7) diff --git a/transformer_engine/paddle/csrc/custom_ops.cu b/transformer_engine/paddle/csrc/custom_ops.cu index 76f8987306..44e0202e53 100644 --- a/transformer_engine/paddle/csrc/custom_ops.cu +++ b/transformer_engine/paddle/csrc/custom_ops.cu @@ -1019,28 +1019,62 @@ void te_scaled_upper_triang_masked_softmax_backward(paddle::Tensor &output_grads softmax_results.stream()); } -__global__ void UpdateScalesKernel(const float *amax, const float *scale, float margin, - float fp8_max, size_t size, float *scale_out) { +__global__ void UpdateFP8MetaKernel(const float *amax, const float *rolled_amax_history, + float *amax_history, float *scale, float *scale_inv, + float margin, float fp8_max, size_t history_numel, + size_t amax_numel) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + if (idx >= history_numel) { + return; + } + + amax_history[idx] = rolled_amax_history[idx]; + + if (idx < amax_numel) { float exp = floor(log2(fp8_max / amax[idx])) - margin; float sf = round(powf(2.0f, abs(exp))); - sf = ((amax[idx] > 0.0f) && isfinite(amax[idx])) ? sf : scale[idx]; - scale_out[idx] = exp < 0.0f ? 1 / sf : sf; + float scale_reg = scale[idx]; + sf = ((amax[idx] > 0.0f) && isfinite(amax[idx])) ? sf : scale_reg; + scale_reg = exp < 0.0f ? 1 / sf : sf; + scale[idx] = scale_reg; + scale_inv[idx] = 1.0f / scale_reg; + amax_history[idx] = 0.0f; } } -std::vector update_scale(const paddle::Tensor &amax, const paddle::Tensor &scale, - float fp8_max, float margin) { - const size_t block_size = 512; - size_t size = static_cast(amax.numel()); - size_t num_blocks = (size + block_size - 1) / block_size; - auto scale_out = paddle::empty_like(scale, scale.dtype(), scale.place()); - UpdateScalesKernel<<>>( - amax.data(), scale.data(), margin, fp8_max, size, scale_out.data()); +void amax_and_scale_update_inplace(paddle::Tensor &amax_history, // NOLINT + paddle::Tensor &scale, // NOLINT + paddle::Tensor &scale_inv, // NOLINT + float fp8_max, float margin, const std::string &amax_compute) { + NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent"); + + paddle::Tensor amax; + + if (amax_compute == "max") { + amax = amax_history.max({0}); + } else { + amax = amax_history.slice(0, 1); + } + + const auto rolled_amax_history = amax_history.roll({-1}, {0}); + + auto size = amax_history.numel(); + constexpr int BLOCK_SIZE = 256; + size_t num_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; + UpdateFP8MetaKernel<<>>( + amax.data(), rolled_amax_history.data(), amax_history.data(), + scale.data(), scale_inv.data(), margin, fp8_max, amax_history.numel(), + amax.numel()); + NVTE_CHECK_CUDA(cudaGetLastError()); +} - return {scale_out}; +void update_latest_amax_history_inplace(paddle::Tensor &history, // NOLINT + const paddle::Tensor &amax) { + // Copy amax to history[0] + NVTE_CHECK_CUDA(cudaMemcpyAsync(history.data(), amax.data(), + amax.numel() * SizeOf(amax.dtype()), cudaMemcpyDeviceToDevice, + amax.stream())); } } // namespace paddle_ext @@ -1242,8 +1276,17 @@ PD_BUILD_OP(te_scaled_upper_triang_masked_softmax_backward) .SetKernelFn( PD_KERNEL(transformer_engine::paddle_ext::te_scaled_upper_triang_masked_softmax_backward)); -PD_BUILD_OP(update_scale) - .Inputs({"Amax", "Scale"}) - .Outputs({"ScaleOut"}) - .Attrs({"fp8_max: float", "margin: float"}) - .SetKernelFn(PD_KERNEL(transformer_engine::paddle_ext::update_scale)); +PD_BUILD_OP(amax_and_scale_update_inplace) + .Inputs({"_amax_history", "_scale", "_scale_inv"}) + .Outputs({"amax_history", "scale", "scale_inv"}) + .SetInplaceMap({{"_amax_history", "amax_history"}, + {"_scale", "scale"}, + {"_scale_inv", "scale_inv"}}) + .Attrs({"fp8_max: float", "margin: float", "amax_compute: std::string"}) + .SetKernelFn(PD_KERNEL(transformer_engine::paddle_ext::amax_and_scale_update_inplace)); + +PD_BUILD_OP(update_latest_amax_history_inplace) + .Inputs({"_history", "amax"}) + .Outputs({"history"}) + .SetInplaceMap({{"_history", "history"}}) + .SetKernelFn(PD_KERNEL(transformer_engine::paddle_ext::update_latest_amax_history_inplace)); diff --git a/transformer_engine/paddle/fp8.py b/transformer_engine/paddle/fp8.py index e56f1de767..abf347042a 100644 --- a/transformer_engine/paddle/fp8.py +++ b/transformer_engine/paddle/fp8.py @@ -197,30 +197,12 @@ def amax_and_scale_update( fp8_max_key = "fp8_max_fwd" if fwd_update else "fp8_max_bwd" if not callable(amax_compute) and sf_compute is None: - # Obtain amax from history - amax_history = fp8_meta[fp8_meta_tensor_key].amax_history - if amax_compute == "max": - amax = paddle.max(amax_history, axis=0) - else: # amax_compute_algo == "most_recent" - amax = amax_history[0] - - # Update amax history and set next amax to zero - if amax_history.shape[0] > 1: - amax_history = paddle.roll(amax_history, -1, 0) - amax_history[0] = 0.0 - fp8_meta[fp8_meta_tensor_key].amax_history = amax_history - - # Update scaling factor - fp8_meta[fp8_meta_tensor_key].scale = tex.update_scale( - amax=amax, - scale=fp8_meta[fp8_meta_tensor_key].scale, - fp8_max=fp8_meta[fp8_max_key], - margin=float(fp8_meta["recipe"].margin)) - - # Update scale_inv - fp8_meta[fp8_meta_tensor_key].scale_inv = \ - 1.0 / fp8_meta[fp8_meta_tensor_key].scale - + tex.amax_and_scale_update_inplace(_amax_history=fp8_meta[fp8_meta_tensor_key].amax_history, + _scale=fp8_meta[fp8_meta_tensor_key].scale, + _scale_inv=fp8_meta[fp8_meta_tensor_key].scale_inv, + fp8_max=fp8_meta[fp8_max_key], + margin=float(fp8_meta["recipe"].margin), + amax_compute=amax_compute) else: raise ValueError("We only support the fp8 recipe with 'max' or 'most_recent' " "amax_compute_algo and default scaling_factor_compute_algo at this " @@ -247,7 +229,7 @@ def prepare(self, num_gemms: bool, amax_history_len: int) -> None: curr_len = self.amax_history.shape[0] num_fp8_tensors = self.amax_history.shape[1] if amax_history_len < curr_len: - self.amax_history = (self.amax_history[:amax_history_len]) + self.amax_history = self.amax_history[:amax_history_len] elif amax_history_len > curr_len: extra_rows = amax_history_len - curr_len self.amax_history = paddle.concat([ diff --git a/transformer_engine/paddle/fp8_buffer.py b/transformer_engine/paddle/fp8_buffer.py index b6f082d69d..93090195a1 100644 --- a/transformer_engine/paddle/fp8_buffer.py +++ b/transformer_engine/paddle/fp8_buffer.py @@ -11,6 +11,7 @@ import numpy as np import paddle +import transformer_engine_paddle as tex from .constants import dist_group_type, RecomputeFunctionNames @@ -152,8 +153,10 @@ def copy_amax_from_buffer(self, fp8_meta: Dict[str, Any]) -> None: amax_buffer_key = self._get_amax_buffer_key(fp8_meta) assert amax_buffer_key in self._data, "TE internal error." - fp8_meta[fp8_meta_tensor_key].amax_history[0] = self._data[amax_buffer_key][ - fp8_meta[buffer_position_key]] + # Copy amax to amax_history[0] + tex.update_latest_amax_history_inplace( + _history=fp8_meta[fp8_meta_tensor_key].amax_history, + amax=self._data[amax_buffer_key][fp8_meta[buffer_position_key]]) def set_for_deletion(self, fp8_meta: Dict[str, Any]) -> None: """Delete this amax key from global buffer during autocast end.""" From a6e1b10f05718c0853792532e9fa556c60a411f3 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 22 Sep 2023 23:42:31 -0700 Subject: [PATCH 054/427] Change scaling factor from E8M0 to E8M23 (#427) * Change scaling factor from E8M0 to E8M23 Signed-off-by: Kirthi Shankar Sivamani * fix formula Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/paddle/test_operators.py | 4 +--- transformer_engine/common/recipe.py | 3 +-- transformer_engine/jax/fp8.py | 10 +++------- transformer_engine/paddle/csrc/custom_ops.cu | 7 ++----- transformer_engine/pytorch/fp8.py | 5 +---- transformer_engine/tensorflow/fp8.py | 4 +--- 6 files changed, 9 insertions(+), 24 deletions(-) diff --git a/tests/paddle/test_operators.py b/tests/paddle/test_operators.py index 7a2472e4bc..fbdd95de95 100644 --- a/tests/paddle/test_operators.py +++ b/tests/paddle/test_operators.py @@ -880,11 +880,9 @@ def test_amax_and_scale_update(): def calc_ref(amax, scale, fp8_max, margin=0): """Calculate reference scale""" - exp = paddle.floor(paddle.log2(fp8_max / amax)) - margin - sf = paddle.round(2**paddle.abs(exp)) + sf = (fp8_max / amax) / (2 ** margin) sf = paddle.where(amax > 0.0, sf, scale) sf = paddle.where(paddle.isfinite(amax), sf, scale) - sf = paddle.where(exp < 0, 1 / sf, sf) return sf scale_ref = calc_ref(amax_tensor, scale_tensor, fp8_max, 0.) diff --git a/transformer_engine/common/recipe.py b/transformer_engine/common/recipe.py index 3bb5320475..c5d2ee4972 100644 --- a/transformer_engine/common/recipe.py +++ b/transformer_engine/common/recipe.py @@ -115,8 +115,7 @@ def scaling_factor_compute(amax: Tensor, .. code-block:: python FP8_MAX = maximum_representable_value(fp8_format) - exp = get_exponent(FP8_MAX / amax) - margin - new_scaling_factor = 2.0 ^ exp + new_scaling_factor = (FP8_MAX / amax) / (2 ^ margin) * The scaling factor should always be a power of 2 to not introduce numerical error during the conversion from FP8 to higher precision format. diff --git a/transformer_engine/jax/fp8.py b/transformer_engine/jax/fp8.py index f5015a315f..83aad88c07 100644 --- a/transformer_engine/jax/fp8.py +++ b/transformer_engine/jax/fp8.py @@ -310,11 +310,9 @@ def _update_fp8_metas_impl(fp8_metas: Collection) -> Collection: amax = fp8_meta_arrays[fp8_amax_idx][..., 0:1] scale = fp8_meta_arrays[fp8_scale_idx] - exp = jnp.floor(jnp.log2(fp8_max / amax)) - FP8Helper.MARGIN - sf = jnp.round(jnp.power(2, jnp.abs(exp))) + sf = (fp8_max / amax) / (2 ** FP8Helper.MARGIN) sf = jnp.where(amax > 0.0, sf, scale) sf = jnp.where(jnp.isfinite(amax), sf, scale) - scale = jnp.where(exp < 0, 1 / sf, sf) fp8_meta_arrays[fp8_scale_idx] = scale fp8_meta_arrays[fp8_scale_inv_idx] = 1 / scale @@ -426,11 +424,9 @@ def update_fp8_metas(state: Collection) -> Collection: .. code-block:: python - exp = floor(log2(fp8_max / amax)) - margin - sf = round(power(2, abs(exp))) + sf = (fp8_max / amax) / (2 ^ margin) sf = sf if amax > 0.0, else original_scale - sf = sf if isfinite(amax), else original_scale) - updated_scale = 1/sf if exp < 0, else sf + updated_scale = sf if isfinite(amax), else original_scale) updated_scale_inv = 1/updated_scale Collection = [dict, flax.core.frozen_dict.FrozenDict] diff --git a/transformer_engine/paddle/csrc/custom_ops.cu b/transformer_engine/paddle/csrc/custom_ops.cu index 44e0202e53..d08080b168 100644 --- a/transformer_engine/paddle/csrc/custom_ops.cu +++ b/transformer_engine/paddle/csrc/custom_ops.cu @@ -1032,11 +1032,8 @@ __global__ void UpdateFP8MetaKernel(const float *amax, const float *rolled_amax_ amax_history[idx] = rolled_amax_history[idx]; if (idx < amax_numel) { - float exp = floor(log2(fp8_max / amax[idx])) - margin; - float sf = round(powf(2.0f, abs(exp))); - float scale_reg = scale[idx]; - sf = ((amax[idx] > 0.0f) && isfinite(amax[idx])) ? sf : scale_reg; - scale_reg = exp < 0.0f ? 1 / sf : sf; + float sf = (fp8_max / amax[idx]) / powf(2.0f, margin); + float scale_reg = ((amax[idx] > 0.0f) && isfinite(amax[idx])) ? sf : scale[idx]; scale[idx] = scale_reg; scale_inv[idx] = 1.0f / scale_reg; amax_history[idx] = 0.0f; diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index 5e9f6634f9..51cd565f5b 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -538,12 +538,9 @@ def _default_sf_compute( margin: int, ) -> torch.Tensor: """Default function to convert amax to scaling factor.""" - exp = torch.floor(torch.log2(fp8_max / amax)) - margin - sf = torch.round(torch.pow(2, torch.abs(exp))) + sf = (fp8_max / amax) / (2 ** margin) sf = torch.where(amax > 0.0, sf, scale) sf = torch.where(torch.isfinite(amax), sf, scale) - sf = torch.where(exp < 0, 1 / sf, sf) - return sf diff --git a/transformer_engine/tensorflow/fp8.py b/transformer_engine/tensorflow/fp8.py index d04471ff12..b6dfb69308 100644 --- a/transformer_engine/tensorflow/fp8.py +++ b/transformer_engine/tensorflow/fp8.py @@ -157,11 +157,9 @@ def get_fp8_recipe(): def _default_sf_compute(amax, scale, fp8_max, margin): """Default function to convert amax to scaling factor.""" - exp = tf.math.floor(tf.experimental.numpy.log2(fp8_max / amax)) - margin - sf = tf.math.round(tf.math.pow(2.0, tf.math.abs(exp))) + sf = (fp8_max / amax) / (2 ** margin) sf = tf.where(amax > 0.0, sf, scale) sf = tf.where(tf.math.is_finite(amax), sf, scale) - sf = tf.where(exp < 0, 1.0 / sf, sf) return sf From a7b22b754cd49ccf556240d725a9bdb2ae68caff Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 22 Sep 2023 23:42:44 -0700 Subject: [PATCH 055/427] [PyTorch] Fix ONNX exports (#437) * Fix ONNX exports Signed-off-by: Kirthi Shankar Sivamani * docs Signed-off-by: Kirthi Shankar Sivamani * review Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/pytorch/test_onnx_export.py | 173 ++-------------------- transformer_engine/pytorch/attention.py | 65 +++----- transformer_engine/pytorch/transformer.py | 34 ++--- 3 files changed, 48 insertions(+), 224 deletions(-) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 14640febde..533e0cff6a 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -763,156 +763,6 @@ def forward(self, inp): validate_result( fname, inp, model, atol=atol, is_fp8=use_fp8, allow_cnt_errors=3, te_outputs=te_outputs) -@skip_FP8 -@pytest.mark.parametrize("softmax_fn", [ - softmax_defs.ScaledUpperTriangMaskedSoftmax, - softmax_defs.ScaledMaskedSoftmax, - softmax_defs.ScaledSoftmax, - te.softmax.FusedScaleMaskSoftmax, -]) -# Softmax kernel only supports FP16 or BF16! -@pytest.mark.parametrize("precision", [torch.float16, torch.bfloat16, "fake-torch.bfloat16"]) -def test_export_softmax(seed_default_rng, set_max_seq_len, softmax_fn, precision): - class Test_Softmax(nn.Module): - def __init__(self, softmax_fn, fake_bf16_io, mask_inp=False): - super().__init__() - self.softmax_fn = softmax_fn - self.scale = 8 # arbitrary value - self.mask_inp = mask_inp - self.fused_scaled_softmax = None - self.fake_bf16_io = fake_bf16_io - if self.softmax_fn == te.softmax.FusedScaleMaskSoftmax: - self.fused_scaled_softmax = te.softmax.FusedScaleMaskSoftmax( - mask_func=te.utils.attention_mask_func, - softmax_in_fp32=True, - ) - - def forward(self, inp, mask): - if self.fake_bf16_io: - inp = inp.type(torch.bfloat16) - - if self.fused_scaled_softmax: - ret = self.fused_scaled_softmax(inp, mask, "causal", self.scale) - else: - if self.mask_inp: - ret = self.softmax_fn.apply(inp, mask, self.scale) - else: - ret = self.softmax_fn.apply(inp, self.scale) - if self.fake_bf16_io: - ret = ret.type(torch.float32) - return ret - - fake_bf16_io = precision == "fake-torch.bfloat16" - precision = torch.bfloat16 if fake_bf16_io else precision - - # Set dimensions (these are arbitrary). - batch_size, n_heads, seq_len_q, seq_len_k = 64, 96, 32, 32 - mask = None - input_names = ["input", "mask"] - inp_shape = [batch_size, n_heads, seq_len_q, seq_len_k] - if softmax_fn == softmax_defs.ScaledUpperTriangMaskedSoftmax: - inp_shape = [batch_size, seq_len_q, seq_len_k] - kernel_str = "ScaledUpperTriangMaskedSoftmax" - model = Test_Softmax(softmax_fn, fake_bf16_io) - elif softmax_fn == softmax_defs.ScaledMaskedSoftmax: - # Generate a random mask with 50% probability for 0 or 1. - probs = 0.5 * torch.ones(1, 1, seq_len_q, seq_len_k, device="cuda", dtype=precision) - mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) - kernel_str = "ScaledMaskedSoftmax" - model = Test_Softmax(softmax_fn, fake_bf16_io, mask_inp=True) - elif softmax_fn == softmax_defs.ScaledSoftmax: - kernel_str = "ScaledSoftmax" - model = Test_Softmax(softmax_fn, fake_bf16_io) - elif softmax_fn == te.softmax.FusedScaleMaskSoftmax: - kernel_str = "TorchSoftmax" - model = Test_Softmax(softmax_fn, fake_bf16_io) - - input_tensor = torch.randn(*inp_shape, device="cuda", dtype=torch.float32 if fake_bf16_io else precision) - high_prec_str = dtype2str(precision, fake_bf16_io=fake_bf16_io) - fname = f"{kernel_str}{high_prec_str}.onnx" - inp = (input_tensor, mask) - dynamic_axes = {} - if mask is not None: - dynamic_axes = {"mask": {2:"seq_len_q", 3:"seq_len_k"}} - do_export(model, inp, fname, input_names=input_names, dynamic_axes=dynamic_axes) - te_outputs = te_infer(model, inp, is_fp8=False) - serialize_inputs_outputs(fname, inp, te_outputs, input_names=input_names) - if fake_bf16_io or precision != torch.bfloat16: - atol = 5e-2 if fake_bf16_io else 1e-3 - validate_result(fname, inp, model, atol=atol, input_names=input_names, te_outputs=te_outputs) - - -# Test dynamically generated softmax mask. -# Softmax kernel only supports FP16 or BF16! -@skip_FP8 -@pytest.mark.parametrize("precision", [torch.float16, torch.bfloat16, "fake-torch.bfloat16"]) -def test_softmax_mask_fn(seed_default_rng, precision): - fake_bf16_io = precision == "fake-torch.bfloat16" - # reset precision to torch.bfloat16 after capturing fake BF16 mode - precision = torch.bfloat16 if fake_bf16_io else precision - - class Test_Softmax(nn.Module): - def __init__(self, use_default_te_mask_fn: bool, fake_bf16_io: bool): - super().__init__() - self.scale = 1 # arbitrary value - self.fake_bf16_io = fake_bf16_io - - if use_default_te_mask_fn: - os.environ["NVTE_ONNX_KVCACHE_MAX_SEQ_LEN"] = "0" - else: - os.environ["NVTE_ONNX_KVCACHE_MAX_SEQ_LEN"] = f"{seq_len_q}" - - # Use NVTE_MASKED_SOFTMAX_FUSION to force TE to use forward_torch_softmax - # even when is_in_onnx_export_mode()==False. - os.environ["NVTE_MASKED_SOFTMAX_FUSION"] = "0" - self.fused_scaled_softmax = te.softmax.FusedScaleMaskSoftmax( - mask_func=te.utils.attention_mask_func, - softmax_in_fp32=True, - ) - - def forward(self, inp, mask): - if self.fake_bf16_io: - inp = inp.type(torch.bfloat16) - ret = self.fused_scaled_softmax(inp, mask, "causal", scale=self.scale) - if self.fake_bf16_io: - ret = ret.type(torch.float) - return ret - - # Set dimensions (these are arbitrary). - mask = None - batch_size, n_heads, seq_len_q, seq_len_k = 64, 96, 32, 32 - assert seq_len_q == seq_len_k # This is a causal (TRILU) mask - inp_shape = [batch_size, n_heads, seq_len_q, seq_len_k] - input_tensor = torch.randn( - *inp_shape, device="cuda", dtype=torch.float if fake_bf16_io else precision) - inp = (input_tensor, mask) - high_prec_str = dtype2str(precision, fake_bf16_io=fake_bf16_io) - - # Compare the outputs of TE when using the default softmax mask - # to the TE outputs produced when using the ONNX-compatible causal mask. - # This verifies that _get_onnx_export_causal_mask generates a correct mask. - model = Test_Softmax(use_default_te_mask_fn=True, fake_bf16_io=fake_bf16_io) - te_outputs_default_mask = te_infer(model, inp, is_fp8=True) - with te.onnx_export(True): - # ONNX export mode forces use of the ONNX-compatible causal mask. - model_onnx_mask = Test_Softmax(use_default_te_mask_fn=False, fake_bf16_io=fake_bf16_io) - te_outputs_onnx_mask = te_infer(model_onnx_mask, inp, is_fp8=True) - compare_outputs(te_outputs_default_mask, te_outputs_onnx_mask, - atol=0, rtol=0, max_errors_printed=10, allow_cnt_errors=0, fname="softmax masking") - - # Compare the outputs of TE when using the default softmax mask - # to the ORT ONNX outputs produced when using the ONNX-compatible causal mask. - input_names = ["input", "mask"] - kernel_str = "FusedScaleMaskSoftmax" - fname = f"{kernel_str}{high_prec_str}.onnx" - do_export(model, inp, fname, input_names=input_names) - serialize_inputs_outputs(fname, inp, te_outputs=te_outputs_default_mask, input_names=input_names) - if fake_bf16_io or precision != torch.bfloat16: - atol = 1e-2 if fake_bf16_io else 1e-3 - validate_result( - fname, inp, model_onnx_mask, atol=atol, - input_names=input_names, te_outputs=te_outputs_default_mask) - @pytest.mark.parametrize("scale_factor", [1]) @pytest.mark.parametrize("use_fp8", [False, True]) @@ -1159,13 +1009,13 @@ def test_export_core_attention( query_layer = torch.randn(qkv_size, dtype=precision, device="cuda") key_layer = torch.randn(qkv_size, dtype=precision, device="cuda") value_layer = torch.randn(qkv_size, dtype=precision, device="cuda") - input_names = ["query", "key", "value", "attention_mask", "attn_mask_type"] + input_names = ["query", "key", "value", "attention_mask"] attention_mask = None if use_mask: # Generate a random mask with 50% probability for 0 or 1. probs = 0.5 * torch.ones(qkv_size[1], qkv_size[2], qkv_size[0], qkv_size[0], device="cuda", dtype=precision) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) - inp = (query_layer, key_layer, value_layer, attention_mask, attn_mask_type) + inp = (query_layer, key_layer, value_layer, attention_mask) mask_str = get_attn_mask_str(use_mask, attn_mask_type) high_prec_str = dtype2str(precision) @@ -1175,6 +1025,7 @@ def test_export_core_attention( num_attention_heads=num_attention_heads, kv_channels=kv_channels, attention_dropout=0.5, + attn_mask_type=attn_mask_type, ).to(device='cuda') do_export(model, inp, @@ -1190,8 +1041,9 @@ def test_export_core_attention( test_configs_multihead_attention = [ #"use_mask, attn_mask_type" - (False, "no_mask"), # calls ScaledUpperTriangMaskedSoftmax + (False, "causal"), # calls ScaledUpperTriangMaskedSoftmax (True, "padding"), # calls ScaledMaskedSoftmax + (False, "padding"), # calls ScaledSoftmax ] test_configs_attention_type = [ #"input_layernorm, attention_type, fuse_qkv_params" @@ -1265,6 +1117,7 @@ def test_export_multihead_attention( model = te.MultiheadAttention( *attention_args, + attn_mask_type=attn_mask_type, params_dtype=precision, return_layernorm_output=return_layernorm_output, input_layernorm=input_layernorm, @@ -1273,8 +1126,8 @@ def test_export_multihead_attention( return_bias=True, ).to(device='cuda') - inp_context = (hidden_states_context, attention_mask, encoder_output, attn_mask_type) - input_names = ["hidden_states", "attention_mask", "encoder_output", "attn_mask_type"] + inp_context = (hidden_states_context, attention_mask, encoder_output) + input_names = ["hidden_states", "attention_mask", "encoder_output"] output_names=["attention_output", "attention_bias"] do_export(model, inp_context, fname, use_fp8, input_names=input_names, output_names=output_names, dynamic_axes={"hidden_states": {0: "seq", 1:"bs"}, @@ -1342,13 +1195,13 @@ def test_export_transformer_layer( num_attention_heads = 4 input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") - input_names = ["input", "attention_mask", "self_attn_mask_type"] + input_names = ["input", "attention_mask"] attention_mask = None if use_mask and attn_mask_type != "causal": # Generate a random mask with 50% probability for 0 or 1. probs = 0.5 * torch.ones(batch_size, 1, sequence_length, sequence_length, device="cuda", dtype=precision) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) - inp = (input_tensor, attention_mask, attn_mask_type) + inp = (input_tensor, attention_mask) fp8_str = "_fp8" if use_fp8 else "" fuse_qkv_params_str = "_fused-qkv" if fuse_qkv_params else "" @@ -1360,6 +1213,7 @@ def test_export_transformer_layer( hidden_size, ffn_hidden_size, num_attention_heads, + self_attn_mask_type=attn_mask_type, output_layernorm=output_layernorm, params_dtype=precision, fuse_qkv_params=fuse_qkv_params, @@ -1541,16 +1395,17 @@ def test_export_gpt_generation( hidden_size, ffn_hidden_size, num_attention_heads, + self_attn_mask_type=attn_mask_type, output_layernorm=output_layernorm, params_dtype=precision, fuse_qkv_params=fuse_qkv_params, zero_centered_gamma=zero_centered_gamma).to(device='cuda') # "Context phase": use full input sequence length - input_names = ["input", "attention_mask", "self_attn_mask_type"] + input_names = ["input"] output_names = ["output"] input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") - inp = (input_tensor, None, attn_mask_type) + inp = (input_tensor,) do_export(model, inp, fname, use_fp8, input_names=input_names, output_names=output_names, dynamic_axes={"input": {0: "seq", 1:"bs"}, diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index b8f9befb1f..f9aa63ce8a 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -186,6 +186,7 @@ def backward(ctx, tensors = split_tensor_along_dim(grad_outputs[0], ctx.dim, 2) return tensors[0], tensors[1], None + class UnfusedDotProductAttention(torch.nn.Module): """Parallel attention w/o QKV and Proj Gemms BMM1 -> softmax + dropout -> BMM2 @@ -883,11 +884,6 @@ class DotProductAttention(torch.nn.Module): and set the environment variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order to disable`flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`. - .. warning:: - - Argument :attr:`attn_mask_type` has been moved to the `forward` method and - is deprecated. It will be fully removed in future releases. - Parameters ---------- num_attention_heads : int @@ -907,6 +903,12 @@ class DotProductAttention(torch.nn.Module): layer_number: int, default = `None` layer number of the current `DotProductAttention` when multiple such modules are concatenated, for instance in consecutive transformer blocks. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. Overridden by + :attr:`attn_mask_type` in the `forward` method. The forward + arg is useful for dynamically changing mask types, e.g. a different + mask for training and inference. The init arg is useful for cases + involving compilation/tracing, e.g. ONNX export. Parallelism parameters ---------------------- @@ -924,7 +926,7 @@ def __init__( kv_channels: int, num_gqa_groups: Optional[int] = None, attention_dropout: float = 0.0, - attn_mask_type: Optional[str] = None, + attn_mask_type: str = "causal", sequence_parallel: bool = False, tp_size: int = 1, get_rng_state_tracker: Optional[Callable] = None, @@ -934,13 +936,6 @@ def __init__( ) -> None: super().__init__() - if attn_mask_type is not None: - warnings.warn( - "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - self.attn_mask_type = attn_mask_type self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) self.tp_group = tp_group @@ -1031,7 +1026,7 @@ def forward( key_layer: torch.Tensor, value_layer: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - attn_mask_type: str = "causal", + attn_mask_type: Optional[str] = None, checkpoint_core_attention: bool = False, core_attention_bias_type: str = "no_bias", core_attention_bias: Optional[torch.Tensor] = None, @@ -1087,7 +1082,7 @@ def forward( Value tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out softmax input when not using flash-attn. - attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `None` type of attention mask passed into softmax operation. checkpoint_core_attention : bool, default = `False` If true, forward activations for attention are recomputed @@ -1102,13 +1097,7 @@ def forward( Whether to use the fast path to set output tensors to 0 or not. """ - if self.attn_mask_type is not None: - warnings.warn( - "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - # Keep previous functionality for current users. + if attn_mask_type is None: attn_mask_type = self.attn_mask_type assert (key_layer.shape[-2] == self.num_gqa_groups_per_partition @@ -1229,11 +1218,6 @@ class MultiheadAttention(torch.nn.Module): Argument :attr:`attention_mask` will be ignored in the `forward` call when :attr:`attn_mask_type` is set to `"causal"`. - .. warning:: - - Argument :attr:`attn_mask_type` has been moved to the `forward` method and - is deprecated. It will be fully removed in future releases. - Parameters ---------- hidden_size : int @@ -1259,6 +1243,12 @@ class MultiheadAttention(torch.nn.Module): layer_number: int, default = `None` layer number of the current `TransformerLayer` when multiple such modules are concatenated to form a transformer block. + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. Overridden by + :attr:`attn_mask_type` in the `forward` method. The forward + arg is useful for dynamically changing mask types, e.g. a different + mask for training and inference. The init arg is useful for cases + involving compilation/tracing, e.g. ONNX export. num_gqa_groups : int, default = `None` number of GQA groups in the transformer layer. Grouped Query Attention is described in @@ -1349,7 +1339,7 @@ def __init__( init_method: Optional[Callable] = None, output_layer_init_method: Optional[Callable] = None, layer_number: Optional[int] = None, - attn_mask_type: Optional[str] = None, + attn_mask_type: str = "causal", tp_group: Optional[dist_group_type] = None, tp_size: int = 1, num_gqa_groups: Optional[int] = None, @@ -1375,13 +1365,6 @@ def __init__( ) -> None: super().__init__() - if attn_mask_type is not None: - warnings.warn( - "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - self.attn_mask_type = attn_mask_type self.layer_number = layer_number self.input_layernorm = input_layernorm @@ -1555,7 +1538,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, encoder_output: Optional[torch.Tensor] = None, - attn_mask_type: str = "causal", + attn_mask_type: Optional[str] = None, is_first_microbatch: Optional[bool] = None, checkpoint_core_attention: bool = False, inference_params: Optional[Any] = None, @@ -1578,7 +1561,7 @@ def forward( Input tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out self-attention softmax input. - attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `None` type of attention mask passed into softmax operation. encoder_output : Optional[torch.Tensor], default = `None` Output of the encoder block to be fed into the decoder block if using @@ -1613,13 +1596,7 @@ def forward( """ # hidden_states: [sq, b, h] - if self.attn_mask_type is not None: - warnings.warn( - "Argument :attr:`attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - # Keep previous functionality for current users. + if attn_mask_type is None: attn_mask_type = self.attn_mask_type if attn_mask_type == "padding" and attention_mask is not None: diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 6b45a10fb3..d4046ec7da 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -73,10 +73,9 @@ class TransformerLayer(torch.nn.Module): Arguments :attr:`attention_softmax_in_fp32` and :attr:`apply_query_key_layer_scaling` are deprecated and will be fully removed in future releases. - .. warning:: - - Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and - is deprecated. It will be fully removed in future releases. + .. note:: + Argument :attr:`attention_mask` will be ignored in the `forward` call when + :attr:`self_attn_mask_type` is set to `"causal"`. Parameters ---------- @@ -127,6 +126,12 @@ class TransformerLayer(torch.nn.Module): kv_channels: int, default = `None` number of key-value channels. defaults to :attr:`hidden_size` / :attr:`num_attention_heads` if `None`. + self_attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` + type of attention mask passed into softmax operation. Overridden by + :attr:`self_attn_mask_type` in the `forward` method. The forward + arg is useful for dynamically changing mask types, e.g. a different + mask for training and inference. The init arg is useful for cases + involving compilation/tracing, e.g. ONNX export. zero_centered_gamma : bool, default = 'False' if set to 'True', gamma parameter in LayerNorm is initialized to 0 and the LayerNorm formula changes to @@ -212,7 +217,7 @@ def __init__( output_layer_init_method: Optional[Callable] = None, layer_number: Optional[int] = None, kv_channels: Optional[int] = None, - self_attn_mask_type: Optional[str] = None, + self_attn_mask_type: str = "causal", tp_group: Optional[dist_group_type] = None, tp_size: int = 1, params_dtype: Optional[torch.dtype] = None, @@ -239,13 +244,6 @@ def __init__( ) -> None: super().__init__() - if self_attn_mask_type is not None: - warnings.warn( - "Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - warnings.warn( "Arguments `attention_softmax_in_fp32` and `apply_query_key_layer_scaling`" "are deprecated and will be fully removed in future releases.", @@ -431,7 +429,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - self_attn_mask_type: str = "causal", + self_attn_mask_type: Optional[str] = None, encoder_output: Optional[torch.Tensor] = None, enc_dec_attn_mask: Optional[torch.Tensor] = None, is_first_microbatch: Optional[bool] = None, @@ -456,7 +454,7 @@ def forward( Input tensor. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out self-attention softmax input. - self_attn_mask_type: {'causal', 'padding'}, default = `causal` + self_attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` type of attention mask passed into softmax operation. encoder_output : Optional[torch.Tensor], default = `None` Output of the encoder block to be fed into the decoder block if using @@ -493,13 +491,7 @@ def forward( Whether to set output tensors to 0 or not before use. """ - if self.self_attn_mask_type is not None: - warnings.warn( - "Argument :attr:`self_attn_mask_type` has been moved to the `forward` method and" - "is deprecated. It will be fully removed in future releases.", - category=DeprecationWarning, - ) - # Keep previous functionality for current users. + if self_attn_mask_type is None: self_attn_mask_type = self.self_attn_mask_type assert ( From a402c4d2cb11d5860385f0bb8edc7597b442d3e6 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 22 Sep 2023 23:44:03 -0700 Subject: [PATCH 056/427] Fix layernorm in GQA (#434) * [PyTorch] Implement GQA based on fused q, k, v projection. Additionally fixes #392 Signed-off-by: Markus Schnoes * [PyTorch] Extend parameters_split option in Linear and LayerNormLinear to support splitting with different sizes as required by unfused GQA. Signed-off-by: Markus Schnoes * fix parameters split Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix noop cat to bypass torch.cat and support uneven split Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix unit tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix torch.split args Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix cuda graph due to noop_cat Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix lint Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove the use of enumerate when possible Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix strides in SplitAlongDim Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Markus Schnoes Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: Markus Schnoes --- tests/pytorch/test_fused_attn.py | 13 +- transformer_engine/pytorch/attention.py | 146 +++++++++++------- transformer_engine/pytorch/module/base.py | 37 +++-- .../pytorch/module/layernorm_linear.py | 55 ++++--- transformer_engine/pytorch/module/linear.py | 55 ++++--- 5 files changed, 194 insertions(+), 112 deletions(-) diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index 32442e40fb..1a1515d843 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -141,7 +141,8 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("ckpt_attn", [False]) @pytest.mark.parametrize("bias_type", ["no_bias", "post_scale_bias"]) -def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type): +@pytest.mark.parametrize("fused_qkv_params", [True, False]) +def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type, fused_qkv_params): """Test TransformerLayer module when its DotProductAttention is enabled with FlashAttention, FusedAttention, or UnfusedDotProductAttention backend""" @@ -149,11 +150,11 @@ def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type): if bias_type == "no_bias": flash_attn_fwd, flash_attn_bwd = _run_transformer_layer( - dtype, bs, config, "FlashAttention", ckpt_attn, bias_type) + dtype, bs, config, "FlashAttention", ckpt_attn, bias_type, fused_qkv_params) fused_attn_fwd, fused_attn_bwd = _run_transformer_layer( - dtype, bs, config, "FusedAttention", ckpt_attn, bias_type) + dtype, bs, config, "FusedAttention", ckpt_attn, bias_type, fused_qkv_params) unfused_attn_fwd, unfused_attn_bwd = _run_transformer_layer( - dtype, bs, config, "UnfusedDotProductAttention", ckpt_attn, bias_type) + dtype, bs, config, "UnfusedDotProductAttention", ckpt_attn, bias_type, fused_qkv_params) atol, rtol = (5e-1, 5e-2) if bias_type == "no_bias": @@ -162,7 +163,7 @@ def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type): assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) -def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): +def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type, fused_qkv_params): reset_rng_states() os.environ["NVTE_FLASH_ATTN"] = "0" @@ -220,7 +221,7 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type): layer_type="encoder", drop_path_rate=drop_path_rates[layer_number - 1], set_parallel_mode=True, - fuse_qkv_params=True, + fuse_qkv_params=fused_qkv_params, zero_centered_gamma=False, qkv_weight_interleaved=False, ub_tp_comm_overlap=False, diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index f9aa63ce8a..bcf5584f3d 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -8,8 +8,9 @@ import math from importlib.metadata import version from contextlib import nullcontext -from typing import Any, Callable, Optional, Tuple, Union, Dict +from typing import Any, Callable, Optional, Tuple, Union, Dict, List from pkg_resources import packaging +import numpy as np import torch @@ -84,48 +85,61 @@ def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: return torch.cat((t, t_pass), dim=-1) -class _SplitLastDim(torch.autograd.Function): +class _SplitAlongDim(torch.autograd.Function): """""" @staticmethod def forward(ctx, mixed_x_layer: torch.Tensor, - num_parts: int + split_dim: int, + split_size_or_sections: Union[int, List[int], Tuple[int]], ) -> Tuple[torch.Tensor, ...]: - return split_tensor_along_dim(mixed_x_layer, -1, num_parts) + ctx.split_dim = split_dim + ctx.split_size_or_sections = split_size_or_sections + return torch.split(mixed_x_layer, split_size_or_sections, dim = split_dim) @staticmethod def backward(ctx, *grad_outputs): assert len(grad_outputs) > 0, "No gradients received for backprop!" + if isinstance(ctx.split_size_or_sections, (list, tuple)): + split_sizes = ctx.split_size_or_sections + assert (len(grad_outputs) == len(split_sizes) + ), "Unequal number of gradients vs split sections for backprop!" + if isinstance(ctx.split_size_or_sections, int): + split_sizes = [ctx.split_size_or_sections] * len(grad_outputs) + dims = len(grad_outputs[0].shape) + split_dim = (ctx.split_dim + dims) % dims + noop_ok = True strides = grad_outputs[0].stride() data_ptr = grad_outputs[0].storage().data_ptr() - shape = grad_outputs[0].shape - last_dim_size = grad_outputs[0].shape[-1] + shape = list(grad_outputs[0].shape) for i, tensor in enumerate(grad_outputs): + shape_i = shape + shape_i[split_dim] = split_sizes[i] + offset_size = sum(split_sizes[:i]) * np.prod(shape[split_dim+1:]) if (tensor.stride() != strides or - tensor.shape != shape or + list(tensor.shape) != shape_i or tensor.storage().data_ptr() != data_ptr or - tensor.storage_offset() != i * last_dim_size): + tensor.storage_offset() != offset_size): noop_ok = False break if noop_ok: - ret = torch.Tensor().to(grad_outputs[0].dtype) ret = torch.Tensor().to(device=grad_outputs[0].device, dtype=grad_outputs[0].dtype) new_shape = list(shape) - new_shape[-1] = new_shape[-1] * len(grad_outputs) - ret.set_(grad_outputs[0].storage(), + new_shape[split_dim] = sum(split_sizes) + ret.set_(grad_outputs[0].untyped_storage(), grad_outputs[0].storage_offset(), new_shape, - grad_outputs[0].stride() + strides ) - return ret, None + return ret, None, None - return torch.cat(grad_outputs, dim = -1), None + return torch.cat(grad_outputs, dim = split_dim), None, None class _CombineQKV(torch.autograd.Function): """""" @@ -1401,8 +1415,8 @@ def __init__( num_attention_heads if num_gqa_groups is None else num_gqa_groups ) assert (num_attention_heads % self.num_gqa_groups == 0 - ), "The number of GQA groups must be divisible by the number of attention heads!" - assert (num_attention_heads % tp_size == 0 + ), "The number of attention heads must be divisible by the number of GQA groups!" + assert (self.num_gqa_groups % tp_size == 0 ), "The number of GQA groups must be divisible by tensor parallel size!" self.num_gqa_groups_per_partition = int(self.num_gqa_groups // tp_size) self.hidden_size_kv = int(hidden_size * self.num_gqa_groups // num_attention_heads) @@ -1419,18 +1433,21 @@ def __init__( qkv_parallel_mode = "column" if set_parallel_mode else None - if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads: + if self.attention_type == "self": + parameters_split = {"query_": hidden_size, + "key_": self.hidden_size_kv, + "value_": self.hidden_size_kv} if not fuse_qkv_params else None if self.input_layernorm: self.layernorm_qkv = LayerNormLinear( hidden_size, - 3 * hidden_size, + hidden_size + 2 * self.hidden_size_kv, eps=layernorm_epsilon, init_method=init_method, bias=bias, return_bias=False, parallel_mode=qkv_parallel_mode, return_layernorm_output=return_layernorm_output, - parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None, + parameters_split=parameters_split, zero_centered_gamma=zero_centered_gamma, ub_bulk_wgrad=ub_bulk_wgrad, ub_bulk_dgrad=ub_bulk_dgrad, @@ -1441,17 +1458,15 @@ def __init__( else: self.qkv = Linear( hidden_size, - 3 * hidden_size, + hidden_size + 2 * self.hidden_size_kv, init_method=init_method, bias=bias, return_bias=False, parallel_mode=qkv_parallel_mode, - parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None, + parameters_split=parameters_split, **common_gemm_kwargs, ) - elif ((self.attention_type == "cross") - or (self.attention_type == "self" - and self.num_gqa_groups != self.num_attention_heads)): + elif self.attention_type == "cross": if self.input_layernorm: self.layernorm_query = LayerNormLinear( hidden_size, @@ -1461,6 +1476,7 @@ def __init__( bias=bias, return_bias=False, parallel_mode=qkv_parallel_mode, + parameters_split=("query_",) if not fuse_qkv_params else None, return_layernorm_output=return_layernorm_output, zero_centered_gamma=zero_centered_gamma, ub_bulk_wgrad=ub_bulk_wgrad, @@ -1636,8 +1652,8 @@ def forward( # Query, Key, and Value # ===================== - if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads: - # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + if self.attention_type == "self": + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn] if self.input_layernorm: layernorm_qkv_outputs = self.layernorm_qkv( hidden_states, @@ -1653,49 +1669,59 @@ def forward( is_first_microbatch=is_first_microbatch, ) + num_queries_per_key_value = (self.num_attention_heads_per_partition // + self.num_gqa_groups_per_partition) if self.qkv_weight_interleaved: - # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] - new_tensor_shape = mixed_x_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) - # split along last dimension - split_dim = -1 - else: - # [sq, b, (np * 3 * hn)] --> [sq, b, 3 * np, hn] + # [sq, b, ng * (np/ng + 2) * hn] --> [sq, b, ng, (np/ng + 2), hn] new_tensor_shape = mixed_x_layer.size()[:-1] + ( - 3 * self.num_attention_heads_per_partition, + self.num_gqa_groups_per_partition, + (num_queries_per_key_value + 2), self.hidden_size_per_attention_head, ) # split along second last dimension split_dim = -2 + else: + # [sq, b, ng * (np/ng + 2) * hn] --> [sq, b, (np/ng + 2), ng, hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + (num_queries_per_key_value + 2), + self.num_gqa_groups_per_partition, + self.hidden_size_per_attention_head + ) + # split along third last dimension + split_dim = -3 mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) - # mixed_x_layer --> 3 [sq, b, np, hn] - if split_dim == -1 and not is_in_onnx_export_mode(): - query_layer, key_layer, value_layer = _SplitLastDim.apply(mixed_x_layer, 3) - else: - query_layer, key_layer, value_layer = split_tensor_along_dim( - mixed_x_layer, split_dim, 3 + # qkv_weight_interleaved: + # [sq, b, ng, (np/ng + 2), hn] + # --> [sq, b, ng, np/ng, hn], [sq, b, ng, 1, hn], [sq, b, ng, 1, hn] + # not qkv_weight_interleaved: + # [sq, b, (np/ng + 2), ng, hn] + # --> [sq, b, np/ng, np, hn], [sq, b, 1, ng, hn], [sq, b, 1, ng, hn] + if not is_in_onnx_export_mode(): + query_layer, key_layer, value_layer = _SplitAlongDim.apply( + mixed_x_layer, split_dim, (num_queries_per_key_value, 1, 1) ) - elif ((self.attention_type == "cross") - or (self.attention_type == "self" - and self.num_gqa_groups != self.num_attention_heads)): - - if self.attention_type == "cross": - input_tensor = encoder_output else: - input_tensor = hidden_states - - # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] + query_layer, key_layer, value_layer = torch.split( + mixed_x_layer, (num_queries_per_key_value, 1, 1), dim = split_dim, + ) + + # query: -> [sq, b, np, hn] + # key, value: -> [sq, b, ng, hn] + query_layer, key_layer, value_layer = (x.reshape(x.size(0), x.size(1), -1, + self.hidden_size_per_attention_head) + for x in (query_layer, key_layer, value_layer)) + + elif self.attention_type == "cross": + # Attention heads [sk, b, h] --> [sk, b, (ng * 2 * hn)] mixed_kv_layer = self.key_value( - input_tensor, + encoder_output, is_first_microbatch=is_first_microbatch, ) if self.qkv_weight_interleaved: - # [sq, b, (np * 2 * hn)] --> [sq, b, np, 2 * hn] + # [sq, b, (ng * 2 * hn)] --> [sq, b, ng, 2 * hn] new_tensor_shape = mixed_kv_layer.size()[:-1] + ( self.num_gqa_groups_per_partition, 2 * self.hidden_size_per_attention_head, @@ -1703,7 +1729,7 @@ def forward( # split along last dimension split_dim = -1 else: - # [sq, b, (np * 2 * hn)] --> [sq, b, 2 * np, hn] + # [sq, b, (ng * 2 * hn)] --> [sq, b, 2 * ng, hn] new_tensor_shape = mixed_kv_layer.size()[:-1] + ( 2 * self.num_gqa_groups_per_partition, self.hidden_size_per_attention_head, @@ -1713,11 +1739,15 @@ def forward( mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) - # mixed_kv_layer --> 2 [sk, b, np, hn] - if split_dim == -1 and not is_in_onnx_export_mode(): - key_layer, value_layer = _SplitLastDim.apply(mixed_kv_layer, 2) + # mixed_kv_layer --> 2 [sk, b, ng, hn] + if not is_in_onnx_export_mode(): + key_layer, value_layer = _SplitAlongDim.apply( + mixed_kv_layer, split_dim, mixed_kv_layer.shape[split_dim] // 2, + ) else: - key_layer, value_layer = split_tensor_along_dim(mixed_kv_layer, split_dim, 2) + key_layer, value_layer = torch.split( + mixed_kv_layer, mixed_kv_layer.shape[split_dim] // 2, dim = split_dim, + ) # Attention head [sq, b, h] --> [sq, b, hp] if self.input_layernorm: diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 82d39eeaf0..50d7b9f2fb 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -212,8 +212,9 @@ def forward(ctx, *params_split: Tuple[torch.Tensor, ...], ) -> torch.Tensor: assert not full_param_buffer.requires_grad, "Buffers should not require gradient" + sum_params_shape = sum(p.shape[0] for p in params_split) assert ( - full_param_buffer.shape[0] % len(params_split) == 0 + full_param_buffer.shape[0] == sum_params_shape ), "Dimensions not compatible for concatenation" param_temp = full_param_buffer.new() @@ -223,18 +224,19 @@ def forward(ctx, full_param_buffer.stride()) param_temp.requires_grad = True - ctx.save_for_backward(full_param_buffer, *params_split) + ctx.save_for_backward(*params_split) return param_temp @staticmethod def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], ...]: - full_param_buffer, *params_split = ctx.saved_tensors - - split_size = full_param_buffer.shape[0] // len(params_split) + params_split = ctx.saved_tensors grads = [] - + slice_begin = 0 for i, _ in enumerate(params_split): - grads.append(grad_output[i * split_size : (i+1) * split_size]) + slice_size = params_split[i].shape[0] + slice_end = slice_begin + slice_size + grads.append(grad_output[slice_begin:slice_end]) + slice_begin = slice_end return None, *grads @@ -753,7 +755,11 @@ def grad_output_preprocess( return grad_output_mat, grad_output_c, grad_output_t, grad_bias - def noop_cat(self, buffer_name: str, pnames: List[str]) -> torch.Tensor: + def noop_cat(self, + buffer_name: str, + pnames: List[str], + parameters_split: Dict[str, int] + ) -> torch.Tensor: """No-op replacement of `torch.cat`. The buffer and split parameters must occupy the same memory region. If this is not the case, then the split parameters are concatenated and the buffer is overwritten. The parameters' memory is then @@ -762,17 +768,24 @@ def noop_cat(self, buffer_name: str, pnames: List[str]) -> torch.Tensor: assert hasattr(self, buffer_name), f"No buffer named {buffer_name}" full_param_buffer = getattr(self, buffer_name) - split_size = full_param_buffer.shape[0] // len(pnames) params = [getattr(self, name) for name in pnames] + slice_begin = 0 for i, p in enumerate(params): - if p.data.data_ptr() != full_param_buffer[i*split_size : (i+1)*split_size].data_ptr(): + slice_size = parameters_split[pnames[i].split('_')[0]+'_'] + slice_end = slice_begin + slice_size + if p.data.data_ptr() != full_param_buffer[slice_begin:slice_end].data_ptr(): with torch.no_grad(): setattr(self, buffer_name, torch.cat(params)) - for j, pname in enumerate(pnames): + slice_begin_j = 0 + for pname in pnames: + slice_size_j = parameters_split[pname.split('_')[0]+'_'] + slice_end_j = slice_begin_j + slice_size_j full_param_buffer = getattr(self, buffer_name) setattr(self, pname, - Parameter(full_param_buffer[j*split_size : (j+1)*split_size])) + Parameter(full_param_buffer[slice_begin_j:slice_end_j])) + slice_begin_j = slice_end_j break + slice_begin = slice_end return _NoopCat.apply(getattr(self, buffer_name), *[getattr(self, name) for name in pnames]) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 9115971524..761b0abf6b 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -536,11 +536,14 @@ class LayerNormLinear(TransformerEngineBaseModule): together with the output of the linear transformation. Example use case: residual connection for transformer module is taken post layernorm. - parameters_split : Tuple[str, ...], default = None - if a tuple of strings is provided, the weight and bias parameters of the - module are exposed as `N` separate `torch.nn.parameter.Parameter`s each, - split along the first dimension, where `N` is the length of the argument - and the strings contained are the names of the split parameters. + parameters_split : Optional[Union[Tuple[str, ...], Dict[str, int]]], default = None + if a tuple of strings or a dict of strings to integers is provided, + the weight and bias parameters of the module are exposed as `N` separate + `torch.nn.parameter.Parameter`s each, split along the first dimension, + where `N` is the length of the argument and the strings contained are the + names of the split parameters. In the case of a tuple, each parameter + has the same shape. In the case of a dict, the values give the + `out_features` for each projection. zero_centered_gamma : bool, default = 'False' if set to 'True', gamma parameter in LayerNorm is initialized to 0 and the LayerNorm formula changes to @@ -607,7 +610,7 @@ def __init__( parallel_mode: Optional[str] = None, return_layernorm_output: bool = False, skip_weight_param_allocation: bool = False, - parameters_split: Optional[Tuple[str, ...]] = None, + parameters_split: Optional[Union[Tuple[str, ...], Dict[str, int]]] = None, zero_centered_gamma: bool = False, ub_bulk_wgrad: bool = False, ub_bulk_dgrad: bool = False, @@ -707,23 +710,35 @@ def __init__( self.bias_tensor.zero_() if parameters_split is None: - parameters_split = ("",) - - assert ( - self.out_features % len(parameters_split) == 0 - ), f"Weight and bias params cannot be split into {len(parameters_split)} parts" - - split_size = self.out_features // len(parameters_split) + parameters_split = {"": self.out_features} + elif isinstance(parameters_split, tuple): + assert ( + self.out_features % len(parameters_split) == 0 + ), f"Weight and bias params cannot be split into {len(parameters_split)} parts" + split_size = self.out_features // len(parameters_split) + parameters_split = {key: split_size for key in parameters_split} + elif isinstance(parameters_split, dict): + overall_split_size = sum(parameters_split.values()) + assert( + self.out_features == overall_split_size + ), f"Overall sum of parameters_split (={overall_split_size}) does not match "\ + f"to out features (={self.out_features})" + else: + assert False, "Type of 'parameters_split' is not None, tuple or dict" + self.updated_parameters_split = parameters_split self.weight_names = [] self.bias_names = [] - for i, pname in enumerate(parameters_split): + slice_begin = 0 + for pname, slice_size in parameters_split.items(): wname = pname + "weight" bname = pname + "bias" + slice_end = slice_begin + slice_size + self.register_parameter( - wname, Parameter(self.weight_tensor[i * split_size : (i+1) * split_size]) + wname, Parameter(self.weight_tensor[slice_begin:slice_end]) ) set_tensor_model_parallel_attributes( @@ -735,7 +750,7 @@ def __init__( if self.use_bias: self.register_parameter( - bname, Parameter(self.bias_tensor[i * split_size : (i+1) * split_size]) + bname, Parameter(self.bias_tensor[slice_begin:slice_end]) ) else: setattr(self, bname, torch.Tensor().to(dtype=params_dtype, device=device)) @@ -746,6 +761,8 @@ def __init__( self.weight_names.append(wname) self.bias_names.append(bname) + slice_begin = slice_end + self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) @@ -841,12 +858,14 @@ def forward( bias_tensor = ( self.bias if self.parameters_split is None else self.bias_tensor if not torch.is_grad_enabled() - else self.noop_cat("bias_tensor", self.bias_names) + else self.noop_cat("bias_tensor", self.bias_names, + self.updated_parameters_split) ) weight_tensor = ( self.weight if self.parameters_split is None else self.weight_tensor if not torch.is_grad_enabled() - else self.noop_cat("weight_tensor", self.weight_names) + else self.noop_cat("weight_tensor", self.weight_names, + self.updated_parameters_split) ) # Fetch the fp8 weights placeholders (for linear/gemm) diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index c54a7aed73..45a163966b 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -461,11 +461,14 @@ class Linear(TransformerEngineBaseModule): init_method : Callable, default = `None` used for initializing weights in the following way: `init_method(weight)`. When set to `None`, defaults to `torch.nn.init.normal_(mean=0.0, std=0.023)`. - parameters_split : Tuple[str, ...], default = None - if a tuple of strings is provided, the weight and bias parameters of the - module are exposed as `N` separate `torch.nn.parameter.Parameter`s each, - split along the first dimension, where `N` is the length of the argument - and the strings contained are the names of the split parameters. + parameters_split : Optional[Union[Tuple[str, ...], Dict[str, int]]], default = None + if a tuple of strings or a dict of strings to integers is provided, + the weight and bias parameters of the module are exposed as `N` separate + `torch.nn.parameter.Parameter`s each, split along the first dimension, + where `N` is the length of the argument and the strings contained are the + names of the split parameters. In the case of a tuple, each parameter + has the same shape. In the case of a dict, the values give the + `out_features` for each projection. device : Union[torch.device, str], default = "cuda" The device on which the parameters of the model will allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the @@ -522,7 +525,7 @@ def __init__( params_dtype: Optional[torch.dtype] = None, parallel_mode: Optional[str] = None, skip_weight_param_allocation: bool = False, - parameters_split: Optional[Tuple[str, ...]] = None, + parameters_split: Optional[Union[Tuple[str, ...], Dict[str, int]]] = None, ub_split_rs: bool = False, ub_split_ag: bool = False, device: Union[torch.device, str] = "cuda", @@ -598,23 +601,35 @@ def __init__( self.bias_tensor.zero_() if parameters_split is None: - parameters_split = ("",) - - assert ( - self.out_features % len(parameters_split) == 0 - ), f"Weight and bias params cannot be split into {len(parameters_split)} parts" - - split_size = self.out_features // len(parameters_split) + parameters_split = {"": self.out_features} + elif isinstance(parameters_split, tuple): + assert ( + self.out_features % len(parameters_split) == 0 + ), f"Weight and bias params cannot be split into {len(parameters_split)} parts" + split_size = self.out_features // len(parameters_split) + parameters_split = {key: split_size for key in parameters_split} + elif isinstance(parameters_split, dict): + overall_split_size = sum(parameters_split.values()) + assert( + self.out_features == overall_split_size + ), f"Overall sum of parameters_split (={overall_split_size}) does not match "\ + f"to out features (={self.out_features})" + else: + assert False, "Type of 'parameters_split' is not None, tuple or dict" + self.updated_parameters_split = parameters_split self.weight_names = [] self.bias_names = [] - for i, pname in enumerate(parameters_split): + slice_begin = 0 + for pname, slice_size in parameters_split.items(): wname = pname + "weight" bname = pname + "bias" + slice_end = slice_begin + slice_size + self.register_parameter( - wname, Parameter(self.weight_tensor[i * split_size : (i+1) * split_size]) + wname, Parameter(self.weight_tensor[slice_begin:slice_end]) ) set_tensor_model_parallel_attributes( @@ -626,7 +641,7 @@ def __init__( if self.use_bias: self.register_parameter( - bname, Parameter(self.bias_tensor[i * split_size : (i+1) * split_size]) + bname, Parameter(self.bias_tensor[slice_begin:slice_end]) ) else: setattr(self, bname, torch.Tensor().to(dtype=params_dtype, device=device)) @@ -637,6 +652,8 @@ def __init__( self.weight_names.append(wname) self.bias_names.append(bname) + slice_begin = slice_end + self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) # For RPL, bias has to be added after TP collectives @@ -715,12 +732,14 @@ def forward( bias_tensor = ( self.bias if self.parameters_split is None else self.bias_tensor if not torch.is_grad_enabled() - else self.noop_cat("bias_tensor", self.bias_names) + else self.noop_cat("bias_tensor", self.bias_names, + self.updated_parameters_split) ) weight_tensor = ( self.weight if self.parameters_split is None else self.weight_tensor if not torch.is_grad_enabled() - else self.noop_cat("weight_tensor", self.weight_names) + else self.noop_cat("weight_tensor", self.weight_names, + self.updated_parameters_split) ) # Fetch the fp8 weights placeholders (for linear/gemm) From 2f57bffa6321b385a6e4a679b8973c3c7676183e Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Sun, 24 Sep 2023 23:00:37 -0700 Subject: [PATCH 057/427] [C/Pytorch] Expand layout support for fused attention (#403) * add flexible layout support Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add support for flexible qkv layout Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add more changes Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fixes for compiling Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove redudant file Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix options device error Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix typos Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * more changes; WIP Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * more changes; WIP Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fixes and tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fixes and wrong results Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * sb3hd/bs3hd working on top of 3xsbhd/bshd/thd Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix dQ, dK, dV Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add nvtx Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove qkvso_strides on torch side; cover it in generateQKVStrides Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * all 15 layouts pass Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add workspace optimization Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * minor fixes and test Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * removed most debug info/clean up Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add note to deprecate some qkv layouts Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix code for unit tests in test_fused_attn.py Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * further remove debug info Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove a couple more comments Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix numerics tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fixes for lint Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix fp8 tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix onnx for core attn; not fixed Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove nvtx and add env var for workspace opt Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove testing for env var Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * replace zeros/zeros_like with empty/empty_like Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix nvtx marker name for _q_k_v API Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove sm80 when compiling for h100 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add mapping from qkv layout to layout group and qkv format Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * clean up enums mapping and remove trailing spaces Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * simplify workspace opt control logic; only need env var Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix fp8 test, and minor modifications for other tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * avoid overwriting model configs in unit test Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * random fixes/improvements: get_qkv_format/etc, default values, docstrings, comments Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix minor issues: invalid syntax Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * change workspace opt logic back to FORCE_WORKSPACE_OPT Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix FP8 tests and generateStrides function Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix get_backend logic for max512/arbitrary Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix unit tests; need cleanup Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * clean up unit tests for layouts, and fix minor lint issue Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * minor tweaks for CI testing: onnx string issue and test fused attn first Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove one unsupported layout from max512 and add a check to qkvpacked API Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix te layer test; reduce test time Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * revert compiler option changes; add back sm80 for even h100 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove some unit tests or make them optional to reduce CI time Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove more unit tests temporarily Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove _q_k_v in naming and add NVTE_ERROR for FP8 Aux_CTX_Tensors size checks Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add more deprecation notes Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove temp tests from last commit Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * replace with te::getenv Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove prints from last commit Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove redundant contiguous() Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove thd->bs3hd user warning to avoid GPU sync Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * adjust fused attn bs in tests Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * temporary fix for onnx issue; more fixes in PR 437 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove unused variables Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Charlene Yang Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- qa/L0_unittest/test.sh | 2 +- tests/pytorch/test_fused_attn.py | 202 ++++- tests/pytorch/test_numerics.py | 2 +- tests/pytorch/test_onnx_export.py | 2 + .../common/fused_attn/fused_attn.cpp | 273 ++++++- .../fused_attn_f16_arbitrary_seqlen.cu | 178 ++++- .../fused_attn_f16_arbitrary_seqlen.h | 24 + .../fused_attn_f16_max512_seqlen.cu | 139 +++- .../fused_attn/fused_attn_f16_max512_seqlen.h | 23 + .../common/fused_attn/fused_attn_fp8.cu | 220 ++++- .../common/fused_attn/fused_attn_fp8.h | 39 + transformer_engine/common/fused_attn/utils.cu | 262 +++++- .../include/transformer_engine/fused_attn.h | 215 ++++- transformer_engine/pytorch/attention.py | 753 +++++++++++------- transformer_engine/pytorch/constants.py | 5 + .../pytorch/cpp_extensions/fused_attn.py | 419 +++++++++- transformer_engine/pytorch/csrc/extensions.h | 46 ++ .../pytorch/csrc/extensions/attention.cu | 438 ++++++++++ .../pytorch/csrc/extensions/pybind.cpp | 21 +- transformer_engine/pytorch/transformer.py | 3 +- 20 files changed, 2832 insertions(+), 434 deletions(-) diff --git a/qa/L0_unittest/test.sh b/qa/L0_unittest/test.sh index f02ea1c6e8..268a534a82 100644 --- a/qa/L0_unittest/test.sh +++ b/qa/L0_unittest/test.sh @@ -9,6 +9,6 @@ set -e pip install pytest==6.2.5 onnxruntime==1.13.1 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py -NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py pytest -v -s $TE_PATH/tests/pytorch/test_jit.py pytest -v -s $TE_PATH/tests/pytorch/test_fused_attn.py +NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index 1a1515d843..1b43fa36eb 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -39,20 +39,23 @@ def __init__( model_configs = { "test1": ModelConfig(1, 1024, 16, 64, 128, 0.0, "causal"), - "test2": ModelConfig(1, 1024, 16, 64, 512, 0.0, "causal"), - "test3": ModelConfig(1, 1024, 16, 64, 2048, 0.0, "causal"), - "test4": ModelConfig(1, 2048, 16, 128, 128, 0.0, "causal"), - "test5": ModelConfig(1, 2048, 16, 128, 512, 0.0, "causal"), - "test6": ModelConfig(1, 2048, 16, 128, 2048, 0.0, "causal"), - "test7": ModelConfig(1, 1024, 16, 64, 128, 0.0, "no_mask"), - "test8": ModelConfig(1, 1024, 16, 64, 512, 0.0, "no_mask"), + "test2": ModelConfig(1, 1024, 16, 64, 2048, 0.0, "causal"), + "test3": ModelConfig(1, 2048, 16, 128, 128, 0.0, "causal"), + "test4": ModelConfig(1, 3072, 24, 128, 2048, 0.0, "causal"), + "test5": ModelConfig(1, 1024, 16, 64, 128, 0.0, "no_mask"), } +if os.getenv('NVTE_ADDITIONAL_TESTS', '0') == '1': + model_configs["test6"] = ModelConfig(1, 1024, 16, 64, 512, 0.0, "causal") + model_configs["test7"] = ModelConfig(1, 2048, 16, 128, 512, 0.0, "causal") + model_configs["test8"] = ModelConfig(1, 2048, 16, 128, 2048, 0.0, "causal") + model_configs["test9"] = ModelConfig(1, 1024, 16, 64, 512, 0.0, "no_mask") + param_types = [torch.float16] if torch.cuda.is_bf16_supported(): param_types.append(torch.bfloat16) -batch_sizes = [1, 2, 32] +batch_sizes = [1, 2] # add more if needed, e.g. 32 @pytest.mark.skipif( get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") @@ -77,10 +80,10 @@ def test_dot_product_attention(dtype, bs, model, ckpt_attn, bias_type): atol, rtol = (2.5e-2, 2.5e-2) if dtype == torch.bfloat16 else (5e-3, 5e-3) if bias_type == "no_bias": - assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type): @@ -126,7 +129,11 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) q = inp[:, :,0,:,:] k = inp[:, :,1,:,:] v = inp[:, :,2,:,:] - op = block(q, k, v, attn_mask_type=config.attn_mask_type, + op = block(q, k, v, + qkv_format='sbhd', + cu_seqlens_q = cu_seqlens, + cu_seqlens_kv = cu_seqlens, + attn_mask_type=config.attn_mask_type, checkpoint_core_attention=ckpt_attn, core_attention_bias_type=bias_type, core_attention_bias=bias) @@ -134,6 +141,130 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) return op, inp.grad +qkv_layouts = [ + 'sb3hd', 'sbh3d', 'sbhd_sb2hd', 'sbhd_sbh2d', 'sbhd_sbhd_sbhd', + 'bs3hd', 'bsh3d', 'bshd_bs2hd', 'bshd_bsh2d', 'bshd_bshd_bshd', + # will add tests for thd layouts later when the support is available in fused attention + #'t3hd', 'th3d', 'thd_t2hd', 'thd_th2d', 'thd_thd_thd', + ] + +@pytest.mark.skipif( + get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +@pytest.mark.parametrize("workspace_opt", [True, False]) +@pytest.mark.parametrize("qkv_layout", qkv_layouts) +def test_dpa_qkv_layout(dtype, bs, model, workspace_opt, qkv_layout): + """Test DotProductAttention module with different QKV layouts""" + + config = model_configs[model] + + flash_attn_fwd, flash_attn_bwd = _run_dpa_qkv_layout( + dtype, bs, config, "FlashAttention", qkv_layout, workspace_opt) + fused_attn_fwd, fused_attn_bwd = _run_dpa_qkv_layout( + dtype, bs, config, "FusedAttention", qkv_layout, workspace_opt) + unfused_attn_fwd, unfused_attn_bwd = _run_dpa_qkv_layout( + dtype, bs, config, "UnfusedDotProductAttention", qkv_layout, workspace_opt) + + atol, rtol = (5e-2, 5e-2) if dtype == torch.bfloat16 else (2.5e-3, 2.5e-3) + torch.testing.assert_close(flash_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, atol = atol, rtol = rtol) + torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, atol = atol, rtol = rtol) + for i in range(len(flash_attn_bwd)): + torch.testing.assert_close(flash_attn_bwd[i], unfused_attn_bwd[i], atol = atol, rtol = rtol) + torch.testing.assert_close(fused_attn_bwd[i], flash_attn_bwd[i], atol = atol, rtol = rtol) + torch.testing.assert_close(fused_attn_bwd[i], unfused_attn_bwd[i], atol = atol, rtol = rtol) + +def _run_dpa_qkv_layout(dtype, bs, config, backend, qkv_layout, workspace_opt): + + torch.manual_seed(1234) + torch.cuda.manual_seed(1234) + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + if backend == "FlashAttention": + os.environ["NVTE_FLASH_ATTN"] = "1" + if backend == "FusedAttention": + os.environ["NVTE_FUSED_ATTN"] = "1" + os.environ["NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT"] = "1" if workspace_opt else "0" + + + dim_to_num = {'b': bs, + 's': config.seq_len, + 'h': config.num_attention_heads, + 'd': config.head_dim, + 't': bs * config.seq_len, + '3': 3, + '2': 2} + + inp = [] + for i,layout in enumerate(qkv_layout.split('_')): + tensor_shape = [dim_to_num[j] for j in layout] + tensor = 0.1 * torch.randn(tensor_shape, dtype = dtype).cuda() + tensor_count = 1 + split_dim = 0 + for dim,l in enumerate(layout): + if l.isdigit(): + tensor_count = int(l) + split_dim = dim + break + tensors = torch.split(tensor, 1, dim = split_dim) if split_dim != 0 else [tensor] + for j in range(tensor_count): + if split_dim != 0: + inp.append(tensors[j].squeeze(split_dim)) + else: + inp.append(tensors[j]) + for i in range(3): + inp[i].requires_grad=True + + seqlens = torch.empty(bs, dtype = torch.int32).cuda() + seqlens.fill_(config.seq_len) + cu_seqlens = torch.zeros(bs + 1, device = inp[0].device, dtype = torch.int32) + cu_seqlens[1:] = torch.cumsum(seqlens, dim = 0) + qkv_format = ''.join([i for i in qkv_layout.split('_')[0] if i.isalpha()]) + qkv_format_no_thd = qkv_format if qkv_format != 'thd' else 'bshd' + op_grad_shape = [dim_to_num[i] for i in qkv_format_no_thd] + op_grad_shape_new = [*op_grad_shape[:-2], op_grad_shape[-2] * op_grad_shape[-1]] + op_grad = 0.001 * torch.randint(0, 200, op_grad_shape_new, dtype = dtype).cuda() + + block = ( + DotProductAttention( + config.num_attention_heads, + config.head_dim, + attention_dropout = config.dropout_p, + attn_mask_type = config.attn_mask_type, + sequence_parallel = False, + tp_size = 1, + get_rng_state_tracker = None, + tp_group = None, + layer_number = 1, + attention_type = "self" + ).to(dtype = dtype).cuda() + ) + + if qkv_format != 'thd': + op = block(inp[0], inp[1], inp[2], qkv_format=qkv_format) + else: + cu_seqlens_q = torch.arange( + 0, + (bs + 1) * config.seq_len, + step=config.seq_len, + dtype=torch.int32, + device=inp[0].device) + cu_seqlens_kv = torch.arange( + 0, + (bs + 1) * config.seq_len, + step=config.seq_len, + dtype=torch.int32, + device=inp[1].device) + op = block(inp[0], inp[1], inp[2], + qkv_format=qkv_format, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv) + op.backward(op_grad) + + return op, (inp[0].grad, inp[1].grad, inp[2].grad) + @pytest.mark.skipif( get_device_compute_capability() < 8.0, reason="Compute capability 8.0+ is required.") @pytest.mark.parametrize("dtype", param_types) @@ -158,10 +289,10 @@ def test_transformer_layer(dtype, bs, model, ckpt_attn, bias_type, fused_qkv_par atol, rtol = (5e-1, 5e-2) if bias_type == "no_bias": - assert torch.allclose(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_bwd, flash_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type, fused_qkv_params): @@ -231,7 +362,7 @@ def _run_transformer_layer(dtype, bs, config, backend, ckpt_attn, bias_type, fus .cuda() ) - num_iters = 10 + num_iters = 5 for i in range(num_iters): op = block(inp, self_attn_mask_type=config.attn_mask_type, checkpoint_core_attention=ckpt_attn, @@ -269,8 +400,8 @@ def find_factors(x): dtype, bs, config, "UnfusedDotProductAttention", num_q_per_gqa_group) atol, rtol = 5e-1, 5e-2 - assert torch.allclose(flash_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(flash_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(flash_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(flash_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_transformer_layer_gqa(dtype, bs, config, backend, num_querys_per_gqa_group): @@ -363,8 +494,8 @@ def test_dpa_fp8(dtype, bs, model): dtype, bs, config, "UnfusedDotProductAttention") atol, rtol = (2.5e-2, 2.5e-2) - assert torch.allclose(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) - assert torch.allclose(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, atol=atol, rtol=rtol) + torch.testing.assert_close(fused_attn_bwd, unfused_attn_bwd, atol=atol, rtol=rtol) def _run_dpa_fp8(dtype, bs, config, backend): @@ -427,7 +558,7 @@ def _run_dpa_fp8_ref(dtype, bs, config, backend): attention_dropout=config.dropout_p, sequence_parallel=False, tp_size=1, - get_rng_state_tracker=None, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, tp_group=None, layer_number=1, attention_type="self" @@ -439,8 +570,6 @@ def _run_dpa_fp8_ref(dtype, bs, config, backend): v = inp[:, :,2,:,:] op = block(q, k, v, attn_mask_type=config.attn_mask_type) op.backward(op_grad) - torch.save(op,'ctx_ref.pt') - torch.save(inp.grad,'dqkv_ref.pt') return op, inp.grad @@ -455,6 +584,8 @@ def _run_dpa_fp8_ref(dtype, bs, config, backend): from transformer_engine.pytorch.cpp_extensions.fused_attn import ( fused_attn_fwd_qkvpacked, fused_attn_bwd_qkvpacked, + fused_attn_fwd, + fused_attn_bwd, FusedAttnBackend) _CUBLASLT_WORKSPACE_SIZE_BYTES = 33_554_432 # 32MiB @@ -542,11 +673,15 @@ def forward( torch.save(qkv_out_fp16, 'qkv.pt') # FMHA - context_, aux_ctx_tensors, *rest = fused_attn_fwd_qkvpacked( + context_, aux_ctx_tensors, *rest = fused_attn_fwd( is_training, max_s, + max_s, cu_seqlens, - qkv_out, + cu_seqlens, + qkv_out[:,0,:,:], + qkv_out[:,1,:,:], + qkv_out[:,2,:,:], fp8_dtype_forward, FusedAttnBackend["FP8"], None, @@ -558,7 +693,7 @@ def forward( attn_scale=None, dropout=p_dropout, fast_zero_fill=fast_zero_fill, - qkv_layout="qkv_interleaved", + qkv_layout="t3hd", attn_bias_type="no_bias", attn_mask_type="padding", rng_gen=None, @@ -617,10 +752,14 @@ def backward( grad_output, ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward ) - dqkv, *rest = fused_attn_bwd_qkvpacked( + dq, dk, dv, *rest = fused_attn_bwd( ctx.max_s, + ctx.max_s, + ctx.cu_seqlens, ctx.cu_seqlens, - qkv_out, + qkv_out[:,0,:,:], + qkv_out[:,1,:,:], + qkv_out[:,2,:,:], context, proj_dgrad.view_as(context), fp8_dtype_forward, @@ -638,10 +777,11 @@ def backward( None, ctx.p_dropout, ctx.fast_zero_fill, - "qkv_interleaved", + "t3hd", "no_bias", "padding", ) + dqkv = torch.cat([dq.unsqueeze(1), dk.unsqueeze(1), dv.unsqueeze(1)], dim=1) dqkv_grad_output_c = dqkv.view(-1, 3*ctx.hidden_size) dqkv_grad_output_c_fp16 = ext.cast_from_fp8(dqkv_grad_output_c, diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index bf9f7502fd..eeb14ba444 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -871,7 +871,7 @@ def _test_dpa_accuracy(block, bs, dtype, config): key.retain_grad() value.retain_grad() - out = block(query, key, value, mask) + out = block(query, key, value, attention_mask=mask) loss = out.sum() loss.backward() diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 533e0cff6a..727ccce3dd 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -1005,6 +1005,7 @@ def test_export_core_attention( # Set dimensions (these are arbitrary). seq_len, batch_size, num_attention_heads, kv_channels = (64, 4, 1, 64) qkv_size = (seq_len, batch_size, num_attention_heads, kv_channels) + qkv_format = "sbhd" query_layer = torch.randn(qkv_size, dtype=precision, device="cuda") key_layer = torch.randn(qkv_size, dtype=precision, device="cuda") @@ -1025,6 +1026,7 @@ def test_export_core_attention( num_attention_heads=num_attention_heads, kv_channels=kv_channels, attention_dropout=0.5, + qkv_format=qkv_format, attn_mask_type=attn_mask_type, ).to(device='cuda') do_export(model, diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp index a651ea005f..f724d1d051 100644 --- a/transformer_engine/common/fused_attn/fused_attn.cpp +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -12,6 +12,66 @@ #include "fused_attn_fp8.h" #include "../util/cuda_runtime.h" +// map NVTE_QKV_Layout to NVTE_QKV_Layout_Group +NVTE_QKV_Layout_Group nvte_get_qkv_layout_group(NVTE_QKV_Layout qkv_layout) { + switch (qkv_layout) { + case NVTE_QKV_Layout::NVTE_SB3HD: + case NVTE_QKV_Layout::NVTE_BS3HD: + case NVTE_QKV_Layout::NVTE_T3HD: + case NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED: + return NVTE_QKV_Layout_Group::NVTE_3HD; + case NVTE_QKV_Layout::NVTE_SBH3D: + case NVTE_QKV_Layout::NVTE_BSH3D: + case NVTE_QKV_Layout::NVTE_TH3D: + return NVTE_QKV_Layout_Group::NVTE_H3D; + case NVTE_QKV_Layout::NVTE_SBHD_SB2HD: + case NVTE_QKV_Layout::NVTE_BSHD_BS2HD: + case NVTE_QKV_Layout::NVTE_THD_T2HD: + case NVTE_QKV_Layout::NVTE_KV_INTERLEAVED: + return NVTE_QKV_Layout_Group::NVTE_HD_2HD; + case NVTE_QKV_Layout::NVTE_SBHD_SBH2D: + case NVTE_QKV_Layout::NVTE_BSHD_BSH2D: + case NVTE_QKV_Layout::NVTE_THD_TH2D: + return NVTE_QKV_Layout_Group::NVTE_HD_H2D; + case NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD: + case NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD: + case NVTE_QKV_Layout::NVTE_THD_THD_THD: + case NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED: + return NVTE_QKV_Layout_Group::NVTE_HD_HD_HD; + default: + NVTE_ERROR("qkv_layout not supported!"); + } +} + +// map NVTE_QKV_Layout to NVTE_QKV_Format +NVTE_QKV_Format nvte_get_qkv_format(NVTE_QKV_Layout qkv_layout) { + switch (qkv_layout) { + case NVTE_QKV_Layout::NVTE_SB3HD: + case NVTE_QKV_Layout::NVTE_SBH3D: + case NVTE_QKV_Layout::NVTE_SBHD_SB2HD: + case NVTE_QKV_Layout::NVTE_SBHD_SBH2D: + case NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD: + return NVTE_QKV_Format::NVTE_SBHD; + case NVTE_QKV_Layout::NVTE_BS3HD: + case NVTE_QKV_Layout::NVTE_BSH3D: + case NVTE_QKV_Layout::NVTE_BSHD_BS2HD: + case NVTE_QKV_Layout::NVTE_BSHD_BSH2D: + case NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD: + return NVTE_QKV_Format::NVTE_BSHD; + case NVTE_QKV_Layout::NVTE_T3HD: + case NVTE_QKV_Layout::NVTE_TH3D: + case NVTE_QKV_Layout::NVTE_THD_T2HD: + case NVTE_QKV_Layout::NVTE_THD_TH2D: + case NVTE_QKV_Layout::NVTE_THD_THD_THD: + case NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED: + case NVTE_QKV_Layout::NVTE_KV_INTERLEAVED: + case NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED: + return NVTE_QKV_Format::NVTE_THD; + default: + NVTE_ERROR("qkv_layout not supported!"); + } +} + // select a backend for fused attention NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( NVTEDType q_dtype, @@ -26,6 +86,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( const int device_id = cuda::current_device(); const int sm_arch_ = cuda::sm_arch(device_id); NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type."); + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); if ((q_dtype == NVTEDType::kNVTEFloat8E4M3) || (q_dtype == NVTEDType::kNVTEFloat8E5M2) && (sm_arch_ >= 90) && (max_seqlen_q == max_seqlen_kv) @@ -33,7 +94,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( && (head_dim == 64) && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) && (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) - && (qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED)) { + && ((qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) + || (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD))) { #if (CUDNN_VERSION >= 8900) backend = NVTE_Fused_Attn_Backend::NVTE_FP8; #else @@ -52,7 +114,12 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( || (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) || (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) && ((qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) - || (qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED))) { + || (qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) + || (qkv_layout == NVTE_QKV_Layout::NVTE_SB3HD) + || (qkv_layout == NVTE_QKV_Layout::NVTE_SBHD_SB2HD) + || (qkv_layout == NVTE_QKV_Layout::NVTE_BS3HD) + || (qkv_layout == NVTE_QKV_Layout::NVTE_BSHD_BS2HD) + || (qkv_layout == NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD))) { flag_m512 = true; } if ( @@ -65,7 +132,9 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( && ((head_dim == 64) || (head_dim == 128)) && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) && (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) - && (qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED)) { + && ((qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD) + || (qkv_format == NVTE_QKV_Format::NVTE_BSHD))) { flag_arb = true; } if (((max_seqlen_q > 512) || (max_seqlen_kv > 512)) @@ -438,3 +507,201 @@ void nvte_fused_attn_bwd_kvpacked( NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } } +// NVTE fused attention FWD with separate Q, K and V +void nvte_fused_attn_fwd( + const NVTETensor Q, + const NVTETensor K, + const NVTETensor V, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_CTX_Tensors, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + const NVTETensor rng_state, + size_t max_seqlen_q, size_t max_seqlen_kv, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_fwd); + using namespace transformer_engine; + const Tensor *input_cu_seqlens_q = reinterpret_cast(cu_seqlens_q); + const Tensor *input_cu_seqlens_kv = reinterpret_cast(cu_seqlens_kv); + const Tensor *input_rng_state = reinterpret_cast(rng_state); + const Tensor *input_Q = reinterpret_cast(Q); + const Tensor *input_K = reinterpret_cast(K); + const Tensor *input_V = reinterpret_cast(V); + const Tensor *input_Bias = reinterpret_cast(Bias); + Tensor *input_output_S = reinterpret_cast(S); + Tensor *output_O = reinterpret_cast(O); + Tensor *wkspace = reinterpret_cast(workspace); + + auto ndim = input_Q->data.shape.size(); + size_t b = input_cu_seqlens_q->data.shape[0] - 1; + size_t h = input_Q->data.shape[ndim - 2]; + size_t d = input_Q->data.shape[ndim - 1]; + + auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); + const NVTEDType Q_type = static_cast(input_Q->data.dtype); + const NVTEDType KV_type = static_cast(input_K->data.dtype); + + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + Q_type, KV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen_q, max_seqlen_kv, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { +#if (CUDNN_VERSION >= 8901) + fused_attn_max_512_fwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_K, input_V, input_Bias, output_O, + Aux_CTX_Tensors, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { +#if (CUDNN_VERSION >= 8900) + fused_attn_arbitrary_seqlen_fwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_K, input_V, input_Bias, output_O, + Aux_CTX_Tensors, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR( + "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { +#if (CUDNN_VERSION >= 8900) + fused_attn_fp8_fwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + is_training, attn_scale, dropout, qkv_layout, + input_Q, input_K, input_V, input_output_S, output_O, + Aux_CTX_Tensors, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); +#endif + } else { + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); + } +} +// NVTE fused attention BWD with separate Q, K and V +void nvte_fused_attn_bwd( + const NVTETensor Q, + const NVTETensor K, + const NVTETensor V, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQ, + NVTETensor dK, + NVTETensor dV, + NVTETensor dBias, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream) { + NVTE_API_CALL(nvte_flash_attn_bwd); + using namespace transformer_engine; + const Tensor *input_cu_seqlens_q = reinterpret_cast(cu_seqlens_q); + const Tensor *input_cu_seqlens_kv = reinterpret_cast(cu_seqlens_kv); + const Tensor *input_Q = reinterpret_cast(Q); + const Tensor *input_K = reinterpret_cast(K); + const Tensor *input_V = reinterpret_cast(V); + const Tensor *input_O = reinterpret_cast(O); + const Tensor *input_dO = reinterpret_cast(dO); + const Tensor *input_S = reinterpret_cast(S); + Tensor *input_output_dP = reinterpret_cast(dP); + Tensor *output_dQ = reinterpret_cast(dQ); + Tensor *output_dK = reinterpret_cast(dK); + Tensor *output_dV = reinterpret_cast(dV); + Tensor *output_dBias = reinterpret_cast(dBias); + Tensor *wkspace = reinterpret_cast(workspace); + + auto ndim = input_Q->data.shape.size(); + size_t b = input_cu_seqlens_q->data.shape[0] - 1; + size_t h = input_Q->data.shape[ndim - 2]; + size_t d = input_Q->data.shape[ndim - 1]; + + auto handle = cudnnExecutionPlanManager::Instance().GetCudnnHandle(); + const NVTEDType Q_type = static_cast(input_Q->data.dtype); + const NVTEDType KV_type = static_cast(input_K->data.dtype); + + NVTE_Fused_Attn_Backend fused_attention_backend = + nvte_get_fused_attn_backend( + Q_type, KV_type, + qkv_layout, bias_type, attn_mask_type, + dropout, max_seqlen_q, max_seqlen_kv, d); + + if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { +#if (CUDNN_VERSION >= 8901) + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + fused_attn_max_512_bwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_K, input_V, input_dO, + output_S, + output_dQ, output_dK, output_dV, output_dBias, + input_cu_seqlens_q, input_cu_seqlens_kv, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { +#if (CUDNN_VERSION >= 8900) + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + fused_attn_arbitrary_seqlen_bwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_K, input_V, input_O, input_dO, + output_S, + output_dQ, output_dK, output_dV, output_dBias, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, wkspace, stream, handle); +#else + const char *err_msg = + "cuDNN 8.9.0 is required for BF16/FP16 fused attention " + "with arbitrary sequence length. \n"; + NVTE_ERROR(err_msg); +#endif + } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { +#if (CUDNN_VERSION >= 8900) + const Tensor *input_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + const Tensor *input_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + fused_attn_fp8_bwd( + b, max_seqlen_q, max_seqlen_kv, h, d, + attn_scale, dropout, qkv_layout, + input_Q, input_K, input_V, input_O, input_dO, + input_M, input_ZInv, + input_S, input_output_dP, + output_dQ, output_dK, output_dV, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); +#endif + } else { + NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); + } +} diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu index 8bed01732e..e2da13729b 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu @@ -15,6 +15,7 @@ #include "../common.h" #include "utils.h" #include "../util/cuda_runtime.h" +#include "../util/system.h" #if (CUDNN_VERSION >= 8900) #define Q_ID 1 @@ -1059,6 +1060,7 @@ void fused_attn_arbitrary_seqlen_bwd_impl( auto matmul_3_Desc = cudnn_frontend::MatMulDescBuilder() .setComputeType(CUDNN_DATA_FLOAT) .build(); + if (!use_workspace_opt) { auto matmul_op3 = cudnn_frontend::OperationBuilder( CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) @@ -1221,9 +1223,6 @@ void fused_attn_arbitrary_seqlen_fwd_qkvpacked( Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, - "qkv_layout must be NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED."); - // QKV shape is [b, s, 3, h, d] void *devPtrQKV = input_QKV->data.dptr; const auto stride = 2 * num_head * head_dim; @@ -1295,9 +1294,6 @@ void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, - "qkv_layout must be NVTE_QKV_INTERLEAVED."); - // QKV shape is [b, s, 3, h, d] void *devPtrQKV = input_QKV->data.dptr; @@ -1337,21 +1333,16 @@ void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, (batch * num_head * max_seqlen_div_up_q * max_seqlen_div_up_kv * 2 + 1048576 - 1) / 1048576; // default upper limit for dp workspace 256MB size_t max_allowed_dp_workspace = 256; - const char* env_workspace_limit_char = std::getenv("NVTE_FUSED_ATTN_DP_WORKSPACE_LIMIT"); - if (env_workspace_limit_char != nullptr) { - try { - std::string env_dp_workspace_limit(env_workspace_limit_char); - int dp_workspace_limit = std::stoi(env_dp_workspace_limit); - if (dp_workspace_limit > max_allowed_dp_workspace) { - max_allowed_dp_workspace = dp_workspace_limit; - } - } catch (...) { - NVTE_ERROR( - "Invalid argument for NVTE_FUSED_ATTN_DP_WORKSPACE_LIMIT (integer; in MBytes)! \n"); - } - } if (required_dp_workspace <= max_allowed_dp_workspace) { - use_workspace_opt = true; + use_workspace_opt = true; + } + use_workspace_opt = transformer_engine::getenv( + "NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT", use_workspace_opt); + // will not be needed in cuDNN 8.9.6 + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + if ((layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) + || (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D)) { + use_workspace_opt = false; } } #endif @@ -1378,5 +1369,152 @@ void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, NVTE_ERROR("Unexpected workspace_size."); } } + +void fused_attn_arbitrary_seqlen_fwd( + size_t batch, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t num_head, size_t head_dim, bool is_training, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, const Tensor *input_Bias, Tensor *output_O, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + const DType QKV_type = input_Q->data.dtype; + void *devPtrQ = input_Q->data.dptr; + void *devPtrK = input_K->data.dptr; + void *devPtrV = input_V->data.dptr; + void *devPtrO = output_O->data.dptr; + void *devPtrS = nullptr; + + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 2; + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + output_S->data.dptr = nullptr; + output_S->data.shape = {batch, num_head, max_seqlen_q, 1}; + output_S->data.dtype = DType::kFloat32; + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; + } else if (Aux_CTX_Tensors->size == 2) { + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + devPtrS = output_S->data.dptr; + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + output_rng_state->data.dptr = rng_state->data.dptr; + } else { + NVTE_ERROR("Unexpected Aux_CTX_Tensors->size."); + } + + void* devPtrDropoutSeed = rng_state->data.dptr; + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + size_t workspace_size = 0; + + fused_attn_arbitrary_seqlen_fwd_impl(batch, num_head, max_seqlen_q, max_seqlen_kv, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, devPtrS, devPtrO, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } else { + NVTE_ERROR("Unexpected workspace_size."); + } +} + +void fused_attn_arbitrary_seqlen_bwd(size_t batch, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t num_head, size_t head_dim, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, const Tensor *input_O, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQ, Tensor *output_dK, Tensor *output_dV, + Tensor *output_dBias, const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, Tensor *workspace, + cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + const auto QKV_type = input_Q->data.dtype; + void *devPtrQ = input_Q->data.dptr; + void *devPtrK = input_K->data.dptr; + void *devPtrV = input_V->data.dptr; + void* devPtrO = input_O->data.dptr; + void *devPtrdO = input_dO->data.dptr; + + void *devPtrdQ = output_dQ->data.dptr; + void *devPtrdK = output_dK->data.dptr; + void *devPtrdV = output_dV->data.dptr; + void *devPtrSoftmaxStats = nullptr; + devPtrSoftmaxStats = output_S->data.dptr; + + void* devPtrDropoutSeed = rng_state->data.dptr; + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + size_t workspace_size = 0; + + bool use_workspace_opt = false; +#if (CUDNN_VERSION >= 8905) + const int device_id = cuda::current_device(); + const int sm_arch_ = cuda::sm_arch(device_id); + if (sm_arch_ >= 90) { + // quick estimate of dp workspace size + size_t max_seqlen_div_up_q = ((max_seqlen_q + 64 - 1) / 64) * 64; + size_t max_seqlen_div_up_kv = ((max_seqlen_kv + 64 - 1) / 64) * 64; + size_t required_dp_workspace = + (batch * num_head * max_seqlen_div_up_q * max_seqlen_div_up_kv * 2 + 1048576 - 1) / 1048576; + // default upper limit for dp workspace 256MB + size_t max_allowed_dp_workspace = 256; + if (required_dp_workspace <= max_allowed_dp_workspace) { + use_workspace_opt = true; + } + use_workspace_opt = transformer_engine::getenv( + "NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT", use_workspace_opt); + // will not be needed in cuDNN 8.9.6 + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + if ((layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) + || (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D)) { + use_workspace_opt = false; + } + } +#endif + + fused_attn_arbitrary_seqlen_bwd_impl(batch, num_head, max_seqlen_q, max_seqlen_kv, head_dim, + attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, devPtrO, devPtrSoftmaxStats, + devPtrdQ, devPtrdK, devPtrdV, devPtrdO, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), workspace->data.dptr, + &workspace_size, stream, handle, use_workspace_opt); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } else { + NVTE_ERROR("Unexpected workspace_size."); + } +} } // namespace transformer_engine #endif // CUDNN_VERSION >= 8900 diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h index 68ebe0c7c0..202e06987d 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h @@ -38,6 +38,30 @@ void fused_attn_arbitrary_seqlen_bwd_qkvpacked(size_t batch, size_t max_seqlen, const Tensor *cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); +void fused_attn_arbitrary_seqlen_fwd(size_t batch, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t num_head, size_t head_size, bool is_training, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, const Tensor *input_Bias, + Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors, + const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + +void fused_attn_arbitrary_seqlen_bwd(size_t batch, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t num_head, size_t head_dim, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, const Tensor *input_O, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQ, Tensor *output_dK, + Tensor *output_dV, Tensor *output_dBias, + const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + #endif // CUDNN_VERSION >= 8900 } // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu index 00fb3e66c2..663ff37187 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu +++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu @@ -1250,9 +1250,6 @@ void fused_attn_max_512_fwd_qkvpacked( Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, - "qkv_layout must be NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED."); - // QKV shape is [b, s, 3, h, d] void *devPtrQKV = input_QKV->data.dptr; const auto stride = 2 * num_head * head_dim; @@ -1323,8 +1320,6 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED, - "qkv_layout must be NVTE_QKV_Layout::NVTE_KV_INTERLEAVED."); NVTE_CHECK(bias_type == NVTE_Bias_Type::NVTE_NO_BIAS || bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS, "NVTE_PRE_SCALE_BIAS is not implemented in fused_attn_max_512."); @@ -1391,6 +1386,76 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k NVTE_ERROR("Unexpected workspace_size."); } } +void fused_attn_max_512_fwd(size_t batch, size_t q_max_seqlen, size_t kv_max_seqlen, + size_t num_head, size_t head_dim, bool is_training, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, + const Tensor *input_Bias, Tensor *output_O, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *q_cu_seqlens, + const Tensor *kv_cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + void *devPtrQ = input_Q->data.dptr; + void *devPtrK = input_K->data.dptr; + void *devPtrV = input_V->data.dptr; + + void *devPtrBias = input_Bias->data.dptr; + + void *devPtrO = output_O->data.dptr; + + void *devPtrS = nullptr; + + const DType q_type = input_Q->data.dtype; + const DType kv_type = input_K->data.dtype; + NVTE_CHECK(q_type == kv_type, "data type of Q must be equal to data type of KV."); + + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 1; + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + output_S->data.dptr = nullptr; + output_S->data.shape = {batch, num_head, q_max_seqlen, kv_max_seqlen}; + output_S->data.dtype = q_type; + } else if (Aux_CTX_Tensors->size == 1) { + Tensor *output_S = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + devPtrS = output_S->data.dptr; + } else { + NVTE_ERROR("Unexpected Aux_CTX_Tensors->size."); + } + + void *devQCuSeqlen = q_cu_seqlens->data.dptr; + void *devKVCuSeqlen = kv_cu_seqlens->data.dptr; + + const DType rng_state_type = rng_state->data.dtype; + NVTE_CHECK(rng_state_type == DType::kInt64); + void *devPtrDropoutSeed = rng_state->data.dptr; + void *devPtrDropoutOffset = + static_cast(static_cast(rng_state->data.dptr) + 1); + + size_t workspace_size = 0; + + fused_attn_max_512_fwd_impl( + batch, num_head, q_max_seqlen, kv_max_seqlen, head_dim, is_training, attn_scale, p_dropout, + qkv_layout, bias_type, mask_type, devPtrQ, devPtrK, devPtrV, devPtrS, devPtrO, devPtrBias, + devQCuSeqlen, devKVCuSeqlen, devPtrDropoutSeed, devPtrDropoutOffset, workspace->data.dptr, + &workspace_size, get_cudnn_dtype(q_type), stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } else { + NVTE_ERROR("Unexpected workspace_size."); + } +} void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t num_head, size_t head_dim, float attn_scale, float p_dropout, @@ -1402,9 +1467,6 @@ void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t nu cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, - "qkv_layout must be NVTE_QKV_INTERLEAVED."); - // QKV shape is [b, s, 3, h, d] void *devPtrQKV = input_QKV->data.dptr; @@ -1465,9 +1527,6 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - NVTE_CHECK(qkv_layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED, - "qkv_layout must be NVTE_KV_INTERLEAVED."); - // Q shape is [b, s, h, d] // KV shape is [b, s, 2, h, d] auto stride = 2 * num_head * head_dim; @@ -1518,5 +1577,63 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k NVTE_ERROR("Unexpected workspace_size."); } } +void fused_attn_max_512_bwd(size_t batch, size_t q_max_seqlen, size_t kv_max_seqlen, + size_t num_head, size_t head_dim, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQ, Tensor *output_dK, Tensor *output_dV, + Tensor *output_dBias, + const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) { + using namespace transformer_engine; + + void *devPtrQ = input_Q->data.dptr; + void *devPtrK = input_K->data.dptr; + void *devPtrV = input_V->data.dptr; + + void *devPtrdO = input_dO->data.dptr; + + void *devPtrdQ = output_dQ->data.dptr; + void *devPtrdK = output_dK->data.dptr; + void *devPtrdV = output_dV->data.dptr; + + void *devPtrdBias = output_dBias->data.dptr; + + void *devPtrS = output_S->data.dptr; + + // devPtrdS reuses the memory of devPtrS + void *devPtrdS = devPtrS; + + void *devPtrQCuSeqlens = q_cu_seqlens->data.dptr; + void *devPtrKVCuSeqlens = kv_cu_seqlens->data.dptr; + + const auto q_type = input_Q->data.dtype; + const auto kv_type = input_K->data.dtype; + NVTE_CHECK(q_type == kv_type, "data type of Q must be equal to data type of KV."); + size_t workspace_size = 0; + + fused_attn_max_512_bwd_impl( + batch, num_head, q_max_seqlen, kv_max_seqlen, head_dim, attn_scale, p_dropout, qkv_layout, + mask_type, bias_type, devPtrQ, devPtrK, devPtrV, devPtrS, devPtrdQ, devPtrdK, devPtrdV, + devPtrdO, devPtrdS, devPtrdBias, devPtrQCuSeqlens, devPtrKVCuSeqlens, workspace->data.dptr, + &workspace_size, get_cudnn_dtype(q_type), stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = {workspace_size}; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = {1}; + workspace->data.dtype = DType::kByte; + return; + } else { + NVTE_ERROR("Unexpected workspace_size."); + } +} } // namespace transformer_engine #endif // CUDNN_VERSION >= 8901 diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h index 75545d0b40..e2106347ff 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h +++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.h @@ -38,6 +38,17 @@ void fused_attn_max_512_fwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k const Tensor *kv_cu_seqlens, const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); +void fused_attn_max_512_fwd(size_t batch, size_t q_max_seqlen, size_t kv_max_seqlen, + size_t num_head, size_t head_dim, bool is_training, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, + const Tensor *input_Bias, Tensor *output_O, + NVTETensorPack *Aux_CTX_Tensors, const Tensor *q_cu_seqlens, + const Tensor *kv_cu_seqlens, const Tensor *rng_state, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + void fused_attn_max_512_bwd_qkvpacked(size_t batch, size_t max_seqlen, size_t num_head, size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, @@ -56,6 +67,18 @@ void fused_attn_max_512_bwd_kvpacked(size_t batch, size_t q_max_seqlen, size_t k Tensor *output_dQ, Tensor *output_dKV, Tensor *output_dBias, const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + +void fused_attn_max_512_bwd(size_t batch, size_t q_max_seqlen, size_t kv_max_seqlen, + size_t num_head, size_t head_dim, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, const Tensor *input_K, + const Tensor *input_V, + const Tensor *input_dO, Tensor *output_S, + Tensor *output_dQ, Tensor *output_dK, Tensor *output_dV, + Tensor *output_dBias, + const Tensor *q_cu_seqlens, const Tensor *kv_cu_seqlens, + Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); #endif // CUDNN_VERSION >= 8901 } // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu index c4bdecac8f..120406202e 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu @@ -173,6 +173,7 @@ static cudnn_frontend::Tensor createScale( static cudnn_frontend::Tensor createScaleWithOffset( const cudnn_frontend::Tensor& prevBlockOutputTensor, const std::string& scale_tensor_name, + NVTE_QKV_Layout layout, cudnnDataType_t tensorType, bool isOutputVirtual, bool isScaleByValue, @@ -192,7 +193,7 @@ static cudnn_frontend::Tensor createScaleWithOffset( generateMatrixStrides(output_dim[0], output_dim[1], output_dim[2], 0 /*s_kv = 0 for placeholder*/, output_dim[3], output_stride, - NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED, NVTE_QKV_Matrix::NVTE_Q_Matrix); + layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); } else { // Otherwise output dim and stride should be the same as prev block dim and stride for (int i = 0; i < 4; i++) { @@ -1163,6 +1164,7 @@ void fused_attn_fp8_fwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in auto OTensor = createScaleWithOffset( OTensor_before_quan_O_tensor, // input tensor "scaleO", // scale tensor + layout, // qkv layout tensorType, // output tensor type false, // output not virtual false, // scale is by value @@ -1515,6 +1517,7 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in auto dVTensor = createScaleWithOffset( dVTensor_before_quan_dV, // input tensor "scaledV", // scale tensor + layout, // qkv layout CUDNN_DATA_FP8_E5M2, // output tensor type false, // output not virtual false, // scale is by value @@ -1631,7 +1634,7 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS * K) * descale dS auto After_dS_K_before_dequan_K = createScale( - After_dS_K, // input tensor + After_dS_K, // input tensor descaledSTensor, // scale tensor CUDNN_DATA_FLOAT, // output tensor type true, // output is virtual @@ -1641,7 +1644,7 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS * K) * descale dS * descale K auto After_dS_K_before_quan_dQ = createScale( - After_dS_K_before_dequan_K, // input tensor + After_dS_K_before_dequan_K, // input tensor descaleKTensor, // scale tensor CUDNN_DATA_FLOAT, // output tensor type true, // output is virtual @@ -1651,8 +1654,9 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS * K) * descale dS * descale K * scale dQ auto dQ = createScaleWithOffset( - After_dS_K_before_quan_dQ, // input tensor + After_dS_K_before_quan_dQ, // input tensor "scaledQ", // scale tensor + layout, // qkv layout CUDNN_DATA_FP8_E5M2, // output tensor type false, // output not virtual false, // scale is by value @@ -1671,7 +1675,7 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS.T * Q) * descale dS auto After_dSTranspose_Q_before_dequan_Q = createScale( - After_dSTranspose_Q, // input tensor + After_dSTranspose_Q, // input tensor descaledSTensor, // scale tensor CUDNN_DATA_FLOAT, // output tensor type true, // output is virtual @@ -1681,7 +1685,7 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS.T * Q) * descale dS * descale Q auto After_dSTranspose_Q_before_quan_dK = createScale( - After_dSTranspose_Q_before_dequan_Q, // input tensor + After_dSTranspose_Q_before_dequan_Q, // input tensor descaleQTensor, // scale tensor CUDNN_DATA_FLOAT, // output tensor type true, // output is virtual @@ -1691,8 +1695,9 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t s_q, int64_t s_kv, int64_t h, in // (dS.T * Q) * descale dS * descale Q * scale dK auto dK = createScaleWithOffset( - After_dSTranspose_Q_before_quan_dK, // input tensor + After_dSTranspose_Q_before_quan_dK, // input tensor "scaledK", // scale tensor + layout, // qkv layout CUDNN_DATA_FP8_E5M2, // output tensor type false, // output not virtual false, // scale is by value @@ -1911,6 +1916,8 @@ void fused_attn_fp8_fwd_qkvpacked( devPtrM = output_M->data.dptr; devPtrZInv = output_ZInv->data.dptr; output_rng_state->data.dptr = rng_state->data.dptr; + } else { + NVTE_ERROR("Unexpected Aux_CTX_Tensors->size."); } void* devPtrAmaxS = input_output_S->amax.dptr; @@ -2048,5 +2055,204 @@ void fused_attn_fp8_bwd_qkvpacked( return; } } +// fused attention FWD FP8 with separate Q, K, V +void fused_attn_fp8_fwd( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t h, size_t d, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_Q, + const Tensor *input_K, + const Tensor *input_V, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_CTX_Tensors, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + void* devPtrQ = input_Q->data.dptr; + void* devPtrK = input_K->data.dptr; + void* devPtrV = input_V->data.dptr; + void* devPtrDescaleQ = input_Q->scale_inv.dptr; + void* devPtrDescaleK = input_Q->scale_inv.dptr; + void* devPtrDescaleV = input_Q->scale_inv.dptr; + + void* devPtrO = output_O->data.dptr; + void* devPtrAmaxO = output_O->amax.dptr; + void* devPtrScaleO = output_O->scale.dptr; + + void* devPtrM = nullptr; + void* devPtrZInv = nullptr; + if (Aux_CTX_Tensors->size == 0) { + if (is_training) { + Aux_CTX_Tensors->size = 3; + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + output_M->data.dptr = nullptr; + output_M->data.shape = {b, h, max_seqlen_q, 1}; + output_M->data.dtype = DType::kFloat32; + output_ZInv->data.dptr = nullptr; + output_ZInv->data.shape = {b, h, max_seqlen_q, 1}; + output_ZInv->data.dtype = DType::kFloat32; + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; + } + } else if (Aux_CTX_Tensors->size == 3) { + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + devPtrM = output_M->data.dptr; + devPtrZInv = output_ZInv->data.dptr; + output_rng_state->data.dptr = rng_state->data.dptr; + } else { + NVTE_ERROR("Unexpected Aux_CTX_Tensors->size."); + } + + void* devPtrAmaxS = input_output_S->amax.dptr; + void* devPtrScaleS = input_output_S->scale.dptr; + void* devPtrDescaleS = input_output_S->scale_inv.dptr; + + void* devPtrcuSeqlensQ = reinterpret_cast( + reinterpret_cast(cu_seqlens_q->data.dptr)); + void* devPtrcuSeqlensKV = reinterpret_cast( + reinterpret_cast(cu_seqlens_kv->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const DType QKV_type = input_Q->data.dtype; + size_t workspace_size = 0; + + fused_attn::fused_attn_fp8_fwd_impl( + b, max_seqlen_q, max_seqlen_kv, h, d, + is_training, attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} +// fused attention BWD FP8 with separate Q, K, V +void fused_attn_fp8_bwd( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t h, size_t d, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_Q, + const Tensor *input_K, + const Tensor *input_V, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQ, + const Tensor *output_dK, + const Tensor *output_dV, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + void* devPtrQ = input_Q->data.dptr; + void* devPtrK = input_K->data.dptr; + void* devPtrV = input_V->data.dptr; + void* devPtrDescaleQ = input_Q->scale_inv.dptr; + void* devPtrDescaleK = input_Q->scale_inv.dptr; + void* devPtrDescaleV = input_Q->scale_inv.dptr; + + void* devPtrO = input_O->data.dptr; + void* devPtrDescaleO = input_O->scale_inv.dptr; + void* devPtrdO = input_dO->data.dptr; + void* devPtrDescaledO = input_dO->scale_inv.dptr; + + void* devPtrM = input_M->data.dptr; + void* devPtrZInv = input_ZInv->data.dptr; + + void* devPtrScaleS = input_S->scale.dptr; + void* devPtrDescaleS = input_S->scale_inv.dptr; + void* devPtrAmaxdS = input_output_dP->amax.dptr; + void* devPtrScaledS = input_output_dP->scale.dptr; + void* devPtrDescaledS = input_output_dP->scale_inv.dptr; + + void* devPtrdQ = output_dQ->data.dptr; + void* devPtrdK = output_dK->data.dptr; + void* devPtrdV = output_dV->data.dptr; + void* devPtrAmaxdQ = output_dQ->amax.dptr; + void* devPtrAmaxdK = output_dQ->amax.dptr; + void* devPtrAmaxdV = output_dQ->amax.dptr; + void* devPtrScaledQ = output_dQ->scale.dptr; + void* devPtrScaledK = output_dQ->scale.dptr; + void* devPtrScaledV = output_dQ->scale.dptr; + + void* devPtrcuSeqlensQ = reinterpret_cast( + reinterpret_cast(cu_seqlens_q->data.dptr)); + void* devPtrcuSeqlensKV = reinterpret_cast( + reinterpret_cast(cu_seqlens_kv->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + const DType QKV_type = input_Q->data.dtype; + size_t workspace_size = 0; + + fused_attn::fused_attn_fp8_bwd_impl( + b, max_seqlen_q, max_seqlen_kv, h, d, + attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledS, + devPtrScaleS, devPtrScaledS, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdS, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} #endif // end of CUDNN>=8900 } // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h index 111dfddd10..d78f0f97ca 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.h +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h @@ -46,5 +46,44 @@ void fused_attn_fp8_bwd_qkvpacked( Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle); + +// fused attention FWD FP8 with separate Q, K, V +void fused_attn_fp8_fwd( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t h, size_t d, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_CTX_Tensors, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); + +// fused attention BWD FP8 with separate Q, K, V +void fused_attn_fp8_bwd( + size_t b, size_t max_seqlen_q, size_t max_seqlen_kv, + size_t h, size_t d, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQ, + const Tensor *output_dK, + const Tensor *output_dV, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); #endif // end of CUDNN>=8900 } // namespace transformer_engine diff --git a/transformer_engine/common/fused_attn/utils.cu b/transformer_engine/common/fused_attn/utils.cu index ebba6efa21..fc4be20cf6 100644 --- a/transformer_engine/common/fused_attn/utils.cu +++ b/transformer_engine/common/fused_attn/utils.cu @@ -30,6 +30,7 @@ void generateMatrixStrides( constexpr int seqlen_q_dim_idx = 2; constexpr int seqlen_kv_dim_idx = 3; + // to be deprecated in the future switch (matrix) { case NVTE_QKV_Matrix::NVTE_Q_Matrix: if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { @@ -37,7 +38,8 @@ void generateMatrixStrides( strideA[seqlen_dim_idx] = 3 * h * d; strideA[head_dim_idx] = d; strideA[batch_dim_idx] = s_q * 3 * h * d; - } else { + } else if ((layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) + || (layout == NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED)) { strideA[hidden_dim_idx] = 1; strideA[seqlen_dim_idx] = h * d; strideA[head_dim_idx] = d; @@ -55,7 +57,7 @@ void generateMatrixStrides( strideA[hidden_dim_idx] = 1; strideA[head_dim_idx] = d; strideA[batch_dim_idx] = s_kv * 2 * h * d; - } else { + } else if (layout == NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) { strideA[seqlen_dim_idx] = h * d; strideA[hidden_dim_idx] = 1; strideA[head_dim_idx] = d; @@ -73,7 +75,7 @@ void generateMatrixStrides( strideA[hidden_transpose_dim_idx] = 1; strideA[head_dim_idx] = d; strideA[batch_dim_idx] = s_kv * 2 * h * d; - } else { + } else if (layout == NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) { strideA[seqlen_transpose_dim_idx] = h * d; strideA[hidden_transpose_dim_idx] = 1; strideA[head_dim_idx] = d; @@ -91,7 +93,7 @@ void generateMatrixStrides( strideA[seqlen_dim_idx] = 2* h * d; strideA[head_dim_idx] = d; strideA[batch_dim_idx] = s_kv * 2 * h * d; - } else { + } else if (layout == NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) { strideA[hidden_dim_idx] = 1; strideA[seqlen_dim_idx] = h * d; strideA[head_dim_idx] = d; @@ -100,21 +102,21 @@ void generateMatrixStrides( break; case NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose: if (layout == NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) { - strideA[hidden_transpose_dim_idx] = 1; - strideA[seqlen_transpose_dim_idx] = 3 * h * d; - strideA[head_dim_idx] = d; - strideA[batch_dim_idx] = s_kv * 3 * h * d; - } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { - strideA[hidden_transpose_dim_idx] = 1; - strideA[seqlen_transpose_dim_idx] = 2* h * d; - strideA[head_dim_idx] = d; - strideA[batch_dim_idx] = s_kv * 2 * h * d; - } else { - strideA[hidden_transpose_dim_idx] = 1; - strideA[seqlen_transpose_dim_idx] = h * d; - strideA[head_dim_idx] = d; - strideA[batch_dim_idx] = s_kv * h * d; - } + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 3 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) { + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = 2* h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * 2 * h * d; + } else if (layout == NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) { + strideA[hidden_transpose_dim_idx] = 1; + strideA[seqlen_transpose_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[batch_dim_idx] = s_kv * h * d; + } break; case NVTE_QKV_Matrix::NVTE_S_Matrix: strideA[seqlen_kv_dim_idx] = 1; @@ -129,6 +131,228 @@ void generateMatrixStrides( strideA[batch_dim_idx] = s_q * h * d; break; } + + // new way of getting strides + switch (layout) { + case NVTE_QKV_Layout::NVTE_SB3HD: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * 3 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = b * 3 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_SBH3D: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = 3 * d; + strideA[seqlen_dim_idx] = b * 3 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = 3 * h * d; + strideA[head_dim_idx] = 3 * d; + strideA[seqlen_transpose_dim_idx] = b * 3 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_SBHD_SB2HD: + if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = 2 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * 2 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = 2 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = b * 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_SBHD_SBH2D: + if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = 2 * h * d; + strideA[head_dim_idx] = 2 * d; + strideA[seqlen_dim_idx] = b * 2 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = 2 * h * d; + strideA[head_dim_idx] = 2 * d; + strideA[seqlen_transpose_dim_idx] = b * 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = b * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = b * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_BS3HD: + case NVTE_QKV_Layout::NVTE_T3HD: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = s_q * 3 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = 3 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = s_q * 3 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = 3 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix) { + strideA[batch_dim_idx] = s_q * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_BSH3D: + case NVTE_QKV_Layout::NVTE_TH3D: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = s_q * 3 * h * d; + strideA[head_dim_idx] = 3 * d; + strideA[seqlen_dim_idx] = 3 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = s_q * 3 * h * d; + strideA[head_dim_idx] = 3 * d; + strideA[seqlen_transpose_dim_idx] = 3 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix) { + strideA[batch_dim_idx] = s_q * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_BSHD_BS2HD: + case NVTE_QKV_Layout::NVTE_THD_T2HD: + if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = s_kv * 2 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = 2 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = s_kv * 2 * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = s_q * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_BSHD_BSH2D: + case NVTE_QKV_Layout::NVTE_THD_TH2D: + if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = s_kv * 2 * h * d; + strideA[head_dim_idx] = 2 * d; + strideA[seqlen_dim_idx] = 2 * h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = s_kv * 2 * h * d; + strideA[head_dim_idx] = 2 * d; + strideA[seqlen_transpose_dim_idx] = 2 * h * d; + strideA[hidden_transpose_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = s_q * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } + break; + case NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD: + case NVTE_QKV_Layout::NVTE_THD_THD_THD: + if ((matrix == NVTE_QKV_Matrix::NVTE_Q_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_O_Matrix)) { + strideA[batch_dim_idx] = s_q * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix)) { + strideA[batch_dim_idx] = s_kv * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_dim_idx] = h * d; + strideA[hidden_dim_idx] = 1; + } else if ((matrix == NVTE_QKV_Matrix::NVTE_K_Matrix_Transpose) + || (matrix == NVTE_QKV_Matrix::NVTE_V_Matrix_Transpose)) { + strideA[batch_dim_idx] = s_kv * h * d; + strideA[head_dim_idx] = d; + strideA[seqlen_transpose_dim_idx] = h * d; + strideA[hidden_transpose_dim_idx] = 1; + } + break; + } + + if (matrix == NVTE_QKV_Matrix::NVTE_S_Matrix) { + strideA[seqlen_kv_dim_idx] = 1; + strideA[seqlen_q_dim_idx] = s_kv; + strideA[head_dim_idx] = s_q * s_kv; + strideA[batch_dim_idx] = h * s_q * s_kv; + } } bool allowAllConfig(cudnnBackendDescriptor_t engine_config) { diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index b71573ec1b..6de3c63512 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -18,7 +18,17 @@ extern "C" { #endif /*! \enum NVTE_QKV_Layout - * \brief QKV matrix layouts + * \brief Memory layouts of QKV tensors + * `S`, `B`, `H`, `D`, and `T` stand for sequence length, batch size, the number of heads, + head size, and the total number of sequences in a batch, i.e. `t = sum(s_i) for i = 0...b-1`. + `SBHD` and `BSHD`-based layouts are used when sequences in a batch are of equal length + or padded to the same length, and `THD`-based layouts are used when sequences have + different lengths in a batch. + * \note {`NVTE_QKV_INTERLEAVED`, `NVTE_KV_INTERLEAVED` and `NVTE_NOT_INTERLEAVED` + will be deprecated in the next release. Please use their equivalent enums instead, i.e. `NVTE_T3HD`, + `NVTE_THD_T2HD` and `NVTE_THD_THD_THD` when sequences are of variable lengths, and `NVTE_BS3HD`, + `NVTE_BSHD_BS2HD` and `NVTE_BSHD_BSHD_BSHD` when sequences are of equal length or padded + to equal length.} */ enum NVTE_QKV_Layout { /*! Separate Q, K, V tensors. @@ -67,7 +77,51 @@ enum NVTE_QKV_Layout { | num_heads * head_dim \endverbatim */ - NVTE_KV_INTERLEAVED = 2 + NVTE_KV_INTERLEAVED = 2, + + NVTE_SB3HD = 3, + NVTE_SBH3D = 4, + NVTE_SBHD_SB2HD = 5, + NVTE_SBHD_SBH2D = 6, + NVTE_SBHD_SBHD_SBHD = 7, + NVTE_BS3HD = 8, + NVTE_BSH3D = 9, + NVTE_BSHD_BS2HD = 10, + NVTE_BSHD_BSH2D = 11, + NVTE_BSHD_BSHD_BSHD = 12, + NVTE_T3HD = 13, + NVTE_TH3D = 14, + NVTE_THD_T2HD = 15, + NVTE_THD_TH2D = 16, + NVTE_THD_THD_THD = 17, +}; + +/*! \enum NVTE_QKV_Layout_Group + * \brief Grouping of QKV layouts + */ +enum NVTE_QKV_Layout_Group { + /*! 3HD QKV layouts, e.g. BS3HD */ + NVTE_3HD = 0, + /*! H3D QKV layouts, e.g. BSH3D */ + NVTE_H3D = 1, + /*! HD_2HD QKV layouts, e.g. BSHD_BS2HD */ + NVTE_HD_2HD = 2, + /*! HD_H2D QKV layouts, e.g. BSHD_BSH2D */ + NVTE_HD_H2D = 3, + /*! HD_HD_HD QKV layouts, e.g. BSHD_BSHD_BSHD */ + NVTE_HD_HD_HD = 4, +}; + +/*! \enum NVTE_QKV_Format + * \brief Dimension formats for QKV tensors + */ +enum NVTE_QKV_Format { + /*! SBHD QKV format */ + NVTE_SBHD = 0, + /*! BSHD QKV format */ + NVTE_BSHD = 1, + /*! THD QKV format */ + NVTE_THD = 2, }; /*! \enum NVTE_Bias_Type @@ -94,6 +148,9 @@ enum NVTE_Mask_Type { NVTE_CAUSAL_MASK = 2, }; +/*! \enum NVTE_Fused_Attn_Backend + * \brief Fused attention backends + */ enum NVTE_Fused_Attn_Backend { /*! No supported backend */ NVTE_No_Backend = -1, @@ -105,8 +162,24 @@ enum NVTE_Fused_Attn_Backend { NVTE_FP8 = 2, }; +/*! \brief Get layout group for a given QKV layout + * + * \param[in] qkv_layout QKV layout, e.g. sbh3d. + * + * \return qkv layout group, e.g. h3d. + */ +NVTE_QKV_Layout_Group nvte_get_qkv_layout_group(NVTE_QKV_Layout qkv_layout); + +/*! \brief Get QKV format for a given QKV layout + * + * \param[in] qkv_layout QKV layout, e.g. sbh3d. + * + * \return qkv format, e.g. sbhd. + */ +NVTE_QKV_Format nvte_get_qkv_format(NVTE_QKV_Layout qkv_layout); + /*! \brief Get fused attention backend based on input parameters. - * + * * \param[in] q_dtype The data type of Tensor Q. * \param[in] kv_dtype The data type of Tensors K, V. * \param[in] qkv_layout The layout of Tensors Q, K, V. @@ -152,7 +225,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. * \param[in] rng_state Seed and offset of CUDA random number generator. * \param[in] max_seqlen Max sequence length used for computing, - * it may be >= max(cu_seqlens). + * it may be >= max(seqlen_i) for i=0,...batch_size-1. * \param[in] is_training Whether this is in training mode or inference. * \param[in] attn_scale Scaling factor for Q * K.T. * \param[in] dropout Dropout probability. @@ -199,7 +272,7 @@ void nvte_fused_attn_fwd_qkvpacked( * \param[out] dBias The gradient of the Bias tensor. * \param[in] cu_seqlens Accumulative sequence lengths, [batch_size + 1]. * \param[in] max_seqlen Max sequence length used for computing, - * it may be >= max(cu_seqlens). + * it may be >= max(seqlen_i) for i=0,...batch_size-1. * \param[in] attn_scale Scaling factor for Q * K.T. * \param[in] dropout Dropout probability. * \param[in] qkv_layout QKV tensor's layout. @@ -249,10 +322,10 @@ void nvte_fused_attn_bwd_qkvpacked( * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. * \param[in] rng_state Seed and offset of CUDA random number generator. - * \param[in] max_seqlen_q Max sequence length used for computing for Q. - * it may be >= max(cu_seqlens_q). - * \param[in] max_seqlen_kv Max sequence length used for computing for KV. - * it may be >= max(cu_seqlens_kv). + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. * \param[in] is_training Whether this is in training mode or inference. * \param[in] attn_scale Scaling factor for Q * K.T. * \param[in] dropout Dropout probability. @@ -300,10 +373,10 @@ void nvte_fused_attn_fwd_kvpacked( * \param[out] dBias The gradient of the Bias tensor. * \param[in] cu_seqlens_q Accumulative sequence lengths for Q, [batch_size + 1]. * \param[in] cu_seqlens_kv Accumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] max_seqlen_q Max sequence length used for computing for Q. - * it may be >= max(cu_seqlens_q). - * \param[in] max_seqlen_kv Max sequence length used for computing for KV. - * it may be >= max(cu_seqlens_kv). + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. + * \param[in] max_seqlen_kv Max sequence length used for computing for KV. + * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. * \param[in] attn_scale Scaling factor for Q * K.T. * \param[in] dropout Dropout probability. * \param[in] qkv_layout QKV tensor's layout. @@ -332,6 +405,122 @@ void nvte_fused_attn_bwd_kvpacked( NVTETensor workspace, cudaStream_t stream); +/*! \brief Compute dot product attention with separate Q, K and V. + * + * Computes: + * - P = Q * Transpose(K) + Bias + * - S = ScaleMaskSoftmax(P) + * - D = Dropout(S) + * - O = D * Transpose(V) + * + * Support Matrix: + \verbatim + | backend | precision | qkv format | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | SBHD, BSHD | NO/POST_SCALE_BIAS | PADDING/CAUSAL_MASK | Yes | <= 512 | 64 | + | 1 | FP16/BF16 | SBHD, BSHD | NO/POST_SCALE_BIAS | CAUSAL_MASK | Yes | > 512 | 64, 128 | + | 2 | FP8 | THD | NO_BIAS | PADDING_MASK | Yes | <= 512 | 64 | + \endverbatim + * + * \param[in] Q The Q tensor. + * \param[in] K The K tensor. + * \param[in] V The V tensor. + * \param[in] Bias The Bias tensor. + * \param[in,out] S The S tensor. + * \param[out] O The output O tensor. + * \param[out] Aux_CTX_Tensors Auxiliary output tensors when training, + * e.g. M, ZInv, rng_state. + * \param[in] cu_seqlens_q Cumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Cumulative sequence lengths for K and V, [batch_size + 1]. + * \param[in] rng_state Seed and offset of CUDA random number generator. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. + * \param[in] max_seqlen_kv Max sequence length used for computing for K and V. + * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. + * \param[in] is_training Whether this is in training mode or inference. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensors' layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_fwd( + const NVTETensor Q, + const NVTETensor K, + const NVTETensor V, + const NVTETensor Bias, + NVTETensor S, + NVTETensor O, + NVTETensorPack* Aux_CTX_Tensors, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + const NVTETensor rng_state, + size_t max_seqlen_q, size_t max_seqlen_kv, + bool is_training, float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); + +/*! \brief Compute the backward of the dot product attention with separate Q, K and V. + * + * Support Matrix: + \verbatim + | backend | precision | qkv format | bias | mask | dropout | sequence length | head_dim | + | 0 | FP16/BF16 | SBHD, BSHD | NO/POST_SCALE_BIAS | PADDING/CAUSAL_MASK | Yes | <= 512 | 64 | + | 1 | FP16/BF16 | SBHD, BSHD | NO/POST_SCALE_BIAS | CAUSAL_MASK | Yes | > 512 | 64, 128 | + | 2 | FP8 | THD | NO_BIAS | PADDING_MASK | Yes | <= 512 | 64 | + \endverbatim + * + * \param[in] Q The Q tensor. + * \param[in] K The K tensor. + * \param[in] V The V tensor. + * \param[in] O The O tensor from forward. + * \param[in] dO The gradient of the O tensor. + * \param[in] S The S tensor. + * \param[in,out] dP The gradient of the P tensor. + * \param[in] Aux_CTX_Tensors Auxiliary tensors from context when in training mode, + * e.g. M, ZInv, rng_state. + * \param[out] dQ The gradient of the Q tensor. + * \param[out] dK The gradient of the K tensor. + * \param[out] dV The gradient of the V tensor. + * \param[out] dBias The gradient of the Bias tensor. + * \param[in] cu_seqlens_q Cumulative sequence lengths for Q, [batch_size + 1]. + * \param[in] cu_seqlens_kv Cumulative sequence lengths for K and V, [batch_size + 1]. + * \param[in] max_seqlen_q Max sequence length used for computing for Q. + * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. + * \param[in] max_seqlen_kv Max sequence length used for computing for K and V. + * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. + * \param[in] attn_scale Scaling factor for Q * K.T. + * \param[in] dropout Dropout probability. + * \param[in] qkv_layout QKV tensors' layout. + * \param[in] bias_type Bias type. + * \param[in] attn_mask_type Attention mask type. + * \param[in] workspace Workspace tensor. + * \param[in] stream CUDA stream used for this operation. + */ +void nvte_fused_attn_bwd( + const NVTETensor Q, + const NVTETensor K, + const NVTETensor V, + const NVTETensor O, + const NVTETensor dO, + const NVTETensor S, + NVTETensor dP, + const NVTETensorPack* Aux_CTX_Tensors, + NVTETensor dQ, + NVTETensor dK, + NVTETensor dV, + NVTETensor dBias, + const NVTETensor cu_seqlens_q, + const NVTETensor cu_seqlens_kv, + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float dropout, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + NVTETensor workspace, + cudaStream_t stream); #ifdef __cplusplus } // extern "C" #endif diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index bcf5584f3d..625cd8644e 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -20,6 +20,8 @@ fused_attn_bwd_qkvpacked, fused_attn_fwd_kvpacked, fused_attn_bwd_kvpacked, + fused_attn_fwd, + fused_attn_bwd, QKVLayout, AttnBiasType, AttnMaskType, @@ -37,6 +39,7 @@ AttnMaskTypes, AttnTypes, AttnBiasTypes, + QKVLayouts, dist_group_type, TE_DType, ) @@ -141,64 +144,6 @@ def backward(ctx, return torch.cat(grad_outputs, dim = split_dim), None, None -class _CombineQKV(torch.autograd.Function): - """""" - - @staticmethod - def forward(ctx, - query_layer: torch.Tensor, - key_layer: torch.Tensor, # pylint: disable=unused-argument - value_layer: torch.Tensor, # pylint: disable=unused-argument - dim: int, - ) -> torch.Tensor: - - mixed_layer = torch.Tensor().to(device=query_layer.device, - dtype=query_layer.dtype) - new_shape = list(query_layer.shape) - new_shape[dim] = new_shape[dim] * 3 - mixed_layer.set_(query_layer.untyped_storage(), - query_layer.storage_offset(), - new_shape, - query_layer.stride()) - ctx.dim = dim - return mixed_layer - - @staticmethod - def backward(ctx, - *grad_outputs, - ) -> Tuple[torch.Tensor, ...]: - assert len(grad_outputs) > 0, "No gradients received for backprop!" - tensors = split_tensor_along_dim(grad_outputs[0], ctx.dim, 3) - return tensors[0], tensors[1], tensors[2], None - -class _CombineKV(torch.autograd.Function): - """""" - - @staticmethod - def forward(ctx, - key_layer: torch.Tensor, - value_layer: torch.Tensor, # pylint: disable=unused-argument - dim: int, - ) -> torch.Tensor: - - mixed_layer = torch.Tensor().to(device=key_layer.device, - dtype=key_layer.dtype) - new_shape = list(key_layer.shape) - new_shape[dim] = new_shape[dim] * 2 - mixed_layer.set_(key_layer.untyped_storage(), - key_layer.storage_offset(), - new_shape, - key_layer.stride()) - ctx.dim = dim - return mixed_layer - - @staticmethod - def backward(ctx, - *grad_outputs, - ) -> Tuple[torch.Tensor, ...]: - assert len(grad_outputs) > 0, "No gradients received for backprop!" - tensors = split_tensor_along_dim(grad_outputs[0], ctx.dim, 2) - return tensors[0], tensors[1], None class UnfusedDotProductAttention(torch.nn.Module): @@ -235,6 +180,9 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + qkv_layout: str = "sbh3d", + cu_seqlens_q: Optional[torch.Tensor] = None, # pylint: disable=unused-argument + cu_seqlens_kv: Optional[torch.Tensor] = None, # pylint: disable=unused-argument attn_mask_type: str = "causal", attention_mask: Optional[torch.Tensor] = None, core_attention_bias_type: str = "no_bias", @@ -242,6 +190,15 @@ def forward( ) -> torch.Tensor: """core attention fprop""" + assert (qkv_layout in QKVLayouts + ), f"UnfusedDotProductAttention does not support qkv_layout = {qkv_layout}!" + qkv_format = ''.join([i for i in qkv_layout.split('_')[0] if i.isalpha()]) + assert (qkv_format != 'thd' + ), """UnfusedDotProductAttention does not support variable sequence lengths!""" + if qkv_format == 'bshd': + # convert to sbhd and use sbhd implementation for now + query_layer, key_layer, value_layer = [x.transpose(0, 1) + for x in [query_layer, key_layer, value_layer]] assert ( attn_mask_type in AttnMaskTypes ), f"attn_mask_type {attn_mask_type} not supported" @@ -257,7 +214,6 @@ def forward( key_layer.size(0), ) - assert key_layer.shape == value_layer.shape, "Keys and values must have the same shape!" if key_layer.shape[2] != query_layer.shape[2]: assert (query_layer.shape[2]%key_layer.shape[2]==0 ),"The number of attention heads must be divisible by the number of GQA groups!" @@ -367,11 +323,19 @@ def forward( # change view [b, np, sq, hn] context_layer = context_layer.view(*output_size) - # [b, np, sq, hn] --> [sq, b, np, hn] - context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + if qkv_format == 'sbhd': + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + context_layer = context_layer.view(seqlen, batch_size, -1) - # [sq, b, np, hn] --> [sq, b, hp] - context_layer = context_layer.view(seqlen, batch_size, -1) + if qkv_format == 'bshd': + # [b, np, sq, hn] --> [b, sq, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + + # [b, sq, np, hn] --> [b, sq, hp] + context_layer = context_layer.view(batch_size, seqlen, -1) return context_layer @@ -406,66 +370,100 @@ def backward(ctx, dq, dk, dv = split_tensor_along_dim(dqkv, -1, 3) return dq, dk, dv +def _get_qkv_layout( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + qkv_format: str = 'sbhd', + ) -> str: + """Get qkv layout. -def _check_qkv_layout(q, k, v): - data_ptr = q.untyped_storage().data_ptr() - check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v]) - if not check_ptrs: - return False - - stride = q.stride() - check_strides = all(stride == x.stride() for x in [q, k, v]) - if not check_strides: - return False - - shape = q.shape - check_shapes = all(shape == x.shape for x in [q, k, v]) - if not check_shapes: - return False - - last_dim_size = shape[-1] - check_offsets = all(i * last_dim_size == x.storage_offset() - for i, x in enumerate([q, k, v])) - if check_offsets: - return "sbh3d" - - last_dims_size = shape[-1] * shape[-2] - check_offsets = all(i * last_dims_size == x.storage_offset() - for i, x in enumerate([q, k, v])) - if check_offsets: - return "sb3hd" + Parameters + ---------- + q: torch.Tensor + Query tensor. + k: torch.Tensor + Key tensor. + v: torch.Tensor + Value tensor. + qkv_format: str, default = `sbhd` + Dimension format for `q`, `k` and `v`, {`sbhd`, `bshd`, `thd`}. `s` stands for + the sequence length dimension, `b` batch size, `h` the number of attention heads, + `d` head size, and `t` the total number of sequences in a batch, i.e. + `t = sum(s_i) for i = 0...b-1`. + + Returns + ---------- + qkv_layout: str + Memory layout of `q`, `k` and `v`. Each `qkv_format` can be mapped to one of five + memory layouts. For example, `sb3hd` means `q`, `k`, `v` are created as one chunk + of memory and that they are interleaved in the `2`nd dimension. `sbhd_sbh2d` means + `q` and `kv` are created in two chunks and that `q` itself is contiguous and `k`, `v` + are interleaved with each other in the `3`rd dimension, `k = kv[:,:,:,0,:]` and + `v = kv[:,:,:,1,:]`. + Mapping: + `sbhd`: {`sb3hd`, `sbh3d`, `sbhd_sb2hd`, `sbhd_sbh2d`, `sbhd_sbhd_sbhd`} + `bshd`: {`bs3hd`, `bsh3d`, `bshd_bs2hd`, `bshd_bsh2d`, `bshd_bshd_bshd`} + `thd` : {`t3hd`, `th3d`, `thd_t2hd`, `thd_th2d`, `thd_thd_thd`} + """ - return "other" + check_last_dim_contiguous = all(x.stride(-1) == 1 for x in [q, k, v]) + assert check_last_dim_contiguous, "q, k and v must have stride 1 in their last dimension!" -def _check_kv_layout(k, v): + data_ptr = q.untyped_storage().data_ptr() + check_ptrs_qkv = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v]) data_ptr = k.untyped_storage().data_ptr() - check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [k, v]) - if not check_ptrs: - return False + check_ptrs_kv = all(x.untyped_storage().data_ptr() == data_ptr for x in [k, v]) + stride = q.stride() + check_strides_qkv = all(stride == x.stride() for x in [q, k, v]) stride = k.stride() - check_strides = all(stride == x.stride() for x in [k, v]) - if not check_strides: - return False + check_strides_kv = all(stride == x.stride() for x in [k, v]) + shape = q.shape + check_shapes_qkv = all(shape == x.shape for x in [q, k, v]) shape = k.shape - check_shapes = all(shape == x.shape for x in [k, v]) - if not check_shapes: - return False + check_shapes_kv = all(shape == x.shape for x in [k, v]) - last_dim_size = shape[-1] - check_offsets = all(i * last_dim_size == x.storage_offset() + last_dim_size = q.shape[-1] + check_last_dim_offsets_qkv = all(i * last_dim_size == x.storage_offset() + for i, x in enumerate([q, k, v])) + last_dim_size = k.shape[-1] + check_last_dim_offsets_kv = all(i * last_dim_size == x.storage_offset() for i, x in enumerate([k, v])) - if check_offsets: - return "sbh2d" - last_dims_size = shape[-1] * shape[-2] - check_offsets = all(i * last_dims_size == x.storage_offset() + last_two_dims_size = q.shape[-1] * q.shape[-2] + check_last_two_dims_offsets_qkv = all(i * last_two_dims_size == x.storage_offset() + for i, x in enumerate([q, k, v])) + last_two_dims_size = k.shape[-1] * k.shape[-2] + check_last_two_dims_offsets_kv = all(i * last_two_dims_size == x.storage_offset() for i, x in enumerate([k, v])) - if check_offsets: - return "sb2hd" - return "other" + qkv_layout = None + if (check_ptrs_qkv and check_strides_qkv and check_shapes_qkv + and check_last_two_dims_offsets_qkv + and not check_last_dim_offsets_qkv): + # sb3hd, bs3hd, t3hd + qkv_layout = qkv_format[:-2] + '3' + qkv_format[-2:] + elif check_ptrs_qkv and check_strides_qkv and check_shapes_qkv and check_last_dim_offsets_qkv: + # sbh3d, bsh3d, th3d + qkv_layout = qkv_format[:-1] + '3' + qkv_format[-1:] + elif (check_ptrs_kv and check_strides_kv and check_shapes_kv + and check_last_two_dims_offsets_kv + and not check_last_dim_offsets_kv): + # sbhd_sb2hd, bshd_bs2hd, thd_t2hd + qkv_layout = qkv_format + '_' + qkv_format[:-2] + '2' + qkv_format[-2:] + elif (check_ptrs_kv and check_strides_kv and check_shapes_kv + and check_last_dim_offsets_kv): + # sbhd_sbh2d, bshd_bsh2d, thd_th2d + qkv_layout = qkv_format + '_' + qkv_format[:-1] + '2' + qkv_format[-1:] + elif check_strides_kv and check_shapes_kv: + # sbhd_sbhd_sbhd, bshd_bshd_bshd, thd_thd_thd + qkv_layout = '_'.join(list([qkv_format])*3) + else: + raise Exception("The provided qkv memory layout is not supported!") + + return qkv_layout class FlashAttention(torch.nn.Module): @@ -496,6 +494,9 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + qkv_layout: str = "sbh3d", + cu_seqlens_q: Optional[torch.Tensor] = None, + cu_seqlens_kv: Optional[torch.Tensor] = None, attn_mask_type: str = "causal", ) -> torch.Tensor: """flash-attn fprop""" @@ -504,52 +505,87 @@ def forward( query_layer.dtype in [torch.float16, torch.bfloat16] and key_layer.dtype in [torch.float16, torch.bfloat16] and value_layer.dtype in [torch.float16, torch.bfloat16] - ), 'FlashAttention currently only supports FP16 and BF16.' + ), "FlashAttention currently only supports FP16 and BF16." assert ( query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda - ), 'FlashAttention currently only supports CUDA tensors.' - - # For now just 128, will make it more general in the future - - if (query_layer.shape[-1] == 128 and - query_layer.shape[0] * query_layer.shape[1] >= 512 and - _check_qkv_layout(query_layer, key_layer, value_layer) == "sbh3d"): - query_layer, key_layer, value_layer = _PrepareQKVForFA.apply(query_layer, - key_layer, - value_layer) - else: - query_layer, key_layer, value_layer = [x.transpose(0,1).contiguous() - for x in (query_layer, key_layer, value_layer)] - - batch_size, seqlen = query_layer.shape[0], query_layer.shape[1] - - # [b, sq, np, hn] + ), "FlashAttention currently only supports CUDA tensors." + assert ( + qkv_layout in QKVLayouts + ), f"FlashAttention does not support qkv_layout = {qkv_layout}!" + + qkv_format = ''.join([i for i in qkv_layout.split('_')[0] if i.isalpha()]) + + if qkv_format == 'sbhd': + # For now just 128, will make it more general in the future + if (query_layer.shape[-1] == 128 and + query_layer.shape[0] * query_layer.shape[1] >= 512 and + qkv_layout == "sbh3d"): + query_layer, key_layer, value_layer = _PrepareQKVForFA.apply(query_layer, + key_layer, + value_layer) + else: + query_layer, key_layer, value_layer = [x.transpose(0,1).contiguous() + for x in (query_layer, key_layer, value_layer)] + + if qkv_format == 'bshd': + query_layer, key_layer, value_layer = [x.contiguous() + for x in (query_layer, key_layer, value_layer)] + + if qkv_format in ['sbhd', 'bshd']: + batch_size, max_seqlen_q, max_seqlen_kv = ( + query_layer.shape[0], query_layer.shape[1], key_layer.shape[1]) + if cu_seqlens_q is None: + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * max_seqlen_q, + step=max_seqlen_q, + dtype=torch.int32, + device=query_layer.device) + if cu_seqlens_kv is None: + cu_seqlens_kv = torch.arange( + 0, + (batch_size + 1) * max_seqlen_kv, + step=max_seqlen_kv, + dtype=torch.int32, + device=key_layer.device) + + if qkv_format == 'thd': + assert (_flash_attn_2_available + ), "flash-attn v2 is required for variable sequence length support!" + assert (cu_seqlens_q is not None and cu_seqlens_kv is not None + ), "cu_seqlens_q and cu_seqlens_kv can not be None when qkv_format = thd!" + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + max_seqlen_q = seqlens_q.max().item() + max_seqlen_kv = seqlens_kv.max().item() + + # [b * s, h, d] query_layer, key_layer, value_layer = [ x.view(x.shape[0] * x.shape[1], *x.shape[2:]) for x in [query_layer, key_layer, value_layer] ] - max_seqlen = seqlen - cu_seqlens = torch.arange( - 0, - (batch_size + 1) * seqlen, - step=seqlen, - dtype=torch.int32, - device=query_layer.device) - with self.attention_dropout_ctx(): fa_optional_forward_kwargs = {} if not _flash_attn_2_available: fa_optional_forward_kwargs["deterministic"] = self.deterministic output = flash_attn_forward_func( - query_layer, key_layer, value_layer, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, + query_layer, key_layer, value_layer, + cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, self.attention_dropout if self.training else 0.0, - softmax_scale=1.0/self.norm_factor, causal=attn_mask_type=="causal", + softmax_scale=1.0/self.norm_factor, + causal=attn_mask_type=="causal", **fa_optional_forward_kwargs ) - # [(b sq), np, hn] -> [sq, b, (np hn)] - return output.view(batch_size, seqlen, -1).transpose(0, 1).contiguous() + if qkv_format == 'sbhd': + # (bs)hd -> bs(hd) -> sb(hd) + output = output.view(batch_size, max_seqlen_q, -1).transpose(0, 1).contiguous() + if qkv_format == 'bshd': + # (bs)hd -> bs(hd) + output = output.view(batch_size, max_seqlen_q, -1).contiguous() + + return output class FusedAttnFunc_qkvpacked(torch.autograd.Function): @@ -685,6 +721,77 @@ def backward(ctx, d_out): None, None, None, None, None, None, None, None, None, None, None, None) +class FusedAttnFunc(torch.autograd.Function): + """Function for FusedAttention with separate Q, K, V tensors""" + + @staticmethod + def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, k, v, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, + qkv_layout, attn_bias_type, attn_mask_type, + rng_gen, fused_attention_backend, use_FAv2_bwd): + out, aux_ctx_tensors = fused_attn_fwd( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, k, v, qkv_dtype, fused_attention_backend, attn_bias, + None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + + ctx.save_for_backward(q, k, v, out, cu_seqlens_q, cu_seqlens_kv) + ctx.aux_ctx_tensors = aux_ctx_tensors + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_kv = max_seqlen_kv + ctx.qkv_dtype = qkv_dtype + ctx.attn_scale = attn_scale + ctx.dropout_p = dropout_p + ctx.fast_zero_fill = fast_zero_fill + ctx.qkv_layout = qkv_layout + ctx.attn_bias_type = attn_bias_type + ctx.attn_mask_type = attn_mask_type + ctx.fused_attention_backend = fused_attention_backend + ctx.use_FAv2_bwd = use_FAv2_bwd + + return out + + @staticmethod + def backward(ctx, d_out): + q, k, v, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors + if ctx.use_FAv2_bwd: + softmax_lse, rng_state = ctx.aux_ctx_tensors + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dv = torch.empty_like(v) + maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x + d_out, q, k, v, out = [maybe_contiguous(x) + for x in (d_out, q, k, v, out)] + flash_attn_cuda_bwd( + d_out, q, k, v, out, softmax_lse, dq, dk, dv, + cu_seqlens_q, cu_seqlens_kv, ctx.max_seqlen_q, ctx.max_seqlen_kv, + ctx.dropout_p, ctx.attn_scale, False, + ctx.attn_mask_type == "causal", None, rng_state + ) + dq = dq[..., :d_out.shape[-1]] + dk = dk[..., :d_out.shape[-1]] + dv = dv[..., :d_out.shape[-1]] + else: + dq, dk, dv, *rest = fused_attn_bwd( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, k, v, out, d_out, + ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + + # if no_bias, return dqkv + if ctx.attn_bias_type == "no_bias": + return (None, None, None, None, None, dq, dk, dv, None, None, None, + None, None, None, None, None, None, + None, None, None, None, None, None) + # else, return (dqkv, dbias) + return (None, None, None, None, None, dq, dk, dv, None, rest[0], None, + None, None, None, None, None, None, + None, None, None, None, None, None) + class FusedAttention(torch.nn.Module): """Dot product attention, with multiple backends: @@ -695,20 +802,23 @@ class FusedAttention(torch.nn.Module): Support matrix: - | backend | 1 | 2 | - | flash based | no | yes | - | cuDNN based | yes | yes | - | qkv dtype | fp16/bf16 | fp16/bf16 | - | attn_type | self/cross | self | - | qkv_layout | | | - | - qkv | qkv_interleaved | qkv_interleaved | - | - (q,kv) | kv_interleaved | | - | mask_type | causal/no_mask | causal | - | bias_type | no_bias/post_scale_bias | no_bias | - | dropout | yes | yes | - | max_seqlen | <=512 | any | - | head_dim | 64 | 64,128 | - | output dtype | fp16/bf16 | fp16/bf16 | + | backend | 1 | 2 | + | flash based | no | yes | + | cuDNN based | yes | yes | + | qkv dtype | fp16/bf16 | fp16/bf16 | + | attn_type | self/cross | self | + | qkv_layout | | | + | - qkv | qkv_interleaved | qkv_interleaved | + | - (q,kv) | kv_interleaved | | + | - (q,k,v) | sb3hd, bs3hd | sb3hd, bs3hd | + | | sbhd_sb2hd, bshd_bs2hd | sbhd_sb2hd, bshd_bs2hd | + | | bshd_bshd_bshd | sbhd_sbhd_sbhd, bshd_bshd_bshd | + | mask_type | causal/no_mask | causal | + | bias_type | no_bias/post_scale_bias | no_bias | + | dropout | yes | yes | + | max_seqlen | <=512 | any | + | head_dim | 64 | 64,128 | + | output dtype | fp16/bf16 | fp16/bf16 | """ def __init__( @@ -733,6 +843,9 @@ def forward( query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor, + qkv_layout: str = "sbh3d", + cu_seqlens_q: Optional[torch.Tensor] = None, + cu_seqlens_kv: Optional[torch.Tensor] = None, attn_mask_type: str = "causal", fused_attention_backend: tex.NVTE_Fused_Attn_Backend = tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend, @@ -743,8 +856,8 @@ def forward( """fused attention fprop""" assert (fused_attention_backend - != tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend - ), 'No fused attention backend supports this input combination!' + != tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend + ), 'No fused attention backend supports this input combination!' assert ( (query_layer.dtype in [torch.float16, torch.bfloat16]) and (key_layer.dtype in [torch.float16, torch.bfloat16]) @@ -753,132 +866,66 @@ def forward( assert ( query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda ), 'FusedAttention only supports CUDA tensors.' + assert ( + qkv_layout in QKVLayouts + ), f"FusedAttention does not support qkv_layout = {qkv_layout}!" + + qkv_format = ''.join([i for i in qkv_layout.split('_')[0] if i.isalpha()]) + if qkv_format in ['sbhd', 'bshd']: + if qkv_format == 'sbhd': + batch_size, max_seqlen_q, max_seqlen_kv = ( + query_layer.shape[1], query_layer.shape[0], key_layer.shape[0]) + if qkv_format == 'bshd': + batch_size, max_seqlen_q, max_seqlen_kv = ( + query_layer.shape[0], query_layer.shape[1], key_layer.shape[1]) + if cu_seqlens_q is None: + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * max_seqlen_q, + step=max_seqlen_q, + dtype=torch.int32, + device=query_layer.device) + if cu_seqlens_kv is None: + cu_seqlens_kv = torch.arange( + 0, + (batch_size + 1) * max_seqlen_kv, + step=max_seqlen_kv, + dtype=torch.int32, + device=key_layer.device) + if qkv_format == 'thd': + assert (cu_seqlens_q is not None and cu_seqlens_kv is not None + ), "cu_seqlens_q and cu_seqlens_kv can not be None when qkv_format = thd!" + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + max_seqlen_q = seqlens_q.max().item() + max_seqlen_kv = seqlens_kv.max().item() qkv_dtype = TE_DType[query_layer.dtype] - seqlen_q, batch_size = query_layer.shape[0], query_layer.shape[1] - seqlen_kv = key_layer.shape[0] - max_seqlen_q = seqlen_q - max_seqlen_kv = seqlen_kv - if self.attention_type == "self": - qkv_layout = _check_qkv_layout(query_layer, key_layer, value_layer) - if qkv_layout == "sbh3d": - mixed_layer = _CombineQKV.apply(query_layer, key_layer, value_layer, 3) - # [s, b, h, 3, d] - mixed_layer = mixed_layer.view( - *mixed_layer.shape[0:3], 3, query_layer.shape[-1]) - # [b, s, 3, h, d] - mixed_layer = mixed_layer.transpose(2, 3).transpose(0, 1).contiguous() - elif qkv_layout == "sb3hd": - mixed_layer = _CombineQKV.apply(query_layer, key_layer, value_layer, 2) - # [s, b, 3, h, d] - mixed_layer = mixed_layer.view( - *mixed_layer.shape[0:2], 3, *query_layer.shape[2:]) - # [b, s, 3, h, d] - mixed_layer = mixed_layer.transpose(0, 1).contiguous() - else: - raise Exception("FusedAttention only supports qkv layout sbh3d or sb3hd!") - - # [total_seqs, 3, h, d] - mixed_layer = mixed_layer.view( - mixed_layer.shape[0] * mixed_layer.shape[1], *mixed_layer.shape[2:]) - - qkv_layout = "qkv_interleaved" - max_seqlen = seqlen_q - cu_seqlens = torch.arange( - 0, - (batch_size + 1) * seqlen_q, - step=seqlen_q, - dtype=torch.int32, - device=query_layer.device) - use_FAv2_bwd = (self.use_FAv2_bwd - and (fused_attention_backend - == tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen) - and core_attention_bias_type == "no_bias") - - with self.attention_dropout_ctx(): - output = FusedAttnFunc_qkvpacked.apply( - self.training, - max_seqlen, - cu_seqlens, - mixed_layer, - qkv_dtype, - core_attention_bias, - 1.0/self.norm_factor, - self.attention_dropout if self.training else 0.0, - fast_zero_fill, - qkv_layout, - core_attention_bias_type, - attn_mask_type, - None, # rng_gen - fused_attention_backend, - use_FAv2_bwd - ) - output = output.view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous() - - if self.attention_type == "cross": - kv_layout = _check_kv_layout(key_layer, value_layer) - if kv_layout == "sbh2d": - key_value = _CombineKV.apply(key_layer, value_layer, 3) - # [s, b, h, 2, d] - key_value = key_value.view( - *key_value.shape[0:3], 2, key_layer.shape[-1]) - # [b, s, 2, h, d] - key_value = key_value.transpose(2, 3).transpose(0, 1).contiguous() - elif qkv_layout == "sb2hd": - key_value = _CombineKV.apply(key_layer, value_layer, 2) - # [s, b, 2, h, d] - key_value = key_value.view( - *key_value.shape[0:2], 2, *key_layer.shape[2:]) - # [b, s, 2, h, d] - key_value = key_value.transpose(0, 1).contiguous() - else: - raise Exception("FusedAttention only supports kv layout sbh2d or sb2hd!") - - # [total_seqs, h, d] - query_layer = query_layer.transpose(0, 1).contiguous() - query_layer = query_layer.view( - query_layer.shape[0] * query_layer.shape[1], *query_layer.shape[2:]) - # [total_seqs, 2, h, d] - key_value = key_value.view([key_value.shape[0] * key_value.shape[1]] - + key_value.shape[2:]) - - qkv_layout = "kv_interleaved" - cu_seqlens_q = torch.arange( - 0, - (batch_size + 1) * seqlen_q, - step=seqlen_q, - dtype=torch.int32, - device=query_layer.device) - cu_seqlens_kv = torch.arange( - 0, - (batch_size + 1) * seqlen_kv, - step=seqlen_kv, - dtype=torch.int32, - device=key_layer.device) - - with self.attention_dropout_ctx(): - outputs = FusedAttnFunc_kvpacked.apply( - self.training, - max_seqlen_q, max_seqlen_kv, - cu_seqlens_q, cu_seqlens_kv, - query_layer, key_value, - qkv_dtype, - core_attention_bias, - 1.0/self.norm_factor, - self.attention_dropout if self.training else 0.0, - fast_zero_fill, - qkv_layout, - core_attention_bias_type, - attn_mask_type, - None, # rng_gen - fused_attention_backend, - use_FAv2_bwd - ) + use_FAv2_bwd = (self.use_FAv2_bwd + and (fused_attention_backend + == tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen)) + with self.attention_dropout_ctx(): + output = FusedAttnFunc.apply( + self.training, + max_seqlen_q, max_seqlen_kv, + cu_seqlens_q, cu_seqlens_kv, + query_layer, key_layer, value_layer, + qkv_dtype, + core_attention_bias, + 1.0/self.norm_factor, + self.attention_dropout if self.training else 0.0, + fast_zero_fill, + qkv_layout, + core_attention_bias_type, + attn_mask_type, + None, # rng_gen + fused_attention_backend, + use_FAv2_bwd, + ) - output = (outputs[0].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous(), - outputs[1].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous()) - return output + # ...hd -> ...(hd) + return output.view(*output.shape[:-2], -1) class DotProductAttention(torch.nn.Module): @@ -917,6 +964,16 @@ class DotProductAttention(torch.nn.Module): layer_number: int, default = `None` layer number of the current `DotProductAttention` when multiple such modules are concatenated, for instance in consecutive transformer blocks. + qkv_format: str, default = `sbhd` + dimension format for `query_layer`, `key_layer` and `value_layer`, + {`sbhd`, `bshd`, `thd`}. `s` stands for the sequence length, `b` batch size, + `h` the number of heads, `d` head size, and `t` the total number of sequences + in a batch, with `t = sum(s_i), for i = 0...b-1`. `sbhd` and `bshd` formats + are used for when sequences in a batch are of equal length or padded to + equal length, and the `thd` format is used for when sequences in a batch + have different lengths. Please note that these formats do not reflect how + tensors `query_layer`, `key_layer`, `value_layer` are laid out in memory. + For that, please use `_get_qkv_layout` to gain the layout information. attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal` type of attention mask passed into softmax operation. Overridden by :attr:`attn_mask_type` in the `forward` method. The forward @@ -940,6 +997,7 @@ def __init__( kv_channels: int, num_gqa_groups: Optional[int] = None, attention_dropout: float = 0.0, + qkv_format: str = "sbhd", attn_mask_type: str = "causal", sequence_parallel: bool = False, tp_size: int = 1, @@ -950,6 +1008,7 @@ def __init__( ) -> None: super().__init__() + self.qkv_format = qkv_format self.attn_mask_type = attn_mask_type self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group) self.tp_group = tp_group @@ -1040,6 +1099,9 @@ def forward( key_layer: torch.Tensor, value_layer: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, + qkv_format: Optional[str] = None, + cu_seqlens_q: Optional[torch.Tensor] = None, + cu_seqlens_kv: Optional[torch.Tensor] = None, attn_mask_type: Optional[str] = None, checkpoint_core_attention: bool = False, core_attention_bias_type: str = "no_bias", @@ -1082,9 +1144,11 @@ def forward( If FusedAttention is being used, users can also choose to switch to flash-attn's implementation for backward by setting :attr:`NVTE_FUSED_ATTN_USE_FAv2_BWD=1` (default: 0), because of the performance differences between various versions of - flash-attn and FusedAttention. Further, :attr:`NVTE_FUSED_ATTN_DP_WORKSPACE_LIMIT` - can be used to enable the workspace related optimizations in FusedAttention - (default: 256MB; raise the limit to enable these performance optimizations). + flash-attn and FusedAttention. Further, :attr:`NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT` + can be used to enable (:attr:`1`) or disable (:attr:`0`) the workspace related + optimizations in FusedAttention. When unset, TransformerEngine determines the code path + based on its internal logic. These optimizations trade memory for performance + and should be used with care. Parameters ---------- @@ -1094,6 +1158,14 @@ def forward( Key tensor. value_layer : torch.Tensor Value tensor. + qkv_format: str, default = `None` + If provided, overrides :attr:`qkv_format` from initialization. + cu_seqlens_q: Optional[torch.Tensor], default = `None` + Cumulative sum of sequence lengths in a batch for `query_layer`, + with shape [batch_size + 1] and dtype torch.int32. + cu_seqlens_kv: Optional[torch.Tensor], default = `None` + Cumulative sum of sequence lengths in a batch for `key_layer` and `value_layer`, + with shape [batch_size + 1] and dtype torch.int32. attention_mask : Optional[torch.Tensor], default = `None` Boolean tensor used to mask out softmax input when not using flash-attn. attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `None` @@ -1111,12 +1183,57 @@ def forward( Whether to use the fast path to set output tensors to 0 or not. """ + assert (key_layer.shape == value_layer.shape + ), "Keys and values must have the same shape!" + if attn_mask_type is None: attn_mask_type = self.attn_mask_type + if qkv_format is None: + qkv_format = self.qkv_format assert (key_layer.shape[-2] == self.num_gqa_groups_per_partition - and value_layer.shape[-2] == self.num_gqa_groups_per_partition - ), f"Keys and values must have {self.num_gqa_groups} heads!" + and value_layer.shape[-2] == self.num_gqa_groups_per_partition + ), f"Keys and values must have num_gqa_group = {self.num_gqa_groups} heads!" + assert (qkv_format in ['sbhd', 'bshd', 'thd'] + ), "DotProductAttention only supports qkv_format = {'sbhd', 'bshd', 'thd'}!" + + if qkv_format == 'thd': + assert (all(len(x.shape) == 3 for x in (query_layer, key_layer, value_layer)) + ), "Queries, keys and values must be 3D tensors when qkv_format = thd!" + assert (cu_seqlens_q is not None and cu_seqlens_kv is not None + ), "cu_seqlens_q and cu_seqlens_kv can not be None when qkv_format = thd!" + assert (cu_seqlens_q.shape == cu_seqlens_kv.shape + and len(cu_seqlens_q.shape) == 1 + and len(cu_seqlens_kv.shape) == 1 + ), "cu_seqlens_q and cu_seqlens_q must both have shape [batch_size + 1]!" + assert (cu_seqlens_q.dtype == torch.int32 + and cu_seqlens_kv.dtype == torch.int32 + ), "cu_seqlens_q and cu_seqlens_q must both be in dtype torch.int32!" + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + max_seqlen_q = seqlens_q.max().item() + max_seqlen_kv = seqlens_kv.max().item() + + if qkv_format in ['sbhd', 'bshd']: + assert (all(len(x.shape) == 4 for x in (query_layer, key_layer, value_layer)) + ), f"Queries, keys and values must be 4D tensors when qkv_format = {qkv_format}!" + if qkv_format == 'sbhd': + max_seqlen_q, max_seqlen_kv = (query_layer.shape[0], key_layer.shape[0]) + if qkv_format == 'bshd': + max_seqlen_q, max_seqlen_kv = (query_layer.shape[1], key_layer.shape[1]) + if cu_seqlens_q is not None: + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + assert (all(seqlens_q <= max_seqlen_q) + ), """Sequence lengths indicated by cu_seqlens_q must be no greater than + the sequence dimention in 'query_layer'!""" + if cu_seqlens_kv is not None: + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + assert (all(seqlens_kv <= max_seqlen_kv) + ), """Sequence lengths indicated by cu_seqlens_kv must be no greater than + the sequence dimention in 'key_layer' and 'value_layer'!""" + + qkv_layout = _get_qkv_layout(query_layer, key_layer, value_layer, + qkv_format = qkv_format) use_flash_attention = self.use_flash_attention use_fused_attention = self.use_fused_attention @@ -1147,8 +1264,6 @@ def forward( use_flash_attention = False use_fused_attention = False - qkv_layout = "qkv_interleaved" if self.attention_type == "self" else "kv_interleaved" - if use_fused_attention: fused_attention_backend = tex.get_fused_attn_backend( TE_DType[query_layer.dtype], @@ -1157,7 +1272,7 @@ def forward( AttnBiasType[core_attention_bias_type], AttnMaskType[attn_mask_type], self.attention_dropout, - query_layer.shape[0], key_layer.shape[0], + max_seqlen_q, max_seqlen_kv, query_layer.shape[-1]) # DPA does not support FP8; for FP8, use cpp_extensions modules directly is_backend_avail = (fused_attention_backend in @@ -1179,9 +1294,16 @@ def forward( query_layer, key_layer, value_layer, - attn_mask_type=attn_mask_type) - return self.flash_attention( - query_layer, key_layer, value_layer, attn_mask_type=attn_mask_type) + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type) + return self.flash_attention(query_layer, key_layer, value_layer, + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type) + if use_fused_attention: if checkpoint_core_attention: @@ -1189,17 +1311,23 @@ def forward( query_layer, key_layer, value_layer, - attn_mask_type=attn_mask_type, - fused_attention_backend=fused_attention_backend, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - fast_zero_fill=fast_zero_fill) + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type, + fused_attention_backend = fused_attention_backend, + core_attention_bias_type = core_attention_bias_type, + core_attention_bias = core_attention_bias, + fast_zero_fill = fast_zero_fill) return self.fused_attention(query_layer, key_layer, value_layer, - attn_mask_type=attn_mask_type, - fused_attention_backend=fused_attention_backend, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - fast_zero_fill=fast_zero_fill) + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type, + fused_attention_backend = fused_attention_backend, + core_attention_bias_type = core_attention_bias_type, + core_attention_bias = core_attention_bias, + fast_zero_fill = fast_zero_fill) if checkpoint_core_attention: return self._checkpointed_attention_forward( @@ -1207,19 +1335,23 @@ def forward( query_layer, key_layer, value_layer, - attn_mask_type=attn_mask_type, - attention_mask=attention_mask, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - ) + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type, + attention_mask = attention_mask, + core_attention_bias_type = core_attention_bias_type, + core_attention_bias = core_attention_bias) return self.unfused_attention(query_layer, key_layer, value_layer, - attn_mask_type=attn_mask_type, - attention_mask=attention_mask, - core_attention_bias_type=core_attention_bias_type, - core_attention_bias=core_attention_bias, - ) + qkv_layout = qkv_layout, + cu_seqlens_q = cu_seqlens_q, + cu_seqlens_kv = cu_seqlens_kv, + attn_mask_type = attn_mask_type, + attention_mask = attention_mask, + core_attention_bias_type = core_attention_bias_type, + core_attention_bias = core_attention_bias) class MultiheadAttention(torch.nn.Module): @@ -1834,6 +1966,9 @@ def forward( query_layer, key_layer, value_layer, + qkv_format='sbhd', + cu_seqlens_q=None, + cu_seqlens_kv=None, attention_mask=attention_mask, attn_mask_type=attn_mask_type, checkpoint_core_attention=checkpoint_core_attention, diff --git a/transformer_engine/pytorch/constants.py b/transformer_engine/pytorch/constants.py index ee43fa10d9..0504cde47c 100644 --- a/transformer_engine/pytorch/constants.py +++ b/transformer_engine/pytorch/constants.py @@ -28,6 +28,11 @@ AttnBiasTypes = ("pre_scale_bias", "post_scale_bias", "no_bias") +QKVLayouts = ( + "sb3hd", "sbh3d", "sbhd_sb2hd", "sbhd_sbh2d", "sbhd_sbhd_sbhd", + "bs3hd", "bsh3d", "bshd_bs2hd", "bshd_bsh2d", "bshd_bshd_bshd", + "t3hd", "th3d", "thd_t2hd", "thd_th2d", "thd_thd_thd") + LayerTypes = ("encoder", "decoder") GemmParallelModes = ("row", "column", None) diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py index dd6fb3e2f8..77b5302d6c 100644 --- a/transformer_engine/pytorch/cpp_extensions/fused_attn.py +++ b/transformer_engine/pytorch/cpp_extensions/fused_attn.py @@ -18,7 +18,9 @@ __all__ = ['fused_attn_fwd_qkvpacked', 'fused_attn_bwd_qkvpacked', 'fused_attn_fwd_kvpacked', - 'fused_attn_bwd_kvpacked'] + 'fused_attn_bwd_kvpacked', + 'fused_attn_fwd', + 'fused_attn_bwd'] TORCH_DType = { @@ -34,6 +36,21 @@ "not_interleaved": NVTE_QKV_Layout.NVTE_NOT_INTERLEAVED, "qkv_interleaved": NVTE_QKV_Layout.NVTE_QKV_INTERLEAVED, "kv_interleaved": NVTE_QKV_Layout.NVTE_KV_INTERLEAVED, + "sb3hd": NVTE_QKV_Layout.NVTE_SB3HD, + "sbh3d": NVTE_QKV_Layout.NVTE_SBH3D, + "sbhd_sb2hd": NVTE_QKV_Layout.NVTE_SBHD_SB2HD, + "sbhd_sbh2d": NVTE_QKV_Layout.NVTE_SBHD_SBH2D, + "sbhd_sbhd_sbhd": NVTE_QKV_Layout.NVTE_SBHD_SBHD_SBHD, + "bs3hd": NVTE_QKV_Layout.NVTE_BS3HD, + "bsh3d": NVTE_QKV_Layout.NVTE_BSH3D, + "bshd_bs2hd": NVTE_QKV_Layout.NVTE_BSHD_BS2HD, + "bshd_bsh2d": NVTE_QKV_Layout.NVTE_BSHD_BSH2D, + "bshd_bshd_bshd": NVTE_QKV_Layout.NVTE_BSHD_BSHD_BSHD, + "t3hd": NVTE_QKV_Layout.NVTE_T3HD, + "th3d": NVTE_QKV_Layout.NVTE_TH3D, + "thd_t2hd": NVTE_QKV_Layout.NVTE_THD_T2HD, + "thd_th2d": NVTE_QKV_Layout.NVTE_THD_TH2D, + "thd_thd_thd": NVTE_QKV_Layout.NVTE_THD_THD_THD, } AttnBiasType = { @@ -166,9 +183,10 @@ def fused_attn_fwd_qkvpacked( if True, runs training and produces auxiliary tensors aux_ctx_tensors for the backward; if False, runs inference and doesn't produce aux_ctx_tensors max_seqlen: int - max sequence length for QKV, used for padding; may be larger than max(cu_seqlens) + max sequence length for QKV, used for padding; may be larger than max(seqlens), + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] cu_seqlens: torch.Tensor - accumulative sequence lengths for QKV; shape [batch_size + 1] + cumulative sequence lengths for QKV; shape [batch_size + 1] qkv: torch.Tensor input tensor QKV; shape [total_seqs, 3, num_heads, head_dim], where total_seqs = cu_seqlens[-1] @@ -336,9 +354,10 @@ def fused_attn_bwd_qkvpacked( Parameters ---------- max_seqlen: int - max sequence length for QKV, used for padding; may be larger than max(cu_seqlens_q) + max sequence length for QKV, used for padding; may be larger than max(seqlens) + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] cu_seqlens: torch.Tensor - accumulative sequence lengths for QKV; shape [batch_size + 1] + cumulative sequence lengths for QKV; shape [batch_size + 1] qkv: torch.Tensor input tensor QKV; shape [total_seqs, 3, num_heads, head_dim], where total_seqs = cu_seqlens[-1] @@ -482,7 +501,7 @@ def fused_attn_fwd_kvpacked( attn_scale: float = None, dropout: float = 0.0, fast_zero_fill: bool = True, - qkv_layout: str = "qkv_interleaved", + qkv_layout: str = "kv_interleaved", attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", rng_gen: torch.Generator = None, @@ -495,13 +514,15 @@ def fused_attn_fwd_kvpacked( if True, runs training and produces auxiliary tensors aux_ctx_tensors for the backward; if False, runs inference and doesn't produce aux_ctx_tensors max_seqlen_q: int - max sequence length for Q, used for padding; may be larger than max(cu_seqlens_q) + max sequence length for Q, used for padding; may be larger than max(seqlens_q), + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] max_seqlen_kv: int - max sequence length for KV, used for padding; may be larger than max(cu_seqlens_kv) + max sequence length for KV, used for padding; may be larger than max(seqlens_kv), + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] cu_seqlens_q: torch.Tensor - accumulative sequence lengths for Q; shape [batch_size + 1] + cumulative sequence lengths for Q; shape [batch_size + 1] cu_seqlens_kv: torch.Tensor - accumulative sequence lengths for KV; shape [batch_size + 1] + cumulative sequence lengths for KV; shape [batch_size + 1] q: torch.Tensor input tensor Q; shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] @@ -535,7 +556,7 @@ def fused_attn_fwd_kvpacked( fast_zero_fill: bool, default = True if True, initializes the output tensor O to zero using the fast filling method; if False, uses PyTorch's .fill_() method - qkv_layout: str, default = "qkv_interleaved" + qkv_layout: str, default = "kv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} @@ -659,7 +680,7 @@ def fused_attn_bwd_kvpacked( attn_scale: float = None, dropout: float = 0.0, fast_zero_fill: bool = True, - qkv_layout: str = "qkv_interleaved", + qkv_layout: str = "kv_interleaved", attn_bias_type: str = "no_bias", attn_mask_type: str = "padding", ) -> Tuple[Union[torch.Tensor, None], ...]: @@ -668,13 +689,15 @@ def fused_attn_bwd_kvpacked( Parameters ---------- max_seqlen_q: int - max sequence length for Q, used for padding; may be larger than max(cu_seqlens_q) + max sequence length for Q, used for padding; may be larger than max(seqlens_q), + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] max_seqlen_kv: int - max sequence length for KV, used for padding; may be larger than max(cu_seqlens_kv) + max sequence length for KV, used for padding; may be larger than max(seqlens_kv), + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] cu_seqlens_q: torch.Tensor - accumulative sequence lengths for Q; shape [batch_size + 1] + cumulative sequence lengths for Q; shape [batch_size + 1] cu_seqlens_kv: torch.Tensor - accumulative sequence lengths for KV; shape [batch_size + 1] + cumulative sequence lengths for KV; shape [batch_size + 1] q: torch.Tensor input tensor Q; shape [total_seqs_q, num_heads, head_dim], where total_seqs_q = cu_seqlens_q[-1] @@ -723,7 +746,7 @@ def fused_attn_bwd_kvpacked( fast_zero_fill: bool, default = True if True, initializes the output tensor O to zero using the fast filling method; if False, uses PyTorch's .fill_() method - qkv_layout: str, default = "qkv_interleaved" + qkv_layout: str, default = "kv_interleaved" layout of QKV; {"qkv_interleaved", "kv_interleaved", "not_interleaved"} attn_bias_type: str, default = "no_bias" type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} @@ -812,3 +835,365 @@ def fused_attn_bwd_kvpacked( return output_tensors # otherwise return (d_q, d_kv), d_bias return output_tensors[:2], output_tensors[2] + +def fused_attn_fwd( + is_training: bool, + max_seqlen_q: int, + max_seqlen_kv: int, + cu_seqlens_q: torch.Tensor, + cu_seqlens_kv: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + qkv_dtype: tex.DType, + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, + attn_bias: torch.Tensor = None, + d_scale_qkv: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_o: torch.Tensor = None, + amax_s: torch.Tensor = None, + amax_o: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + fast_zero_fill: bool = True, + qkv_layout: str = "sbh3d", + attn_bias_type: str = "no_bias", + attn_mask_type: str = "padding", + rng_gen: torch.Generator = None, +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention FWD for separate QKV input. + + Parameters + ---------- + is_training: bool + if True, runs training and produces auxiliary tensors aux_ctx_tensors + for the backward; if False, runs inference and doesn't produce aux_ctx_tensors + max_seqlen_q: int + max sequence length for Q, used for padding; + may be larger than max(seqlens_q), + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + max_seqlen_kv: int + max sequence length for K and V, used for padding; + may be larger than max(seqlens_kv), + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + cu_seqlens_q: torch.Tensor + cumulative sequence lengths for Q; shape [batch_size + 1] + cu_seqlens_kv: torch.Tensor + cumulative sequence lengths for K and V; shape [batch_size + 1] + q: torch.Tensor + input tensor Q; + shape [total_seqs_q, num_heads, head_dim], + where total_seqs_q = cu_seqlens_q[-1], + or [batch_size, seqlen_q, num_heads, head_dim], + or [seqlen_q, batch_size, num_heads, head_dim] + k: torch.Tensor + input tensor K; + shape [total_seqs_kv, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1], + or [batch_size, seqlen_kv, num_heads, head_dim], + or [seqlen_kv, batch_size, num_heads, head_dim] + v: torch.Tensor + input tensor V; + shape [total_seqs_kv, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1], + or [batch_size, seqlen_kv, num_heads, head_dim], + or [seqlen_kv, batch_size, num_heads, head_dim] + qkv_dtype: tex.DType + data type of Q, K and V; in tex.DType, not torch.dtype + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. + attn_bias: torch.Tensor, default = None + input tensor Bias when attn_bias_type is "pre_scale_bias" or "post_scale_bias"; + shape [1, num_heads, max_seqlen_q, max_seqlen_kv], same data type as q, k and v + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of Q, K and V in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) + q_scale_o: torch.Tensor, default = None + input tensor for the quantization of O in FP8 computations + amax_s: torch.Tensor, default = None + output tensor, amax of S, used by the next iteration in FP8 computations + amax_o: torch.Tensor, default = None + output tensor, amax of O, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method + qkv_layout: str, default = "sbh3d" + layout of Q, K and V; + {"sb3hd", "sbh3d", "sbhd_sb2hd", "sbhd_sbh2d", "sbhd_sbhd_sbhd", + "bs3hd", "bsh3d", "bshd_bs2hd", "bshd_bsh2d", "bshd_bshd_bshd", + "t3hd", "th3d", "thd_t2hd", "thd_th2d", "thd_thd_thd"} + attn_bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + rng_gen: torch.Generator, default = None + random number generator; + if None, uses the default CUDA generator from PyTorch; otherwise, uses rng_gen + + Returns + ---------- + o: torch.Tensor + output tensor O, of the attention calculation; same data type as Q, K and V; + same shape as Q + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors used for the backward; + if is_training is True, aux_ctx_tensors = [softmax-related tensors, rng_state] + if is_training is False, aux_ctx_tensors = None + + softmax-related tensors: + 1. if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"] + softmax: torch.Tensor + Softmax(Q*K.T) + shape [batch_size, num_heads, max_seqlen_q, max_seqlen_kv], dtype float32 + 2. if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"] + softmaxStats: torch.Tensor + log(sum(e^(x - max(x)))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + 3. if fused_attention_backend == FusedAttnBackend["FP8"] + M: torch.Tensor + max(Q*K.T) + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + ZInv: torch.Tensor + 1/sum(e^(x - max(x))), where x=Q*K.T + shape [batch_size, num_heads, max_seqlen_q, 1], dtype float32 + rng_state: torch.Tensor, optional, if backend is not F16_max512_seqlen + state of the random number generator; + [seed, offset], dtype uint64 + """ + + check_cu_seqlens(cu_seqlens_q) + check_cu_seqlens(cu_seqlens_kv) + assert (cu_seqlens_q.numel() == cu_seqlens_kv.numel() + ), "cu_seqlens_q and cu_seqlens_kv must have the same length." + h = q.shape[-2] + d = q.shape[-1] + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + if attn_bias_type != "no_bias": + assert (attn_bias is not None + ), "attn_bias tensor cannot be None when attn_bias_type is not no_bias." + assert (attn_bias.shape == torch.Size([1, h, max_seqlen_q, max_seqlen_kv]) + ), "attn_bias tensor must be in [1, h, max_seqlen_q, max_seqlen_kv] shape." + assert (attn_bias.dtype == q.dtype + ), "attn_bias tensor must be in the same dtype as q and kv." + + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." + + # BF16/FP16 fused attention API from fmha_v1 apex + if fused_attention_backend == FusedAttnBackend["F16_max512_seqlen"]: + rng_elts_per_thread = (max_seqlen_q * max_seqlen_kv + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + + # BF16/FP16 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"]: + rng_elts_per_thread = BACKEND_F16arb_ELTS_PER_THREADS + + # FP8 fused attention API from fmha_v2 + if fused_attention_backend == FusedAttnBackend["FP8"]: + rng_elts_per_thread = (max_seqlen_q * max_seqlen_q + + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + + # execute kernel + output_tensors = tex.fused_attn_fwd( + max_seqlen_q, max_seqlen_kv, is_training, attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens_q, cu_seqlens_kv, q, k, v, qkv_dtype, + d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, + attn_bias, rng_gen, rng_elts_per_thread, + ) + + # out, aux_ctx_tensors + return output_tensors[0], output_tensors[1:] + + +def fused_attn_bwd( + max_seqlen_q: int, + max_seqlen_kv: int, + cu_seqlens_q: torch.Tensor, + cu_seqlens_kv: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + d_o: torch.Tensor, + qkv_dtype: tex.DType, + aux_ctx_tensors: List[torch.Tensor], + fused_attention_backend: tex.NVTE_Fused_Attn_Backend, + d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, + d_scale_o: torch.Tensor = None, + d_scale_do: torch.Tensor = None, + q_scale_s: torch.Tensor = None, + q_scale_dp: torch.Tensor = None, + q_scale_dqkv: torch.Tensor = None, + amax_dp: torch.Tensor = None, + amax_dqkv: torch.Tensor = None, + attn_scale: float = None, + dropout: float = 0.0, + fast_zero_fill: bool = True, + qkv_layout: str = "sbh3d", + attn_bias_type: str = "no_bias", + attn_mask_type: str = "padding", +) -> Tuple[Union[torch.Tensor, None], ...]: + """Fused Attention BWD for packed KV input. + + Parameters + ---------- + max_seqlen_q: int + max sequence length for Q, used for padding; may be larger than max(seqlens_q), + seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1] + max_seqlen_kv: int + max sequence length for K and V, used for padding; + may be larger than max(seqlens_kv), + seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1] + cu_seqlens_q: torch.Tensor + cumulative sequence lengths for Q; shape [batch_size + 1] + cu_seqlens_kv: torch.Tensor + cumulative sequence lengths for K and V; shape [batch_size + 1] + q: torch.Tensor + input tensor Q; + shape [total_seqs_q, num_heads, head_dim], + where total_seqs_q = cu_seqlens_q[-1], + or [batch_size, seqlen_q, num_heads, head_dim], + or [seqlen_q, batch_size, num_heads, head_dim] + k: torch.Tensor + input tensor K; + shape [total_seqs_kv, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1], + or [batch_size, seqlen_kv, num_heads, head_dim], + or [seqlen_kv, batch_size, num_heads, head_dim] + v: torch.Tensor + input tensor V; + shape [total_seqs_kv, num_heads, head_dim], + where total_seqs_kv = cu_seqlens_kv[-1], + or [batch_size, seqlen_kv, num_heads, head_dim], + or [seqlen_kv, batch_size, num_heads, head_dim] + o: torch.Tensor + input tensor O (output of forward); same data type as Q, K and V; + same shape as Q + d_o: torch.Tensor + input tensor dO (gradient of O); same data type as Q, K and V; + same shape as Q + qkv_dtype: tex.DType + data type of Q, K and V; in tex.DType, not torch.dtype + aux_ctx_tensors: List[torch.Tensor] + auxiliary output tensors of the forward pass when its is_training is True, + e.g. aux_ctx_tensors = [M, ZInv, rng_state] + fused_attention_backend: tex.NVTE_Fused_Attn_Backend + please see FusedAttention module for details on supported backends. + d_scale_qkv: torch.Tensor, default = None + input tensor for the dequantization of Q, K and V in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) + d_scale_o: torch.Tensor, default = None + input tensor for the dequantization of O in FP8 computations + d_scale_do: torch.Tensor, default = None + input tensor for the dequantization of dO in FP8 computations + q_scale_s: torch.Tensor, default = None + input tensor for the quantization of S in FP8 computations + q_scale_dp: torch.Tensor, default = None + input tensor for the quantization of dP in FP8 computations, P = Q * K.T + q_scale_dqkv: torch.Tensor, default = None + input tensor for the quantization of dQ, dK and dV in FP8 computations + amax_dp: torch.Tensor, default = None + output tensor, amax of dP, used by the next iteration in FP8 computations, + P = Q * K.T + amax_dqkv: torch.Tensor, default = None + output tensor, amax of dQ, dK and dV, used by the next iteration in FP8 computations + attn_scale: float, default = None + if not None, use attn_scale as the attention scale for Q*K.T BMM; + if None, use 1.0/sqrt(head_dim) as the default + dropout: float, default = 0.0 + dropout probability, 0.0 means no dropout, 1.0 means no output; + dropout must be 0.0 if is_training is False + fast_zero_fill: bool, default = True + if True, initializes the output tensor O to zero using the fast filling method; + if False, uses PyTorch's .fill_() method + qkv_layout: str, default = "sbh3d" + layout of Q, K and V; + {"sb3hd", "sbh3d", "sbhd_sb2hd", "sbhd_sbh2d", "sbhd_sbhd_sbhd", + "bs3hd", "bsh3d", "bshd_bs2hd", "bshd_bsh2d", "bshd_bshd_bshd", + "t3hd", "th3d", "thd_t2hd", "thd_th2d", "thd_thd_thd"} + attn_bias_type: str, default = "no_bias" + type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias"} + attn_mask_type: str, default = "padding" + type of the attention mask; {"padding", "causal", "no_mask"} + + Returns + ---------- + d_q: torch.Tensor + gradient tensor of Q; same data type and shape as Q + d_k: torch.Tensor + gradient tensor of K; same data type and shape as K + d_v: torch.Tensor + gradient tensor of V; same data type and shape as V + d_bias: torch.Tensor, optional + gradient tensor of Bias when attn_bias_type is "pre_scale_bias" + or "post_scale_bias"; same data type and shape as Bias + """ + + check_cu_seqlens(cu_seqlens_q) + check_cu_seqlens(cu_seqlens_kv) + assert (cu_seqlens_q.numel() == cu_seqlens_kv.numel() + ), "cu_seqlens_q and cu_seqlens_kv must have the same length." + b = cu_seqlens_q.numel() - 1 + h = q.shape[-2] + d = q.shape[-1] + + if attn_scale is None: + attn_scale = 1.0 / math.sqrt(d) + + assert (fused_attention_backend != FusedAttnBackend["No_Backend"] + ), "Fused attention does not support this input combination." + + if fused_attention_backend != FusedAttnBackend["F16_max512_seqlen"]: + assert (len(aux_ctx_tensors) >= 1 + ), "aux_ctx_tensors must contain rng_state as its last element." + rng_state = aux_ctx_tensors[-1] + check_rng_state(rng_state) + + if fused_attention_backend == FusedAttnBackend["FP8"]: + assert (d_scale_qkv is not None), "d_scale_qkv is required for FP8 fused attention." + assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." + assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." + assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." + assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." + assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." + assert (amax_dp is not None), "amax_dp is required for FP8 fused attention." + assert (amax_dqkv is not None), "amax_dqkv is required for FP8 fused attention." + assert (len(aux_ctx_tensors) == 3 + ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for FP8 fused attention." + check_scalar(d_scale_qkv) + check_scalar(d_scale_s) + check_scalar(d_scale_o) + check_scalar(d_scale_do) + check_scalar(q_scale_s) + check_scalar(q_scale_dp) + check_scalar(q_scale_dqkv) + check_scalar(amax_dp) + check_scalar(amax_dqkv) + m, z_inv = aux_ctx_tensors[:2] + check_stats(m, b, h, max_seqlen_q) + check_stats(z_inv, b, h, max_seqlen_q) + + # execute kernel + output_tensors = tex.fused_attn_bwd( + max_seqlen_q, max_seqlen_kv, attn_scale, dropout, fast_zero_fill, + QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], + cu_seqlens_q, cu_seqlens_kv, q, k, v, o, d_o, qkv_dtype, aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, + ) + + return tuple(output_tensors) diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index d06906b5a2..274a523ec0 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -106,6 +106,52 @@ std::vector fused_attn_bwd_kvpacked( c10::optional amax_dP, c10::optional amax_dQKV); +std::vector fused_attn_fwd( + size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor K, + const at::Tensor V, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen, + size_t rng_elts_per_thread); + +std::vector fused_attn_bwd( + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, + NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor K, + const at::Tensor V, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV); + at::Tensor fa_prepare_fwd(at::Tensor qkvi); at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v); diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cu b/transformer_engine/pytorch/csrc/extensions/attention.cu index d2b91cc194..4f2d958f13 100644 --- a/transformer_engine/pytorch/csrc/extensions/attention.cu +++ b/transformer_engine/pytorch/csrc/extensions/attention.cu @@ -717,6 +717,444 @@ std::vector fused_attn_bwd_kvpacked( return {dQ, dKV, dBias}; } +// fused attention FWD with separate Q, K and V tensors +std::vector fused_attn_fwd( + size_t max_seqlen_q, size_t max_seqlen_kv, + bool is_training, float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor K, + const at::Tensor V, + const transformer_engine::DType qkv_type, + const c10::optional descale_QKV, + const c10::optional scale_S, + const c10::optional scale_O, + c10::optional amax_S, + c10::optional amax_O, + const c10::optional Bias, + const c10::optional rng_gen, + size_t rng_elts_per_thread) { + using namespace transformer_engine; + + auto q_sizes = Q.sizes().vec(); + std::vector q_shape{q_sizes.begin(), q_sizes.end()}; + auto k_sizes = K.sizes().vec(); + std::vector k_shape{k_sizes.begin(), k_sizes.end()}; + auto v_sizes = V.sizes().vec(); + std::vector v_shape{v_sizes.begin(), v_sizes.end()}; + + // create output tensor O + auto O = torch::empty_like(Q); + + // construct NVTE tensors + TensorWrapper te_Q, te_K, te_V, te_S, te_O, te_Bias; + TensorWrapper te_cu_seqlens_q, te_cu_seqlens_kv; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + auto h = Q.size(-2); + auto d = Q.size(-1); + if (set_zero && ((h * d) % block_size == 0)) { + mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + O.fill_(0); + } + if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_K = makeTransformerEngineTensor(K.data_ptr(), k_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_V = makeTransformerEngineTensor(V.data_ptr(), v_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + at::Tensor descale_S = torch::empty_like(scale_S.value()); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, amax_S.value().data_ptr(), + scale_S.value().data_ptr(), descale_S.data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, + qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + te_K = makeTransformerEngineTensor(K.data_ptr(), k_shape, + qkv_type, nullptr, nullptr, nullptr); + te_V = makeTransformerEngineTensor(V.data_ptr(), v_shape, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + if ((bias_type != NVTE_NO_BIAS) && (Bias.has_value())) { + auto bias_sizes = Bias.value().sizes().vec(); + std::vector bias_shape{bias_sizes.begin(), bias_sizes.end()}; + te_Bias = makeTransformerEngineTensor(Bias.value().data_ptr(), bias_shape, + DType::kFloat32, nullptr, nullptr, nullptr); + } + auto cu_seqlens_q_sizes = cu_seqlens_q.sizes().vec(); + std::vector cu_seqlens_q_shape{cu_seqlens_q_sizes.begin(), cu_seqlens_q_sizes.end()}; + auto cu_seqlens_kv_sizes = cu_seqlens_kv.sizes().vec(); + std::vector cu_seqlens_kv_shape{cu_seqlens_kv_sizes.begin(), cu_seqlens_kv_sizes.end()}; + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), cu_seqlens_q_shape, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), cu_seqlens_kv_shape, + DType::kInt32, nullptr, nullptr, nullptr); + + // extract rng seed and offset + auto gen = at::get_generator_or_default( + rng_gen, at::cuda::detail::getDefaultCUDAGenerator()); + at::PhiloxCudaState philox_args = init_philox_state(gen, rng_elts_per_thread); + auto options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA); + auto rng_state = torch::empty({2}, options); + unpack<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( + philox_args, static_cast(rng_state.data_ptr())); + auto te_rng_state = makeTransformerEngineTensor(rng_state); + + // create auxiliary output tensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_fwd( + te_Q.data(), + te_K.data(), + te_V.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace and auxiliary output tensors + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // output_tensors = [O, nvte_aux_tensor_pack.tensors] + std::vector output_tensors; + output_tensors.push_back(O); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + // allocate memory for nvte_aux_tensor_pack.tensors + at::Tensor output_tensor; + if (nvte_aux_tensor_pack.size >= 2) { + output_tensor = (i < nvte_aux_tensor_pack.size-1) + ? allocateSpace(tensor->data.shape, tensor->data.dtype, false) : rng_state; + } else { + output_tensor = allocateSpace(tensor->data.shape, tensor->data.dtype, false); + } + output_tensors.push_back(output_tensor); + tensor->data.dptr = output_tensor.data_ptr(); + } + + // execute the kernel + nvte_fused_attn_fwd( + te_Q.data(), + te_K.data(), + te_V.data(), + te_Bias.data(), + te_S.data(), + te_O.data(), + &nvte_aux_tensor_pack, + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + te_rng_state.data(), + max_seqlen_q, max_seqlen_kv, + is_training, attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers, but not allocated memory + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + // if training, [O, softmax-related tensors, rng_state]; if inference, [O] + return output_tensors; +} + +// fused attention BWD with separate Q, K and V +std::vector fused_attn_bwd( + size_t max_seqlen_q, size_t max_seqlen_kv, + float attn_scale, float p_dropout, bool set_zero, + NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, + const at::Tensor cu_seqlens_q, + const at::Tensor cu_seqlens_kv, + const at::Tensor Q, + const at::Tensor K, + const at::Tensor V, + const at::Tensor O, + const at::Tensor dO, + const transformer_engine::DType qkv_type, + const std::vector Aux_CTX_Tensors, + const c10::optional descale_QKV, + const c10::optional descale_S, + const c10::optional descale_O, + const c10::optional descale_dO, + const c10::optional scale_S, + const c10::optional scale_dP, + const c10::optional scale_dQKV, + c10::optional amax_dP, + c10::optional amax_dQKV) { + using namespace transformer_engine; + + auto q_sizes = Q.sizes().vec(); + std::vector q_shape{q_sizes.begin(), q_sizes.end()}; + auto k_sizes = K.sizes().vec(); + std::vector k_shape{k_sizes.begin(), k_sizes.end()}; + auto v_sizes = V.sizes().vec(); + std::vector v_shape{v_sizes.begin(), v_sizes.end()}; + auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + + at::Tensor dQ; + at::Tensor dK; + at::Tensor dV; + at::Tensor dQKV, dKV; + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + std::vector tmp_shape; + switch (layout_group) { + case NVTE_QKV_Layout_Group::NVTE_3HD: + tmp_shape = std::vector{q_sizes.begin(), q_sizes.end()}; + tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 2, int64_t(3)); + dQKV = torch::empty(c10::IntArrayRef(tmp_shape), options); + dQ = dQKV.index({"...", torch::indexing::Slice(0, 1, 1), + torch::indexing::Slice(0, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); + dK = dQKV.index({"...", torch::indexing::Slice(1, 2, 1), + torch::indexing::Slice(0, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); + dV = dQKV.index({"...", torch::indexing::Slice(2, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); + break; + case NVTE_QKV_Layout_Group::NVTE_H3D: + tmp_shape = std::vector{q_sizes.begin(), q_sizes.end()}; + tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 1, int64_t(3)); + dQKV = torch::empty(c10::IntArrayRef(tmp_shape), options); + dQ = dQKV.index({"...", torch::indexing::Slice(0, 1, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); + dK = dQKV.index({"...", torch::indexing::Slice(1, 2, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); + dV = dQKV.index({"...", torch::indexing::Slice(2, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); + break; + case NVTE_QKV_Layout_Group::NVTE_HD_2HD: + dQ = torch::empty_like(Q); + tmp_shape = std::vector{k_sizes.begin(), k_sizes.end()}; + tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 2, int64_t(2)); + dKV = torch::empty(c10::IntArrayRef(tmp_shape), options); + dK = dKV.index({"...", torch::indexing::Slice(0, 1, 1), + torch::indexing::Slice(0, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); + dV = dKV.index({"...", torch::indexing::Slice(1, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); + break; + case NVTE_QKV_Layout_Group::NVTE_HD_H2D: + dQ = torch::empty_like(Q); + tmp_shape = std::vector{k_sizes.begin(), k_sizes.end()}; + tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 1, int64_t(2)); + dKV = torch::empty(c10::IntArrayRef(tmp_shape), options); + dK = dKV.index({"...", torch::indexing::Slice(0, 1, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); + dV = dKV.index({"...", torch::indexing::Slice(1, torch::indexing::None, 1), + torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); + break; + case NVTE_QKV_Layout_Group::NVTE_HD_HD_HD: + dQ = torch::empty_like(Q); + dK = torch::empty_like(K); + dV = torch::empty_like(V); + break; + default: + NVTE_ERROR("QKV layout not supported!"); + } + + at::Tensor dBias; + TensorWrapper te_dBias; + if (bias_type != NVTE_NO_BIAS) { + dBias = torch::empty({1, static_cast(Q.size(-2)), + static_cast(max_seqlen_q), + static_cast(max_seqlen_kv)}, options); + te_dBias = makeTransformerEngineTensor(dBias); + } + + // construct NVTE tensors + TensorWrapper te_Q, te_K, te_V, te_O, te_dO, te_S, te_dP, te_dQ, te_dK, te_dV; + if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { + // FP8 + auto h_q = Q.size(-2); + auto h_kv = K.size(-2); + auto d = Q.size(-1); + if (set_zero + && ((h_q * d) % block_size == 0) + && ((h_kv * d) % block_size == 0) + && dQ.is_contiguous() + && dK.is_contiguous() + && dV.is_contiguous()) { + mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); + mha_fill(dK, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); + mha_fill(dV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); + } else { + dQ.fill_(0); + dK.fill_(0); + dV.fill_(0); + } + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!scale_S.has_value()) || (!scale_dP.has_value()) + || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; + err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); + } + te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_K = makeTransformerEngineTensor(K.data_ptr(), k_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_V = makeTransformerEngineTensor(V.data_ptr(), v_shape, + qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); + te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, + scale_S.value().data_ptr(), descale_S.value().data_ptr()); + at::Tensor descale_dP = torch::empty_like(scale_dP.value()); + te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, + amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), + descale_dP.data_ptr()); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + te_dK = makeTransformerEngineTensor(dK.data_ptr(), k_shape, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + te_dV = makeTransformerEngineTensor(dV.data_ptr(), v_shape, qkv_type, + amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); + } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { + // BF16 or FP16 + te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + te_K = makeTransformerEngineTensor(K.data_ptr(), k_shape, + qkv_type, nullptr, nullptr, nullptr); + te_V = makeTransformerEngineTensor(V.data_ptr(), v_shape, + qkv_type, nullptr, nullptr, nullptr); + te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + te_S = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dP = makeTransformerEngineTensor(nullptr, {0}, + DType::kFloat32, nullptr, nullptr, nullptr); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, + qkv_type, nullptr, nullptr, nullptr); + te_dK = makeTransformerEngineTensor(dK.data_ptr(), k_shape, + qkv_type, nullptr, nullptr, nullptr); + te_dV = makeTransformerEngineTensor(dV.data_ptr(), v_shape, + qkv_type, nullptr, nullptr, nullptr); + } else { + NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); + } + + // create cu_seqlens tensorwrappers + auto cu_seqlens_q_sizes = cu_seqlens_q.sizes().vec(); + std::vector cu_seqlens_q_shape{cu_seqlens_q_sizes.begin(), cu_seqlens_q_sizes.end()}; + auto cu_seqlens_kv_sizes = cu_seqlens_kv.sizes().vec(); + std::vector cu_seqlens_kv_shape{cu_seqlens_kv_sizes.begin(), cu_seqlens_kv_sizes.end()}; + TensorWrapper te_cu_seqlens_q, te_cu_seqlens_kv, te_qkvso_strides; + te_cu_seqlens_q = makeTransformerEngineTensor(cu_seqlens_q.data_ptr(), cu_seqlens_q_shape, + DType::kInt32, nullptr, nullptr, nullptr); + te_cu_seqlens_kv = makeTransformerEngineTensor(cu_seqlens_kv.data_ptr(), cu_seqlens_kv_shape, + DType::kInt32, nullptr, nullptr, nullptr); + + // convert auxiliary tensors from forward to NVTETensors + NVTETensorPack nvte_aux_tensor_pack; + nvte_tensor_pack_create(&nvte_aux_tensor_pack); + nvte_aux_tensor_pack.size = Aux_CTX_Tensors.size(); + for (size_t i = 0; i < nvte_aux_tensor_pack.size; ++i) { + auto tensor = reinterpret_cast(nvte_aux_tensor_pack.tensors[i]); + tensor->data.dptr = Aux_CTX_Tensors[i].data_ptr(); + std::vector tmp(Aux_CTX_Tensors[i].sizes().vec()); + tensor->data.shape = std::vector(tmp.begin(), tmp.end()); + tensor->data.dtype = GetTransformerEngineDType(Aux_CTX_Tensors[i].scalar_type()); + } + + // create workspace + TensorWrapper workspace; + + // populate tensors with appropriate shapes and dtypes + nvte_fused_attn_bwd( + te_Q.data(), + te_K.data(), + te_V.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dK.data(), + te_dV.data(), + te_dBias.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // allocate memory for workspace + auto workspace_data = allocateSpace(workspace.shape(), workspace.dtype()); + workspace = makeTransformerEngineTensor( + workspace_data.data_ptr(), + workspace.shape(), workspace.dtype()); + + // execute kernel + nvte_fused_attn_bwd( + te_Q.data(), + te_K.data(), + te_V.data(), + te_O.data(), + te_dO.data(), + te_S.data(), + te_dP.data(), + &nvte_aux_tensor_pack, + te_dQ.data(), + te_dK.data(), + te_dV.data(), + te_dBias.data(), + te_cu_seqlens_q.data(), + te_cu_seqlens_kv.data(), + max_seqlen_q, max_seqlen_kv, + attn_scale, p_dropout, + qkv_layout, bias_type, attn_mask_type, + workspace.data(), + at::cuda::getCurrentCUDAStream()); + + // destroy tensor wrappers + nvte_tensor_pack_destroy(&nvte_aux_tensor_pack); + + return {dQ, dK, dV, dBias}; +} + namespace flash_attention { constexpr int warp_size = 32; diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp index 93196962e0..abc15022b0 100644 --- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -56,6 +56,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Fused Attention FP8/BF16/FP16 FWD with packed KV"); m.def("fused_attn_bwd_kvpacked", &fused_attn_bwd_kvpacked, "Fused Attention FP8/BF16/FP16 BWD with packed KV"); + m.def("fused_attn_fwd", &fused_attn_fwd, + "Fused Attention FP8/BF16/FP16 FWD with separate Q, K and V"); + m.def("fused_attn_bwd", &fused_attn_bwd, + "Fused Attention FP8/BF16/FP16 BWD with separate Q, K and V"); m.def("fp8_transpose", &fp8_transpose, "Transpose with FP8 I/O"); m.def("gelu", &gelu, "GeLU with FP8 output"); m.def("relu", &relu, "ReLU with FP8 output"); @@ -148,7 +152,22 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::enum_(m, "NVTE_QKV_Layout") .value("NVTE_NOT_INTERLEAVED", NVTE_QKV_Layout::NVTE_NOT_INTERLEAVED) .value("NVTE_QKV_INTERLEAVED", NVTE_QKV_Layout::NVTE_QKV_INTERLEAVED) - .value("NVTE_KV_INTERLEAVED", NVTE_QKV_Layout::NVTE_KV_INTERLEAVED); + .value("NVTE_KV_INTERLEAVED", NVTE_QKV_Layout::NVTE_KV_INTERLEAVED) + .value("NVTE_SB3HD", NVTE_QKV_Layout::NVTE_SB3HD) + .value("NVTE_SBH3D", NVTE_QKV_Layout::NVTE_SBH3D) + .value("NVTE_SBHD_SB2HD", NVTE_QKV_Layout::NVTE_SBHD_SB2HD) + .value("NVTE_SBHD_SBH2D", NVTE_QKV_Layout::NVTE_SBHD_SBH2D) + .value("NVTE_SBHD_SBHD_SBHD", NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD) + .value("NVTE_BS3HD", NVTE_QKV_Layout::NVTE_BS3HD) + .value("NVTE_BSH3D", NVTE_QKV_Layout::NVTE_BSH3D) + .value("NVTE_BSHD_BS2HD", NVTE_QKV_Layout::NVTE_BSHD_BS2HD) + .value("NVTE_BSHD_BSH2D", NVTE_QKV_Layout::NVTE_BSHD_BSH2D) + .value("NVTE_BSHD_BSHD_BSHD", NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD) + .value("NVTE_T3HD", NVTE_QKV_Layout::NVTE_T3HD) + .value("NVTE_TH3D", NVTE_QKV_Layout::NVTE_TH3D) + .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD) + .value("NVTE_THD_TH2D", NVTE_QKV_Layout::NVTE_THD_TH2D) + .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD); py::enum_(m, "NVTE_Fused_Attn_Backend") .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index d4046ec7da..8ac14758e7 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -74,6 +74,7 @@ class TransformerLayer(torch.nn.Module): are deprecated and will be fully removed in future releases. .. note:: + Argument :attr:`attention_mask` will be ignored in the `forward` call when :attr:`self_attn_mask_type` is set to `"causal"`. @@ -624,5 +625,5 @@ def forward( if self.output_layernorm: output = self.layernorm(output) - # output: [b, s, h] + # output: [s, b, h] return output From f575ff935c54307fffcdf6b051f8eba105fb02e2 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 26 Sep 2023 22:47:55 -0700 Subject: [PATCH 058/427] Add release to deprecation warnings (#447) Change deprecation warnings Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/jax/__init__.py | 30 ++++++++++++------- transformer_engine/pytorch/module/base.py | 4 +-- .../pytorch/module/layernorm_linear.py | 8 ++--- transformer_engine/pytorch/module/linear.py | 8 ++--- transformer_engine/pytorch/transformer.py | 4 +-- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/transformer_engine/jax/__init__.py b/transformer_engine/jax/__init__.py index 0459402172..793e6c3f8b 100644 --- a/transformer_engine/jax/__init__.py +++ b/transformer_engine/jax/__init__.py @@ -10,29 +10,39 @@ extend_logical_axis_rules = deprecate_wrapper( flax.extend_logical_axis_rules, - "extend_logical_axis_rules is moving to transformer_engine.jax.flax module") + "extend_logical_axis_rules is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") DenseGeneral = deprecate_wrapper(flax.DenseGeneral, - "DenseGeneral is moving to transformer_engine.jax.flax module") + "DenseGeneral is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") LayerNorm = deprecate_wrapper(flax.LayerNorm, - "LayerNorm is moving to transformer_engine.jax.flax module") + "LayerNorm is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") LayerNormDenseGeneral = deprecate_wrapper( flax.LayerNormDenseGeneral, - "LayerNormDenseGeneral is moving to transformer_engine.jax.flax module") + "LayerNormDenseGeneral is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") LayerNormMLP = deprecate_wrapper(flax.LayerNormMLP, - "LayerNormMLP is moving to transformer_engine.jax.flax module") + "LayerNormMLP is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") TransformerEngineBase = deprecate_wrapper( flax.TransformerEngineBase, - "TransformerEngineBase is moving to transformer_engine.jax.flax module") + "TransformerEngineBase is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") MultiHeadAttention = deprecate_wrapper( - flax.MultiHeadAttention, "MultiHeadAttention is moving to transformer_engine.jax.flax module") + flax.MultiHeadAttention, "MultiHeadAttention is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") RelativePositionBiases = deprecate_wrapper( flax.RelativePositionBiases, - "RelativePositionBiases is moving to transformer_engine.jax.flax module") + "RelativePositionBiases is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") TransformerLayer = deprecate_wrapper( - flax.TransformerLayer, "TransformerLayer is moving to transformer_engine.jax.flax module") + flax.TransformerLayer, "TransformerLayer is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") TransformerLayerType = deprecate_wrapper( flax.TransformerLayerType, - "TransformerLayerType is moving to transformer_engine.jax.flax module") + "TransformerLayerType is moving to transformer_engine.jax.flax module" + " and will be fully removed in the next release (v1.0.0).") __all__ = [ 'fp8_autocast', 'update_collections', 'update_fp8_metas', 'get_delayed_scaling', diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 50d7b9f2fb..8bb9d55f38 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -366,7 +366,7 @@ def set_extra_state(self, state: torch.Tensor) -> None: if isinstance(state, list): warnings.warn( "This checkpoint format is deprecated and will be" - "removed in a future release of Transformer Engine" + "removed in the next release (v1.0.0)." ) # Retrieve checkpointed items. @@ -412,7 +412,7 @@ def set_extra_state(self, state: torch.Tensor) -> None: else: warnings.warn( "This checkpoint format is deprecated and will be" - "removed in a future release of Transformer Engine" + "removed in the next release (v1.0.0)." ) # Load extra items. self.fp8_meta.update(state["extra_fp8_variables"]) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 761b0abf6b..b7372f81fe 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -514,7 +514,7 @@ class LayerNormLinear(TransformerEngineBaseModule): .. warning:: Argument :attr:`skip_weight_param_allocation` is deprecated and will - be fully removed in future releases. + be fully removed in the next release (v1.0.0). Parameters ---------- @@ -622,7 +622,7 @@ def __init__( if skip_weight_param_allocation: warnings.warn( "Argument `skip_weight_param_allocation` is deprecated and" - "will be fully removed in future releases. It is ignored" + "will be fully removed in the next release (v1.0.0). It is ignored" "starting from v0.11.", category=DeprecationWarning, ) @@ -827,7 +827,7 @@ def forward( .. warning:: Arguments :attr:`weight` and :attr:`bias` are deprecated and will - be fully removed in future releases. + be fully removed in the next release (v1.0.0). Parameters ---------- @@ -851,7 +851,7 @@ def forward( if weight is not None or bias is not None: raise RuntimeError( "Arguments `weight` and `bias` are deprecated and " - "will be fully removed in future releases." + "will be fully removed in the next release (v1.0.0)." ) with self.prepare_forward(inp, is_first_microbatch) as inp: diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 45a163966b..98ca2015ed 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -448,7 +448,7 @@ class Linear(TransformerEngineBaseModule): .. warning:: Argument :attr:`skip_weight_param_allocation` is deprecated and will - be fully removed in future releases. + be fully removed in the next release (v1.0.0). Parameters ---------- @@ -535,7 +535,7 @@ def __init__( if skip_weight_param_allocation: warnings.warn( "Argument `skip_weight_param_allocation` is deprecated and" - "will be fully removed in future releases. It has ignored" + "will be fully removed in the next release (v1.0.0). It has ignored" "starting from v0.11.", category=DeprecationWarning, ) @@ -701,7 +701,7 @@ def forward( .. warning:: Arguments :attr:`weight` and :attr:`bias` are deprecated and will - be fully removed in future releases. + be fully removed in the next release (v1.0.0). Parameters ---------- @@ -725,7 +725,7 @@ def forward( if weight is not None or bias is not None: raise RuntimeError( "Arguments `weight` and `bias` are deprecated and " - "will be fully removed in future releases." + "will be fully removed in the next release (v1.0.0)." ) with self.prepare_forward(inp, is_first_microbatch) as inp: diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 8ac14758e7..d8a1aa1ad2 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -71,7 +71,7 @@ class TransformerLayer(torch.nn.Module): .. warning:: Arguments :attr:`attention_softmax_in_fp32` and :attr:`apply_query_key_layer_scaling` - are deprecated and will be fully removed in future releases. + are deprecated and will be fully removed in the next release (v1.0.0). .. note:: @@ -247,7 +247,7 @@ def __init__( warnings.warn( "Arguments `attention_softmax_in_fp32` and `apply_query_key_layer_scaling`" - "are deprecated and will be fully removed in future releases.", + "are deprecated and will be fully removed in the next release (v1.0.0).", category=DeprecationWarning, ) From dfd29c48fe61e9fe419bb02710b53f064c39d1a3 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 26 Sep 2023 22:48:09 -0700 Subject: [PATCH 059/427] Keep previous FA version (#450) Signed-off-by: Kirthi Shankar Sivamani --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bcccd8208f..5959c2b941 100644 --- a/setup.py +++ b/setup.py @@ -290,7 +290,7 @@ def add_unique(l: List[str], vals: Union[str, List[str]]) -> None: # Framework-specific requirements if "pytorch" in frameworks(): - add_unique(install_reqs, ["torch", "flash-attn>=1.0.6, <=2.2.1"]) + add_unique(install_reqs, ["torch", "flash-attn>=1.0.6, <=2.0.4"]) add_unique(test_reqs, ["numpy", "onnxruntime", "torchvision"]) if "jax" in frameworks(): if not found_pybind11(): From 02a4ccceb02309ae1544562edd689b2edcc89696 Mon Sep 17 00:00:00 2001 From: vasunvidia <108759426+vasunvidia@users.noreply.github.com> Date: Thu, 5 Oct 2023 13:20:16 -0700 Subject: [PATCH 060/427] Atomic gemm and FP8 Reduce Scatter (#449) * Initial commit Signed-off-by: Vasudevan Rengasamy * Repro for RS output mismatch with Single GEMM + Split pipelined RS Signed-off-by: Vasudevan Rengasamy * minor changes for AG->GEMM pipelined overlap Signed-off-by: Vasudevan Rengasamy * Add Atomic Gemm cublasApi attributes and initial implementation of AG->Atomic GEMM Signed-off-by: Vasudevan Rengasamy * AtomicGemm+RS functional with workaround Signed-off-by: Vasudevan Rengasamy * add amax update to layernorm_linear for FP8 unit test accuracy Signed-off-by: Vasudevan Rengasamy * Enable reducescatter2_userbuff_strided variants Signed-off-by: Vasudevan Rengasamy * Bug fix Signed-off-by: Vasudevan Rengasamy * AG+AtomicGemm overlap functional but gemm doesnt overlap with comm Signed-off-by: Vasudevan Rengasamy * Add userbuffers_sendrecv kernel variants Signed-off-by: Vasudevan Rengasamy * TransformerLayer API changes to enable AtomicGemm+RS overlap Signed-off-by: Vasudevan Rengasamy * Code cleanup Signed-off-by: Vasudevan Rengasamy * Code cleanup2 Signed-off-by: Vasudevan Rengasamy * [UB] AllGather Atomic GEMM overlap using userbuffer_sendrecv kernels Signed-off-by: Vasudevan Rengasamy * Code cleanup + bug fix for multiatomic sendrecv kernel Signed-off-by: Vasudevan Rengasamy * cleanup Signed-off-by: Vasudevan Rengasamy * Bug fixes Signed-off-by: Vasudevan Rengasamy * [UB] Add shuffling for better AG AtomicGEMM overlap Signed-off-by: Vasudevan Rengasamy * Bug fix for AG AtomicGemm overlap Signed-off-by: Vasudevan Rengasamy * Bug fix for multiAtomicAG and singleAtomicAG Signed-off-by: Vasudevan Rengasamy * Use chunk_i+1 as recv_chunk for multiatomic_AG with shuffling Signed-off-by: Vasudevan Rengasamy * Launch AtomicGEMM after first-chunk AG Signed-off-by: Vasudevan Rengasamy * Rebase to main Signed-off-by: Vasudevan Rengasamy * Add FP8 ReduceScatter kernels, AtomicGEMM+FP8 RS not functional Signed-off-by: Vasudevan Rengasamy * Revert "Add FP8 ReduceScatter kernels, AtomicGEMM+FP8 RS not functional" This reverts commit 80a47a76355440cd5fb4314c96fe9fda632d87f9. Signed-off-by: Vasudevan Rengasamy * Add support for NVLS-MC and FP8 Reduce Scatter Signed-off-by: Vasudevan Rengasamy * Bug fix Signed-off-by: Vasudevan Rengasamy * Atomic and Multiatomic FP8 RS functional Signed-off-by: Vasudevan Rengasamy * Remove debug print Signed-off-by: Vasudevan Rengasamy * UB comm initialization hang fix Signed-off-by: Vasudevan Rengasamy * Code cleanup Signed-off-by: Vasudevan Rengasamy * Create new GEMM API for Atomic GEMM Signed-off-by: Vasudevan Rengasamy * CI ready Signed-off-by: Kirthi Shankar Sivamani * more fixes Signed-off-by: Kirthi Shankar Sivamani * license Signed-off-by: Kirthi Shankar Sivamani * Bug fix Signed-off-by: Vasudevan Rengasamy * Revert NVLS-MC Signed-off-by: Vasudevan Rengasamy * Check cu* versions for running atomic gemms Signed-off-by: Kirthi Shankar Sivamani * lint Signed-off-by: Kirthi Shankar Sivamani * fixes Signed-off-by: Kirthi Shankar Sivamani * Cleanup Signed-off-by: Vasudevan Rengasamy * Add experimental warning Signed-off-by: Kirthi Shankar Sivamani * Better wording Signed-off-by: Kirthi Shankar Sivamani * Add warning to c api Signed-off-by: Kirthi Shankar Sivamani * Fix wording Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Vasudevan Rengasamy Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- tests/pytorch/test_onnx_export.py | 4 +- .../common/gemm/cublaslt_gemm.cu | 108 +- .../common/include/transformer_engine/gemm.h | 46 + transformer_engine/pytorch/attention.py | 6 + .../pytorch/cpp_extensions/gemm.py | 26 +- .../pytorch/csrc/comm_gemm_overlap.h | 502 ++- transformer_engine/pytorch/csrc/extensions.h | 26 + .../pytorch/csrc/extensions/gemm.cu | 80 + .../pytorch/csrc/extensions/pybind.cpp | 12 +- .../csrc/userbuffers/userbuffers-host.cpp | 186 +- .../pytorch/csrc/userbuffers/userbuffers.cu | 2949 ++++++++++++++--- .../pytorch/csrc/userbuffers/userbuffers.h | 83 + transformer_engine/pytorch/module/base.py | 19 +- .../pytorch/module/layernorm_linear.py | 69 +- .../pytorch/module/layernorm_mlp.py | 124 +- transformer_engine/pytorch/module/linear.py | 62 +- transformer_engine/pytorch/transformer.py | 20 + 17 files changed, 3619 insertions(+), 703 deletions(-) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 727ccce3dd..171b2f23c4 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -506,7 +506,7 @@ def forward(self, inp, weight): self.fp8_tensor_weight, self.weights_type) - ret = fp8_gemm( + ret, _ = fp8_gemm( weight_fp8, self.meta_weight.scale_inv, self.fp8_tensor_weight, @@ -1324,7 +1324,7 @@ def forward(self, inp, weight): self.fp8_tensor_weight, self.weights_type) - ret = fp8_gemm( + ret, _ = fp8_gemm( weight_fp8, self.meta_weight.scale_inv, self.fp8_tensor_weight, diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 7f8b0b723d..95ef55bba4 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include "../common.h" @@ -50,6 +51,10 @@ void cublas_gemm(const Tensor *inputA, bool accumulate, bool use_split_accumulator, int math_sm_count, + int m_split, + int n_split, + bool gemm_producer, + const Tensor *inputCounter, cudaStream_t stream ) { void *A = inputA->data.dptr; @@ -63,6 +68,10 @@ void cublas_gemm(const Tensor *inputA, void *bias_ptr = inputBias->data.dptr; const bool bias = bias_ptr != nullptr; void *pre_gelu_out = outputPreGelu->data.dptr; + void *counter = nullptr; + if (inputCounter != nullptr) { + counter = inputCounter->data.dptr; + } const bool gelu = pre_gelu_out != nullptr; const bool use_fp8 = is_fp8_dtype(inputA->data.dtype) || is_fp8_dtype(inputB->data.dtype); @@ -223,6 +232,27 @@ void cublas_gemm(const Tensor *inputA, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); +#if CUDA_VERSION >= 12020 && CUBLAS_VERSION >= 120205 + if (counter != nullptr) { + if (m_split == 0) m_split=1; + if (n_split == 0) n_split=1; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + operationDesc, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS, + &m_split, sizeof(m_split))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + operationDesc, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS, + &n_split, sizeof(n_split))); + if (gemm_producer) { + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + operationDesc, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER, + &counter, sizeof(counter))); + } else { + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + operationDesc, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER, + &counter, sizeof(counter))); + } + } +#endif NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -254,7 +284,6 @@ void cublas_gemm(const Tensor *inputA, workspaceSize, stream)); /* stream */ - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference)); NVTE_CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(Ddesc)); NVTE_CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(Cdesc)); @@ -320,5 +349,82 @@ void nvte_cublas_gemm(const NVTETensor A, wspace->data.shape[0], accumulate, use_split_accumulator, math_sm_count, + 0, + 0, + false, + nullptr, + stream); +} + +void nvte_cublas_atomic_gemm(const NVTETensor A, + const NVTETensor B, + NVTETensor D, + const NVTETensor bias, + NVTETensor pre_gelu_out, + bool transa, + bool transb, + bool grad, + NVTETensor workspace, + bool accumulate, + bool use_split_accumulator, + int math_sm_count, + int m_split, + int n_split, + bool gemm_producer, + const NVTETensor counter, + cudaStream_t stream) { + NVTE_API_CALL(nvte_cublas_atomic_gemm); + + int cudart_version; + NVTE_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version)); + NVTE_CHECK(cudart_version >= 12020, "Cuda version 12.2 is required for atomic gemm."); + NVTE_CHECK(cublasLtGetVersion() >= 120205, "Cublas version 12.2.5 is required for atomic gemm."); + + using namespace transformer_engine; + const Tensor *inputA = reinterpret_cast(A); + const Tensor *inputB = reinterpret_cast(B); + Tensor *outputD = reinterpret_cast(D); + const Tensor *biasTensor = reinterpret_cast(bias); + Tensor *outputGelu = reinterpret_cast(pre_gelu_out); + const Tensor *inputCounter = reinterpret_cast(counter); + Tensor *wspace = reinterpret_cast(workspace); + + const int m = transa ? inputA->data.shape[0] : inputA->data.shape[1]; + const int k = transa ? inputA->data.shape[1] : inputA->data.shape[0]; + const int n = transb ? inputB->data.shape[1] : inputB->data.shape[0]; + int lda, ldb, ldd; + if (transa && !transb) { // TN + lda = k; + ldb = k; + ldd = m; + } else if (!transa && !transb) { // NN + lda = m; + ldb = k; + ldd = m; + } else if (!transa && transb) { // NT + lda = m; + ldb = n; + ldd = m; + } else { // TT + NVTE_ERROR("TT layout not allowed."); + } + + cublas_gemm(inputA, + inputB, + outputD, + biasTensor, + outputGelu, + m, n, k, + lda, ldb, ldd, + (transa) ? CUBLAS_OP_T : CUBLAS_OP_N, + (transb) ? CUBLAS_OP_T : CUBLAS_OP_N, + grad, wspace->data.dptr, + wspace->data.shape[0], + accumulate, use_split_accumulator, + math_sm_count, + m_split, + n_split, + gemm_producer, + inputCounter, stream); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 8cd549b658..5faff43afa 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -54,6 +54,52 @@ void nvte_cublas_gemm(const NVTETensor A, cudaStream_t stream ); +/*! \brief Compute matrix multiplication of 2 matrices with chunking and atomic counters. + * + * \warning Cublas atomic gemm uses a beta API and is not tested for all use cases. + * + * Computes: + * - `D = AB` if both `bias` and `pre_gelu_out` are empty tensors + * - `D = AB + bias` if `pre_gelu_out` is empty and `bias` is not empty + * - `D = GELU(AB + bias)` if both `bias` and `pre_gelu_out` are not empty tensors + * + * \param[in] A The A matrix. + * \param[in] B The B matrix. + * \param[in,out] D Output matrix. + * \param[in] bias Bias tensor. + * \param[in,out] pre_gelu_out Output matrix before GELU activation. + * \param[in] transa Whether A matrix is transposed. + * \param[in] transb Whether B matrix is transposed. + * \param[in] grad Whether this operation is part of the + * gradient computation. + * \param[out] workspace Workspace tensor. + * \param[in] accumulate Whether to accumulate the result into the D matrix. + * \param[in] use_split_accumulator Whether to use split accumulator in the FP8 GEMM. + * \param[in] math_sm_count Number of GPU SMs to use (default=0: use cuBLAS heuristics) + * \param[in] m_split Number of chunks/splits along m-dimension for Atomic GEMM. + * \param[in] n_split Number of chunks/splits along n-dimension for Atomic GEMM. + * \param[in] gemm_producer Whether Atomic GEMM is the producer or consumer. + * \param[in,out] counter counter[chunk_i]=0 indicates chunk_i has been produced. + * \param[in] stream CUDA stream used for the operation. + */ +void nvte_cublas_atomic_gemm(const NVTETensor A, + const NVTETensor B, + NVTETensor D, + const NVTETensor bias, + NVTETensor pre_gelu_out, + bool transa, + bool transb, + bool grad, + NVTETensor workspace, + bool accumulate, + bool use_split_accumulator, + int math_sm_count, + int m_split, + int n_split, + bool gemm_producer, + const NVTETensor counter, + cudaStream_t stream +); #ifdef __cplusplus } // extern "C" #endif diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 625cd8644e..3fb67b990a 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1505,6 +1505,8 @@ def __init__( ub_bulk_dgrad: bool = False, ub_split_rs: bool = False, ub_split_ag: bool = False, + ub_atomic_gemm_rs: bool = False, + ub_atomic_gemm_ag: bool = False, bias: bool = True, normalization: str = "LayerNorm", device: Union[torch.device, str] = "cuda", @@ -1585,6 +1587,7 @@ def __init__( ub_bulk_dgrad=ub_bulk_dgrad, ub_split_ag=ub_split_ag, normalization=normalization, + ub_atomic_gemm_ag=ub_atomic_gemm_ag, **common_gemm_kwargs, ) else: @@ -1615,6 +1618,7 @@ def __init__( ub_bulk_dgrad=ub_bulk_dgrad, ub_split_ag=ub_split_ag, normalization=normalization, + ub_atomic_gemm_ag=ub_atomic_gemm_ag, **common_gemm_kwargs, ) else: @@ -1661,6 +1665,8 @@ def __init__( parallel_mode="row" if set_parallel_mode else None, ub_split_rs=ub_split_rs, ub_split_ag=ub_split_ag, + ub_atomic_gemm_rs=ub_atomic_gemm_rs, + ub_atomic_gemm_ag=ub_atomic_gemm_ag, **common_gemm_kwargs, ) diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py index c84dd1cb39..2d271c950c 100644 --- a/transformer_engine/pytorch/cpp_extensions/gemm.py +++ b/transformer_engine/pytorch/cpp_extensions/gemm.py @@ -92,22 +92,40 @@ def fp8_gemm( assert ub is not None, 'ub object is None!' if ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_AG: fn = ub.bulk_overlap - args = tuple(args + (1,)) + extra_output_tensor = ( + empty_tensor if extra_output_tensor is None else extra_output_tensor + ) + args = tuple(args + (1, extra_output_tensor,)) elif ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_RS: fn = ub.bulk_overlap - args = tuple(args + (0,)) + extra_output_tensor = ( + empty_tensor if extra_output_tensor is None else extra_output_tensor + ) + args = tuple(args + (0, extra_output_tensor,)) elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG: fn = ub.split_overlap_ag extra_output_tensor = ( empty_tensor if extra_output_tensor is None else extra_output_tensor ) args = tuple(args + (extra_output_tensor,)) + elif ub_algo == tex.UbufOverlapAlgo.ATOMIC_GEMM_AG: + fn = ub.atomic_gemm_overlap_ag + extra_output_tensor = ( + empty_tensor if extra_output_tensor is None else extra_output_tensor + ) + args = tuple(args + (extra_output_tensor,)) elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS: fn = ub.split_overlap_rs assert ( extra_output_tensor is not None ), 'SPLIT_PIPELINED_RS requires extra output tensor' args = tuple(args + (True, extra_output_tensor,)) + elif ub_algo == tex.UbufOverlapAlgo.ATOMIC_GEMM_RS: + fn = ub.atomic_gemm_overlap_rs + assert ( + extra_output_tensor is not None + ), 'ATOMIC_GEMM_RS requires extra output tensor' + args = tuple(args + (True, extra_output_tensor,)) _ = fn(*args) if return_output: @@ -204,10 +222,10 @@ def gemm( assert ub is not None, 'ub object is None!' if ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_AG: fn = ub.bulk_overlap - args = tuple(args + (1,)) + args = tuple(args + (1, empty_tensor)) elif ub_algo == tex.UbufOverlapAlgo.BULK_OVERLAP_RS: fn = ub.bulk_overlap - args = tuple(args + (0,)) + args = tuple(args + (0, empty_tensor)) elif ub_algo == tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG: fn = ub.split_overlap_ag extra_output_tensor = ( diff --git a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h index 5dd71e4758..edac58a9dd 100644 --- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h +++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h @@ -4,30 +4,32 @@ * See LICENSE for license information. ************************************************************************/ +#include "userbuffers/userbuffers.h" #include #include #include #include +#include #include #include #include #include #include #include -#include "userbuffers/userbuffers.h" #define HALF_BYTES 2 #define UB_MAX_SM 32 -#define CHECK_CUDA(call) \ - do { \ - cudaError_t status_ = call; \ - if (status_ != cudaSuccess) { \ - fprintf(stderr, "CUDA Error at line %d: %s\n", __LINE__, cudaGetErrorString(status_)); \ - exit(1); \ - } \ +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status_ = call; \ + if (status_ != cudaSuccess) { \ + fprintf(stderr, "CUDA Error at line %d: %s\n", __LINE__, cudaGetErrorString(status_)); \ + exit(1); \ + } \ } while (0) +using namespace torch::indexing; namespace ubuf { enum class COMM_TYPE { RS = 0, AG = 1 }; @@ -36,11 +38,16 @@ enum class UBOverlapAlgo { BULK_OVERLAP_AG = 0, BULK_OVERLAP_RS = 1, SPLIT_PIPELINED_AG = 2, - SPLIT_PIPELINED_RS = 3 + SPLIT_PIPELINED_RS = 3, + ATOMIC_GEMM_RS = 4, + ATOMIC_GEMM_AG = 5 }; -struct UbufCommOverlap : torch::CustomClassHolder { - communicator *_ub_comm; +struct UbufBase { + static inline communicator *_ub_comm{nullptr}; + static inline bool comm_created{false}; +}; +struct UbufCommOverlap : torch::CustomClassHolder, UbufBase { int _tp_id; int _tp_size; int _num_splits; @@ -49,24 +56,53 @@ struct UbufCommOverlap : torch::CustomClassHolder { void *_ubuf_ptr; torch::Tensor _ubuf; torch::Tensor output_tensor; + torch::Tensor _ubuf_scale_inv; + bool _ubuf_scale_inv_initialized; + torch::Tensor counter; + torch::Tensor _empty_tensor; at::cuda::CUDAStream _stream_comm = at::cuda::getStreamFromPool(true); std::vector _stream_compute; cudaEvent_t _start_compute, _stop_compute, _start_d2dcopy, _start_comm, _stop_comm; + int comm_sms; + int cga_size; + int use_ce; UbufCommOverlap(torch::Tensor sample, int rank, int tp_size, int num_comm_sm, int comm_cga_size, - int num_splits, bool set_sm_margin, int num_max_streams) { + int num_splits, bool set_sm_margin, int num_max_streams, + torch::Tensor empty_tensor) { // Initialize userbuf communicator - create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); - _ub_comm->use_ce = 0; - _ub_comm->sms = num_comm_sm; - _ub_comm->cga_size = comm_cga_size; + if (!comm_created) { + if (rank == 0) { + printf("!!! [UB] Create UbufCommOverlap Communicator\n"); + } + create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); + comm_created = true; + } + use_ce = 0; + comm_sms = num_comm_sm; + cga_size = comm_cga_size; + _empty_tensor = empty_tensor; // Allocate and register extra userbuffers int ubuf_bytes = sample.numel() * sample.element_size(); _ub_reg = register_user_buffer_collective(reinterpret_cast(&_ubuf_ptr), ubuf_bytes, _ub_comm, true); + if (rank == 0) { + printf("!!! [UB] Register UBuf %d\n", _ub_reg); + } _ubuf = torch::from_blob(_ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options()); + const char *env_p = std::getenv("NVTE_RS_STRIDED_ATOMIC"); + const char *env_q = std::getenv("NVTE_UB_ATOMIC_GEMM_RS"); + if (rank == 0 && env_p != nullptr && env_q != nullptr && env_q[0] == '1') { + if (env_p[0] == '1') + printf("!! Using reducescatter2_userbuff_strided_atomic\n"); + else if (env_p[0] == '2') + printf("!! Using reducescatter2_userbuff_strided_multiatomic\n"); + else + printf("!! Using reducescatter2_userbuff_strided\n"); + } + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); for (int i = 0; i < std::min(num_max_streams, num_splits); i++) { cudaStream_t stream; @@ -78,6 +114,7 @@ struct UbufCommOverlap : torch::CustomClassHolder { _num_splits = num_splits; _tp_size = tp_size; _tp_id = (rank % tp_size); + _ubuf_scale_inv_initialized = false; // Set the number of SMs for GEMM with margin cudaDeviceProp prop; @@ -85,6 +122,9 @@ struct UbufCommOverlap : torch::CustomClassHolder { _math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount; output_tensor = torch::Tensor(); + auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA); + counter = torch::zeros({num_splits * 2}, counter_options); + counter.index_put_({Slice(None, num_splits)}, 1); // CUDA event creation cudaEventCreateWithFlags(&_start_compute, 0); cudaEventCreateWithFlags(&_stop_compute, 0); @@ -97,13 +137,17 @@ struct UbufCommOverlap : torch::CustomClassHolder { ** Bulk GEMM + COMM ** This function assumes the communication input is pre-copied to _ubuf */ - std::vector bulk_overlap( - at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, - transformer_engine::DType A_type, bool transa, at::Tensor B, at::Tensor B_scale_inverse, - int64_t B_fp8_tensor, transformer_engine::DType B_type, bool transb, at::Tensor D, - at::Tensor D_scale, transformer_engine::DType D_type, at::Tensor D_amax, at::Tensor bias, - transformer_engine::DType bias_type, at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, - size_t workspaceSize, bool accumulate, bool use_split_accumulator, int comm_type) { + std::vector + bulk_overlap(at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, + at::Tensor B_scale_inverse, int64_t B_fp8_tensor, transformer_engine::DType B_type, + bool transb, at::Tensor D, at::Tensor D_scale, transformer_engine::DType D_type, + at::Tensor D_amax, at::Tensor bias, transformer_engine::DType bias_type, + at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, size_t workspaceSize, + bool accumulate, bool use_split_accumulator, int comm_type, at::Tensor rs_output) { + _ub_comm->use_ce = use_ce; + _ub_comm->sms = comm_sms; + _ub_comm->cga_size = cga_size; // Get the current userbuf offset char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); int comm_elements = (_ubuf.numel() / 2) * _ubuf.element_size(); // UBUF uses 2Byte element size @@ -121,15 +165,30 @@ struct UbufCommOverlap : torch::CustomClassHolder { if (_comm_type == COMM_TYPE::AG) { allgather2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, (cudaStream_t)_stream_comm); } else if (_comm_type == COMM_TYPE::RS) { - reducescatter2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, - (cudaStream_t)_stream_comm); + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + comm_elements *= 2; + float *scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + assert(rs_output.numel() == _ubuf.numel() / _tp_size); + assert(rs_output.size(0) == _ubuf.size(0) / _tp_size); + assert(rs_output.element_size() == 2); + char *rs_output_ptr = reinterpret_cast(rs_output.data_ptr()); + reducescatter2_userbuff_fp8<__nv_fp8_e5m2>(rs_output_ptr, scale_inv_ptr, _ub_reg, 0, + comm_elements, _ub_comm, + (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, + (cudaStream_t)_stream_comm); + } } else { NVTE_ERROR("Not supported communication type."); } - if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + if (A_scale_inverse.numel()) + A_scale_inverse = A_scale_inverse[A_fp8_tensor]; - if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + if (B_scale_inverse.numel()) + B_scale_inverse = B_scale_inverse[B_fp8_tensor]; assert(pre_gelu_out.numel() == 0); te_gemm(A, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, D, D_scale, @@ -147,6 +206,117 @@ struct UbufCommOverlap : torch::CustomClassHolder { return {D, output_tensor}; } // bulk_overlap + /* + ** Split FPROP GEMM + ReduceScatter + */ + void atomic_gemm_overlap_rs(at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, + at::Tensor B_scale_inverse, int64_t B_fp8_tensor, + transformer_engine::DType B_type, bool transb, at::Tensor D, + at::Tensor D_scale, transformer_engine::DType D_type, + at::Tensor D_amax, at::Tensor bias, + transformer_engine::DType bias_type, at::Tensor pre_gelu_out, + bool grad, at::Tensor workspace, size_t workspaceSize, + bool accumulate, bool use_split_accumulator, bool gemm_overlap, + at::Tensor rs_output) { + _ub_comm->use_ce = use_ce; + _ub_comm->sms = comm_sms; + _ub_comm->cga_size = cga_size; + // Get GEMM dimensions + int m = A.size(0); + int k = A.size(1); + int n = B.size(0); + int m_chunk = m / _num_splits; + int workspace_size_chunk = workspaceSize / _stream_compute.size(); + + // Get input, output, and workspace data pointers + char *input_a_chunk_ptr = reinterpret_cast(A.data_ptr()); + char *output_buf_chunk_ptr = reinterpret_cast(_ubuf.data_ptr()); + char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); + int *counter_ptr = reinterpret_cast(counter.data_ptr()); + char *rs_output_ptr = reinterpret_cast(rs_output.data_ptr()); + int ori_sms = _ub_comm->sms; + + // Catch up the default torch stream + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_compute, stream_main)); + CHECK_CUDA(cudaEventRecord(_stop_comm, _stream_comm)); + for (int i = 0; i < _stream_compute.size(); i++) { + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[i], _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[i], _stop_comm, 0)); + } + + if (A_scale_inverse.numel()) + A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + + if (B_scale_inverse.numel()) + B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + + assert(pre_gelu_out.numel() == 0); + + torch::Tensor input_a = torch::from_blob(input_a_chunk_ptr, {m, k}, A.options()); + torch::Tensor output_d = torch::from_blob(output_buf_chunk_ptr, {n, m}, _ubuf.options()); + // torch::zeros({n, m}, _ubuf.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr, {workspace_size_chunk}, workspace.options()); + at::cuda::setCurrentCUDAStream(_stream_compute[0]); + te_atomic_gemm(input_a, A_scale_inverse, A_type, transa, B, B_scale_inverse, B_type, transb, + output_d, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms, _num_splits /*m_split*/, 0 /*n_split*/, true /*gemm_producer*/, + counter); + for (int i = 0; i < _num_splits; i++) { + const char *env_p = std::getenv("NVTE_RS_STRIDED_ATOMIC"); + if (env_p != nullptr && env_p[0] == '1') { + if (i == _num_splits - 1) { + _ub_comm->sms = UB_MAX_SM; + } + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + float *d_scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>( + rs_output_ptr, d_scale_inv_ptr, _ub_reg, i * m_chunk, m_chunk, n, m, m, _num_splits, + &counter_ptr[i], _ub_comm, (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_strided_atomic(rs_output_ptr, _ub_reg, i * m_chunk, m_chunk, n, m, + _num_splits, &counter_ptr[i], _ub_comm, + (cudaStream_t)_stream_comm); + } + } else if (env_p != nullptr && env_p[0] == '2') { + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + float *d_scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + reducescatter2_userbuff_strided_multiatomic_fp8<__nv_fp8_e4m3>( + rs_output_ptr, d_scale_inv_ptr, _ub_reg, m_chunk, m_chunk, n, m, m, _num_splits, + counter_ptr, _ub_comm, (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_strided_multiatomic(rs_output_ptr, _ub_reg, m_chunk, m_chunk, n, + m, _num_splits, counter_ptr, _ub_comm, + (cudaStream_t)_stream_comm); + } + break; + } else { + consumer(counter_ptr, i, (cudaStream_t)_stream_comm); + // if (i == _num_splits-1) { + // _ub_comm->sms = UB_MAX_SM; + // } + reducescatter2_userbuff_strided(rs_output_ptr, _ub_reg, i * m_chunk, m_chunk, n, m, + _ub_comm, (cudaStream_t)_stream_comm); + } + + rs_output_ptr += m_chunk * rs_output.element_size(); + } + + _ub_comm->sms = ori_sms; + CHECK_CUDA(cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[0])); + CHECK_CUDA(cudaEventRecord(_stop_comm, (cudaStream_t)_stream_comm)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_comm, 0)); + at::cuda::setCurrentCUDAStream(stream_main); + + return; + } // split_overlap_rs + /* ** Split FPROP GEMM + ReduceScatter */ @@ -160,6 +330,9 @@ struct UbufCommOverlap : torch::CustomClassHolder { size_t workspaceSize, bool accumulate, bool use_split_accumulator, bool gemm_overlap, at::Tensor rs_output) { // Get GEMM dimensions + _ub_comm->use_ce = use_ce; + _ub_comm->sms = comm_sms; + _ub_comm->cga_size = cga_size; int m = A.size(0); int k = A.size(1); int n = B.size(0); @@ -174,7 +347,6 @@ struct UbufCommOverlap : torch::CustomClassHolder { char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); char *rs_output_ptr = reinterpret_cast(rs_output.data_ptr()); - int ubuf_offset = 0; int ori_sms = _ub_comm->sms; // Catch up the default torch stream @@ -184,9 +356,11 @@ struct UbufCommOverlap : torch::CustomClassHolder { CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[i], _start_compute, 0)); } - if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + if (A_scale_inverse.numel()) + A_scale_inverse = A_scale_inverse[A_fp8_tensor]; - if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + if (B_scale_inverse.numel()) + B_scale_inverse = B_scale_inverse[B_fp8_tensor]; assert(pre_gelu_out.numel() == 0); @@ -223,10 +397,19 @@ struct UbufCommOverlap : torch::CustomClassHolder { CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); // Communication chunk - reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, (i - 1) * output_chunk_size, - m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + float *d_scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + reducescatter2_userbuff_stridedoutput_fp8<__nv_fp8_e4m3>( + rs_output_ptr, d_scale_inv_ptr, _ub_reg, (i - 1) * output_chunk_size, m_chunk, n, m, + _ub_comm, (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, (i - 1) * output_chunk_size, + m_chunk, n, m, _ub_comm, + (cudaStream_t)_stream_comm); + } - rs_output_ptr += m_chunk * _ubuf.element_size(); + rs_output_ptr += m_chunk * rs_output.element_size(); } int last_compute_stream_id = (_num_splits + _stream_compute.size() - 1) % _stream_compute.size(); @@ -236,9 +419,17 @@ struct UbufCommOverlap : torch::CustomClassHolder { // Last communication chunk with max SM _ub_comm->sms = UB_MAX_SM; - reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, - (_num_splits - 1) * output_chunk_size, m_chunk, n, m, - _ub_comm, (cudaStream_t)_stream_comm); + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + float *d_scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + reducescatter2_userbuff_stridedoutput_fp8<__nv_fp8_e4m3>( + rs_output_ptr, d_scale_inv_ptr, _ub_reg, (_num_splits - 1) * output_chunk_size, m_chunk, + n, m, _ub_comm, (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, + (_num_splits - 1) * output_chunk_size, m_chunk, n, m, + _ub_comm, (cudaStream_t)_stream_comm); + } } else { for (int i = 0; i < _num_splits; i++) { torch::Tensor input_a_chunk = @@ -259,13 +450,21 @@ struct UbufCommOverlap : torch::CustomClassHolder { CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_comm, _start_comm, 0)); // Communication chunk. Uses MAX_SM at the last chunk - if (i == _num_splits-1) { + if (i == _num_splits - 1) { _ub_comm->sms = UB_MAX_SM; } - reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, i * output_chunk_size, - m_chunk, n, m, _ub_comm, (cudaStream_t)_stream_comm); - - rs_output_ptr += m_chunk * _ubuf.element_size(); + if (_ubuf.element_size() == 1) { + assert(_ubuf_scale_inv_initialized); + float *d_scale_inv_ptr = reinterpret_cast(_ubuf_scale_inv.data_ptr()); + reducescatter2_userbuff_stridedoutput_fp8<__nv_fp8_e4m3>( + rs_output_ptr, d_scale_inv_ptr, _ub_reg, i * output_chunk_size, m_chunk, n, m, + _ub_comm, (cudaStream_t)_stream_comm); + } else { + reducescatter2_userbuff_stridedoutput(rs_output_ptr, _ub_reg, i * output_chunk_size, + m_chunk, n, m, _ub_comm, + (cudaStream_t)_stream_comm); + } + rs_output_ptr += m_chunk * rs_output.element_size(); input_a_chunk_ptr += input_a_chunk_size * B.element_size(); output_buf_chunk_ptr += output_chunk_size * _ubuf.element_size(); } @@ -283,6 +482,12 @@ struct UbufCommOverlap : torch::CustomClassHolder { return; } // split_overlap_rs + void set_ubuf_scale_inv(const torch::Tensor &scale_inv) { + _ubuf_scale_inv = scale_inv; + _ubuf_scale_inv_initialized = true; + } + + bool is_fp8_ubuf() { return (_ubuf.element_size() == 1); } /* ** Helper function to copy input to _ubuf */ @@ -311,7 +516,8 @@ struct UbufCommOverlap : torch::CustomClassHolder { torch::Tensor &get_ubuf_output(int comm_type) { char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); COMM_TYPE _comm_type = static_cast(comm_type); - if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) NVTE_ERROR("Invalid comm_type"); + if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) + NVTE_ERROR("Invalid comm_type"); if (_comm_type == COMM_TYPE::RS) ubuf_wt_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); int output_c_dim0 = (_comm_type == COMM_TYPE::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size; @@ -321,35 +527,51 @@ struct UbufCommOverlap : torch::CustomClassHolder { } }; // UbufCommOverlap -struct UbufP2PCommOverlap : torch::CustomClassHolder { - communicator *_ub_comm; +struct UbufP2PCommOverlap : torch::CustomClassHolder, UbufBase { int _tp_id; int _tp_size; int _ub_reg; int _next_rank, _prev_rank, _rank, _rank_round_tp; int _aggregate2; int _math_sms; + int _self_chunk_id; void *_ubuf_ptr; torch::Tensor _ubuf; + torch::Tensor counter; + torch::Tensor _empty_tensor; std::vector _ubufs; at::cuda::CUDAStream _stream_send = at::cuda::getStreamFromPool(true); at::cuda::CUDAStream _stream_recv = at::cuda::getStreamFromPool(true); std::vector _stream_compute; cudaEvent_t _start_compute, _stop_compute, _stop_send, _stop_recv; + int use_ce; + int sms; + int cga_size; - UbufP2PCommOverlap(torch::Tensor sample, int rank, int tp_size, bool aggregate2, - int num_max_streams) { + UbufP2PCommOverlap(torch::Tensor sample, int rank, int tp_size, int num_comm_sm, + int comm_cga_size, bool set_sm_margin, bool aggregate2, int num_max_streams, + torch::Tensor empty_tensor) { // Initialize userbuf communicator - create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); - _ub_comm->use_ce = 1; - _ub_comm->sms = 1; - _ub_comm->cga_size = 1; + if (!comm_created) { + if (rank == 0) { + printf("!!! [UB] Create UbufP2PCommOverlap Communicator\n"); + } + create_communicator_grouped2(&_ub_comm, 1, 1, tp_size, 1); + comm_created = true; + } + use_ce = 1; + sms = 1; + cga_size = 1; + _empty_tensor = empty_tensor; // Create workspace tensor with userbuffer int ubuf_bytes = sample.numel() * sample.element_size(); int ubuf_chunk_bytes = ubuf_bytes / tp_size; _ub_reg = register_user_buffer_collective(reinterpret_cast(&_ubuf_ptr), ubuf_bytes, _ub_comm, true); + if (rank == 0) { + printf("!!! [UBP2P] Register UBuf %d\n", _ub_reg); + } _ubuf = torch::from_blob(_ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options()); // Create tensor chunks for easy management @@ -372,7 +594,7 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { // Set the number of SMs for GEMM with margin cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); - _math_sms = prop.multiProcessorCount; + _math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount; _tp_size = tp_size; _aggregate2 = aggregate2; @@ -383,6 +605,26 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { _next_rank = (tp_size + rank + 1) % tp_size + _rank_round_tp; _prev_rank = (tp_size + rank + -1) % tp_size + _rank_round_tp; + auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA); + counter = torch::zeros({tp_size * 2}, counter_options); + counter.index_put_({Slice(None, tp_size)}, 1); + _self_chunk_id = _tp_id; + + const char *env_p = std::getenv("NVTE_AG_P2P_ATOMIC"); + if (rank == 0 && env_p != nullptr) { + if (env_p[0] == '1') { + printf("!!userbuffers_sendrecv_atomic\n"); + } else if (env_p[0] == '2') { + printf("!!userbuffers_sendrecv_multiatomic\n"); + } else if (env_p[0] == '3') { + printf("!!userbuffers_sendrecv_multiatomic_shuffle\n"); + _self_chunk_id = 0; + } else { + printf("!!userbuffers_sendrecv\n"); + } + } + counter.index_put_({_self_chunk_id}, 0); + // CUDA event creation cudaEventCreateWithFlags(&_start_compute, 0); cudaEventCreateWithFlags(&_stop_compute, 0); @@ -390,11 +632,144 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { cudaEventCreateWithFlags(&_stop_recv, 0); } + /* + ** Split AllGather + AtomicGEMM using P2P communication + ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is + *needed to have AG outputs + ** in each rank to be in the contiguous memory space after all ring exchange + *phases. + */ + torch::Tensor atomic_gemm_overlap_ag( + at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, + transformer_engine::DType A_type, bool transa, at::Tensor B, at::Tensor B_scale_inverse, + int64_t B_fp8_tensor, transformer_engine::DType B_type, bool transb, at::Tensor D, + at::Tensor D_scale, transformer_engine::DType D_type, at::Tensor D_amax, at::Tensor bias, + transformer_engine::DType bias_type, at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, + size_t workspaceSize, bool accumulate, bool use_split_accumulator, at::Tensor B_copy) { + _ub_comm->use_ce = use_ce; + _ub_comm->sms = sms; + _ub_comm->cga_size = cga_size; + // Get GEMM dimensions between TN and NN input layouts + const int m = (transa) ? A.size(0) : A.size(1); + const int k = (transa) ? A.size(1) : A.size(0); + const int n_chunk = _ubufs[0].size(0); + + // Get communication and GEMM output chunk sizes + const int comm_bytes = _ubufs[0].numel() * _ubufs[0].element_size(); + + // Get output and workspace data pointers + char *output_ptr = reinterpret_cast(D.data_ptr()); + char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); + int *counter_ptr = reinterpret_cast(counter.data_ptr()); + int workspace_size_chunk = workspaceSize / _stream_compute.size(); + + if (A_scale_inverse.numel()) + A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + + if (B_scale_inverse.numel()) + B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + + at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); + CHECK_CUDA(cudaEventRecord(_start_compute, (cudaStream_t)stream_main)); + + assert(pre_gelu_out.numel() == 0); + // Catch up the default torch stream + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_recv, _start_compute, 0)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); + + torch::Tensor output_chunk = torch::from_blob(output_ptr, {_ubuf.size(0), m}, D.options()); + torch::Tensor workspace_chunk = + torch::from_blob(workspace_ptr, {workspace_size_chunk}, workspace.options()); + for (int i = 0; i < _tp_size; i++) { + // Set the userbuffer id. Buffer under send is the input for the current + // GEMM chunk The initial input chunk is stored _ubuf[rank]. This is to + // have the AG output in all ranks to be contiguous after the ring + // exchanges + int send_chunk_id = (_tp_size + _tp_id - i) % _tp_size; + int recv_chunk_id = (_tp_size + _tp_id - i - 1) % _tp_size; + int send_offset = comm_bytes * send_chunk_id; + int recv_offset = comm_bytes * recv_chunk_id; + + if (i < _tp_size - 1) { + const char *env_p = std::getenv("NVTE_AG_P2P_ATOMIC"); + if (env_p != nullptr && env_p[0] == '1') { + userbuffers_sendrecv_atomic(_ub_reg, _ub_reg, send_offset, recv_offset, comm_bytes, + _ub_comm, _next_rank, _prev_rank, &counter_ptr[recv_chunk_id], + (cudaStream_t)_stream_recv); + } else if (env_p != nullptr && env_p[0] == '2') { + if (i == 0) { + userbuffers_sendrecv_multiatomic(_ub_reg, _ub_reg, comm_bytes, comm_bytes, comm_bytes, + _ub_comm, _next_rank, _prev_rank, _tp_size, + counter_ptr, false, (cudaStream_t)_stream_recv); + } + } else if (env_p != nullptr && env_p[0] == '3') { + if (i == 0) { + userbuffers_sendrecv_multiatomic(_ub_reg, _ub_reg, comm_bytes, comm_bytes, comm_bytes, + _ub_comm, _next_rank, _prev_rank, _tp_size, + counter_ptr, true, (cudaStream_t)_stream_recv); + } + } else { + // P2P communication + // userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset, + // comm_bytes, _ub_comm, + // _next_rank, (cudaStream_t)_stream_send); + // userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset, + // comm_bytes, _ub_comm, + // _prev_rank, (cudaStream_t)_stream_recv); + // CHECK_CUDA(cudaEventRecord(_stop_recv, + // (cudaStream_t)_stream_recv)); + // CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_send, + // _stop_recv, 0)); + userbuffers_sendrecv(_ub_reg, _ub_reg, send_offset, recv_offset, comm_bytes, _ub_comm, + _next_rank, _prev_rank, (cudaStream_t)_stream_recv); + producer(counter_ptr, recv_chunk_id, (cudaStream_t)_stream_recv); + } + if (i == 0) { + at::cuda::setCurrentCUDAStream(_stream_compute[0]); + te_atomic_gemm(A, A_scale_inverse, A_type, transa, _ubuf, B_scale_inverse, B_type, transb, + output_chunk, D_scale, D_type, D_amax, bias, bias_type, pre_gelu_out, grad, + workspace_chunk, workspace_size_chunk, accumulate, use_split_accumulator, + _math_sms, 0, _tp_size, false, counter); + } + } else { + // GEMM + // userbuffers_send_multiatomic(_ub_reg, 0, _ub_reg, 0, comm_bytes, + // _ub_comm, + // _next_rank, _tp_size, comm_bytes, comm_bytes, + // (cudaStream_t)_stream_send); + // userbuffers_recv_multiatomic(_ub_reg, 0, _ub_reg, 0, comm_bytes, + // _ub_comm, + // _prev_rank, _tp_size, counter_ptr, + // (cudaStream_t)_stream_recv); + if (B_copy.numel() > 0) { + assert(B_copy.numel() == _ubufs[_tp_id].numel()); + assert(B_copy.element_size() == _ubufs[_tp_id].element_size()); + CHECK_CUDA(cudaMemcpyAsync(B_copy.data_ptr(), _ubufs[_tp_id].data_ptr(), + _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), + cudaMemcpyDeviceToDevice, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaEventRecord(_stop_send, (cudaStream_t)_stream_send)); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_send, 0)); + } + } + } + for (int i = 0; i < _tp_size; i++) { + if (i != _self_chunk_id) { + consumer(counter_ptr, i, (cudaStream_t)_stream_compute[0]); + } + } + at::cuda::setCurrentCUDAStream(stream_main); + CHECK_CUDA(cudaEventRecord(_stop_compute, (cudaStream_t)_stream_compute[0])); + CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)stream_main, _stop_compute, 0)); + + return D; + } // split_overlap_ag /* ** Split AllGather + GEMM using P2P communication - ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is needed to have AG - *outputs - ** in each rank to be in the contiguous memory space after all ring exchange phases. + ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is + *needed to have AG outputs + ** in each rank to be in the contiguous memory space after all ring exchange + *phases. */ torch::Tensor split_overlap_ag(at::Tensor A, at::Tensor A_scale_inverse, int64_t A_fp8_tensor, transformer_engine::DType A_type, bool transa, at::Tensor B, @@ -405,6 +780,9 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { transformer_engine::DType bias_type, at::Tensor pre_gelu_out, bool grad, at::Tensor workspace, size_t workspaceSize, bool accumulate, bool use_split_accumulator, at::Tensor B_copy) { + _ub_comm->use_ce = use_ce; + _ub_comm->sms = sms; + _ub_comm->cga_size = cga_size; // Get GEMM dimensions between TN and NN input layouts const int m = (transa) ? A.size(0) : A.size(1); const int k = (transa) ? A.size(1) : A.size(0); @@ -419,9 +797,11 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { char *workspace_ptr = reinterpret_cast(workspace.data_ptr()); int workspace_size_chunk = workspaceSize / _stream_compute.size(); - if (A_scale_inverse.numel()) A_scale_inverse = A_scale_inverse[A_fp8_tensor]; + if (A_scale_inverse.numel()) + A_scale_inverse = A_scale_inverse[A_fp8_tensor]; - if (B_scale_inverse.numel()) B_scale_inverse = B_scale_inverse[B_fp8_tensor]; + if (B_scale_inverse.numel()) + B_scale_inverse = B_scale_inverse[B_fp8_tensor]; at::cuda::CUDAStream stream_main = at::cuda::getDefaultCUDAStream(); CHECK_CUDA(cudaEventRecord(_start_compute, (cudaStream_t)stream_main)); @@ -506,9 +886,10 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { CHECK_CUDA(cudaStreamWaitEvent((cudaStream_t)_stream_compute[0], _start_compute, 0)); for (int i = 0; i < _tp_size; i++) { - // Set the userbuffer id. Buffer under send is the input for the current GEMM chunk - // The initial input chunk is stored _ubuf[rank]. This is to have the AG output in all ranks - // to be contiguous after the ring exchanges + // Set the userbuffer id. Buffer under send is the input for the current + // GEMM chunk The initial input chunk is stored _ubuf[rank]. This is to + // have the AG output in all ranks to be contiguous after the ring + // exchanges int send_chunk_id = (_tp_size + _tp_id - i) % _tp_size; int recv_chunk_id = (_tp_size + _tp_id - i - 1) % _tp_size; int send_offset = comm_bytes * send_chunk_id; @@ -581,7 +962,8 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder { torch::Tensor get_ubuf_output(int comm_type) { char *ubuf_wt_ptr = reinterpret_cast(_ubuf.data_ptr()); COMM_TYPE _comm_type = static_cast(comm_type); - if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) NVTE_ERROR("Invalid comm_type"); + if (_comm_type != COMM_TYPE::AG && _comm_type != COMM_TYPE::RS) + NVTE_ERROR("Invalid comm_type"); if (_comm_type == COMM_TYPE::RS) ubuf_wt_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size(); int output_c_dim0 = (_comm_type == COMM_TYPE::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size; diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index 274a523ec0..4eaca7c896 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -179,6 +179,32 @@ void te_gemm(at::Tensor A, int math_sm_count ); +void te_atomic_gemm(at::Tensor A, + at::Tensor A_scale_inverse, + transformer_engine::DType A_type, + bool transa, + at::Tensor B, + at::Tensor B_scale_inverse, + transformer_engine::DType B_type, + bool transb, + at::Tensor D, + at::Tensor D_scale, + transformer_engine::DType D_type, + at::Tensor D_amax, + at::Tensor bias, + transformer_engine::DType bias_type, + at::Tensor pre_gelu_out, + bool grad, + at::Tensor workspace, + size_t workspaceSize, + bool accumulate, + bool use_split_accumulator, + int math_sm_count, + int m_split, + int n_split, + bool gemm_producer, + at::Tensor counter +); void fused_cast_transpose(at::Tensor input, at::Tensor scale, diff --git a/transformer_engine/pytorch/csrc/extensions/gemm.cu b/transformer_engine/pytorch/csrc/extensions/gemm.cu index 1a7630edce..480b8716b2 100644 --- a/transformer_engine/pytorch/csrc/extensions/gemm.cu +++ b/transformer_engine/pytorch/csrc/extensions/gemm.cu @@ -6,6 +6,7 @@ #include "extensions.h" + void te_gemm(at::Tensor A, at::Tensor A_scale_inverse, transformer_engine::DType A_type, @@ -73,3 +74,82 @@ void te_gemm(at::Tensor A, math_sm_count, at::cuda::getCurrentCUDAStream()); } + +void te_atomic_gemm(at::Tensor A, + at::Tensor A_scale_inverse, + transformer_engine::DType A_type, + bool transa, + at::Tensor B, + at::Tensor B_scale_inverse, + transformer_engine::DType B_type, + bool transb, + at::Tensor D, + at::Tensor D_scale, + transformer_engine::DType D_type, + at::Tensor D_amax, + at::Tensor bias, + transformer_engine::DType bias_type, + at::Tensor pre_gelu_out, + bool grad, + at::Tensor workspace, + size_t workspaceSize, + bool accumulate, + bool use_split_accumulator, + int math_sm_count, + int m_split, + int n_split, + bool gemm_producer, + at::Tensor counter +) { + using namespace transformer_engine; + auto te_A = makeTransformerEngineTensor(A.data_ptr(), + {static_cast(A.size(0)), + static_cast(A.size(1))}, + A_type, nullptr, nullptr, + A_scale_inverse.data_ptr()); + auto te_B = makeTransformerEngineTensor(B.data_ptr(), + {static_cast(B.size(0)), + static_cast(B.size(1))}, + B_type, nullptr, nullptr, + B_scale_inverse.data_ptr()); + auto te_D = makeTransformerEngineTensor(D.data_ptr(), + {static_cast(D.size(0)), + static_cast(D.size(1))}, + D_type, D_amax.data_ptr(), + D_scale.data_ptr(), nullptr); + auto te_bias = makeTransformerEngineTensor(bias.data_ptr(), {static_cast(bias.size(0))}, + bias_type); + auto te_counter = makeTransformerEngineTensor(counter.data_ptr(), + {static_cast(counter.size(0))}, + DType::kInt32); + + const auto gelu_shape = pre_gelu_out.data_ptr() == nullptr + ? std::vector{static_cast(pre_gelu_out.size(0))} + : std::vector{static_cast(pre_gelu_out.size(0)), + static_cast(pre_gelu_out.size(1))}; + auto te_pre_gelu_out = makeTransformerEngineTensor(pre_gelu_out.data_ptr(), + gelu_shape, + GetTransformerEngineDType( + pre_gelu_out.scalar_type())); + auto te_workspace = makeTransformerEngineTensor(workspace.data_ptr(), + {workspaceSize}, + DType::kByte); + + nvte_cublas_atomic_gemm(te_A.data(), + te_B.data(), + te_D.data(), + te_bias.data(), + te_pre_gelu_out.data(), + transa, + transb, + grad, + te_workspace.data(), + accumulate, + use_split_accumulator, + math_sm_count, + m_split, + n_split, + gemm_producer, + te_counter.data(), + at::cuda::getCurrentCUDAStream()); +} diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp index abc15022b0..7e80299d15 100644 --- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -91,18 +91,24 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .value("BULK_OVERLAP_AG", ubuf::UBOverlapAlgo::BULK_OVERLAP_AG) .value("BULK_OVERLAP_RS", ubuf::UBOverlapAlgo::BULK_OVERLAP_RS) .value("SPLIT_PIPELINED_RS", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_RS) - .value("SPLIT_PIPELINED_AG", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_AG); + .value("SPLIT_PIPELINED_AG", ubuf::UBOverlapAlgo::SPLIT_PIPELINED_AG) + .value("ATOMIC_GEMM_RS", ubuf::UBOverlapAlgo::ATOMIC_GEMM_RS) + .value("ATOMIC_GEMM_AG", ubuf::UBOverlapAlgo::ATOMIC_GEMM_AG); py::class_(m, "UbufCommOverlap") - .def(py::init()) + .def(py::init()) .def("bulk_overlap", &ubuf::UbufCommOverlap::bulk_overlap) .def("split_overlap_rs", &ubuf::UbufCommOverlap::split_overlap_rs) + .def("set_ubuf_scale_inv", &ubuf::UbufCommOverlap::set_ubuf_scale_inv) + .def("atomic_gemm_overlap_rs", &ubuf::UbufCommOverlap::atomic_gemm_overlap_rs) + .def("is_fp8_ubuf", &ubuf::UbufCommOverlap::is_fp8_ubuf) .def("copy_input_to_ubuf", &ubuf::UbufCommOverlap::copy_input_to_ubuf) .def("get_ubuf_output", &ubuf::UbufCommOverlap::get_ubuf_output); py::class_(m, "UbufP2PCommOverlap") - .def(py::init()) + .def(py::init()) .def("split_overlap_ag", &ubuf::UbufP2PCommOverlap::split_overlap_ag) + .def("atomic_gemm_overlap_ag", &ubuf::UbufP2PCommOverlap::atomic_gemm_overlap_ag) .def("copy_input_to_ubuf", &ubuf::UbufP2PCommOverlap::copy_input_to_ubuf) .def("get_ubuf_output", &ubuf::UbufP2PCommOverlap::get_ubuf_output); #else // NVTE_WITH_USERBUFFERS diff --git a/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp b/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp index 59afc4b452..7c08070728 100644 --- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp @@ -4,10 +4,13 @@ * See LICENSE for license information. ************************************************************************/ +#include "userbuffers.h" #include +#include #include #include #include +#include #include #include #include @@ -15,9 +18,6 @@ #include #include #include -#include -#include -#include "userbuffers.h" static int oob_bcast(void *comm_context, void *buf, int size, int root) { MPI_Bcast(buf, size, MPI_BYTE, root, @@ -38,20 +38,31 @@ static int oob_gather(void *comm_context, int root, void *sbuf, void *rbuf, int int stringCmp(const void *a, const void *b) { return strcmp((const char *)a, (const char *)b); } -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) -#define NVTE_UB_ERROR(x) \ - do { \ - throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) + \ - " in function " + __func__ + ": " + x); \ - } while (false) +#define CUCHECK(cmd) \ + do { \ + CUresult retval = cmd; \ + if (retval != CUDA_SUCCESS) { \ + const char *error_string; \ + cuGetErrorString(retval, &error_string); \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, error_string); \ + exit(EXIT_FAILURE); \ + } \ + } while (0); + +#define NVTE_UB_ERROR(x) \ + do { \ + throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) + \ + " in function " + __func__ + ": " + x); \ + } while (false) int pipe_rank(communicator *comm, int step) { int mynode = comm->myrank / comm->nvsize; @@ -89,12 +100,14 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode (*comm)->push = 1; (*comm)->use_ce = 0; (*comm)->cga_size = 2; - for (int i = 0; i < userbuffers_op_types; i++) (*comm)->basecounter[i] = 0; + for (int i = 0; i < userbuffers_op_types; i++) + (*comm)->basecounter[i] = 0; (*comm)->head = 0; (*comm)->tail = 0; (*comm)->activeproxy = 1; (*comm)->active_nreqs = 0; - for (int i = 0; i < userbuffers_op_types; i++) (*comm)->active_req[i].active = -1; + for (int i = 0; i < userbuffers_op_types; i++) + (*comm)->active_req[i].active = -1; int ret = 0; // split communicator @@ -112,8 +125,10 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode color = 0; for (int n = 0; n < size; n++) { - if (n > 0 && strcmp(host_names[n - 1], host_names[n])) color++; - if (strcmp(host_name, host_names[n]) == 0) break; + if (n > 0 && strcmp(host_names[n - 1], host_names[n])) + color++; + if (strcmp(host_name, host_names[n]) == 0) + break; } free(host_names); @@ -128,14 +143,22 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode cpu_set_t cpuset; CPU_ZERO(&cpuset); int core; - if (mylocal == 0) core = 50; - if (mylocal == 1) core = 58; - if (mylocal == 2) core = 18; - if (mylocal == 3) core = 26; - if (mylocal == 4) core = 114; - if (mylocal == 5) core = 122; - if (mylocal == 6) core = 82; - if (mylocal == 7) core = 90; + if (mylocal == 0) + core = 50; + if (mylocal == 1) + core = 58; + if (mylocal == 2) + core = 18; + if (mylocal == 3) + core = 26; + if (mylocal == 4) + core = 114; + if (mylocal == 5) + core = 122; + if (mylocal == 6) + core = 82; + if (mylocal == 7) + core = 90; CPU_SET(core, &cpuset); if (!getenv("NVTE_NODOUBLE")) { @@ -144,7 +167,8 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode else CPU_SET(core + 128, &cpuset); } - if (getenv("NVTE_DOPIN")) pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (getenv("NVTE_DOPIN")) + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); if (ndev == numlocal) { // all visible devices if (cur_dev != mylocal) @@ -175,7 +199,8 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode int datanodegroup_id = myrank / numlocal / datanodes; // data reduction group node belongs, equals 0 for all if both // pipenodes=1 and tensornodes=1 - // mpi communicator only needed for SHARP which is always allreduce1/data-parallel + // mpi communicator only needed for SHARP which is always + // allreduce1/data-parallel MPI_Comm_split(MPI_COMM_WORLD, mylocal + numlocal * datanodegroup_id, rank, &(*comm)->comm_inter); // different rails from same group are in different subcommunicators @@ -192,19 +217,37 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode char *ib_dev_list; int ZIONROCE = getenv("NVTE_ZIONROCE") ? atoi(getenv("NVTE_ZIONROCE")) : 0; int ROCE = getenv("NVTE_ROCE") ? atoi(getenv("NVTE_ROCE")) : 0; - if (ZIONROCE) ROCE = 1; + if (ZIONROCE) + ROCE = 1; int DGX_H100 = device_prop.major == 9; switch (mylocal) { - case 0:ib_dev_list = "mlx5_0:1"; break; // NOLINT(*) - case 1:ib_dev_list = (char*)(DGX_H100?"mlx5_3:1":"mlx5_1:1"); break; // NOLINT(*) - case 2:ib_dev_list = (char*)(ZIONROCE?"mlx5_4:1":DGX_H100?"mlx5_4:1":"mlx5_2:1"); break; // NOLINT(*) - case 3:ib_dev_list = (char*)(DGX_H100?"mlx5_5:1":"mlx5_3:1"); break; // NOLINT(*) - case 4:ib_dev_list = (char*)(DGX_H100?"mlx5_6:1":"mlx5_6:1"); break; // NOLINT(*) - case 5:ib_dev_list = (char*)(DGX_H100?"mlx5_9:1":"mlx5_7:1"); break; // NOLINT(*) - case 6:ib_dev_list = (char*)(ZIONROCE?"mlx5_10:1":DGX_H100?"mlx5_10:1":"mlx5_8:1"); break; // NOLINT(*) - case 7:ib_dev_list = (char*)(DGX_H100?"mlx5_11:1":"mlx5_9:1"); break; // NOLINT(*) - default: break; + case 0: + ib_dev_list = "mlx5_0:1"; + break; // NOLINT(*) + case 1: + ib_dev_list = (char *)(DGX_H100 ? "mlx5_3:1" : "mlx5_1:1"); // NOLINT(*) + break; // NOLINT(*) + case 2: + ib_dev_list = (char *)(ZIONROCE ? "mlx5_4:1" : DGX_H100 ? "mlx5_4:1" : "mlx5_2:1"); // NOLINT(*) + break; // NOLINT(*) + case 3: + ib_dev_list = (char *)(DGX_H100 ? "mlx5_5:1" : "mlx5_3:1"); // NOLINT(*) + break; // NOLINT(*) + case 4: + ib_dev_list = (char *)(DGX_H100 ? "mlx5_6:1" : "mlx5_6:1"); // NOLINT(*) + break; // NOLINT(*) + case 5: + ib_dev_list = (char *)(DGX_H100 ? "mlx5_9:1" : "mlx5_7:1"); // NOLINT(*) + break; // NOLINT(*) + case 6: + ib_dev_list = (char *)(ZIONROCE ? "mlx5_10:1" : DGX_H100 ? "mlx5_10:1" : "mlx5_8:1"); // NOLINT(*) + break; // NOLINT(*) + case 7: + ib_dev_list = (char *)(DGX_H100 ? "mlx5_11:1" : "mlx5_9:1"); // NOLINT(*) + break; // NOLINT(*) + default: + break; } (*comm)->fifo = reinterpret_cast(malloc(sizeof(ub_request) * NVTE_MAX_REQUESTS)); @@ -215,7 +258,8 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode CUDACHECK(cudaMallocHost((void **)&(*comm)->hostflags, // NOLINT(*) (NVTE_MAX_SMS + 100) * sizeof(int))); - for (int i = 0; i < 100 + NVTE_MAX_SMS; i++) (*comm)->hostflags[i] = 0; + for (int i = 0; i < 100 + NVTE_MAX_SMS; i++) + (*comm)->hostflags[i] = 0; _mm_mfence(); sleep(1); @@ -223,13 +267,16 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode (*comm)->ibnvsize = (*comm)->nvsize; #define NBUF 2 + #define LOCALSIZE 4 * (NVTE_REG0_OFFSET(*comm) + NVTE_REG0_FLAGS + NVTE_REG0_COMMBUFFER * NBUF) // peer pointers + op flags + comm buffer - CUDACHECK(cudaMalloc(&(*comm)->gpu_ptrs, LOCALSIZE)); // flags and pointers, no block data yet + CUDACHECK(cudaMalloc(&(*comm)->gpu_ptrs, + LOCALSIZE)); // flags and pointers, no block data yet CUDACHECK(cudaMemset((*comm)->gpu_ptrs, 0, LOCALSIZE)); CUDACHECK(cudaDeviceSynchronize()); - register_user_buffer_collective(&((*comm)->gpu_ptrs), LOCALSIZE, *comm); // will use handler 0 + register_user_buffer_collective(&((*comm)->gpu_ptrs), LOCALSIZE, + *comm); // will use handler 0 CUDACHECK(cudaMalloc(&(*comm)->send_id, (*comm)->nranks * sizeof(int))); CUDACHECK(cudaMalloc(&(*comm)->recv_id, NVTE_MAX_REGIONS * (*comm)->nranks * sizeof(int))); CUDACHECK(cudaMemset((*comm)->send_id, 0, (*comm)->nranks * sizeof(int))); @@ -243,7 +290,6 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode #define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) CUDACHECK(cudaMalloc(&(*comm)->flags, 2 * GPU_PAGE_SIZE)); unsigned int flag = 1; - // cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)(*comm)->flags); CUDACHECK(cudaMemset((*comm)->flags, 0, 2 * GPU_PAGE_SIZE)); (*comm)->flags = reinterpret_cast(((CUdeviceptr)(*comm)->flags + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK); @@ -275,7 +321,8 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode pthread_attr_setschedparam(&attr, ¶m); if (getenv("NVTE_UBDEBUG")) - printf("%d/%d:(%d x %d): DP %d x %d TP %d x %d, DPGROUP %dx%d TPGROUP %dx%d PIPE_ID %d/%d\n", + printf("%d/%d:(%d x %d): DP %d x %d TP %d x %d, DPGROUP %dx%d TPGROUP " + "%dx%d PIPE_ID %d/%d\n", myrank, nranks, myrank / numlocal, myrank % numlocal, (*comm)->my_node, (*comm)->ar_nvrank, (*comm)->my2_node, (*comm)->ar2_nvrank, (*comm)->num_nodes, (*comm)->ar_nvsize, (*comm)->num2_nodes, (*comm)->ar2_nvsize, (*comm)->pipe_id, @@ -300,9 +347,9 @@ void destroy_communicator(communicator *comm) { } int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *comm, bool alloc) { - if (comm->free_region > NVTE_MAX_REGIONS) return -1; + if (comm->free_region > NVTE_MAX_REGIONS) + return -1; int hndl = comm->free_region; - // printf("%d register %d size %lld\n",comm->myrank,hndl,bytes);fflush(NULL); comm->peer_ptr[hndl] = reinterpret_cast(malloc(sizeof(void *) * (comm->nvsize))); if (alloc) { @@ -313,25 +360,22 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * reinterpret_cast(malloc(sizeof(cudaIpcMemHandle_t) * (comm->nvsize))); CUDACHECK(cudaIpcGetMemHandle(&memhndl[comm->nvrank], *gpubuff)); - MPI_Allgather(&memhndl[comm->nvrank], sizeof(cudaIpcMemHandle_t), MPI_BYTE, memhndl, sizeof(cudaIpcMemHandle_t), MPI_BYTE, comm->comm_intra); - for (int i = 0; i < comm->nvsize; i++) if (i != comm->nvrank) CUDACHECK(cudaIpcOpenMemHandle((void **)&(comm->peer_ptr[hndl][i]), // NOLINT(*) memhndl[i], cudaIpcMemLazyEnablePeerAccess)); comm->peer_ptr[hndl][comm->nvrank] = *gpubuff; CUDACHECK(cudaDeviceSynchronize()); - CUDACHECK( cudaMemcpy(reinterpret_cast(comm->gpu_ptrs) + (hndl * comm->nvsize * sizeof(void *)), comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice)); - CUDACHECK(cudaDeviceSynchronize()); free(memhndl); comm->mem_ptr[hndl] = *gpubuff; + return comm->free_region++; } @@ -352,8 +396,10 @@ int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons void allreduce_nonsharp_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream, int op) { - if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); - // if(comm->myrank==0) fprintf(stderr,"AR2(%d) user call launch_mode=%d\n",op,comm->launch_mode); + if (elements < 64) + NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); + // if(comm->myrank==0) fprintf(stderr,"AR2(%d) user call + // launch_mode=%d\n",op,comm->launch_mode); const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; int blocksize = elements * 2; int maxcredit = 0; @@ -361,19 +407,19 @@ void allreduce_nonsharp_inplace(const int handler, const int offset, const int e blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / comm->nblocks; // FIXME TUNING blocksize *= comm->alignblock; - if (blocksize < comm->minblock) blocksize = comm->minblock; + if (blocksize < comm->minblock) + blocksize = comm->minblock; maxcredit = (elements * 2 + blocksize - 1) / blocksize; - // if(maxcredit>4) maxcredit=4; - // if(maxcredit>4 && ar_nvsize==1) maxcredit=4; size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit - if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; - // blocksize=elements*2; + if (blocksize > peerblock * ar_nvsize) + blocksize = peerblock * ar_nvsize; int sms = allreduce2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, stream, op); if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) { - if (!sms) return; + if (!sms) + return; comm->fifo[comm->head].optype = op; comm->fifo[comm->head].basecounter = comm->basecounter[op]; comm->fifo[comm->head].blocksize = blocksize; @@ -399,7 +445,8 @@ void allreduce2_userbuff_inplace(const int handler, const int offset, const int void allreduce_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) + NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); allreduce_nonsharp_inplace(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp); return; @@ -407,7 +454,8 @@ void allreduce_userbuff_inplace(const int handler, const int offset, const int e void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) + NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); int op = userbuffers_allreduceop_nonsharp; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; @@ -418,17 +466,20 @@ void reducescatter_userbuff_inplace(const int handler, const int offset, const i blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / comm->nblocks; // FIXME TUNING blocksize *= comm->alignblock; - if (blocksize < comm->minblock) blocksize = comm->minblock; + if (blocksize < comm->minblock) + blocksize = comm->minblock; maxcredit = (elements * 2 + blocksize - 1) / blocksize; size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit - if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; + if (blocksize > peerblock * ar_nvsize) + blocksize = peerblock * ar_nvsize; int sms = reducescatter2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, stream, op); if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) { - if (!sms) return; + if (!sms) + return; comm->fifo[comm->head].optype = op; comm->fifo[comm->head].basecounter = comm->basecounter[op]; comm->fifo[comm->head].blocksize = blocksize; @@ -448,7 +499,8 @@ void reducescatter_userbuff_inplace(const int handler, const int offset, const i void allgather_userbuff_inplace(const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { - if (elements < 64) NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); + if (elements < 64) + NVTE_UB_ERROR("Userbuffer comm for given config not implemented."); int op = userbuffers_allreduceop_nonsharp; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; int blocksize = elements * 2; @@ -458,11 +510,13 @@ void allgather_userbuff_inplace(const int handler, const int offset, const int e blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) / comm->nblocks; // FIXME TUNING blocksize *= comm->alignblock; - if (blocksize < comm->minblock) blocksize = comm->minblock; + if (blocksize < comm->minblock) + blocksize = comm->minblock; maxcredit = (elements * 2 + blocksize - 1) / blocksize; size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit; // max size we can fit - if (blocksize > peerblock * ar_nvsize) blocksize = peerblock * ar_nvsize; + if (blocksize > peerblock * ar_nvsize) + blocksize = peerblock * ar_nvsize; int sms = allgather2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, stream, op); diff --git a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu index 2c8e9dc61d..ecd17a45d7 100644 --- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu @@ -12,22 +12,42 @@ #else #include #endif +#include "userbuffers.h" #include +#include #include -#include "userbuffers.h" #define MAX_THREADS 1024 #define TIMEOUT 200000000000ull -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) +#define ATOMIC_CONSUMER(chunk) \ + if (counters) { \ + if (threadIdx.x == 0 && blockIdx.x == 0) { \ + int old_val; \ + while (0 != (old_val = atomicCAS(((unsigned int *)counters) + chunk, 0, 0))) { \ + } \ + ((unsigned int *)counters)[chunk] = 1; \ + asm volatile("fence.sc.gpu;\n"); \ + } \ + if (blockIdx.x == 0) \ + __syncthreads(); \ + } + +#define ATOMIC_PRODUCER(chunk) \ + if (counters) { \ + ((unsigned int *)counters)[chunk] = 0; \ + asm volatile("fence.sc.gpu;\n"); \ + } + template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rw(const int op, const int flagoffset, const int firstrank, @@ -36,8 +56,7 @@ __global__ void __launch_bounds__(MAX_THREADS) __shared__ int4 *userptr[RANKS]; int *flagptr, physgpu, targetgpu, *myptr; int *reduceidptr, reduce_id; - // if(blockIdx.x==0 && threadIdx.x==0) printf("%d/%d(phys %d gpustep %d firstrank %d):RRkernel(d) - // start, size %lld\n",myrank,RANKS,gpustep*myrank+firstrank,gpustep,firstrank,numlines*16ull); + if (threadIdx.x < RANKS) { physgpu = myrank * gpustep + firstrank; targetgpu = threadIdx.x * gpustep + firstrank; @@ -66,7 +85,8 @@ __global__ void __launch_bounds__(MAX_THREADS) int warp = blockIdx.x + (threadIdx.x >> 5); int dest[RANKS]; #pragma unroll - for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); __syncthreads(); for (int line = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); line < numlines; @@ -86,7 +106,8 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 8; j++) s[j] += x[j]; + for (int j = 0; j < 8; j++) + s[j] += x[j]; } #pragma unroll for (int i = 0; i < RANKS; i++) { @@ -96,7 +117,8 @@ __global__ void __launch_bounds__(MAX_THREADS) } __syncthreads(); - if (threadIdx.x == 0) __threadfence_system(); + if (threadIdx.x == 0) + __threadfence_system(); __syncthreads(); if (threadIdx.x < RANKS) { @@ -111,7 +133,8 @@ __global__ void __launch_bounds__(MAX_THREADS) } } } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; + if (threadIdx.x == 0 && blockIdx.x == 0) + *reduceidptr = reduce_id; } // fp16 inplace reduce kernel (Volta,Hopper) template @@ -150,7 +173,8 @@ __global__ void __launch_bounds__(MAX_THREADS) int warp = blockIdx.x + (threadIdx.x >> 5); int dest[RANKS]; #pragma unroll - for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); __syncthreads(); for (int line = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); line < numlines; @@ -169,13 +193,15 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 8; j++) s[j] += x[j]; + for (int j = 0; j < 8; j++) + s[j] += x[j]; } userptr[myrank][lineoffset + line] = sum; } __syncthreads(); - if (threadIdx.x == 0) __threadfence(); + if (threadIdx.x == 0) + __threadfence(); __syncthreads(); if (threadIdx.x < RANKS) { @@ -217,7 +243,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userptr[myrank][lineoffset + line + blockDim.x * dest[i]] = val[i]; } } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; + if (threadIdx.x == 0 && blockIdx.x == 0) + *reduceidptr = reduce_id; } // fp16 inplace reduce kernel (Ampere) template @@ -227,19 +254,19 @@ __global__ void __launch_bounds__(MAX_THREADS) const int mylineoffset, const int totallines, void **commbuff, const int handleridx) { __shared__ int4 *userptr[RANKS]; - int *flagptr, physgpu, targetgpu, *myptr; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; int *reduceidptr, reduce_id; + int lastSM = 0; if (threadIdx.x < RANKS) { physgpu = myrank * gpustep + firstrank; targetgpu = threadIdx.x * gpustep + firstrank; - const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; reduceidptr = myptr - NVTE_MAX_OPS; // +op; reduce_id = (*reduceidptr) + 1; - flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; - myptr += blockflagoffset; - - flagptr[physgpu] = reduce_id; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; volatile int *flag = (volatile int *)&(myptr[targetgpu]); userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); clock_t s = clock64(); @@ -252,11 +279,18 @@ __global__ void __launch_bounds__(MAX_THREADS) } } __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } int warp = blockIdx.x + (threadIdx.x >> 5); int dest[RANKS]; #pragma unroll - for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); __syncthreads(); for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; @@ -275,13 +309,15 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 8; j++) s[j] += x[j]; + for (int j = 0; j < 8; j++) + s[j] += x[j]; } userptr[myrank][mylineoffset + line] = sum; } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; } // fp16 inplace reduce-scatter kernel template @@ -293,19 +329,19 @@ __global__ void __launch_bounds__(MAX_THREADS) const int skiplines, void **commbuff, const int handleridx, void *outbuf) { __shared__ int4 *userptr[RANKS]; - int *flagptr, physgpu, targetgpu, *myptr; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; int *reduceidptr, reduce_id; + int lastSM = 0; if (threadIdx.x < RANKS) { physgpu = myrank * gpustep + firstrank; targetgpu = threadIdx.x * gpustep + firstrank; - const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; reduceidptr = myptr - NVTE_MAX_OPS; // +op; reduce_id = (*reduceidptr) + 1; - flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; - myptr += blockflagoffset; - - flagptr[physgpu] = reduce_id; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; volatile int *flag = (volatile int *)&(myptr[targetgpu]); userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); clock_t s = clock64(); @@ -318,11 +354,18 @@ __global__ void __launch_bounds__(MAX_THREADS) } } __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } int warp = blockIdx.x + (threadIdx.x >> 5); int dest[RANKS]; #pragma unroll - for (int i = 0; i < RANKS; i++) dest[i] = (i + myrank + warp) & (RANKS - 1); + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); __syncthreads(); for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; @@ -341,24 +384,28 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 8; j++) s[j] += x[j]; + for (int j = 0; j < 8; j++) + s[j] += x[j]; } (reinterpret_cast(outbuf))[(line / rowlines) * skiplines + (line % rowlines)] = sum; } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; } // fp16 reduce-scatter kernel (out of place) +#if 0 +// All MC kernels here template __global__ void __launch_bounds__(MAX_THREADS) - userbuffers_fp16_sum_inplace_gpu_rr_ag(const int op, const int flagoffset, const int firstrank, - const int myrank, const int gpustep, - const int mylineoffset, const int totallines, - void **commbuff, const int handleridx) { - __shared__ int4 *userptr[RANKS]; + userbuffers_fp16_sum_inplace_gpu_mc(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, const int lineoffset, + const int numlines, void **commbuff, const int handleridx, + float4 *mc_ptr) { int *flagptr, physgpu, targetgpu, *myptr; int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { physgpu = myrank * gpustep + firstrank; targetgpu = threadIdx.x * gpustep + firstrank; @@ -371,114 +418,322 @@ __global__ void __launch_bounds__(MAX_THREADS) flagptr[physgpu] = reduce_id; volatile int *flag = (volatile int *)&(myptr[targetgpu]); - userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); clock_t s = clock64(); - } - - int warp = blockIdx.x + (threadIdx.x >> 5); - int dest[RANKS]; - - int skipmy = 0; -#pragma unroll - for (int i = 0; i < RANKS; i++) { - int dst = (i + warp + myrank) & (RANKS - 1); - if (dst == myrank) { - skipmy++; - continue; + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } } - dest[i - skipmy] = dst; + reduce_id++; } __syncthreads(); +#define UNROLL_MC 8 + const int loop_step0 = blockDim.x * gridDim.x * RANKS; + const int loop_step = loop_step0 * UNROLL_MC; + const int start_elem = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); + const int end_elem = max(start_elem, numlines); + const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; + const int end_aligned = start_elem + aligned_elem; - for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; - line += blockDim.x * gridDim.x) { - int4 val[RANKS - 1]; - + for (int line = start_elem; line < end_aligned; line += loop_step) { + uint4 val[UNROLL_MC]; #pragma unroll - for (int i = 0; i < RANKS - 1; i++) { - val[i] = userptr[dest[i]][mylineoffset + line + totallines * dest[i]]; - } - + for (int i = 0; i < UNROLL_MC; i++) +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (lineoffset + line + i * loop_step0)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (lineoffset + line + i * loop_step0)) + : "memory"); +#endif #pragma unroll - for (int i = 0; i < RANKS - 1; i++) { - userptr[myrank][mylineoffset + line + totallines * dest[i]] = val[i]; + for (int i = 0; i < UNROLL_MC; i++) + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"( + mc_ptr + (lineoffset + line + i * loop_step0)), + "r"(val[i].x), "r"(val[i].y), "r"(val[i].z), "r"(val[i].w) + : "memory"); + } + for (int line = end_aligned; line < end_elem; line += loop_step0) { + uint4 val; +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (lineoffset + line)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (lineoffset + line)) + : "memory"); +#endif + asm volatile( + "multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(mc_ptr + (lineoffset + line)), + "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory"); + } + + __syncthreads(); + if (threadIdx.x == 0) + __threadfence_system(); + __syncthreads(); + + if (threadIdx.x < RANKS) { + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } } } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; -} // fp16 inplace reduce kernel (Ampere) + if (threadIdx.x == 0 && blockIdx.x == 0) + *reduceidptr = reduce_id; +} // fp16 inplace reduce kernel (Hopper) MC template __global__ void __launch_bounds__(MAX_THREADS) - userbuffers_fp16_sum_inplace_gpu_rw_ag(const int op, const int flagoffset, const int firstrank, + userbuffers_fp16_sum_inplace_gpu_mc_rs(const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep, const int mylineoffset, const int totallines, - void **commbuff, const int handleridx) { - __shared__ int4 *userptr[RANKS]; - int *flagptr, physgpu, targetgpu, *myptr; + void **commbuff, const int handleridx, float4 *mc_ptr) { + volatile int *flagptr; + int physgpu, targetgpu, *myptr; int *reduceidptr, reduce_id; - int4 *localptr; + uint4 *localptr = reinterpret_cast(commbuff[myrank * gpustep + firstrank + handleridx]); + int lastSM = 0; + if (threadIdx.x < RANKS) { physgpu = myrank * gpustep + firstrank; targetgpu = threadIdx.x * gpustep + firstrank; - const int blockflagoffset = NVTE_MAX_NVLINK * 2 * blockIdx.x; myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; reduceidptr = myptr - NVTE_MAX_OPS; // +op; reduce_id = (*reduceidptr) + 1; - flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset + blockflagoffset; - myptr += blockflagoffset; - userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); - reduce_id++; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } } __syncthreads(); - localptr = userptr[myrank]; + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } + const int loop_step0 = blockDim.x * gridDim.x; + const int loop_step = loop_step0 * UNROLL_MC; + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = max(start_elem, totallines); + const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; + const int end_aligned = start_elem + aligned_elem; - int warp = blockIdx.x + (threadIdx.x >> 5); - int dest[RANKS - 1]; - int skipmy = 0; + for (int line = start_elem; line < end_aligned; line += loop_step) { + uint4 val[UNROLL_MC]; #pragma unroll - for (int i = 0; i < RANKS; i++) { - int dst = (i + warp + myrank) & (RANKS - 1); - if (dst == myrank) { - skipmy++; - continue; + for (int i = 0; i < UNROLL_MC; i++) +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (mylineoffset + line + i * loop_step0)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (mylineoffset + line + i * loop_step0)) + : "memory"); +#endif +#pragma unroll + for (int i = 0; i < UNROLL_MC; i++) + localptr[mylineoffset + line + i * loop_step0] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += loop_step0) { + uint4 val; +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (mylineoffset + line)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (mylineoffset + line)) + : "memory"); +#endif + localptr[mylineoffset + line] = val; + } + + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 inplace reduce-scatter kernel MC + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_mc_rs_oop(const int op, const int flagoffset, + const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, + const int totallines, const int rowlines, + const int skiplines, void **commbuff, + const int handleridx, void *outbuf, float4 *mc_ptr) { + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; + } } - dest[i - skipmy] = dst; } -#define UNROLLAG 4 __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } + const int loop_step0 = blockDim.x * gridDim.x; - const int loop_step = loop_step0 * UNROLLAG; + const int loop_step = loop_step0 * UNROLL_MC; const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; const int end_elem = max(start_elem, totallines); const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; const int end_aligned = start_elem + aligned_elem; - for (int line = start_elem; line < end_aligned; line += loop_step) { - int4 val[UNROLLAG]; + uint4 val[UNROLL_MC]; +#pragma unroll + for (int i = 0; i < UNROLL_MC; i++) +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (mylineoffset + line + i * loop_step0)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val[i].x), "=r"(val[i].y), "=r"(val[i].z), "=r"(val[i].w) + : "l"(mc_ptr + (mylineoffset + line + i * loop_step0)) + : "memory"); +#endif #pragma unroll - for (int j = 0; j < UNROLLAG; j++) val[j] = localptr[mylineoffset + line + loop_step0 * j]; + for (int i = 0; i < UNROLL_MC; i++) + (reinterpret_cast(outbuf))[((line + i * loop_step0) / rowlines) * skiplines + + ((line + i * loop_step0) % rowlines)] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += loop_step0) { + uint4 val; +#if defined(NVTE_UB_FP16) + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (mylineoffset + line)) + : "memory"); +#else + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(mc_ptr + (mylineoffset + line)) + : "memory"); +#endif + reinterpret_cast (outbuf)[(line / rowlines) * skiplines + (line % rowlines)] = val; + } + + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) fp16 MC + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_mc_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx, uint4 *mc_ptr) { + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + uint4 *localptr = reinterpret_cast(commbuff[myrank * gpustep + firstrank + handleridx]); + + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + } + __syncthreads(); + const int loop_step0 = blockDim.x * gridDim.x; + const int loop_step = loop_step0 * UNROLL_MC; + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = max(start_elem, totallines); + const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; + const int end_aligned = start_elem + aligned_elem; + for (int line = start_elem; line < end_aligned; line += loop_step) { + uint4 val[UNROLL_MC]; #pragma unroll - for (int j = 0; j < UNROLLAG; j++) + for (int i = 0; i < UNROLL_MC; i++) + val[i] = localptr[mylineoffset + line + i * loop_step0]; #pragma unroll - for (int i = 0; i < RANKS - 1; i++) { - userptr[dest[i]][mylineoffset + line + j * loop_step0] = val[j]; - } + for (int i = 0; i < UNROLL_MC; i++) + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"( + mc_ptr + (mylineoffset + line + i * loop_step0)), + "r"(val[i].x), "r"(val[i].y), "r"(val[i].z), "r"(val[i].w) + : "memory"); } - for (int line = end_aligned; line < end_elem; line += loop_step0) { - int4 sum = localptr[mylineoffset + line]; -#pragma unroll - for (int i = 0; i < RANKS - 1; i++) { - userptr[dest[i]][mylineoffset + line] = sum; - } + uint4 val = localptr[mylineoffset + line]; + asm volatile( + "multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(mc_ptr + (mylineoffset + line)), + "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory"); } __syncthreads(); - if (threadIdx.x == 0) __threadfence_system(); + if (threadIdx.x == 0) + __threadfence_system(); __syncthreads(); - if (threadIdx.x < RANKS) { + __shared__ int lastSM; + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + else + lastSM = 0; + } + __syncthreads(); + if (lastSM && threadIdx.x < RANKS) { + if (threadIdx.x == 0) + *reduceidptr = reduce_id; flagptr[physgpu] = reduce_id; volatile int *flag = (volatile int *)&myptr[targetgpu]; clock_t s = clock64(); @@ -490,229 +745,983 @@ __global__ void __launch_bounds__(MAX_THREADS) } } } - if (threadIdx.x == 0 && blockIdx.x == 0) *reduceidptr = reduce_id; -} // fp16 inplace allgather kernel (Volta,Hopper) +} // fp16 inplace allgather kernel (Hopper) MC +#else template __global__ void __launch_bounds__(MAX_THREADS) - userbuffers_fp16_sum_inplace_gpu_rr_blocked(const int op, const int flagoffset, - const int firstrank, const int myrank, - const int lineoffset, const int numlines, - void **commbuff, const int handleridx, - const int peerblocklines, int *hostflags, - int *gpuflag, const int numblocks) { - const int basecounter = gpuflag[NVTE_GF_STATE + op]; + userbuffers_fp16_sum_inplace_gpu_mc(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, const int lineoffset, + const int numlines, void **commbuff, const int handleridx, + float4 *mc_ptr) {} +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rs_oop( + const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep, + const int mylineoffset, const int totallines, const int rowlines, const int skiplines, + void **commbuff, const int handleridx, void *outbuf, float4 *mc_ptr) {} +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_mc_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx, uint4 *mc_ptr) {} +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_mc_rs(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx, float4 *mc_ptr) {} +#endif -#define REDUCETHREADS (blockDim.x - 32) +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_fp8( + const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep, + const int mylineoffset, const int totallines, const int rowlines, const int skiplines, + void **commbuff, const int handleridx, void *outbuf, float *scale) { + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + half hscale = (half)*scale; - if (threadIdx.x < 32) { - int *flagptr; - if (threadIdx.x < RANKS) { - if (!blockIdx.x) { - flagptr = reinterpret_cast(commbuff[threadIdx.x + firstrank]); - flagptr[flagoffset + myrank + firstrank] = basecounter; - } - volatile int *flag = (volatile int *)&((reinterpret_cast( - commbuff[myrank + firstrank]))[flagoffset + threadIdx.x + firstrank]); - while (*flag < basecounter) { + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; } } - __syncthreads(); - - int startblock = 0, endblock = numblocks; + } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); - for (int nblock = 0; nblock < endblock; nblock++) { - asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; - if (threadIdx.x == 0) { - __threadfence(); - if (blockIdx.x) gpuflag[op * NVTE_MAX_SMS * 2 + blockIdx.x] = nblock + basecounter + 1; - } else if (blockIdx.x == 0) { - int expecting = (basecounter + nblock + 1); - if (threadIdx.x < gridDim.x) - while (((volatile int *)gpuflag)[op * NVTE_MAX_SMS * 2 + threadIdx.x] < expecting) { - } - } - if (!blockIdx.x) { - asm volatile("bar.sync 15, %0;" ::"r"(32)); - if (!threadIdx.x) hostflags[0] = nblock + basecounter + 1; - } +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][mylineoffset + line]; } - int cachedflag = basecounter; + int4 sum[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; + half *s = reinterpret_cast(&sum); -#define ALLGATHERFLAG NVTE_GF_IBSHARPDONE +#pragma unroll + for (int i = 0; i < RANKS; i++) { + fp8type *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(fp8type); j++) + s[j] += hscale * (half)(x[j]); + } + int hline = 2 * line; + (reinterpret_cast(outbuf))[(hline / rowlines) * skiplines + (hline % rowlines)] = + sum[0]; + hline++; + (reinterpret_cast(outbuf))[(hline / rowlines) * skiplines + (hline % rowlines)] = + sum[1]; + } - if (blockIdx.x == 0 && threadIdx.x < RANKS) { - while (cachedflag < basecounter + numblocks) { - int newflag = ((volatile int *)gpuflag)[ALLGATHERFLAG]; - if (newflag == cachedflag) continue; - cachedflag = newflag; - flagptr[flagoffset + myrank + 32 + firstrank] = cachedflag; + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) (fp8->fp16) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_atomic_fp8( + const int op, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, const int totallines, const int rowlines, + const int skiplines_out, const int skiplines_in, void **commbuff, const int handleridx, + void *outbuf, float *scale, void *counters, const int numchunks, const int atomicindex) { + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + half hscale = (half)*scale; + + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + // const int blockflagoffset = MAX_NVLINK * 2 * blockIdx.x; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr); + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; // + blockflagoffset; + } + + for (int chunk_i = 0; chunk_i < numchunks; chunk_i++) { + ATOMIC_CONSUMER(chunk_i); + + lastSM = 0; + if (threadIdx.x < RANKS) { + reduce_id++; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; + } } } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), /*numchunks * */ adder); + if (old_val + adder == NVTE_MAX_SMS * (reduce_id /* + numchunks*/)) + lastSM = 1; + } - if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; - } else { - const int warp = blockIdx.x + (threadIdx.x >> 5); - int4 *userptr[RANKS]; - int4 *userptrmyrank; + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; #pragma unroll for (int i = 0; i < RANKS; i++) - userptr[i] = reinterpret_cast( - commbuff[((i + myrank + warp) & (RANKS - 1)) + handleridx + firstrank]); - userptrmyrank = reinterpret_cast(commbuff[myrank + handleridx + firstrank]); + dest[i] = (i + myrank + warp) & (RANKS - 1); + __syncthreads(); + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + const int rowlines_in = rowlines / 2; + const int index_in = skiplines_in == 0 + ? mylineoffset + myrank * totallines + line + : (numchunks <= 1 ? 1 : chunk_i) * mylineoffset + + myrank * (totallines * skiplines_in / rowlines_in) + + (line / rowlines_in) * skiplines_in + (line % rowlines_in); + const int index1_out = chunk_i * mylineoffset * 2 + ((2 * line) / rowlines) * skiplines_out + + ((2 * line) % rowlines); + const int index2_out = chunk_i * mylineoffset * 2 + + ((2 * line + 1) / rowlines) * skiplines_out + + ((2 * line + 1) % rowlines); - int blocklineoffset = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][index_in]; + } - while (blocklineoffset < numlines) { - const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); - const int blocklines = remainder / RANKS; - const int blockstart = lineoffset + blocklineoffset + blocklines * myrank; + int4 sum[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; + half *s = reinterpret_cast(&sum); - for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines; - line += REDUCETHREADS * gridDim.x) { - int4 val[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + fp8type *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(fp8type); j++) + s[j] += hscale * (half)(x[j]); + } + (reinterpret_cast(outbuf))[index1_out] = sum[0]; + (reinterpret_cast(outbuf))[index2_out] = sum[1]; + } + } + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) (fp8->fp16) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride(const int op, const int flagoffset, + const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, + const int totallines, const int rowlines, + const int skiplines, void **commbuff, + const int handleridx, void *outbuf) { + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; + } + } + } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; #pragma unroll - for (int i = 0; i < RANKS; i++) { - val[i] = userptr[i][blockstart + line]; - } + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); - int4 sum = val[0]; - half *s = reinterpret_cast(&sum); + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + int index_in = mylineoffset + myrank * (totallines * skiplines / rowlines) + + (line / rowlines) * skiplines + (line % rowlines); #pragma unroll - for (int i = 1; i < RANKS; i++) { - half *x = reinterpret_cast(&val[i]); + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][index_in]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + #pragma unroll - for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; - } + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) + s[j] += x[j]; + } - userptrmyrank[blockstart + line] = sum; - } // single block loop + int index_out = (line / rowlines) * skiplines + (line % rowlines); + (reinterpret_cast(outbuf))[index_out] = sum; + } - asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) fp16 + +#if 0 +template +__global__ void +__launch_bounds__(MAX_THREADS) +userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic_fp8( + const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep, + const int mylineoffset, const int totallines, const int rowlines, const int skiplines, + const int numchunks, void **commbuff, const int handleridx, void* outbuf, void *counters, + float* scale) { + if (counters) { + if ( threadIdx.x == 0 ) { + // spin-lock on counter from producer + int old_val; + while (0 != (old_val = atomicCAS(((unsigned int*)counters), 0, 0) )) {} + + // make sure all threadblocks have read/waited on counters. + int old_val2; + atomicInc(((unsigned int *)counters)+numchunks, gridDim.x-1); + while (0 != (old_val2 = atomicCAS(((unsigned int*)counters)+numchunks, 0, 0) )) {} + + // reset counter for next producer. + ((unsigned int*)counters)[0] = 1; + asm volatile ("fence.sc.gpu;\n"); + } + } + __syncthreads(); - blocklineoffset += peerblocklines * RANKS; - } // block loop NVLINK-REDUCESCATTER - const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1); - const int myblockDim = nwarps << 5; - const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1); - const int maxthreadIdx = myblockDim * (RANKS - 1) + 32; - const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1); - const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31); - volatile int *flag = (volatile int *)&((reinterpret_cast( - commbuff[myrank + firstrank]))[flagoffset + mydest + 32 + firstrank]); + __shared__ int4* userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + half hscale = (half) *scale; - int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)]; + if (threadIdx.x < RANKS) { + physgpu = myrank*gpustep+firstrank; + targetgpu = threadIdx.x*gpustep+firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr-NVTE_MAX_OPS; // +op; + reduce_id =(*reduceidptr)+1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) flagptr[physgpu] = reduce_id; + volatile int* flag = (volatile int*)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu+handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64()-s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", + myrank, blockIdx.x, threadIdx.x, reduce_id, *flag); + break; + } + } + } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS-gridDim.x+1 : 1; + int old_val = atomicAdd(myptr+(NVTE_MAX_NVLINK*2), adder); + if (old_val+adder == NVTE_MAX_SMS*reduce_id) lastSM = 1; + } - blocklineoffset = 0; - int gathercounter = basecounter + 1; - while (blocklineoffset < numlines) { - const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); - const int blocklines = remainder / RANKS; - const int blockstart = lineoffset + blocklineoffset; -#define UNROLL 6 - int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest]; - int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest]; + int warp = blockIdx.x+(threadIdx.x>>5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) + dest[i] = (i+myrank+warp)&(RANKS-1); - if (threadIdx.x < maxthreadIdx) { - const int start_elem = mythreadIdx + myblockDim * blockIdx.x; - const int end_elem = max(start_elem, blocklines); - const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) * - (myblockDim * gridDim.x * UNROLL); - const int end_aligned = start_elem + aligned_elem; + for (int line = threadIdx.x+blockDim.x*blockIdx.x; + line < totallines; line+=blockDim.x*gridDim.x) { + int4 val[RANKS]; + int index_in = mylineoffset + myrank*(totallines*skiplines/rowlines/2) + + (line/rowlines)*skiplines/2+(line%rowlines); - if (mythreadIdx == 0) { - while (*flag < gathercounter) { - } - gathercounter++; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][index_in]; } - asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim)); + int4 sum[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; + half *s = reinterpret_cast(&sum); - for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { - int4 val[UNROLL]; #pragma unroll - for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; + for (int i = 0; i < RANKS; i++) { + fp8type *x = reinterpret_cast(&val[i]); #pragma unroll - for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + for (int j=0; j < sizeof(int4)/sizeof(fp8type); j++) s[j] += hscale * (half)(x[j]); } - for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) - myptr[line] = peerptr[line]; + int hline = 2*line; + int index_out1 = (hline/rowlines)*skiplines+(hline%rowlines); + (reinterpret_cast(outbuf))[index_out1] = sum[0]; + hline++; + int index_out2 = (hline/rowlines)*skiplines+(hline%rowlines); + (reinterpret_cast(outbuf))[index_out2] = sum[1]; } - blocklineoffset += peerblocklines * RANKS; - } // block loop for NVLINK-ALLGATHER - } // worker warps else block -} // fp16 inplace reduce kernel with SHARP / in blocks -// threadfence and SMs sync to SM0 -#define SMBAR(offset, block) \ - asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); \ - if (threadIdx.x == 0) { \ - __threadfence_system(); \ - if (blockIdx.x) gpuflag[offset + blockIdx.x] = block + basecounter + 1; \ - } else if (blockIdx.x == 0) { \ - int expecting = (basecounter + block + 1); \ - if (threadIdx.x < gridDim.x) \ - while (((volatile int *)gpuflag)[offset + threadIdx.x] < expecting) { \ - } \ - } \ - if (blockIdx.x == 0) asm volatile("bar.sync 15, %0;" ::"r"(32)); + if (threadIdx.x == 0 && lastSM) *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) fp16 +#endif template -__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2( - const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks, - const int commbufoffset, const int flagoffset, const int firstrank, const int myrank, - const int gpustep, const int lineoffset, const int numlines, void **commbuff, - const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag, - const int numblocks) { - const int basecounter = gpuflag[NVTE_GF_STATE + op]; - if (threadIdx.x < 32) { - int *flagptr; - volatile int *localflag = (volatile int *)&( - ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]); // NOLINT(*) - // initial intranode barrier - once - if (threadIdx.x < RANKS) { - if (!blockIdx.x) { - flagptr = reinterpret_cast(commbuff[gpustep * threadIdx.x + firstrank]); - flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter; +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic( + const int op, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, const int totallines, const int rowlines, + const int skiplines, const int numchunks, void **commbuff, const int handleridx, + void *outbuf, void *counters) { + if (counters) { + if (threadIdx.x == 0) { + // spin-lock on counter from producer + int old_val; + while (0 != (old_val = atomicCAS(((unsigned int *)counters), 0, 0))) { } - volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank]; - while (*flag < basecounter) { + + // make sure all threadblocks have read/waited on counters. + int old_val2; + atomicInc(((unsigned int *)counters) + numchunks, gridDim.x - 1); + while (0 != (old_val2 = atomicCAS(((unsigned int *)counters) + numchunks, 0, 0))) { } + + // reset counter for next producer. + ((unsigned int *)counters)[0] = 1; + asm volatile("fence.sc.gpu;\n"); } - __syncthreads(); + } + __syncthreads(); - for (int nblock = 0; nblock < numblocks + headstart; nblock++) { - if (nblock < numblocks) { - // RS happens here - SMBAR(op * 2 * NVTE_MAX_SMS, nblock); - if (!blockIdx.x && !threadIdx.x) - hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1; - } + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; - if (nblock >= headstart) { - for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) - if (ibflag != myibrank) - while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) { - } - asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); - // REDUCE happens here - SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart); - if (!blockIdx.x && !threadIdx.x) - hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = nblock + basecounter + 1 - headstart; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; } } - // final part doing NVAG based on responses from NIC-RMW:IBAG + } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } - if (blockIdx.x == 0) { - for (int nblock = 0; nblock < numblocks; nblock++) { - const int expected = basecounter + nblock + 1; - for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); + + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + int index_in = mylineoffset + myrank * (totallines * skiplines / rowlines) + + (line / rowlines) * skiplines + (line % rowlines); + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][index_in]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) + s[j] += x[j]; + } + + int index_out = (line / rowlines) * skiplines + (line % rowlines); + (reinterpret_cast(outbuf))[index_out] = sum; + } + + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; +} // fp16 reduce-scatter kernel (out of place) fp16 + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_multiatomic( + const int op, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int mylineoffset, const int totallines, const int rowlines, + const int skiplines, const int numchunks, void **commbuff, const int handleridx, + void *outbuf, void *counters) { + for (int chunk_i = 0; chunk_i < numchunks; chunk_i++) { + if (counters) { + if (threadIdx.x == 0) { + // spin-lock on counter from producer + int old_val; + while (0 != (old_val = atomicCAS(((unsigned int *)counters) + chunk_i, 0, 0))) { + } + + // make sure all threadblocks have read/waited on counters. + int old_val2; + atomicInc(((unsigned int *)counters) + numchunks + chunk_i, gridDim.x - 1); + while (0 != + (old_val2 = atomicCAS(((unsigned int *)counters) + numchunks + chunk_i, 0, 0))) { + } + + // reset counter for next producer. + ((unsigned int *)counters)[chunk_i] = 1; + asm volatile("fence.sc.gpu;\n"); + } + } + __syncthreads(); + + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int lastSM = 0; + + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + if (blockIdx.x == 0) + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x, + threadIdx.x, reduce_id, *flag); + break; + } + } + } + __syncthreads(); + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + } + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; +#pragma unroll + for (int i = 0; i < RANKS; i++) + dest[i] = (i + myrank + warp) & (RANKS - 1); + + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS]; + int index_in = chunk_i * mylineoffset + myrank * (totallines * skiplines / rowlines) + + (line / rowlines) * skiplines + (line % rowlines); + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[dest[i]][index_in]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < 8; j++) + s[j] += x[j]; + } + + int index_out = chunk_i * mylineoffset + (line / rowlines) * skiplines + (line % rowlines); + (reinterpret_cast(outbuf))[index_out] = sum; + } + if (threadIdx.x == 0 && lastSM) + *reduceidptr = reduce_id; + } +} // fp16 reduce-scatter kernel (out of place) fp16 + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + volatile int *flag = (volatile int *)&(myptr[targetgpu]); + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + clock_t s = clock64(); + } + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS]; + + int skipmy = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + int dst = (i + warp + myrank) & (RANKS - 1); + if (dst == myrank) { + skipmy++; + continue; + } + dest[i - skipmy] = dst; + } + __syncthreads(); + + for (int line = threadIdx.x + blockDim.x * blockIdx.x; line < totallines; + line += blockDim.x * gridDim.x) { + int4 val[RANKS - 1]; + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + val[i] = userptr[dest[i]][mylineoffset + line + totallines * dest[i]]; + } + +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[myrank][mylineoffset + line + totallines * dest[i]] = val[i]; + } + } + __shared__ int lastSM; + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + else + lastSM = 0; + } + __syncthreads(); + if (lastSM && threadIdx.x < RANKS) { + if (threadIdx.x == 0) + *reduceidptr = reduce_id; + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } +} // fp16 inplace reduce kernel (Ampere) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rw_ag(const int op, const int flagoffset, const int firstrank, + const int myrank, const int gpustep, + const int mylineoffset, const int totallines, + void **commbuff, const int handleridx) { + __shared__ int4 *userptr[RANKS]; + volatile int *flagptr; + int physgpu, targetgpu, *myptr; + int *reduceidptr, reduce_id; + int4 *localptr; + if (threadIdx.x < RANKS) { + physgpu = myrank * gpustep + firstrank; + targetgpu = threadIdx.x * gpustep + firstrank; + myptr = (reinterpret_cast(commbuff[physgpu])) + flagoffset; + reduceidptr = myptr - NVTE_MAX_OPS; // +op; + reduce_id = (*reduceidptr) + 1; + flagptr = (reinterpret_cast(commbuff[targetgpu])) + flagoffset; + userptr[threadIdx.x] = reinterpret_cast(commbuff[targetgpu + handleridx]); + } + __syncthreads(); + localptr = userptr[myrank]; + + int warp = blockIdx.x + (threadIdx.x >> 5); + int dest[RANKS - 1]; + int skipmy = 0; +#pragma unroll + for (int i = 0; i < RANKS; i++) { + int dst = (i + warp + myrank) & (RANKS - 1); + if (dst == myrank) { + skipmy++; + continue; + } + dest[i - skipmy] = dst; + } +#define UNROLLAG 4 + __syncthreads(); + const int loop_step0 = blockDim.x * gridDim.x; + const int loop_step = loop_step0 * UNROLLAG; + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = max(start_elem, totallines); + const int aligned_elem = ((end_elem - start_elem) / loop_step) * loop_step; + const int end_aligned = start_elem + aligned_elem; + + for (int line = start_elem; line < end_aligned; line += loop_step) { + int4 val[UNROLLAG]; +#pragma unroll + for (int j = 0; j < UNROLLAG; j++) + val[j] = localptr[mylineoffset + line + loop_step0 * j]; + +#pragma unroll + for (int j = 0; j < UNROLLAG; j++) +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[dest[i]][mylineoffset + line + j * loop_step0] = val[j]; + } + } + + for (int line = end_aligned; line < end_elem; line += loop_step0) { + int4 sum = localptr[mylineoffset + line]; +#pragma unroll + for (int i = 0; i < RANKS - 1; i++) { + userptr[dest[i]][mylineoffset + line] = sum; + } + } + + __syncthreads(); + if (threadIdx.x == 0) + __threadfence_system(); + __syncthreads(); + + __shared__ int lastSM; + if (threadIdx.x == 0) { + const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS - gridDim.x + 1 : 1; + int old_val = atomicAdd(myptr + (NVTE_MAX_NVLINK * 2), adder); + if (old_val + adder == NVTE_MAX_SMS * reduce_id) + lastSM = 1; + else + lastSM = 0; + } + __syncthreads(); + if (lastSM && threadIdx.x < RANKS) { + if (threadIdx.x == 0) + *reduceidptr = reduce_id; + flagptr[physgpu] = reduce_id; + volatile int *flag = (volatile int *)&myptr[targetgpu]; + clock_t s = clock64(); + while (*flag < reduce_id) { + if (clock64() - s > 2ull * TIMEOUT) { + printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id, + *flag); + break; + } + } + } +} // fp16 inplace allgather kernel (Volta,Hopper) + +template +__global__ void __launch_bounds__(MAX_THREADS) + userbuffers_fp16_sum_inplace_gpu_rr_blocked(const int op, const int flagoffset, + const int firstrank, const int myrank, + const int lineoffset, const int numlines, + void **commbuff, const int handleridx, + const int peerblocklines, int *hostflags, + int *gpuflag, const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + +#define REDUCETHREADS (blockDim.x - 32) + + if (threadIdx.x < 32) { + int *flagptr; + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[threadIdx.x + firstrank]); + flagptr[flagoffset + myrank + firstrank] = basecounter; + } + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[myrank + firstrank]))[flagoffset + threadIdx.x + firstrank]); + while (*flag < basecounter) { + } + } + __syncthreads(); + + int startblock = 0, endblock = numblocks; + + for (int nblock = 0; nblock < endblock; nblock++) { + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + if (threadIdx.x == 0) { + __threadfence(); + if (blockIdx.x) + gpuflag[op * NVTE_MAX_SMS * 2 + blockIdx.x] = nblock + basecounter + 1; + } else if (blockIdx.x == 0) { + int expecting = (basecounter + nblock + 1); + if (threadIdx.x < gridDim.x) + while (((volatile int *)gpuflag)[op * NVTE_MAX_SMS * 2 + threadIdx.x] < expecting) { + } + } + if (!blockIdx.x) { + asm volatile("bar.sync 15, %0;" ::"r"(32)); + if (!threadIdx.x) + hostflags[0] = nblock + basecounter + 1; + } + } + + int cachedflag = basecounter; + +#define ALLGATHERFLAG NVTE_GF_IBSHARPDONE + + if (blockIdx.x == 0 && threadIdx.x < RANKS) { + while (cachedflag < basecounter + numblocks) { + int newflag = ((volatile int *)gpuflag)[ALLGATHERFLAG]; + if (newflag == cachedflag) + continue; + cachedflag = newflag; + flagptr[flagoffset + myrank + 32 + firstrank] = cachedflag; + } + } + + if (blockIdx.x == 0 && threadIdx.x == 0) + gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + } else { + const int warp = blockIdx.x + (threadIdx.x >> 5); + int4 *userptr[RANKS]; + int4 *userptrmyrank; +#pragma unroll + for (int i = 0; i < RANKS; i++) + userptr[i] = reinterpret_cast( + commbuff[((i + myrank + warp) & (RANKS - 1)) + handleridx + firstrank]); + userptrmyrank = reinterpret_cast(commbuff[myrank + handleridx + firstrank]); + __syncthreads(); + + int blocklineoffset = 0; + + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset + blocklines * myrank; + + for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines; + line += REDUCETHREADS * gridDim.x) { + int4 val[RANKS]; + +#pragma unroll + for (int i = 0; i < RANKS; i++) { + val[i] = userptr[i][blockstart + line]; + } + + int4 sum = val[0]; + half *s = reinterpret_cast(&sum); + +#pragma unroll + for (int i = 1; i < RANKS; i++) { + half *x = reinterpret_cast(&val[i]); +#pragma unroll + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) + s[j] += x[j]; + } + + userptrmyrank[blockstart + line] = sum; + } // single block loop + + asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); + + blocklineoffset += peerblocklines * RANKS; + } // block loop NVLINK-REDUCESCATTER + const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1); + const int myblockDim = nwarps << 5; + const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1); + const int maxthreadIdx = myblockDim * (RANKS - 1) + 32; + const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1); + const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31); + volatile int *flag = (volatile int *)&((reinterpret_cast( + commbuff[myrank + firstrank]))[flagoffset + mydest + 32 + firstrank]); + + int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)]; + + blocklineoffset = 0; + int gathercounter = basecounter + 1; + while (blocklineoffset < numlines) { + const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS); + const int blocklines = remainder / RANKS; + const int blockstart = lineoffset + blocklineoffset; + +#define UNROLL 6 + int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest]; + int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest]; + + if (threadIdx.x < maxthreadIdx) { + const int start_elem = mythreadIdx + myblockDim * blockIdx.x; + const int end_elem = max(start_elem, blocklines); + const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) * + (myblockDim * gridDim.x * UNROLL); + const int end_aligned = start_elem + aligned_elem; + + if (mythreadIdx == 0) { + while (*flag < gathercounter) { + } + gathercounter++; + } + + asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim)); + + for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { + int4 val[UNROLL]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) + val[i] = peerptr[line + i * myblockDim * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLL; i++) + myptr[line + i * myblockDim * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) + myptr[line] = peerptr[line]; + } + blocklineoffset += peerblocklines * RANKS; + } // block loop for NVLINK-ALLGATHER + } // worker warps else block +} // fp16 inplace reduce kernel with SHARP / in blocks + +// threadfence and SMs sync to SM0 +#define SMBAR(offset, block) \ + asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); \ + if (threadIdx.x == 0) { \ + __threadfence_system(); \ + if (blockIdx.x) \ + gpuflag[offset + blockIdx.x] = block + basecounter + 1; \ + } else if (blockIdx.x == 0) { \ + int expecting = (basecounter + block + 1); \ + if (threadIdx.x < gridDim.x) \ + while (((volatile int *)gpuflag)[offset + threadIdx.x] < expecting) { \ + } \ + } \ + if (blockIdx.x == 0) \ + asm volatile("bar.sync 15, %0;" ::"r"(32)); + +template +__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2( + const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks, + const int commbufoffset, const int flagoffset, const int firstrank, const int myrank, + const int gpustep, const int lineoffset, const int numlines, void **commbuff, + const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag, + const int numblocks) { + const int basecounter = gpuflag[NVTE_GF_STATE + op]; + if (threadIdx.x < 32) { + int *flagptr; + volatile int *localflag = (volatile int *)&( + ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]); // NOLINT(*) + // initial intranode barrier - once + if (threadIdx.x < RANKS) { + if (!blockIdx.x) { + flagptr = reinterpret_cast(commbuff[gpustep * threadIdx.x + firstrank]); + flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter; + } + volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank]; + while (*flag < basecounter) { + } + } + __syncthreads(); + + for (int nblock = 0; nblock < numblocks + headstart; nblock++) { + if (nblock < numblocks) { + // RS happens here + SMBAR(op * 2 * NVTE_MAX_SMS, nblock); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1; + } + + if (nblock >= headstart) { + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) + if (ibflag != myibrank) + while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) { + } + asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x)); + // REDUCE happens here + SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart); + if (!blockIdx.x && !threadIdx.x) + hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = nblock + basecounter + 1 - headstart; + } + } + // final part doing NVAG based on responses from NIC-RMW:IBAG + + if (blockIdx.x == 0) { + for (int nblock = 0; nblock < numblocks; nblock++) { + const int expected = basecounter + nblock + 1; + for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32) if (ibflag != myibrank) while (localflag[NVTE_REG0_IBAG + ibflag] < expected) { } @@ -722,7 +1731,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ } } - if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + if (blockIdx.x == 0 && threadIdx.x == 0) + gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; } else { // sync warp // reducethreads const int warp = blockIdx.x + (threadIdx.x >> 5); @@ -762,7 +1772,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) + s[j] += x[j]; } userptrmyrank[blockstart + line] = sum; @@ -801,13 +1812,15 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ : tempbufptr[i * ibblocklines + line]; half *x = reinterpret_cast(&val[(i + 1) % UNROLLRS]); #pragma unroll - for (int j = 0; j < 16; j++) s[j] += x[j]; + for (int j = 0; j < 16; j++) + s[j] += x[j]; } #pragma unroll for (int i = 1; i < UNROLLRS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 16; j++) s[j] += x[j]; + for (int j = 0; j < 16; j++) + s[j] += x[j]; } userptrmyrank[tempstart + line] = sum; } @@ -858,9 +1871,11 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { int4 val[UNROLL]; #pragma unroll - for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; + for (int i = 0; i < UNROLL; i++) + val[i] = peerptr[line + i * myblockDim * gridDim.x]; #pragma unroll - for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + for (int i = 0; i < UNROLL; i++) + myptr[line + i * myblockDim * gridDim.x] = val[i]; } for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) myptr[line] = peerptr[line]; @@ -952,7 +1967,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ for (int i = 1; i < RANKS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < sizeof(int4) / sizeof(half); j++) s[j] += x[j]; + for (int j = 0; j < sizeof(int4) / sizeof(half); j++) + s[j] += x[j]; } userptrmyrank[blockstart + line] = sum; @@ -971,9 +1987,6 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ int4 *tempbufptr = &internalbuf[((nblock - headstart) % maxcredit) * peerblocklines]; const int tempstart = lineoffset + (nblock - headstart) * peerblocklines * RANKS + myrank * blocklines + ibblocklines * myibrank; - // if(threadIdx.x==32) printf("[%d] block%d thread %d offset %d line %d ibblocklines %d ptr - // %lx commbufoffset - // %d\n",myrank,blockIdx.x,threadIdx.x,tempstart,0,ibblocklines,(void*)&tempbufptr[(1-myibrank)*ibblocklines],(1-myibrank)*ibblocklines*16); asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32)); @@ -994,13 +2007,15 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ : tempbufptr[i * ibblocklines + line]; half *x = reinterpret_cast(&val[(i + 1) % UNROLLRS]); #pragma unroll - for (int j = 0; j < 16; j++) s[j] += x[j]; + for (int j = 0; j < 16; j++) + s[j] += x[j]; } #pragma unroll for (int i = 1; i < UNROLLRS; i++) { half *x = reinterpret_cast(&val[i]); #pragma unroll - for (int j = 0; j < 16; j++) s[j] += x[j]; + for (int j = 0; j < 16; j++) + s[j] += x[j]; } userptrmyrank[tempstart + line] = sum; } @@ -1048,7 +2063,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ } } - if (blockIdx.x == 0 && threadIdx.x == 0) gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; + if (blockIdx.x == 0 && threadIdx.x == 0) + gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks; } else { // sync warp // reducethreads const int warp = blockIdx.x + (threadIdx.x >> 5); @@ -1105,9 +2121,11 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) { int4 val[UNROLL]; #pragma unroll - for (int i = 0; i < UNROLL; i++) val[i] = peerptr[line + i * myblockDim * gridDim.x]; + for (int i = 0; i < UNROLL; i++) + val[i] = peerptr[line + i * myblockDim * gridDim.x]; #pragma unroll - for (int i = 0; i < UNROLL; i++) myptr[line + i * myblockDim * gridDim.x] = val[i]; + for (int i = 0; i < UNROLL; i++) + myptr[line + i * myblockDim * gridDim.x] = val[i]; } for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x) myptr[line] = peerptr[line]; @@ -1125,102 +2143,134 @@ __global__ void userbuffers_fp16_sum_inplace_gpu_null(const int op, int *hostfla gpuflag[NVTE_GF_STATE + op] = basecounter; while (((volatile int *)gpuflag)[NVTE_GF_IBSHARPDONE] < basecounter) { } -} +} + +#define callranks_block(x) \ + if (comm->ar_nvsize == x) \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked<<>>( \ + userbuffers_allreduceop_sharp, NVTE_REG0_OFFSET(comm), comm->ar_firstgpu, comm->ar_nvrank, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / comm->ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, \ + (elements * 2 + blocksize - 1) / blocksize); + +#define callranks2_block(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) \ + headstart = maxcredit; \ + if (x == 1) \ + headstart = maxcredit; \ + if (headstart > numblocks) \ + headstart = numblocks; \ + if (headstart == 0) \ + headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks2_block_rs(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) \ + headstart = maxcredit; \ + if (x == 1) \ + headstart = maxcredit; \ + if (headstart > numblocks) \ + headstart = numblocks; \ + if (headstart == 0) \ + headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks2_block_ag(x) \ + if (ar_nvsize == x) { \ + int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ + int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ + if (headstart > maxcredit) \ + headstart = maxcredit; \ + if (x == 1) \ + headstart = maxcredit; \ + if (headstart > numblocks) \ + headstart = numblocks; \ + if (headstart == 0) \ + headstart = 1; \ + userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag<<>>( \ + op, maxcredit, headstart, my_node, num_nodes, \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ + (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ + NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ + offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ + handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ + reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ + } + +#define callranks(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8, \ + arg7 = elements / 8; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr \ + : userbuffers_fp16_sum_inplace_gpu_rw), \ + kernelArgs)); \ + } -#define callranks_block(x) \ - if (comm->ar_nvsize == x) \ - userbuffers_fp16_sum_inplace_gpu_rr_blocked<<>>( \ - userbuffers_allreduceop_sharp, NVTE_REG0_OFFSET(comm), comm->ar_firstgpu, comm->ar_nvrank, \ - offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ - handler * comm->nvsize, blocksize / sizeof(int4) / comm->ar_nvsize, \ - reinterpret_cast(comm->hostflags), comm->flags, \ - (elements * 2 + blocksize - 1) / blocksize); +#define callranksMC(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8, \ + arg7 = elements / 8; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *arg10 = comm->mc_ptr[handler]; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_mc), kernelArgs)); \ + } -#define callranks2_block(x) \ - if (ar_nvsize == x) { \ - int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ - int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ - if (headstart > maxcredit) headstart = maxcredit; \ - if (x == 1) headstart = maxcredit; \ - if (headstart > numblocks) headstart = numblocks; \ - if (headstart == 0) headstart = 1; \ - userbuffers_fp16_sum_inplace_gpu_rr_blocked2<<>>( \ - op, maxcredit, headstart, my_node, num_nodes, \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ - (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ - offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ - handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ - reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ - } - -#define callranks2_block_rs(x) \ - if (ar_nvsize == x) { \ - int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ - int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ - if (headstart > maxcredit) headstart = maxcredit; \ - if (x == 1) headstart = maxcredit; \ - if (headstart > numblocks) headstart = numblocks; \ - if (headstart == 0) headstart = 1; \ - userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs<<>>( \ - op, maxcredit, headstart, my_node, num_nodes, \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ - (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ - offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ - handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ - reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ - } - -#define callranks2_block_ag(x) \ - if (ar_nvsize == x) { \ - int numblocks = (elements * 2 + blocksize - 1) / blocksize; \ - int headstart = numblocks - 1; /*<3?numblocks-1:3;*/ \ - if (headstart > maxcredit) headstart = maxcredit; \ - if (x == 1) headstart = maxcredit; \ - if (headstart > numblocks) headstart = numblocks; \ - if (headstart == 0) headstart = 1; \ - userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag<<>>( \ - op, maxcredit, headstart, my_node, num_nodes, \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS + \ - (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0), \ - NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step, \ - offset / 8, elements / 8, reinterpret_cast(comm->gpu_ptrs), \ - handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize, \ - reinterpret_cast(comm->hostflags), comm->flags, numblocks); \ - } - -#define callranks(x) \ - if (ar_nvsize == x) { \ - int arg1 = op - NVTE_MAX_OPS, \ - arg2 = NVTE_REG0_OFFSET(comm) - \ - (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ - NVTE_MAX_OPS, \ - arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8, \ - arg7 = elements / 8; \ - void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ - int arg9 = handler * comm->nvsize; \ - void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ - reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ - reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ - reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ - reinterpret_cast(&arg9)}; \ - CUDACHECK(cudaLaunchKernelExC( \ - &cfg, \ - reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr \ - : userbuffers_fp16_sum_inplace_gpu_rw), \ - kernelArgs)); \ - } - -#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \ - cudaLaunchConfig_t cfg = {sms, threads, 0, stream, NULL, 0}; \ - cudaLaunchAttribute attribute_ub[2]; \ - attribute_ub[1].id = cudaLaunchAttributeClusterDimension; \ - attribute_ub[1].val.clusterDim.x = sms % comm->cga_size == 0 ? comm->cga_size : 1; \ - attribute_ub[1].val.clusterDim.y = 1; \ - attribute_ub[1].val.clusterDim.z = 1; \ - attribute_ub[0].id = cudaLaunchAttributeCooperative; \ - cfg.attrs = attribute_ub; \ +#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \ + cudaLaunchConfig_t cfg = {sms, threads, 0, stream, NULL, 0}; \ + cudaLaunchAttribute attribute_ub[2]; \ + attribute_ub[1].id = cudaLaunchAttributeClusterDimension; \ + attribute_ub[1].val.clusterDim.x = sms % comm->cga_size == 0 ? comm->cga_size : 1; \ + attribute_ub[1].val.clusterDim.y = 1; \ + attribute_ub[1].val.clusterDim.z = 1; \ + attribute_ub[0].id = cudaLaunchAttributeCooperative; \ + cfg.attrs = attribute_ub; \ cfg.numAttrs = comm->sm_arch >= 9 ? 2 : 1; int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const int elements, @@ -1232,10 +2282,12 @@ int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const in const int ar_nvsize = comm->nvsize; const int ar_firstgpu = comm->ar_firstgpu; const int ar_nvrank = comm->ar_nvrank; - if (elements < 8) return 0; + if (elements < 8) + return 0; int sms = sms = comm->sms; int warps = comm->threads / 32; - if (warps < comm->ar_nvsize) warps = comm->ar_nvsize; + if (warps < comm->ar_nvsize) + warps = comm->ar_nvsize; if (comm->launch_mode & NVTE_LAUNCH_GPU) { if (comm->ar_nvsize == 1) @@ -1259,109 +2311,502 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - - if (elements < 8) return 0; + + if (elements < 8) + return 0; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) + warps = ar_nvsize; + if (num_nodes > 1) { + callranks2_block(1) callranks2_block(2) callranks2_block(4) callranks2_block(8) + } else { + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks(2) callranks(4) callranks(8) + } + return sms; +} + +#define callranks_ag(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + (comm->use_rr_kernel ? 0 : arg4 * arg7); \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr_ag \ + : userbuffers_fp16_sum_inplace_gpu_rw_ag), \ + kernelArgs)); \ + } + +#define callranks_agMC(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + uint4 *arg10 = reinterpret_cast(comm->mc_ptr[handler]); \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_mc_ag), kernelArgs)); \ + } + +#define callranks_rs(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs), kernelArgs)); \ + } + +#define callranks_rsMC(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7; \ + void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ + int arg9 = handler * comm->nvsize; \ + void *arg10 = comm->mc_ptr[handler]; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_mc_rs), kernelArgs)); \ + } + +#define callranks_rs_oop(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7, arg8 = rowelements / 8, arg9 = strideelements / 8; \ + void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ + int arg11 = handler * comm->nvsize; \ + void *arg12 = output; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop), \ + kernelArgs)); \ + } + +#define callranks_rs_oop_fp8(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 16 / x, \ + arg6 = offset / 16 + arg4 * arg7, arg8 = rowelements / 8, arg9 = strideelements / 8; \ + void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ + int arg11 = handler * comm->nvsize; \ + void *arg12 = output; \ + float *arg13 = scale; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_fp8), \ + kernelArgs)); \ + } + +#define callranks_rs_oopMC(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8 + arg4 * arg7, arg8 = rowelements / 8, arg9 = strideelements / 8; \ + void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ + int arg11 = handler * comm->nvsize; \ + void *arg12 = output; \ + void *arg13 = comm->mc_ptr[handler]; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_mc_rs_oop), \ + kernelArgs)); \ + } + +#define callranks_rs_oop_atomic_fp8(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 16 / x, \ + arg6 = offset / 16, arg8 = rowelements / 8, arg9 = strideelements_out / 8, \ + arg10 = strideelements_in / 16; \ + void **arg11 = reinterpret_cast(comm->gpu_ptrs); \ + int arg12 = handler * comm->nvsize; \ + void *arg13 = output; \ + float *arg14 = scale; \ + void *arg15 = counters; \ + int arg16 = numchunks, arg17 = atomicindex; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13), reinterpret_cast(&arg14), \ + reinterpret_cast(&arg15), reinterpret_cast(&arg16), \ + reinterpret_cast(&arg17)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast( \ + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_atomic_fp8), \ + kernelArgs)); \ + } + +#define callranks_rs_oop_stride(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8, arg8 = rowelements / 8, arg9 = strideelements / 8; \ + void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ + int arg11 = handler * comm->nvsize; \ + void *arg12 = output; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride), \ + kernelArgs)); \ + } + +#if 0 +#define callranks_rs_oop_stride_atomic_fp8(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 16 / x, \ + arg6 = offset / 16, arg8 = rowelements / 8, arg9 = strideelements / 8, arg10 = numchunks; \ + void **arg11 = reinterpret_cast(comm->gpu_ptrs); \ + int arg12 = handler * comm->nvsize; \ + void *arg13 = output; \ + void *arg14 = counters; \ + float *arg15 = scale; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13), reinterpret_cast(&arg14), \ + reinterpret_cast(&arg15)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast( \ + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic_fp8), \ + kernelArgs)); \ + } +#endif + +#define callranks_rs_oop_stride_atomic(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8, arg8 = rowelements / 8, arg9 = strideelements / 8, arg10 = numchunks; \ + void **arg11 = reinterpret_cast(comm->gpu_ptrs); \ + int arg12 = handler * comm->nvsize; \ + void *arg13 = output; \ + void *arg14 = counters; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13), reinterpret_cast(&arg14)}; \ + CUDACHECK(cudaLaunchKernelExC( \ + &cfg, \ + reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic), \ + kernelArgs)); \ + } + +#define callranks_rs_oop_stride_multiatomic(x) \ + if (ar_nvsize == x) { \ + int arg1 = op - NVTE_MAX_OPS, \ + arg2 = NVTE_REG0_OFFSET(comm) - \ + (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ + NVTE_MAX_OPS, \ + arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ + arg6 = offset / 8, arg8 = rowelements / 8, arg9 = strideelements / 8, arg10 = numchunks; \ + void **arg11 = reinterpret_cast(comm->gpu_ptrs); \ + int arg12 = handler * comm->nvsize; \ + void *arg13 = output; \ + void *arg14 = counters; \ + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ + reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ + reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ + reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ + reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ + reinterpret_cast(&arg11), reinterpret_cast(&arg12), \ + reinterpret_cast(&arg13), reinterpret_cast(&arg14)}; \ + CUDACHECK( \ + cudaLaunchKernelExC(&cfg, \ + reinterpret_cast( \ + userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_multiatomic), \ + kernelArgs)); \ + } + +int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, + const int elements, const int blocksize, communicator *comm, + cudaStream_t stream, int op) { + // schedule GPU kernel only + // CPU/SHARP part is responsibility of caller + + const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; + const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 8) + return 0; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) + warps = ar_nvsize; + + if (num_nodes > 1) { + callranks2_block_rs(1) callranks2_block_rs(2) callranks2_block_rs(4) callranks2_block_rs(8) + } else { + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs(2) callranks_rs(4) callranks_rs(8) + } + return sms; +} + +void reducescatter2_userbuff_strided(void *output, const int handler, const int offset, + const int rowelements, const int colelements, + const int strideelements, communicator *comm, + cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 64) + return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) + warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs_oop_stride(2) callranks_rs_oop_stride(4) callranks_rs_oop_stride(8) +} +void reducescatter2_userbuff_strided_atomic(void *output, const int handler, const int offset, + const int rowelements, const int colelements, + const int strideelements, const int numchunks, + void *counters, communicator *comm, + cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + + if (elements < 64) + return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) + warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs_oop_stride_atomic(2) callranks_rs_oop_stride_atomic(4) + callranks_rs_oop_stride_atomic(8) +} + +#if 0 + template + void reducescatter2_userbuff_strided_atomic_fp8( + void* output, float *scale, const int handler, const int offset, const int rowelements, + const int colelements, const int strideelements, const int numchunks, void *counters, + communicator* comm, cudaStream_t stream) { + const int elements = rowelements*colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements; + const int ar_firstgpu = op == userbuffers_allreduceop_nonsharp ? + comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? + 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? + comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? + comm->ar_nvrank : comm->ar2_nvrank; + + assert(comm->sm_arch >= 9); + if (elements < 128) return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads/32; + if (warps < ar_nvsize) warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps*32, stream); + callranks_rs_oop_stride_atomic_fp8(2) + callranks_rs_oop_stride_atomic_fp8(4) + callranks_rs_oop_stride_atomic_fp8(8) + } +#endif +template +void reducescatter2_userbuff_strided_universal_fp8(void *output, float *scale, const int handler, + const int offset, const int rowelements, + const int colelements, + const int strideelements_out, + const int strideelements_in, const int numchunks, + const int atomicindex, void *counters, + communicator *comm, cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + assert(comm->sm_arch >= 9); + if (elements < 128) + return; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; - if (num_nodes > 1) { - callranks2_block(1) callranks2_block(2) callranks2_block(4) callranks2_block(8) - } else { - SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); - callranks(2) callranks(4) callranks(8) - } - return sms; -} - -#define callranks_ag(x) \ - if (ar_nvsize == x) { \ - int arg1 = op - NVTE_MAX_OPS, \ - arg2 = NVTE_REG0_OFFSET(comm) - \ - (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ - NVTE_MAX_OPS, \ - arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ - arg6 = offset / 8 + (comm->use_rr_kernel ? 0 : arg4 * arg7); \ - void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ - int arg9 = handler * comm->nvsize; \ - void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ - reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ - reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ - reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ - reinterpret_cast(&arg9)}; \ - CUDACHECK(cudaLaunchKernelExC( \ - &cfg, \ - reinterpret_cast(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr_ag \ - : userbuffers_fp16_sum_inplace_gpu_rw_ag), \ - kernelArgs)); \ - } + if (warps < ar_nvsize) + warps = ar_nvsize; -#define callranks_rs(x) \ - if (ar_nvsize == x) { \ - int arg1 = op - NVTE_MAX_OPS, \ - arg2 = NVTE_REG0_OFFSET(comm) - \ - (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ - NVTE_MAX_OPS, \ - arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ - arg6 = offset / 8 + arg4 * arg7; \ - void **arg8 = reinterpret_cast(comm->gpu_ptrs); \ - int arg9 = handler * comm->nvsize; \ - void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ - reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ - reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ - reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ - reinterpret_cast(&arg9)}; \ - CUDACHECK(cudaLaunchKernelExC( \ - &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs), kernelArgs)); \ - } - -#define callranks_rs_oop(x) \ - if (ar_nvsize == x) { \ - int arg1 = op - NVTE_MAX_OPS, \ - arg2 = NVTE_REG0_OFFSET(comm) - \ - (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE + \ - NVTE_MAX_OPS, \ - arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 8 / x, \ - arg6 = offset / 8 + arg4 * arg7, arg8 = rowelements / 8, arg9 = strideelements / 8; \ - void **arg10 = reinterpret_cast(comm->gpu_ptrs); \ - int arg11 = handler * comm->nvsize; \ - void *arg12 = output; \ - void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), \ - reinterpret_cast(&arg3), reinterpret_cast(&arg4), \ - reinterpret_cast(&arg5), reinterpret_cast(&arg6), \ - reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ - reinterpret_cast(&arg9), reinterpret_cast(&arg10), \ - reinterpret_cast(&arg11), reinterpret_cast(&arg12)}; \ - CUDACHECK(cudaLaunchKernelExC( \ - &cfg, reinterpret_cast(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop), \ - kernelArgs)); \ - } + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs_oop_atomic_fp8(2) callranks_rs_oop_atomic_fp8(4) callranks_rs_oop_atomic_fp8(8) +} -int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, - const int elements, const int blocksize, communicator *comm, - cudaStream_t stream, int op) { - // schedule GPU kernel only - // CPU/SHARP part is responsibility of caller +template +void reducescatter2_userbuff_strided_atomic_fp8(void *output, float *scale, const int handler, + const int offset, const int rowelements, + const int colelements, const int strideelements_out, + const int strideelements_in, const int numchunks, + void *counters, communicator *comm, + cudaStream_t stream) { + reducescatter2_userbuff_strided_universal_fp8( + output, scale, handler, offset, rowelements, colelements, strideelements_out, + strideelements_in, 1, numchunks, counters /*nullptr*/, comm, stream); +} +template +void reducescatter2_userbuff_strided_multiatomic_fp8( + void *output, float *scale, const int handler, const int offset, const int rowelements, + const int colelements, const int strideelements_out, const int strideelements_in, + const int numchunks, void *counters, communicator *comm, cudaStream_t stream) { + reducescatter2_userbuff_strided_universal_fp8( + output, scale, handler, offset, rowelements, colelements, strideelements_out, + strideelements_in, numchunks, 0, counters /*nullptr*/, comm, stream); +} - const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes; - const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node; +void reducescatter2_userbuff_strided_multiatomic(void *output, const int handler, const int offset, + const int rowelements, const int colelements, + const int strideelements, const int numchunks, + void *counters, communicator *comm, + cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements * 2; const int ar_firstgpu = op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - if (elements < 8) return 0; + if (elements < 64) + return; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; + if (warps < ar_nvsize) + warps = ar_nvsize; - if (num_nodes > 1) { - callranks2_block_rs(1) callranks2_block_rs(2) callranks2_block_rs(4) callranks2_block_rs(8) - } else { - SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); - callranks_rs(2) callranks_rs(4) callranks_rs(8) - } - return sms; + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + // if(comm->use_mc && (comm->memflags[handler] & NVTE_UB_MEM_MC_CREATED)) { + // //callranks_rs_oopMC(2) + // //callranks_rs_oopMC(4) + // //callranks_rs_oopMC(8) + // } else { + // if(comm->memflags[handler] & NVTE_UB_MEM_UC_CONTIG) { + // //callranks_rs_oopUCPTR(2) + // //callranks_rs_oopUCPTR(4) + // //callranks_rs_oopUCPTR(8) + // } else { + callranks_rs_oop_stride_multiatomic(2) callranks_rs_oop_stride_multiatomic(4) + callranks_rs_oop_stride_multiatomic(8) + // } + //} } int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset, @@ -1378,10 +2823,12 @@ int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - if (elements < 8) return 0; + if (elements < 8) + return 0; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; + if (warps < ar_nvsize) + warps = ar_nvsize; if (num_nodes > 1) { callranks2_block_ag(1) callranks2_block_ag(2) callranks2_block_ag(4) callranks2_block_ag(8) @@ -1402,13 +2849,15 @@ void allgather2_userbuff_inplace(const int handler, const int offset, const int const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - if (elements < 64) return; + if (elements < 64) + return; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; + if (warps < ar_nvsize) + warps = ar_nvsize; SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); - callranks_ag(2) callranks_ag(4) callranks_ag(8) + callranks_ag(2) callranks_ag(4) callranks_ag(8) } void allgather2_userbuff_inplace_sliced(const int handler, const int offset, const int elements, @@ -1436,13 +2885,15 @@ void reducescatter2_userbuff_inplace(const int handler, const int offset, const const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - if (elements < 64) return; + if (elements < 64) + return; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; + if (warps < ar_nvsize) + warps = ar_nvsize; SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); - callranks_rs(2) callranks_rs(4) callranks_rs(8) + callranks_rs(2) callranks_rs(4) callranks_rs(8) } void reducescatter2_userbuff_stridedoutput(void *output, const int handler, const int offset, const int rowelements, const int colelements, @@ -1457,21 +2908,124 @@ void reducescatter2_userbuff_stridedoutput(void *output, const int handler, cons const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; - if (elements < 64) return; + if (elements < 64) + return; int sms = ar_nvsize == 1 ? 2 : comm->sms; int warps = comm->threads / 32; - if (warps < ar_nvsize) warps = ar_nvsize; + if (warps < ar_nvsize) + warps = ar_nvsize; SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); - callranks_rs_oop(2) callranks_rs_oop(4) callranks_rs_oop(8) + callranks_rs_oop(2) callranks_rs_oop(4) callranks_rs_oop(8) } void reducescatter2_userbuff(void *output, const int handler, const int offset, const int elements, communicator *comm, cudaStream_t stream) { reducescatter2_userbuff_stridedoutput(output, handler, offset, elements, 1, 0, comm, stream); } +template +void reducescatter2_userbuff_stridedoutput_fp8(void *output, float *scale, const int handler, + const int offset, const int rowelements, + const int colelements, const int strideelements, + communicator *comm, cudaStream_t stream) { + const int elements = rowelements * colelements; + const int op = userbuffers_allreduceop_nonsharp2; + const int blocksize = elements; + const int ar_firstgpu = + op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu; + const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize; + const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize; + const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank; + assert(comm->sm_arch >= 9); + if (elements < 128) + return; + int sms = ar_nvsize == 1 ? 2 : comm->sms; + int warps = comm->threads / 32; + if (warps < ar_nvsize) + warps = ar_nvsize; + + SETUP_LAUNCH_CONFIG(sms, warps * 32, stream); + callranks_rs_oop_fp8(2) callranks_rs_oop_fp8(4) callranks_rs_oop_fp8(8) +} + +template +void reducescatter2_userbuff_fp8(void *output, float *scale, const int handler, const int offset, + const int elements, communicator *comm, cudaStream_t stream) { + reducescatter2_userbuff_stridedoutput_fp8(output, scale, handler, offset, elements, 1, 0, + comm, stream); +} + +template void reducescatter2_userbuff_fp8<__nv_fp8_e5m2>(void *output, float *scale, + const int handler, const int offset, + const int elements, communicator *comm, + cudaStream_t stream = 0); +template void reducescatter2_userbuff_fp8<__nv_fp8_e4m3>(void *output, float *scale, + const int handler, const int offset, + const int elements, communicator *comm, + cudaStream_t stream = 0); +#if 0 +template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>( + void* output, float *scale, const int handler, const int offset, + const int rowelements, const int colelements, const int strideelements, + const int numchunks, void *counters, communicator* comm, cudaStream_t stream = 0); +#endif +template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>( + void *output, float *scale, const int handler, const int offset, const int rowelements, + const int colelements, const int strideelements_out, const int strideelements_in, + const int numchunks, void *counters, communicator *comm, cudaStream_t stream = 0); +template void reducescatter2_userbuff_strided_multiatomic_fp8<__nv_fp8_e4m3>( + void *output, float *scale, const int handler, const int offset, const int rowelements, + const int colelements, const int strideelements_out, const int strideelements_in, + const int numchunks, void *counters, communicator *comm, cudaStream_t stream = 0); +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pullsendrecv(int myrank, int peer, int *recv_id, int *send_flagptr, + int *recv_flagptr, int4 *srcptr, int4 *dstptr, const int lines) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + atomicAdd_system(send_flagptr, 1); + } + +#define UNROLLCOPY 8 + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = (end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1)); + const int end_aligned = start_elem + aligned_elem; + + if (threadIdx.x == 0) { + const int signal_id = (*recv_id) + 1; + volatile int *flag = (volatile int *)recv_flagptr; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("[%d from %d] pullrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, + *flag); + break; + } + } + if (lines == 0) { + *recv_id = signal_id; + return; + } // otherwise need an extra kernel + } + __syncthreads(); + + if (end_elem <= start_elem) + return; + + for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) + val[i] = srcptr[line + i * blockDim.x * gridDim.x]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) + dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) + dstptr[line] = srcptr[line]; +} + __global__ void kuserbuffers_pullsend(int myrank, int peer, int *send_id, int *flagptr) { - atomicAdd(flagptr, 1); + atomicAdd_system(flagptr, 1); } __global__ void kuserbuffers_inc(int *id) { @@ -1514,14 +3068,17 @@ __global__ void __launch_bounds__(MAX_THREADS) } __syncthreads(); - if (end_elem <= start_elem) return; + if (end_elem <= start_elem) + return; for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { int4 val[UNROLLCOPY]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x * gridDim.x]; + for (int i = 0; i < UNROLLCOPY; i++) + val[i] = srcptr[line + i * blockDim.x * gridDim.x]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + for (int i = 0; i < UNROLLCOPY; i++) + dstptr[line + i * blockDim.x * gridDim.x] = val[i]; } for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) dstptr[line] = srcptr[line]; @@ -1539,18 +3096,22 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { int4 val[UNROLLCOPY]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x * gridDim.x]; + for (int i = 0; i < UNROLLCOPY; i++) + val[i] = srcptr[line + i * blockDim.x * gridDim.x]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + for (int i = 0; i < UNROLLCOPY; i++) + dstptr[line + i * blockDim.x * gridDim.x] = val[i]; } for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) dstptr[line] = srcptr[line]; } __syncthreads(); - if (threadIdx.x) return; + if (threadIdx.x) + return; __threadfence_system(); - atomicAdd(flagptr, 1); // otherwise need local SM sync before sending flag - } else { // 0 bytes and 1 SM only + atomicAdd_system(flagptr, + 1); // otherwise need local SM sync before sending flag + } else { // 0 bytes and 1 SM only atomicAdd_system(flagptr, 1); } } @@ -1559,7 +3120,8 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *f const int signal_id = (*recv_id) + adder; *recv_id = signal_id; volatile int *flag = (volatile int *)flagptr; - if (*flag >= signal_id) return; + if (*flag >= signal_id) + return; clock_t s = clock64(); while (atomicAdd_system(flagptr, 0) < signal_id) { if (clock64() - s > TIMEOUT) { @@ -1569,13 +3131,203 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *f } } -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pushsendrecv(int *send_id, int *send_flagptr, int4 *srcptr, int4 *dstptr, + const int lines, int myrank, int peer, int *recv_id, + int *recv_flagptr, int adder) { + if (lines) { + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = + ((end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1))); + const int end_aligned = start_elem + aligned_elem; + if (end_elem > start_elem) { + for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + val[i] = srcptr[line + i * blockDim.x * gridDim.x]; + } +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + } + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) { + dstptr[line] = srcptr[line]; + } + } + __syncthreads(); + if (threadIdx.x) + return; + __threadfence_system(); + atomicAdd_system(send_flagptr, + 1); // otherwise need local SM sync before sending flag + } else { // 0 bytes and 1 SM only + atomicAdd_system(send_flagptr, 1); + } + + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int signal_id = (*recv_id) + adder; + *recv_id = signal_id; + volatile int *flag = (volatile int *)recv_flagptr; + if (*flag >= signal_id) + return; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, + *flag); + return; + } + } + } +} + +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pushsendrecv_atomic(int *send_id, int *send_flagptr, int4 *srcptr, int4 *dstptr, + const int lines, int myrank, int peer, int *recv_id, + int *recv_flagptr, int adder, void *counters) { + if (lines) { + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = + ((end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1))); + const int end_aligned = start_elem + aligned_elem; + if (end_elem > start_elem) { + for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + val[i] = srcptr[line + i * blockDim.x * gridDim.x]; + } +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + dstptr[line + i * blockDim.x * gridDim.x] = val[i]; + } + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) { + dstptr[line] = srcptr[line]; + } + } + __syncthreads(); + if (threadIdx.x) + return; + __threadfence_system(); + atomicAdd_system(send_flagptr, + 1); // otherwise need local SM sync before sending flag + } else { // 0 bytes and 1 SM only + atomicAdd_system(send_flagptr, 1); + } + + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int signal_id = (*recv_id) + adder; + *recv_id = signal_id; + volatile int *flag = (volatile int *)recv_flagptr; + // if(*flag>=signal_id) return; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, + *flag); /*return;*/ + } + } + + // Decrement atomic val to signal current output tile finish + if (counters) { + ((unsigned int *)counters)[0] = 0; + asm volatile("fence.sc.gpu;\n"); + } + } +} + +__global__ void __launch_bounds__(MAX_THREADS) + kuserbuffers_pushsendrecv_multiatomic(int *send_id, int *send_flagptr, int4 *srcptr, + int4 *dstptr, const int lines, int myrank, int peer, + int *recv_id, int *recv_flagptr, int adder, + void *counters, int nchunks, int send_stride, + int recv_stride, bool shuffle) { + for (int chunk_i = 0; chunk_i < nchunks - 1; chunk_i++) { + int send_chunk_id = shuffle ? chunk_i : (nchunks + myrank - chunk_i) % nchunks; + int recv_chunk_id = shuffle ? chunk_i + 1 : (nchunks + myrank - chunk_i - 1) % nchunks; + int send_offset = (send_chunk_id * send_stride) / 16; + int recv_offset = ((shuffle ? recv_chunk_id : send_chunk_id) * recv_stride) / 16; + + if (lines) { + const int start_elem = threadIdx.x + blockDim.x * blockIdx.x; + const int end_elem = lines; + const int aligned_elem = + ((end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1))); + const int end_aligned = start_elem + aligned_elem; + if (end_elem > start_elem) { + for (int line = start_elem; line < end_aligned; + line += blockDim.x * gridDim.x * UNROLLCOPY) { + int4 val[UNROLLCOPY]; +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + val[i] = srcptr[send_offset + line + i * blockDim.x * gridDim.x]; + } +#pragma unroll + for (int i = 0; i < UNROLLCOPY; i++) { + dstptr[recv_offset + line + i * blockDim.x * gridDim.x] = val[i]; + } + } + for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x) { + dstptr[recv_offset + line] = srcptr[send_offset + line]; + } + } + __syncthreads(); + if (!threadIdx.x) { + __threadfence_system(); + atomicAdd_system(send_flagptr, + 1); // otherwise need local SM sync before sending flag + } + } else { // 0 bytes and 1 SM only + atomicAdd_system(send_flagptr, 1); + } + + // wait for message to arrive. + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int signal_id = (*recv_id) + adder; + *recv_id = signal_id; + volatile int *flag = (volatile int *)recv_flagptr; + // if(*flag>=signal_id) return; + clock_t s = clock64(); + while (*flag < signal_id) { + if (clock64() - s > TIMEOUT) { + printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, + *flag); /*return;*/ + } + } + } + + // Producer must update counters. + if (blockIdx.x == 0 && threadIdx.x == 0) { + // Decrement atomic val to signal current output tile finish + if (counters) { + ((unsigned int *)counters)[recv_chunk_id /*chunk_i+1*/] = 0; + asm volatile("fence.sc.gpu;\n"); + } + } + + // sync all CTAs before moving to next chunk. + if (threadIdx.x == 0) { + int old_val2; + atomicInc(((unsigned int *)counters) + nchunks + chunk_i, gridDim.x - 1); + while (0 != (old_val2 = atomicCAS(((unsigned int *)counters) + nchunks + chunk_i, 0, 0))) { + } + } + __syncthreads(); + } +} + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) #define INTRANODE(peer) ((peer / comm->nvsize) == (comm->myrank / comm->nvsize)) @@ -1611,7 +3363,8 @@ void userbuffers_send(const int srchandler, const size_t srcoffset, const int ds comm->hostflags + userbuffers_sendop); return; } - if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) + return; if (comm->push == 0) { kuserbuffers_pullsend<<<1, 1, 0, stream>>>(comm->myrank, peer, &(comm->send_id[peer]), reinterpret_cast(flagptr)); @@ -1633,10 +3386,145 @@ void userbuffers_send(const int srchandler, const size_t srcoffset, const int ds } } +void userbuffers_sendrecv(const int srchandler, const int dsthandler, const size_t send_offset, + const size_t recv_offset, const size_t bytes, communicator *comm, + const int send_peer, const int recv_peer, cudaStream_t stream) { + bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0); + int send_peerlocal = send_peer % comm->nvsize; + int recv_peerlocal = recv_peer % comm->nvsize; + void *flagptr_send = + (comm->peer_ptr[0][send_peerlocal]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + void *flagptr_recv = + (comm->mem_ptr[0]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + + void *send_srcptr = (comm->mem_ptr[srchandler]) + send_offset; + void *send_dstptr = (comm->peer_ptr[dsthandler][send_peerlocal]) + send_offset; + if (comm->use_ce) + CUDACHECK(cudaMemcpyAsync(send_dstptr, send_srcptr, bytes, cudaMemcpyDeviceToDevice, stream)); + SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream); + + int *arg1 = &comm->send_id[send_peer]; + int *arg2 = reinterpret_cast(flagptr_send); + int4 *arg3 = reinterpret_cast(send_srcptr); + int4 *arg4 = reinterpret_cast(send_dstptr); + int arg5 = signalonly ? 0 : bytes / 16; + int arg6 = comm->myrank; + int arg7 = recv_peer; + int *arg8 = &comm->recv_id[recv_peer * NVTE_MAX_REGIONS + dsthandler]; + int *arg9 = reinterpret_cast(flagptr_recv); + int arg10 = signalonly ? 1 : comm->sms; + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), + reinterpret_cast(&arg3), reinterpret_cast(&arg4), + reinterpret_cast(&arg5), reinterpret_cast(&arg6), + reinterpret_cast(&arg7), reinterpret_cast(&arg8), + reinterpret_cast(&arg9), reinterpret_cast(&arg10)}; + CUDACHECK( + cudaLaunchKernelExC(&cfg, reinterpret_cast(kuserbuffers_pushsendrecv), kernelArgs)); + //} +} + +void userbuffers_sendrecv_atomic(const int srchandler, const int dsthandler, + const size_t send_offset, const size_t recv_offset, + const size_t bytes, communicator *comm, const int send_peer, + const int recv_peer, void *counters, cudaStream_t stream) { + assert(comm->push && comm->use_ce == 0); + bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0); + + int send_peerlocal = send_peer % comm->nvsize; + int recv_peerlocal = recv_peer % comm->nvsize; + void *flagptr_send = + (comm->peer_ptr[0][send_peerlocal]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + void *flagptr_recv = + (comm->mem_ptr[0]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + + void *send_srcptr = (comm->mem_ptr[srchandler]) + send_offset; + void *send_dstptr = (comm->peer_ptr[dsthandler][send_peerlocal]) + send_offset; + if (comm->use_ce) { + CUDACHECK(cudaMemcpyAsync(send_dstptr, send_srcptr, bytes, cudaMemcpyDeviceToDevice, stream)); + } + SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream); + + int *arg1 = &comm->send_id[send_peer]; + int *arg2 = reinterpret_cast(flagptr_send); + int4 *arg3 = reinterpret_cast(send_srcptr); + int4 *arg4 = reinterpret_cast(send_dstptr); + int arg5 = signalonly ? 0 : bytes / 16; + int arg6 = comm->myrank; + int arg7 = recv_peer; + int *arg8 = &comm->recv_id[recv_peer * NVTE_MAX_REGIONS + dsthandler]; + int *arg9 = reinterpret_cast(flagptr_recv); + int arg10 = signalonly ? 1 : comm->sms; + void *arg11 = counters; + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), + reinterpret_cast(&arg3), reinterpret_cast(&arg4), + reinterpret_cast(&arg5), reinterpret_cast(&arg6), + reinterpret_cast(&arg7), reinterpret_cast(&arg8), + reinterpret_cast(&arg9), reinterpret_cast(&arg10), + reinterpret_cast(&arg11)}; + CUDACHECK(cudaLaunchKernelExC(&cfg, reinterpret_cast(kuserbuffers_pushsendrecv_atomic), + kernelArgs)); +} + +void userbuffers_sendrecv_multiatomic(const int srchandler, const int dsthandler, + const size_t send_stride, const size_t recv_stride, + const size_t bytes, communicator *comm, const int send_peer, + const int recv_peer, const int nchunks, void *counters, + bool shuffle, cudaStream_t stream) { + assert(comm->push && comm->use_ce == 0); + + int send_peerlocal = send_peer % comm->nvsize; + int recv_peerlocal = recv_peer % comm->nvsize; + void *flagptr_send = + (comm->peer_ptr[0][send_peerlocal]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + void *flagptr_recv = + (comm->mem_ptr[0]) + + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) * + sizeof(int)); + + SETUP_LAUNCH_CONFIG(comm->sms, 1024, stream); + + int *arg1 = &comm->send_id[send_peer]; + int *arg2 = reinterpret_cast(flagptr_send); + int4 *arg3 = reinterpret_cast((comm->mem_ptr[srchandler])); + int4 *arg4 = reinterpret_cast((comm->peer_ptr[dsthandler][send_peerlocal])); + int arg5 = bytes / 16; + int arg6 = comm->myrank; + int arg7 = recv_peer; + int *arg8 = &comm->recv_id[recv_peer * NVTE_MAX_REGIONS + dsthandler]; + int *arg9 = reinterpret_cast(flagptr_recv); + int arg10 = comm->sms; + void *arg11 = counters; + int arg12 = nchunks; + int arg13 = send_stride; + int arg14 = recv_stride; + bool arg15 = shuffle; + void *kernelArgs[] = {reinterpret_cast(&arg1), reinterpret_cast(&arg2), + reinterpret_cast(&arg3), reinterpret_cast(&arg4), + reinterpret_cast(&arg5), reinterpret_cast(&arg6), + reinterpret_cast(&arg7), reinterpret_cast(&arg8), + reinterpret_cast(&arg9), reinterpret_cast(&arg10), + reinterpret_cast(&arg11), reinterpret_cast(&arg12), + reinterpret_cast(&arg13), reinterpret_cast(&arg14), + reinterpret_cast(&arg15)}; + CUDACHECK(cudaLaunchKernelExC( + &cfg, reinterpret_cast(kuserbuffers_pushsendrecv_multiatomic), kernelArgs)); +} + __global__ void __launch_bounds__(MAX_THREADS) kuserbuffers_alltoall(void **baseflagptrs, int flagoffset, int4 *basesrcptr, void **dstptrs, size_t dstoffset, const int lines, const int myrank) { - if (blockIdx.x == myrank) return; + if (blockIdx.x == myrank) + return; int4 *dstptr = reinterpret_cast(dstptrs[blockIdx.x] + dstoffset); int *flagptr = reinterpret_cast(baseflagptrs[blockIdx.x] + flagoffset); const size_t myblockoffset = blockIdx.x * lines; @@ -1652,14 +3540,18 @@ __global__ void __launch_bounds__(MAX_THREADS) for (int line = start_elem; line < end_aligned; line += blockDim.x * UNROLLCOPY) { int4 val[UNROLLCOPY]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) val[i] = srcptr[line + i * blockDim.x]; + for (int i = 0; i < UNROLLCOPY; i++) + val[i] = srcptr[line + i * blockDim.x]; #pragma unroll - for (int i = 0; i < UNROLLCOPY; i++) dstptr[line + i * blockDim.x] = val[i]; + for (int i = 0; i < UNROLLCOPY; i++) + dstptr[line + i * blockDim.x] = val[i]; } - for (int line = end_aligned; line < end_elem; line += blockDim.x) dstptr[line] = srcptr[line]; + for (int line = end_aligned; line < end_elem; line += blockDim.x) + dstptr[line] = srcptr[line]; } __syncthreads(); - if (threadIdx.x) return; + if (threadIdx.x) + return; __threadfence_system(); atomicAdd(flagptr, 1); @@ -1702,7 +3594,8 @@ void userbuffers_recv(const int srchandler, const size_t srcoffset, const int ds sizeof(int)); bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0); bool intranode = INTRANODE(peer); - if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) + return; if (comm->push == 0 && intranode) { void *dstptr = (comm->mem_ptr[dsthandler]) + dstoffset; void *srcptr = (comm->peer_ptr[srchandler][peerlocal]) + srcoffset; @@ -1728,7 +3621,45 @@ void userbuffers_alltoall_recv(communicator *comm, cudaStream_t stream) { (comm->mem_ptr[0]) + ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * userbuffers_alltoall) * sizeof(int)); - if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) return; + if (!(comm->launch_mode & NVTE_LAUNCH_GPU)) + return; kuserbuffers_pushrecv<<<1, 1, 0, stream>>>(comm->myrank, -1, reinterpret_cast(flagptr + 4), reinterpret_cast(flagptr), comm->nranks - 1); } + +// producer +static __global__ void producer_kernel(void *atomic_ptr, int chunk_i) { + // Decrement atomic val to signal current output tile finish + if (blockIdx.x == 0 && threadIdx.x == 0) { + ((unsigned int *)atomic_ptr)[chunk_i] = 0; + } + + // COMM kernel need to explicitely flash gmem. + // GEMM kernel already executed, and can not see gmem + // change without COMM kernel explicitely make change + asm volatile("fence.sc.gpu;\n"); +} + +// consumer +static __global__ void consumer_kernel(void *atomic_ptr, int chunk_i) { + // Wait for producer to change the val to 0, which signal producer ready + if (blockIdx.x == 0 && threadIdx.x == 0) { + int old_val; + while (0 != (old_val = atomicCAS((unsigned int *)atomic_ptr + chunk_i, 0, 0))) { + } + ((unsigned int *)atomic_ptr)[chunk_i] = 1; + asm volatile("fence.sc.gpu;\n"); + } +} + +void producer(void *atomic_ptr, int chunk_i, cudaStream_t stream) { + dim3 block(1); + dim3 grid(1); + producer_kernel<<>>(atomic_ptr, chunk_i); +} + +void consumer(void *atomic_ptr, int chunk_i, cudaStream_t stream) { + dim3 block(1); + dim3 grid(1); + consumer_kernel<<>>(atomic_ptr, chunk_i); +} diff --git a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h index d6ec23c40d..7f635771c9 100644 --- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h +++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h @@ -24,6 +24,18 @@ #define NVTE_LAUNCH_CPU 2 #define NVTE_MAX_NVLINK 8 +#define UB_MEM_UC_CONTIG 1 +#define UB_MEM_MC_CREATED 2 +#define UB_MEM_ALLOCATED 4 + +#define NVTE_UB_MEM_UC_CONTIG 1 +#define NVTE_UB_MEM_MC_CREATED 2 +#define NVTE_UB_MEM_ALLOCATED 4 + +#ifdef UCP +#include +#endif + // region 0 flag offsets #define NVTE_REG0_OPFLAGS 1024 #define NVTE_REG0_RECV (NVTE_REG0_OPFLAGS * userbuffers_op_types) @@ -35,6 +47,10 @@ #define NVTE_REG0_IBRS 32 #define NVTE_REG0_IBAG 512 +#if defined(UCP) || !defined(NOSHARP) +#undef REG0_COMMBUFFER +#define REG0_COMMBUFFER (1024*1024*16) +#endif // gpuflags map offsets #define NVTE_GF_STATE 16000 #define NVTE_GF_IBSHARPDONE 0 @@ -81,6 +97,19 @@ struct communicator { void *mem_ptr[NVTE_MAX_REGIONS]; void **peer_ptr[NVTE_MAX_REGIONS]; + + int memflags[NVTE_MAX_REGIONS]; // UC,MC, user/lib allocated + + CUmemGenericAllocationHandle *uchandles[NVTE_MAX_REGIONS]; + void* ucbase_ptr[NVTE_MAX_REGIONS]; // only for cuMem allocated memory + size_t mem_size[NVTE_MAX_REGIONS]; + + void* mc_ptr[NVTE_MAX_REGIONS]; + void* mc_baseptr; + CUmemGenericAllocationHandle mc_handle; + size_t mc_offset, mc_maxsize; + int use_mc; // 1: use MC if available, 0: override not to use MC + int ar_nvsize, ar_firstgpu, ar_nvrank; // number of gpus(and first gpu in a group) of gpus per node in reduction subgroup // (_splitar init used) would be equal to (nvsize,0) for regular comm_create @@ -120,6 +149,8 @@ struct communicator { }; typedef struct communicator communicator; +void producer(void *atomic_ptr, int chunk_i, cudaStream_t stream); +void consumer(void *atomic_ptr, int chunk_i, cudaStream_t stream); int create_communicator(communicator **comm); /* creates communicator, allocates all internal buffers if necessary */ @@ -191,6 +222,45 @@ void reducescatter2_userbuff_stridedoutput(void *output, const int handler, cons const int rowelements, const int colelements, const int strideelements, communicator *comm, cudaStream_t stream = 0); +template +void reducescatter2_userbuff_stridedoutput_fp8(void* output, float* scale, const int handler, + const int offset, const int rowelements, + const int colelements, const int strideelements, + communicator* comm, cudaStream_t stream = 0); +template +void reducescatter2_userbuff_fp8(void* output, float* scale, const int handler, const int offset, + const int elements, communicator* comm, cudaStream_t stream = 0); +#if 0 +template +void reducescatter2_userbuff_strided_atomic_fp8(void* output, float *scale, const int handler, + const int offset, const int rowelements, + const int colelements, const int strideelements, + const int numchunks, void *counters, + communicator* comm, cudaStream_t stream = 0); +#endif +template +void reducescatter2_userbuff_strided_atomic_fp8(void* output, float *scale, const int handler, + const int offset, const int rowelements, + const int colelements, const int strideelements_out, + const int strideelements_in, const int numchunks, + void *counters, communicator* comm, + cudaStream_t stream = 0); +template +void reducescatter2_userbuff_strided_multiatomic_fp8( + void* output, float *scale, const int handler, const int offset, const int rowelements, + const int colelements, const int strideelements_out, const int strideelements_in, + const int numchunks, void *counters, communicator* comm, cudaStream_t stream = 0); +void reducescatter2_userbuff_strided( + void* output, const int handler, const int offset, const int rowelements, const int colelements, + const int strideelements, communicator* comm, cudaStream_t stream = 0); +void reducescatter2_userbuff_strided_atomic( + void* output, const int handler , const int offset, const int rowelements, const int colelements, + const int strideelements, const int numchunks, void *counters, communicator* comm, + cudaStream_t stream = 0); +void reducescatter2_userbuff_strided_multiatomic( + void* output, const int handler, const int offset, const int rowelements, const int colelements, + const int strideelements, const int numchunks, void *counters, communicator* comm, + cudaStream_t stream = 0); /* everything should be 16byte aligned = 8 elts aligned output is strided: row starts separated by stride elements*/ @@ -208,6 +278,19 @@ void userbuffers_send(const int srchandler, const size_t srcoffset, const int ds void userbuffers_recv(const int srchandler, const size_t srcoffset, const int dsthandler, const size_t dstoffset, const size_t bytes, communicator *comm, const int peer, cudaStream_t stream = 0); +void userbuffers_sendrecv( + const int srchandler, const int dsthandler, const size_t send_offset, const size_t recv_offset, + const size_t bytes, communicator* comm, const int send_peer, const int recv_peer, + cudaStream_t stream = 0); +void userbuffers_sendrecv_atomic( + const int srchandler, const int dsthandler, const size_t send_offset, const size_t recv_offset, + const size_t bytes, communicator* comm, const int send_peer, const int recv_peer, void *counters, + cudaStream_t stream = 0); +void userbuffers_sendrecv_multiatomic( + const int srchandler, const int dsthandler, const size_t send_offset, const size_t recv_offset, + const size_t bytes, communicator* comm, const int send_peer, const int recv_peer, + const int nchunks, void *counters, bool shuffle, cudaStream_t stream = 0); + // alltoall split send and recv to allow for overlap // send kicks in sending data to the destination - invoke on same stream as data generation diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 8bb9d55f38..7076e59600 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -124,6 +124,8 @@ def initialize_ub( fp8_buf = [ "qkv_fprop", "qkv_dgrad", "proj_dgrad", "fc1_fprop", "fc1_dgrad", "fc2_dgrad" ] + if bool(int(os.getenv("NVTE_UB_FP8_RS", "0"))): + fp8_buf.append ("proj_fprop") # Default overlap methods for layers methods = { "ring_exchange":["qkv_fprop", "fc1_fprop", "proj_dgrad", "fc2_dgrad"], @@ -153,8 +155,12 @@ def add_ub( sample_buffer, # Sample userbuffer rank_id, # Rank id tp_size, # TP size + num_sm, # Number of communication SMs + cga_size, # CGA cluster size + set_sm_margin, # Set SM margin aggregate, # Aggregate 2X GEMM chunks _NUM_MAX_UB_STREAMS, # Max concurrent GEMM streams + torch.Tensor(), # empty tensor to pass to counters ) else: ub_obj = tex.UbufCommOverlap( @@ -166,6 +172,7 @@ def add_ub( num_splits, # Number of communication splits set_sm_margin, # Set SM margin _NUM_MAX_UB_STREAMS, # Max concurrent GEMM streams + torch.Tensor(), # empty tensor to pass to counters ) _ub_communicators[name] = ub_obj @@ -676,10 +683,12 @@ def grad_output_preprocess( grad_output_mat = grad_output.view((-1, grad_output.shape[-1])) gather_grad_output = row_parallel_mode and ctx.sequence_parallel + if gather_grad_output: + ub_overlap_ag = ctx.ub_split_ag or ctx.ub_atomic_gemm_ag # No-FP8 case: bgrad is fused with wgrad for this case. if not ctx.fp8: if gather_grad_output: - if not ctx.ub_split_ag: + if not ub_overlap_ag: grad_output_mat, _ = gather_along_first_dim( grad_output_mat, ctx.tp_group ) @@ -698,8 +707,8 @@ def grad_output_preprocess( and ctx.fp8_meta["recipe"].override_linear_precision.wgrad ): assert ( - not ctx.ub_split_ag - ), "override_linear_precision.wgrad not supported with ub_split_ag" + not ub_overlap_ag + ), "override_linear_precision.wgrad not supported with UB AG overlap" grad_output_mat, _ = gather_along_first_dim(grad_output_mat, ctx.tp_group) # FP8 case with gather: unfused bgrad, cast, transpose for efficient gather elif gather_grad_output: @@ -707,7 +716,7 @@ def grad_output_preprocess( grad_bias = grad_output_mat.sum(dim=0) else: grad_bias = None - if ctx.ub_split_ag: + if ub_overlap_ag: grad_output_c = ctx.ub_obj_gradout.get_ubuf_output(0) else: grad_output_c = torch.empty_like(grad_output_mat, dtype=torch.uint8) @@ -718,7 +727,7 @@ def grad_output_preprocess( fp8_dtype_backward, out=grad_output_c, ) - if not ctx.ub_split_ag: + if not ub_overlap_ag: grad_output_c, _ = gather_along_first_dim(grad_output_c, ctx.tp_group) grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) else: diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index b7372f81fe..71af058415 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -83,6 +83,7 @@ def forward( ub_bulk_dgrad: bool, ub_split_ag: bool, normalization: str, + ub_atomic_gemm_ag: bool, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -100,11 +101,12 @@ def forward( if ln_bias is not None: ln_bias = cast_if_needed(ln_bias, activation_dtype) - if ub_split_ag: + if ub_split_ag or ub_atomic_gemm_ag: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1 or (not is_grad_enabled) or return_layernorm_output: ub_split_ag = False - if ub_split_ag: + ub_atomic_gemm_ag = False + if ub_split_ag or ub_atomic_gemm_ag: dim_size = list(inputmat.size()) dim_size[0] = dim_size[0] * tp_world_size ub_obj_lnout = get_ub("qkv_fprop") @@ -112,6 +114,8 @@ def forward( else: ln_out_dtype = torch.uint8 if fp8 else inputmat.dtype ln_out = torch.empty_like(inputmat, dtype=ln_out_dtype) + if ub_atomic_gemm_ag: + assert fp8, "AtomicGemm overlap supported only for FP8 GEMM." fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) @@ -139,7 +143,7 @@ def forward( fp8_dtype_forward, ) # Column Parallel Linear - if ub_split_ag: + if ub_split_ag or ub_atomic_gemm_ag: ln_out_total = ub_obj_lnout.get_ubuf_output(1) ln_out = torch.empty_like(ln_out) elif parallel_mode == "column" and sequence_parallel: @@ -173,6 +177,8 @@ def forward( tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward) + ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None + ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ub_atomic_gemm_ag else ub_algo out = tex.fp8_gemm( weight_fp8, fp8_meta["scaling_fwd"].scale_inv, @@ -187,9 +193,9 @@ def forward( bias=bias, use_bias=use_bias, use_split_accumulator=_2X_ACC_FPROP, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, - ub=ub_obj_lnout if ub_split_ag else None, - extra_output_tensor=ln_out if ub_split_ag else None, + ub_algo=ub_algo, + ub=ub_obj_lnout if (ub_split_ag or ub_atomic_gemm_ag) else None, + extra_output_tensor=ln_out if (ub_split_ag or ub_atomic_gemm_ag) else None, ) else: # Cast for native AMP @@ -339,6 +345,14 @@ def backward( fp8_dtype_backward = get_fp8_te_dtype( ctx.fp8_meta["recipe"], fprop_tensor=False ) + out_index, meta_tensor, out_te_type, out_type = ( + None, None, None, ctx.activation_dtype) + if ctx.ub_bulk_wgrad and ub_obj_dgrad.is_fp8_ubuf(): + out_index = tex.FP8BwdTensors.GRAD_INPUT1 + meta_tensor = ctx.fp8_meta["scaling_bwd"] + out_te_type = fp8_dtype_backward + out_type = torch.uint8 + ub_obj_dgrad.set_ubuf_scale_inv(meta_tensor.scale_inv[out_index]) # DGRAD: Evaluated unconditionally to feed into Linear backward _ = tex.fp8_gemm( @@ -350,12 +364,15 @@ def backward( ctx.fp8_meta["scaling_bwd"].scale_inv, tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, - ctx.activation_dtype, + out_type, get_workspace(), out=dgrad, use_split_accumulator=_2X_ACC_DGRAD, ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, - ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None, + out_index=out_index, + fp8_meta_tensor = meta_tensor, + D_dtype = out_te_type, ) else: # DGRAD: Evaluated unconditionally to feed into Linear backward @@ -387,6 +404,15 @@ def backward( if weight.requires_grad: if ctx.fp8: # WGRAD + extra_output_tensor = None + if ctx.ub_bulk_wgrad: + if ub_obj_dgrad.is_fp8_ubuf(): + dim_size = list(ub_obj_dgrad.get_ubuf_output(0).size()) # RS output + extra_output_tensor = torch.empty( + dim_size, dtype=ctx.activation_dtype, device=dgrad.device) + dgrad = extra_output_tensor + else: + dgrad = ub_obj_dgrad.get_ubuf_output(0) if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: ln_out_total_t = tex.fp8_transpose(ln_out_total, fp8_dtype_forward) wgrad = tex.fp8_gemm( @@ -405,7 +431,8 @@ def backward( use_split_accumulator=_2X_ACC_WGRAD, ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, - ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, + extra_output_tensor=extra_output_tensor ) else: ln_out_total_c = tex.cast_from_fp8( @@ -426,7 +453,8 @@ def backward( out=weight.main_grad if ctx.fuse_wgrad_accumulation else None, ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, - ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None + ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, + extra_output_tensor=extra_output_tensor ) else: # WGRAD @@ -443,12 +471,14 @@ def backward( ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None ) + if ctx.ub_bulk_wgrad: + dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output - - if ctx.ub_bulk_wgrad: - dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output # Column Parallel Linear - elif ctx.parallel_mode == "column" and ctx.tensor_parallel and handle is not None: + if ((not ctx.ub_bulk_wgrad) + and ctx.parallel_mode == "column" + and ctx.tensor_parallel + and handle is not None): handle.wait() # LayerNorm gradient @@ -504,6 +534,7 @@ def backward( None, None, None, + None, ) @@ -616,6 +647,7 @@ def __init__( ub_bulk_dgrad: bool = False, ub_split_ag: bool = False, device: Union[torch.device, str] = "cuda", + ub_atomic_gemm_ag: bool = False, ) -> None: super().__init__() @@ -642,12 +674,18 @@ def __init__( self.ub_bulk_wgrad = ub_bulk_wgrad self.ub_bulk_dgrad = ub_bulk_dgrad self.ub_split_ag = ub_split_ag + self.ub_atomic_gemm_ag = ub_atomic_gemm_ag - if ub_bulk_wgrad or ub_bulk_dgrad or ub_split_ag: + if ub_bulk_wgrad or ub_bulk_dgrad or ub_split_ag or ub_atomic_gemm_ag: assert ( tex.userbuf_comm_available() ), "Userbuffer communication backend not available." + if ub_atomic_gemm_ag: + warnings.warn( + "Atomic gemm uses a beta API from cublas and is not tested for all use cases." + ) + if tp_group is None: self.tp_size = tp_size if tp_size == 1: @@ -909,6 +947,7 @@ def forward( self.ub_bulk_dgrad, self.ub_split_ag, self.normalization, + self.ub_atomic_gemm_ag, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index ea9f7b5b2b..2daf73f11c 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -4,6 +4,7 @@ """LayerNormMLP API""" import os +import warnings from typing import Union, Optional, Callable, Tuple, List, Dict, Any import torch @@ -107,7 +108,9 @@ def forward( ub_bulk_wgrad: bool, ub_bulk_dgrad: bool, ub_split_rs: bool, + ub_atomic_gemm_rs: bool, ub_split_ag: bool, + ub_atomic_gemm_ag: bool, activation: str, normalization: str, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: @@ -130,20 +133,25 @@ def forward( if ln_bias is not None: ln_bias = cast_if_needed(ln_bias, activation_dtype) - if ub_split_ag: + if ub_split_ag or ub_atomic_gemm_ag: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1 or (not is_grad_enabled) or return_layernorm_output: ub_split_ag = False - if ub_split_ag: + ub_atomic_gemm_ag = False + ub_overlap_ag = ub_split_ag or ub_atomic_gemm_ag + if ub_overlap_ag: ub_obj_lnout = get_ub("fc1_fprop") ln_out = ub_obj_lnout.get_ubuf_output(0) else: ln_out_dtype = torch.uint8 if (fp8 and not return_layernorm_output) else inputmat.dtype ln_out = torch.empty_like(inputmat, dtype=ln_out_dtype) - if ub_split_rs: + if ub_split_rs or ub_atomic_gemm_rs: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1: ub_split_rs = False + ub_atomic_gemm_rs = False + if ub_atomic_gemm_rs or ub_atomic_gemm_ag: + assert fp8, "AtomicGemm overlap supported only for FP8 GEMM." fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) @@ -171,7 +179,7 @@ def forward( fp8_dtype_forward, ) # Column Parallel Linear - if ub_split_ag: + if ub_overlap_ag: ln_out_total = ub_obj_lnout.get_ubuf_output(1) ln_out = torch.empty_like(ln_out) elif set_parallel_mode and sequence_parallel: @@ -223,6 +231,8 @@ def forward( fp8_dtype_forward, ) + ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None + ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ub_atomic_gemm_ag else ub_algo fc1_out = tex.fp8_gemm( fc1_weight_fp8, fp8_meta["scaling_fwd"].scale_inv, @@ -237,9 +247,9 @@ def forward( bias=fc1_bias, use_bias=use_fc1_bias, use_split_accumulator=_2X_ACC_FPROP, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None, - ub=ub_obj_lnout if ub_split_ag else None, - extra_output_tensor=ln_out if ub_split_ag else None, + ub_algo=ub_algo, + ub=ub_obj_lnout if ub_overlap_ag else None, + extra_output_tensor=ln_out if ub_overlap_ag else None, ) gelu_out = activation_func( @@ -249,18 +259,29 @@ def forward( fp8_dtype_forward, ) - if ub_split_rs: + fc2_out_index, fc2_meta_tensor, fc2_te_type, out_type = ( + None, None, None, activation_dtype) + if ub_split_rs or ub_atomic_gemm_rs: ub_obj_fc2out = get_ub("fc2_fprop") fc2_out = ub_obj_fc2out.get_ubuf_output(1) dim_size = list(gelu_out.size()) dim_size[0] = dim_size[0] // tp_world_size dim_size[1] = fc2_weight.size(0) rs_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + + if ub_obj_fc2out.is_fp8_ubuf(): + fc2_out_index = tex.FP8FwdTensors.GEMM2_OUTPUT + fc2_meta_tensor = fp8_meta["scaling_fwd"] + fc2_te_type = fp8_dtype_forward + out_type = torch.uint8 + ub_obj_fc2out.set_ubuf_scale_inv(fc2_meta_tensor.scale_inv[fc2_out_index]) else: dim_size = list(gelu_out.size()) dim_size[1] = fc2_weight.size(0) fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + ub_algo=tex.UbufOverlapAlgo.ATOMIC_GEMM_RS if ub_atomic_gemm_rs else None + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else ub_algo _ = tex.fp8_gemm( fc2_weight_fp8, fp8_meta["scaling_fwd"].scale_inv, @@ -270,15 +291,18 @@ def forward( fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM2_INPUT, fp8_dtype_forward, - activation_dtype, + out_type, get_workspace(), bias=fc2_bias, use_bias=use_fc2_bias, use_split_accumulator=_2X_ACC_FPROP, out=fc2_out, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, - ub=ub_obj_fc2out if ub_split_rs else None, - extra_output_tensor=rs_out if ub_split_rs else None, + ub_algo=ub_algo, + ub=ub_obj_fc2out if ub_split_rs or ub_atomic_gemm_rs else None, + extra_output_tensor=rs_out if ub_split_rs or ub_atomic_gemm_rs else None, + out_index=fc2_out_index, + fp8_meta_tensor = fc2_meta_tensor, + D_dtype = fc2_te_type, ) else: # Cast for native AMP @@ -394,11 +418,12 @@ def forward( ctx.ub_bulk_wgrad = ub_bulk_wgrad ctx.ub_bulk_dgrad = ub_bulk_dgrad ctx.ub_split_ag = ub_split_ag + ctx.ub_atomic_gemm_ag = ub_atomic_gemm_ag ctx.requires_dgrad = inp.requires_grad ctx.normalization = normalization # Row Parallel Linear - if ub_split_rs: + if ub_split_rs or ub_atomic_gemm_rs: fc2_out = rs_out elif set_parallel_mode and sequence_parallel: fc2_out, _ = reduce_scatter_along_first_dim(fc2_out, tp_group) @@ -447,11 +472,15 @@ def backward( dim_size[0] = dim_size[0] * tp_world_size ub_obj_lnout = get_ub("fc1_dgrad") ub_obj_lnout.copy_input_to_ubuf(ln_out, 1) - if ctx.ub_split_ag: + ub_overlap_ag = ctx.ub_split_ag or ctx.ub_atomic_gemm_ag + if ub_overlap_ag: tp_world_size = get_distributed_world_size(ctx.tp_group) if tp_world_size == 1: ctx.ub_split_ag = False - if ctx.ub_split_ag: + ctx.ub_overlap_ag = False + ub_overlap_ag = ctx.ub_split_ag or ctx.ub_atomic_gemm_ag + + if ub_overlap_ag: dim_size = list(grad_outputs[0].size()) dim_size[0] = dim_size[0] * tp_world_size ctx.ub_obj_gradout = get_ub("fc2_dgrad") @@ -497,6 +526,8 @@ def backward( ctx.fp8_meta["recipe"], fprop_tensor=False ) + ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None + ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ctx.ub_atomic_gemm_ag else ub_algo # FC2 DGRAD; Unconditional fc2_dgrad = tex.fp8_gemm( fc2_weight_t_fp8, @@ -510,10 +541,10 @@ def backward( ctx.activation_dtype, get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, - ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, + ub_algo=ub_algo, + ub=ctx.ub_obj_gradout if ub_overlap_ag else None, ) - if ctx.ub_split_ag: + if ub_overlap_ag: grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) # FC2 WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: @@ -595,11 +626,19 @@ def backward( ) dgelu_t = None + out_index, meta_tensor, out_te_type, out_type = ( + None, None, None, ctx.activation_dtype) fc1_dgrad_size = list(dgelu.size()) fc1_dgrad_size[1] = fc1_weight.size(1) if ctx.ub_bulk_wgrad: # allocate dgrad output ub_obj_dgrad = get_ub("fc1_wgrad") fc1_dgrad = ub_obj_dgrad.get_ubuf_output(1) # AllGather output + if ub_obj_dgrad.is_fp8_ubuf(): + out_index = tex.FP8BwdTensors.GRAD_INPUT2 + meta_tensor = ctx.fp8_meta["scaling_bwd"] + out_te_type = fp8_dtype_backward + out_type = torch.uint8 + ub_obj_dgrad.set_ubuf_scale_inv(meta_tensor.scale_inv[out_index]) else: fc1_dgrad = torch.empty( fc1_dgrad_size, dtype=ctx.activation_dtype, device=fc1_weight.device @@ -614,12 +653,15 @@ def backward( ctx.fp8_meta["scaling_bwd"].scale_inv, tex.FP8BwdTensors.GRAD_OUTPUT2, fp8_dtype_backward, - ctx.activation_dtype, + out_type, get_workspace(), out=fc1_dgrad, use_split_accumulator=_2X_ACC_DGRAD, ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_AG if ctx.ub_bulk_dgrad else None, - ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None + ub=ub_obj_lnout if ctx.ub_bulk_dgrad else None, + out_index=out_index, + fp8_meta_tensor = meta_tensor, + D_dtype = out_te_type, ) else: # FC2 DGRAD; Unconditional @@ -703,6 +745,15 @@ def backward( if fc1_weight.requires_grad: if ctx.fp8: # FC1 WGRAD + extra_output_tensor = None + if ctx.ub_bulk_wgrad: + if ub_obj_dgrad.is_fp8_ubuf(): + dim_size = list(ub_obj_dgrad.get_ubuf_output(0).size()) # RS output + extra_output_tensor = torch.empty( + dim_size, dtype=ctx.activation_dtype, device=fc1_dgrad.device) + fc1_dgrad = extra_output_tensor + else: + fc1_dgrad = ub_obj_dgrad.get_ubuf_output(0) if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: ln_out_total_t = tex.fp8_transpose(ln_out_total, fp8_dtype_forward) fc1_wgrad = tex.fp8_gemm( @@ -724,6 +775,7 @@ def backward( ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, + extra_output_tensor=extra_output_tensor, ) else: ln_out_total_c = tex.cast_from_fp8( @@ -747,6 +799,7 @@ def backward( ub_algo=tex.UbufOverlapAlgo.BULK_OVERLAP_RS if ctx.ub_bulk_wgrad else None, ub=ub_obj_dgrad if ctx.ub_bulk_wgrad else None, + extra_output_tensor=extra_output_tensor, ) else: # FC1 WGRAD @@ -768,11 +821,14 @@ def backward( fc1_wgrad, _, _ = fc1_wgrad_outputs else: fc1_wgrad, fc1_bias_grad, _ = fc1_wgrad_outputs + if ctx.ub_bulk_wgrad: + fc1_dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output # Column Parallel Linear - if ctx.ub_bulk_wgrad: - fc1_dgrad = ub_obj_dgrad.get_ubuf_output(0) # Reduce-scatter output - elif ctx.set_parallel_mode and ctx.tensor_parallel and handle is not None: + if ((not ctx.ub_bulk_wgrad) + and ctx.set_parallel_mode + and ctx.tensor_parallel + and handle is not None): handle.wait() # LayerNorm gradient @@ -832,6 +888,8 @@ def backward( None, None, None, + None, + None, ) @@ -947,8 +1005,10 @@ def __init__( ub_bulk_wgrad: bool = False, ub_bulk_dgrad: bool = False, ub_split_rs: bool = False, + ub_atomic_gemm_rs: bool = False, ub_split_ag: bool = False, device: Union[torch.device, str] = "cuda", + ub_atomic_gemm_ag: bool = False, ) -> None: super().__init__() @@ -969,12 +1029,24 @@ def __init__( self.ub_bulk_dgrad = ub_bulk_dgrad self.ub_split_rs = ub_split_rs self.ub_split_ag = ub_split_ag - - if ub_bulk_wgrad or ub_bulk_dgrad or ub_split_rs or ub_split_ag: + self.ub_atomic_gemm_rs = ub_atomic_gemm_rs + self.ub_atomic_gemm_ag = ub_atomic_gemm_ag + + if (ub_bulk_wgrad # pylint: disable=too-many-boolean-expressions + or ub_bulk_dgrad + or ub_split_rs + or ub_split_ag + or ub_atomic_gemm_rs + or ub_atomic_gemm_ag): assert ( tex.userbuf_comm_available() ), "Userbuffer communication backend not available." + if ub_atomic_gemm_rs or ub_atomic_gemm_ag: + warnings.warn( + "Atomic gemm uses a beta API from cublas and is not tested for all use cases." + ) + if tp_group is None: self.tp_size = tp_size if tp_size == 1: @@ -1189,7 +1261,9 @@ def forward( self.ub_bulk_wgrad, self.ub_bulk_dgrad, self.ub_split_rs, + self.ub_atomic_gemm_rs, self.ub_split_ag, + self.ub_atomic_gemm_ag, self.activation, self.normalization, ) diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 98ca2015ed..2d9dbac057 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -77,6 +77,8 @@ def forward( is_grad_enabled: bool, ub_split_rs: bool, ub_split_ag: bool, + ub_atomic_gemm_rs: bool, + ub_atomic_gemm_ag: bool, ) -> torch.Tensor: # Make sure input dimensions are compatible in_features = weight.shape[-1] @@ -88,10 +90,13 @@ def forward( update_fp8_weights = is_first_microbatch is None or is_first_microbatch - if ub_split_rs: + if ub_split_rs or ub_atomic_gemm_rs: tp_world_size = get_distributed_world_size(tp_group) if tp_world_size == 1: ub_split_rs = False + ub_atomic_gemm_rs = False + if ub_atomic_gemm_rs or ub_atomic_gemm_ag: + assert fp8, "AtomicGemm overlap supported only for FP8 GEMM." # Cast for native AMP inputmat = cast_if_needed(inputmat, activation_dtype) inputmat_no_fp8 = inputmat @@ -155,18 +160,29 @@ def forward( fp8_dtype_forward, ) - if ub_split_rs: + proj_out_index, meta_tensor, proj_out_tetype, proj_out_pttype = ( + None, None, None, activation_dtype) + if ub_split_rs or ub_atomic_gemm_rs: ub_obj_projout = get_ub("proj_fprop") out = ub_obj_projout.get_ubuf_output(1) dim_size = list(inputmat_total.size()) dim_size[0] = dim_size[0] // tp_world_size dim_size[1] = weight.size(0) rs_out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + + if ub_obj_projout.is_fp8_ubuf(): + proj_out_index = tex.FP8FwdTensors.GEMM1_OUTPUT + meta_tensor = fp8_meta["scaling_fwd"] + proj_out_tetype = fp8_dtype_forward + proj_out_pttype = torch.uint8 + ub_obj_projout.set_ubuf_scale_inv(meta_tensor.scale_inv[proj_out_index]) else: dim_size = list(inputmat_total.size()) dim_size[1] = weight.size(0) out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + ub_algo=tex.UbufOverlapAlgo.ATOMIC_GEMM_RS if ub_atomic_gemm_rs else None + ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else ub_algo _ = fp8_gemm( weight_fp8, fp8_meta["scaling_fwd"].scale_inv, @@ -176,15 +192,18 @@ def forward( fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, - activation_dtype, + proj_out_pttype, get_workspace(), bias=bias, use_bias=use_bias, use_split_accumulator=_2X_ACC_FPROP, out=out, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else None, - ub=ub_obj_projout if ub_split_rs else None, - extra_output_tensor=rs_out if ub_split_rs else None, + ub_algo=ub_algo, + ub=ub_obj_projout if (ub_split_rs or ub_atomic_gemm_rs) else None, + extra_output_tensor=rs_out if (ub_split_rs or ub_atomic_gemm_rs) else None, + out_index=proj_out_index, + fp8_meta_tensor = meta_tensor, + D_dtype = proj_out_tetype, ) else: # Cast for native AMP @@ -245,11 +264,12 @@ def forward( ctx.parallel_mode = parallel_mode ctx.tp_group = tp_group ctx.ub_split_ag = ub_split_ag + ctx.ub_atomic_gemm_ag = ub_atomic_gemm_ag ctx.tp_size = tp_size ctx.requires_dgrad = inp.requires_grad # Row Parallel Linear - if ub_split_rs: + if ub_split_rs or ub_atomic_gemm_rs: out = rs_out elif parallel_mode == "row" and sequence_parallel: out, _ = reduce_scatter_along_first_dim(out, tp_group) @@ -275,11 +295,12 @@ def backward( fwd_scale_inverses, ) = ctx.saved_tensors - if ctx.ub_split_ag: + if ctx.ub_split_ag or ctx.ub_atomic_gemm_ag: tp_world_size = get_distributed_world_size(ctx.tp_group) if tp_world_size == 1: ctx.ub_split_ag = False - if ctx.ub_split_ag: + ctx.ub_atomic_gemm_ag = False + if ctx.ub_split_ag or ctx.ub_atomic_gemm_ag: dim_size = list(grad_output.size()) dim_size[0] = dim_size[0] * tp_world_size ctx.ub_obj_gradout = get_ub("proj_dgrad") @@ -323,6 +344,8 @@ def backward( ctx.fp8_meta["recipe"], fprop_tensor=False ) + ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None + ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ctx.ub_atomic_gemm_ag else ub_algo if ctx.requires_dgrad: if ctx.fp8: dgrad = fp8_gemm( @@ -337,8 +360,8 @@ def backward( ctx.activation_dtype, get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, - ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ctx.ub_split_ag else None, - ub=ctx.ub_obj_gradout if ctx.ub_split_ag else None, + ub_algo=ub_algo, + ub=ctx.ub_obj_gradout if ctx.ub_split_ag or ctx.ub_atomic_gemm_ag else None, ) else: dgrad, _, _ = gemm( @@ -366,7 +389,7 @@ def backward( if ctx.fp8: # WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: - if ctx.ub_split_ag: + if ctx.ub_split_ag or ctx.ub_atomic_gemm_ag: grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) wgrad = fp8_gemm( inputmat_t_total, @@ -436,6 +459,8 @@ def backward( None, None, None, + None, + None, ) @@ -529,6 +554,8 @@ def __init__( ub_split_rs: bool = False, ub_split_ag: bool = False, device: Union[torch.device, str] = "cuda", + ub_atomic_gemm_rs: bool = False, + ub_atomic_gemm_ag: bool = False, ) -> None: super().__init__() @@ -550,12 +577,19 @@ def __init__( self.parameters_split = parameters_split self.ub_split_rs = ub_split_rs self.ub_split_ag = ub_split_ag + self.ub_atomic_gemm_rs = ub_atomic_gemm_rs + self.ub_atomic_gemm_ag = ub_atomic_gemm_ag - if ub_split_rs or ub_split_ag: + if ub_split_rs or ub_split_ag or ub_atomic_gemm_rs: assert ( tex.userbuf_comm_available() ), "Userbuffer communication backend not available." + if ub_atomic_gemm_rs or ub_atomic_gemm_ag: + warnings.warn( + "Atomic gemm uses a beta API from cublas and is not tested for all use cases." + ) + if tp_group is None: self.tp_size = tp_size if tp_size == 1: @@ -774,6 +808,8 @@ def forward( torch.is_grad_enabled(), self.ub_split_rs, self.ub_split_ag, + self.ub_atomic_gemm_rs, + self.ub_atomic_gemm_ag, ) out = linear_fn(*args) diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index d8a1aa1ad2..cded3bf53f 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -263,6 +263,22 @@ def __init__( ub_bulk_dgrad = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_BULK_DGRAD", "1"))) ub_split_ag = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_SPLIT_AG", "1"))) ub_split_rs = ub_tp_comm_overlap and bool(int(os.getenv("NVTE_UB_SPLIT_RS", "1"))) + ub_atomic_gemm_rs = (ub_tp_comm_overlap + and bool(int(os.getenv("NVTE_UB_ATOMIC_GEMM_RS", "0")))) + assert ( + not (ub_split_rs and ub_atomic_gemm_rs) + ), "Only one type of RS overlap NVTE_UB_SPLIT_RS/NVTE_UB_ATOMIC_GEMM_RS should be enabled." + ub_atomic_gemm_ag = (ub_tp_comm_overlap + and bool(int(os.getenv("NVTE_UB_ATOMIC_GEMM_AG", "0")))) + assert ( + not (ub_split_ag and ub_atomic_gemm_ag) + ), "Only one type of AG overlap NVTE_UB_SPLIT_AG/NVTE_UB_ATOMIC_GEMM_AG should be enabled." + + if ub_atomic_gemm_rs or ub_atomic_gemm_ag: + warnings.warn( + "Atomic gemm uses a beta API from cublas and is not tested for all use cases." + ) + bias_dropout_fusion = bool(int(os.getenv("NVTE_BIAS_DROPOUT_FUSION", "1"))) self.layer_number = layer_number self.output_layernorm = output_layernorm @@ -323,6 +339,8 @@ def __init__( "ub_bulk_dgrad" : ub_bulk_dgrad, "ub_split_ag" : ub_split_ag, "ub_split_rs" : ub_split_rs, + "ub_atomic_gemm_rs" : ub_atomic_gemm_rs, + "ub_atomic_gemm_ag" : ub_atomic_gemm_ag, } self.self_attention = MultiheadAttention( @@ -377,6 +395,8 @@ def __init__( ub_bulk_dgrad=ub_bulk_dgrad, ub_split_rs=ub_split_rs, ub_split_ag=ub_split_ag, + ub_atomic_gemm_rs=ub_atomic_gemm_rs, + ub_atomic_gemm_ag=ub_atomic_gemm_ag, activation=activation, normalization=normalization, device=device, From 8eae4ce2b8fdfbbe525fc8bfecb0df5498cc9687 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Sat, 7 Oct 2023 01:20:16 +0800 Subject: [PATCH 061/427] [JAX] Enhance Dropout in TransformerLayer. (#444) * [JAX] Enhance Dropout in TransformerLayer. 1. Fixed missing setup of dropout RNG key in TransformerLayer and LayerNormMLP. 2. Allowing seperated dropout rate for FC1's output and other hiddens. Signed-off-by: Ming Huang * Fix wrong fp8 scale in _update_fp8_metas_impl Signed-off-by: Ming Huang * Fix typo Signed-off-by: Ming Huang --------- Signed-off-by: Ming Huang Co-authored-by: Kirthi Shankar Sivamani --- tests/jax/test_helper.py | 9 ++++---- tests/jax/test_layer.py | 8 +++++++ tests/jax/test_praxis_layers.py | 3 +++ transformer_engine/jax/flax/module.py | 6 +++++- transformer_engine/jax/flax/transformer.py | 22 +++++++++++++++----- transformer_engine/jax/fp8.py | 6 +++--- transformer_engine/jax/praxis/transformer.py | 4 ++++ 7 files changed, 44 insertions(+), 14 deletions(-) diff --git a/tests/jax/test_helper.py b/tests/jax/test_helper.py index 91ca06a90e..815aab6099 100644 --- a/tests/jax/test_helper.py +++ b/tests/jax/test_helper.py @@ -72,11 +72,10 @@ def get_fp8_scale(fp8_max, amax, scale): amax = np.array(amax) scale = np.array(scale) - exp = np.floor(np.log2(fp8_max / amax)) - FP8Helper.MARGIN - sf = np.round(np.power(2, np.abs(exp))) - sf = np.where(amax > 0.0, sf, scale) - sf = np.where(np.isfinite(amax), sf, scale) - return np.where(exp < 0, 1 / sf, sf) + sf = (fp8_max / amax) / (2**FP8Helper.MARGIN) + sf = jnp.where(amax > 0.0, sf, scale) + sf = jnp.where(jnp.isfinite(amax), sf, scale) + return sf amax_meta_shape = (num_of_meta, FP8Helper.AMAX_HISTORY_LEN) scale_meta_shape = (num_of_meta, 1) diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py index a635c687b7..4f9e224663 100644 --- a/tests/jax/test_layer.py +++ b/tests/jax/test_layer.py @@ -167,6 +167,7 @@ def forward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): if k == 'dropout_rate': te_layer_attrs['attention_dropout'] = v te_layer_attrs['hidden_dropout'] = v + te_layer_attrs['intermediate_dropout'] = v elif k == 'fuse_mlp_wi': continue else: @@ -174,6 +175,7 @@ def forward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): ref_layer_cls = partial(RefEncoderLayer, dtype=dtype, **attrs) layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), + intermediate_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.ENCODER, self_attn_mask_type='padding', dtype=dtype, @@ -212,6 +214,7 @@ def forward_backward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e- if k == 'dropout_rate': te_layer_attrs['attention_dropout'] = v te_layer_attrs['hidden_dropout'] = v + te_layer_attrs['intermediate_dropout'] = v elif k == 'fuse_mlp_wi': continue else: @@ -219,6 +222,7 @@ def forward_backward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e- ref_layer_cls = partial(RefEncoderLayer, dtype=dtype, **attrs) layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), + intermediate_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.ENCODER, self_attn_mask_type='padding', dtype=dtype, @@ -381,6 +385,7 @@ def forward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): if k == 'dropout_rate': te_layer_attrs['attention_dropout'] = v te_layer_attrs['hidden_dropout'] = v + te_layer_attrs['intermediate_dropout'] = v elif k == 'fuse_mlp_wi': continue else: @@ -388,6 +393,7 @@ def forward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e-08): ref_layer_cls = partial(RefDecoderLayer, dtype=dtype, **attrs) layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), + intermediate_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.DECODER, dtype=dtype, **te_layer_attrs) @@ -426,6 +432,7 @@ def forward_backward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e- if k == 'dropout_rate': te_layer_attrs['attention_dropout'] = v te_layer_attrs['hidden_dropout'] = v + te_layer_attrs['intermediate_dropout'] = v elif k == 'fuse_mlp_wi': continue else: @@ -433,6 +440,7 @@ def forward_backward_runner(self, data_shape, dtype, attrs, rtol=1e-05, atol=1e- ref_layer_cls = partial(RefDecoderLayer, dtype=dtype, **attrs) layer_cls = partial(TransformerLayer, hidden_dropout_dims=(sequence_dim,), + intermediate_dropout_dims=(sequence_dim,), layer_type=TransformerLayerType.DECODER, dtype=dtype, **te_layer_attrs) diff --git a/tests/jax/test_praxis_layers.py b/tests/jax/test_praxis_layers.py index 12ad919077..5a1bf41fb2 100644 --- a/tests/jax/test_praxis_layers.py +++ b/tests/jax/test_praxis_layers.py @@ -957,6 +957,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): layernorm_type = attrs[TransformerLayerAttr.LN_TYPE] hidden_dropout = 0.0 attention_dropout = 0.0 + intermediate_dropout = 0.0 mlp_activations = attrs[TransformerLayerAttr.ACTIVATION] kernel_init = WeightInit.Gaussian(1.0) use_bias = attrs[TransformerLayerAttr.USE_BIAS] @@ -991,6 +992,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): layernorm_type=layernorm_type, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, + intermediate_dropout=intermediate_dropout, mlp_activations=mlp_activations, use_bias=use_bias, bias_init=bias_init, @@ -1007,6 +1009,7 @@ def generate_praxis_p_and_flax_cls(self, dtype, attrs): layernorm_type=layernorm_type, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, + intermediate_dropout=intermediate_dropout, mlp_activations=mlp_activations, mha_kernel_init=TransformerEngineBaseLayer.generate_params_init( "mha_kernel", kernel_init), diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index d95bece5ad..89da212367 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -739,6 +739,8 @@ class LayerNormMLP(TransformerEngineBase): activations: Sequence[Union[str, Callable]], default = ('relu',) The sequence of activation functions to apply after the first linear transformation. Each activation has its own transformation layer. + intermediate_dropout_rng_name: str, default = 'dropout' + The key in given RNGs via flax.linen.Module.apply that for generating Dropout masks. intermediate_dropout_rate: float, default = 0.1 Dropout probability for the dropout op after the :attr:`activations`. intermediate_hidden_dropout_dims: Sequence[int], default = () @@ -779,6 +781,7 @@ class LayerNormMLP(TransformerEngineBase): bias_axes_2: Tuple[str, ...] = ('embed',) return_layernorm_output: bool = True activations: Sequence[Union[str, Callable]] = ('relu',) + intermediate_dropout_rng_name: str = 'dropout' intermediate_dropout_rate: float = 0.1 intermediate_hidden_dropout_dims: Sequence[int] = () axis: Union[Iterable[int], int] = -1 @@ -985,7 +988,8 @@ def fp8_meta_generator(): z = jnp.reshape(z, (*z.shape[:-2], -1)) z = nn.Dropout(rate=self.intermediate_dropout_rate, - broadcast_dims=self.intermediate_hidden_dropout_dims)( + broadcast_dims=self.intermediate_hidden_dropout_dims, + rng_collection=self.intermediate_dropout_rng_name)( z, deterministic=deterministic) # DenseGeneral 2 diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py index 2a3d5979fd..451d7731b1 100644 --- a/transformer_engine/jax/flax/transformer.py +++ b/transformer_engine/jax/flax/transformer.py @@ -883,6 +883,10 @@ class TransformerLayer(nn.Module): Dimensions that will share the same dropout mask for hidden attention_dropout: float, default = 0.1 Dropout probability for the dropout op during multi-head attention. + intermediate_dropout: float, default = 0.1 + Dropout probability for the dropout op after FC1 layer. + intermediate_dropout_dims: Sequence[int], default = () + Dimensions that will share the same dropout mask for hidden after FC1 layer. dropout_rng_name: str, default = 'dropout' The key in given RNGs via flax.linen.Module.apply that for generating Dropout masks in the Multi-Head Attention. @@ -963,6 +967,8 @@ class TransformerLayer(nn.Module): hidden_dropout: float = 0.1 hidden_dropout_dims: Sequence[int] = () attention_dropout: float = 0.1 + intermediate_dropout: float = 0.1 + intermediate_dropout_dims: Sequence[int] = () dropout_rng_name: str = 'dropout' mha_kernel_init: Initializer = None mlp_kernel_init: Initializer = None @@ -1078,6 +1084,8 @@ def __call__(self, else: mha_name = 'self_attention' + inputs = _with_sharding_constraint(inputs, (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)) + # [batch, length, emb_dim] -> [batch, length, emb_dim] x, residual = MultiHeadAttention( num_heads=self.num_attention_heads, @@ -1113,14 +1121,15 @@ def hidden_dropout(x, deterministic): assert -x_shape_len <= dims < x_shape_len return nn.Dropout(rate=self.hidden_dropout, - broadcast_dims=self.hidden_dropout_dims)(x, - deterministic=deterministic) + broadcast_dims=self.hidden_dropout_dims, + rng_collection=self.dropout_rng_name)(x, deterministic=deterministic) x = hidden_dropout(x, deterministic) if self.drop_path > 0.0: drop_path_shape = _generate_drop_path_shape(x.shape, batch_dim) x = nn.Dropout(rate=self.drop_path, - broadcast_dims=drop_path_shape)(x, deterministic=deterministic) + broadcast_dims=drop_path_shape, + rng_collection=self.dropout_rng_name)(x, deterministic=deterministic) x = x + residual mlp_input = x @@ -1156,6 +1165,8 @@ def hidden_dropout(x, deterministic): y = hidden_dropout(y, deterministic) mlp_input = y + residual + mlp_input = _with_sharding_constraint(mlp_input, (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)) + # MlpBlock residual = mlp_input z, ln_out = LayerNormMLP( @@ -1167,8 +1178,9 @@ def hidden_dropout(x, deterministic): return_layernorm_output=self.apply_residual_connection_post_layernorm, intermediate_dim=self.mlp_hidden_size, activations=self.mlp_activations, - intermediate_dropout_rate=self.hidden_dropout, - intermediate_hidden_dropout_dims=self.hidden_dropout_dims, + intermediate_dropout_rng_name=self.dropout_rng_name, + intermediate_dropout_rate=self.intermediate_dropout, + intermediate_hidden_dropout_dims=self.intermediate_dropout_dims, dtype=self.dtype, scale_axes=(W_NO_SHARD_AXES,), ln_bias_axes=(W_NO_SHARD_AXES,), diff --git a/transformer_engine/jax/fp8.py b/transformer_engine/jax/fp8.py index 83aad88c07..c64bcbd6d0 100644 --- a/transformer_engine/jax/fp8.py +++ b/transformer_engine/jax/fp8.py @@ -310,11 +310,11 @@ def _update_fp8_metas_impl(fp8_metas: Collection) -> Collection: amax = fp8_meta_arrays[fp8_amax_idx][..., 0:1] scale = fp8_meta_arrays[fp8_scale_idx] - sf = (fp8_max / amax) / (2 ** FP8Helper.MARGIN) + sf = (fp8_max / amax) / (2**FP8Helper.MARGIN) sf = jnp.where(amax > 0.0, sf, scale) sf = jnp.where(jnp.isfinite(amax), sf, scale) - fp8_meta_arrays[fp8_scale_idx] = scale - fp8_meta_arrays[fp8_scale_inv_idx] = 1 / scale + fp8_meta_arrays[fp8_scale_idx] = sf + fp8_meta_arrays[fp8_scale_inv_idx] = 1 / sf return jax.tree_util.tree_unflatten(treedef, fp8_meta_arrays) diff --git a/transformer_engine/jax/praxis/transformer.py b/transformer_engine/jax/praxis/transformer.py index 9bf9628490..b16c4e731e 100644 --- a/transformer_engine/jax/praxis/transformer.py +++ b/transformer_engine/jax/praxis/transformer.py @@ -137,6 +137,8 @@ class TransformerLayer(TransformerEngineBaseLayer): hidden_dropout: float = 0.1 hidden_dropout_dims: Sequence[int] = () attention_dropout: float = 0.1 + intermediate_dropout: float = 0.1 + intermediate_dropout_dims: Sequence[int] = () dropout_rng_name: str = 'dropout' mlp_activations: Sequence[str] = ('relu',) use_bias: bool = False @@ -190,6 +192,8 @@ def setup(self) -> None: hidden_dropout=self.hidden_dropout, hidden_dropout_dims=self.hidden_dropout_dims, attention_dropout=self.attention_dropout, + intermediate_dropout=self.intermediate_dropout, + intermediate_dropout_dims=self.intermediate_dropout_dims, dropout_rng_name=self.dropout_rng_name, mha_kernel_init=TransformerEngineBaseLayer.generate_params_init( "mha_kernel", self.params_init), From 61a6a188914bf56cd3aa05cc77d1e88412c9bb0c Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 19 Oct 2023 14:44:31 -0700 Subject: [PATCH 062/427] [PyTorch] rm unused docs (#484) RM unused docs Signed-off-by: Kirthi Shankar Sivamani --- docs/api/pytorch.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index e31f44fef5..aea66b257f 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -29,7 +29,6 @@ pyTorch :members: forward, set_context_parallel_group, set_tensor_parallel_group .. autoapiclass:: transformer_engine.pytorch.InferenceParams(max_batch_size, max_sequence_length) - :members: swap_key_value_dict .. autoapiclass:: transformer_engine.pytorch.CudaRNGStatesTracker() :members: reset, get_states, set_states, add, fork From 719f422f802086d995446431388849b2749c4d94 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Fri, 20 Oct 2023 01:14:51 -0700 Subject: [PATCH 063/427] Fix incorrect dtype in LayerNormLinear (#483) Signed-off-by: Tim Moon Co-authored-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/module/layernorm_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index a910946218..a8e83631bc 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -112,7 +112,7 @@ def forward( ub_obj_lnout = get_ub("qkv_fprop") ln_out = ub_obj_lnout.get_ubuf_output(0) else: - ln_out_dtype = torch.uint8 if fp8 else inputmat.dtype + ln_out_dtype = torch.uint8 if (fp8 and not return_layernorm_output) else inputmat.dtype ln_out = torch.empty_like(inputmat, dtype=ln_out_dtype) if ub_atomic_gemm_ag: assert fp8, "AtomicGemm overlap supported only for FP8 GEMM." From 1214da0e47662a1d1aa9fad1b622ca59a707a651 Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Fri, 20 Oct 2023 13:11:04 -0500 Subject: [PATCH 064/427] Incorrect use of extend_fsdp_sharding_meta() in cross_fused_attn() (#482) fixed incorrect of extend_fsdp_sharding_meta() in cross_fused_attn() Signed-off-by: Alp Dener --- transformer_engine/jax/fused_attn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/jax/fused_attn.py b/transformer_engine/jax/fused_attn.py index aaca58b2d5..3951d87274 100644 --- a/transformer_engine/jax/fused_attn.py +++ b/transformer_engine/jax/fused_attn.py @@ -206,7 +206,7 @@ def cross_fused_attn(q: jnp.ndarray, tp_dims=([2, 3, None, None], [2]), dp_axis_name=dp_axis_name, tp_axis_name=tp_axis_name) - sharding_meta = extend_fsdp_sharding_meta(sharding_meta, {0: 0, 2: 0}) + sharding_meta, _ = extend_fsdp_sharding_meta(sharding_meta, {0: 0, 2: 0}) inputs_ = tuple( jnp.reshape(x, new_shape) if x is not None else None From ebfeaad52204ce687f908e4fdbcf8caff704f1b8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Fri, 20 Oct 2023 07:37:22 +0200 Subject: [PATCH 065/427] Better way of checking cuDNN version (#485) * Ability to check cuDNN version from Python Signed-off-by: Przemek Tredak * Modify the fused attention test to not use the CUDNN_VERSION env variable which is specific to NGC containers Signed-off-by: Przemek Tredak --------- Signed-off-by: Przemek Tredak --- tests/pytorch/test_fused_attn.py | 10 +++++++++- transformer_engine/pytorch/csrc/common.h | 1 + transformer_engine/pytorch/csrc/extensions.h | 2 ++ transformer_engine/pytorch/csrc/extensions/misc.cu | 4 ++++ transformer_engine/pytorch/csrc/extensions/pybind.cpp | 1 + 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index a3a2656d0b..ac868b83d9 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -44,7 +44,15 @@ fp8_available, reason_for_no_fp8 = fp8.FP8GlobalStateManager.is_fp8_available() _flash_attn_version = packaging.version.Version(version("flash-attn")) _flash_attn_2_available = _flash_attn_version >= packaging.version.Version("2") -_cudnn_version = [int(i) for i in os.environ['CUDNN_VERSION'].split('.')] + +def _get_cudnn_version(): + cudnn_version_encoded = ext.get_cudnn_version() + cudnn_major = cudnn_version_encoded // 1000 + cudnn_minor = (cudnn_version_encoded - cudnn_major * 1000) // 100 + cudnn_patch = cudnn_version_encoded - 1000 * cudnn_major - 100 * cudnn_minor + return [cudnn_major, cudnn_minor, cudnn_patch] + +_cudnn_version = _get_cudnn_version() class ModelConfig: diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index 7c17f1f34c..d40f3db45b 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index 4eaca7c896..d1789cedb2 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -524,6 +524,8 @@ at::Tensor scaled_upper_triang_masked_softmax_backward(at::Tensor output_grads_, size_t get_cublasLt_version(); +size_t get_cudnn_version(); + bool userbuf_comm_available(); void placeholder(); diff --git a/transformer_engine/pytorch/csrc/extensions/misc.cu b/transformer_engine/pytorch/csrc/extensions/misc.cu index e6275d1159..48aa98bbf1 100644 --- a/transformer_engine/pytorch/csrc/extensions/misc.cu +++ b/transformer_engine/pytorch/csrc/extensions/misc.cu @@ -13,6 +13,10 @@ size_t get_cublasLt_version() { return cublasLtGetVersion(); } +size_t get_cudnn_version() { + return cudnnGetVersion(); +} + bool userbuf_comm_available() { // TODO(ksivamani) check on python side #ifdef NVTE_WITH_USERBUFFERS diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp index 7e80299d15..fd117782ab 100644 --- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -77,6 +77,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Misc m.def("get_cublasLt_version", &get_cublasLt_version, "Get cublasLt version"); + m.def("get_cudnn_version", &get_cudnn_version, "Get cuDNN version"); m.def("userbuf_comm_available", &userbuf_comm_available, "If userbuf backend is available"); // Data structures From 7eca973ae8dcf6b62d755db18096a41f47b40337 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Mon, 23 Oct 2023 14:23:05 -0700 Subject: [PATCH 066/427] [PyTorch] Fixes and tests for FP8 + activation recompute (#487) * initial test fix Signed-off-by: Kirthi Shankar Sivamani * Drop eval for selective checkpointing tests Signed-off-by: Kirthi Shankar Sivamani * Remove redundant recompute for FA Signed-off-by: Kirthi Shankar Sivamani * CI fix; Decouple fused attention and numerics tests Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- tests/pytorch/test_fused_attn.py | 36 ++++- tests/pytorch/test_numerics.py | 152 ++++++++++++---------- transformer_engine/pytorch/attention.py | 13 -- transformer_engine/pytorch/fp8.py | 23 ++++ transformer_engine/pytorch/module/base.py | 20 +-- 5 files changed, 154 insertions(+), 90 deletions(-) diff --git a/tests/pytorch/test_fused_attn.py b/tests/pytorch/test_fused_attn.py index ac868b83d9..fd37bd371c 100644 --- a/tests/pytorch/test_fused_attn.py +++ b/tests/pytorch/test_fused_attn.py @@ -25,8 +25,6 @@ QKVLayout, fused_attn_bwd, fused_attn_fwd, - fused_attn_bwd_qkvpacked, - fused_attn_fwd_qkvpacked, ) import transformer_engine.pytorch.fp8 as fp8 from transformer_engine.pytorch.module.base import ( @@ -38,13 +36,24 @@ init_method_normal, scaled_init_method_normal, ) +from transformer_engine.pytorch.distributed import _set_cuda_rng_state, CudaRNGStatesTracker import transformer_engine_extensions as tex -from test_numerics import get_dummy_cuda_rng_tracker, reset_rng_states + +# Only run FP8 tests on H100. fp8_available, reason_for_no_fp8 = fp8.FP8GlobalStateManager.is_fp8_available() _flash_attn_version = packaging.version.Version(version("flash-attn")) _flash_attn_2_available = _flash_attn_version >= packaging.version.Version("2") + +seed = 1234 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +# Record initial RNG state from script run. +_cpu_rng_state = torch.get_rng_state() +_cuda_rng_state = torch.cuda.get_rng_state() + + def _get_cudnn_version(): cudnn_version_encoded = ext.get_cudnn_version() cudnn_major = cudnn_version_encoded // 1000 @@ -52,6 +61,13 @@ def _get_cudnn_version(): cudnn_patch = cudnn_version_encoded - 1000 * cudnn_major - 100 * cudnn_minor return [cudnn_major, cudnn_minor, cudnn_patch] + +def reset_rng_states() -> None: + """revert back to initial RNG state.""" + torch.set_rng_state(_cpu_rng_state) + _set_cuda_rng_state(_cuda_rng_state) + + _cudnn_version = _get_cudnn_version() @@ -210,6 +226,13 @@ def _run_dot_product_attention(dtype, bs, config, backend, ckpt_attn, bias_type) else: bias = None + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + + def get_dummy_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + block = ( DotProductAttention( config.num_attention_heads, @@ -733,6 +756,13 @@ def _run_dpa_fp8_ref(dtype, bs, config, backend): cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) op_grad = torch.load('op_grad.pt').cuda().view(bs, config.seq_len, -1).transpose(0,1) + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + + def get_dummy_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + block = ( DotProductAttention( config.num_attention_heads, diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 21ee0968d9..02fb63e71f 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -12,6 +12,7 @@ import torch.nn as nn from torch.nn import Parameter +from transformer_engine.pytorch.fp8 import fp8_autocast, FP8GlobalStateManager from transformer_engine.pytorch.utils import ( init_method_normal, scaled_init_method_normal, @@ -25,6 +26,10 @@ from transformer_engine.pytorch.distributed import _set_cuda_rng_state, CudaRNGStatesTracker +# Only run FP8 tests on H100. +fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() + + seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -90,20 +95,11 @@ def assert_allclose(l1: List[torch.Tensor], l2: List[torch.Tensor], atol: float) def reset_rng_states() -> None: - # revert back to initial RNG state. + """revert back to initial RNG state.""" torch.set_rng_state(_cpu_rng_state) _set_cuda_rng_state(_cuda_rng_state) -_DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() -_DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) - - -def get_dummy_cuda_rng_tracker(): - """Get cuda rng tracker.""" - return _DUMMY_CUDA_RNG_STATE_TRACKER - - class TorchScaledMaskedSoftmax(nn.Module): def __init__(self) -> None: super().__init__() @@ -343,41 +339,21 @@ def forward( return x -def _test_e2e_selective_recompute(block, bs, dtype, config, recompute=False): +def _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=False): reset_rng_states() - - te_inp_hidden_states = torch.randn( - config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True - ).cuda() - te_inp_hidden_states.retain_grad() - te_inp_attn_mask = get_causal_attn_mask(config.seq_len) - - te_out = block( - te_inp_hidden_states, - attention_mask=te_inp_attn_mask, - checkpoint_core_attention=recompute, - ) - loss = te_out.sum() - loss.backward() - torch.cuda.synchronize() - - outputs = [te_out, te_inp_hidden_states.grad] - for p in block.parameters(): - if p.requires_grad: - outputs.append(p.grad) - return outputs - - -@pytest.mark.parametrize("dtype", param_types) -@pytest.mark.parametrize("bs", batch_sizes) -@pytest.mark.parametrize("model", model_configs.keys()) -def test_gpt_selective_activation_recompute(dtype, bs, model): - config = model_configs[model] + FP8GlobalStateManager.reset() sigma = 0.023 init_method = init_method_normal(sigma) output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + + def get_dummy_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + block = ( TransformerLayer( config.hidden_size, @@ -395,38 +371,19 @@ def test_gpt_selective_activation_recompute(dtype, bs, model): params_dtype=dtype, ) .cuda() - .eval() ) - outputs = _test_e2e_selective_recompute(block, bs, dtype, config, recompute=False) - outputs_recompute = _test_e2e_selective_recompute(block, bs, dtype, config, recompute=True) - assert_all_equal(outputs, outputs_recompute) - - -def _test_e2e_full_recompute(block, bs, dtype, config, recompute=False): - reset_rng_states() - te_inp_hidden_states = torch.randn( config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True ).cuda() te_inp_hidden_states.retain_grad() te_inp_attn_mask = get_causal_attn_mask(config.seq_len) - if recompute: - te_out = te_checkpoint( - block, - False, # distribute_saved_activations - get_dummy_cuda_rng_tracker, - None, # tp_group - te_inp_hidden_states, - attention_mask=te_inp_attn_mask, - checkpoint_core_attention=False, - ) - else: + with fp8_autocast(enabled=fp8): te_out = block( te_inp_hidden_states, attention_mask=te_inp_attn_mask, - checkpoint_core_attention=False, + checkpoint_core_attention=recompute, ) loss = te_out.sum() loss.backward() @@ -442,13 +399,33 @@ def _test_e2e_full_recompute(block, bs, dtype, config, recompute=False): @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) -def test_gpt_full_activation_recompute(dtype, bs, model): +@pytest.mark.parametrize("fp8", all_boolean) +def test_gpt_selective_activation_recompute(dtype, bs, model, fp8): + if fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) + config = model_configs[model] + outputs = _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=False) + outputs_recompute = _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=True) + assert_all_equal(outputs, outputs_recompute) + + +def _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=False): + reset_rng_states() + FP8GlobalStateManager.reset() + sigma = 0.023 init_method = init_method_normal(sigma) output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + + def get_dummy_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + block = ( TransformerLayer( config.hidden_size, @@ -466,11 +443,54 @@ def test_gpt_full_activation_recompute(dtype, bs, model): params_dtype=dtype, ) .cuda() - .eval() ) - outputs = _test_e2e_full_recompute(block, bs, dtype, config, recompute=False) - outputs_recompute = _test_e2e_full_recompute(block, bs, dtype, config, recompute=True) + te_inp_hidden_states = torch.randn( + config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True + ).cuda() + te_inp_hidden_states.retain_grad() + te_inp_attn_mask = get_causal_attn_mask(config.seq_len) + + with fp8_autocast(enabled=fp8): + if recompute: + te_out = te_checkpoint( + block, + False, # distribute_saved_activations + get_dummy_cuda_rng_tracker, + None, # tp_group + te_inp_hidden_states, + attention_mask=te_inp_attn_mask, + checkpoint_core_attention=False, + ) + else: + te_out = block( + te_inp_hidden_states, + attention_mask=te_inp_attn_mask, + checkpoint_core_attention=False, + ) + loss = te_out.sum() + loss.backward() + torch.cuda.synchronize() + + outputs = [te_out, te_inp_hidden_states.grad] + for p in block.parameters(): + if p.requires_grad: + outputs.append(p.grad) + return outputs + + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +@pytest.mark.parametrize("fp8", all_boolean) +def test_gpt_full_activation_recompute(dtype, bs, model, fp8): + if fp8 and not fp8_available: + pytest.skip(reason_for_no_fp8) + + config = model_configs[model] + + outputs = _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=False) + outputs_recompute = _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=True) assert_all_equal(outputs, outputs_recompute) @@ -565,8 +585,8 @@ def _test_e2e_checkpointing(bs, dtype, config, checkpoint=False, steps=10, path= def test_gpt_checkpointing(dtype, bs, model): config = model_configs[model] outputs = _test_e2e_checkpointing(bs, dtype, config, checkpoint=False) - outputs_recompute = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True) - assert_all_equal(outputs, outputs_recompute) + outputs_checkpoint = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True) + assert_all_equal(outputs, outputs_checkpoint) def _test_e2e_gpt_accuracy(block, bs, dtype, config): diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 0d2dbe0bc8..6f1aafe3f0 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -2164,19 +2164,6 @@ def forward( ) if use_flash_attention: - if checkpoint_core_attention: - return self._checkpointed_attention_forward(self.flash_attention, - query_layer, - key_layer, - value_layer, - attention_mask=attention_mask, - qkv_layout=qkv_layout, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_kv=cu_seqlens_kv, - attn_mask_type=attn_mask_type, - cp_group=self.cp_group, - cp_global_ranks=self.cp_global_ranks, - cp_stream=self.cp_stream) return self.flash_attention(query_layer, key_layer, value_layer, diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index 24c97be6e9..c89ff10968 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -75,6 +75,29 @@ class FP8GlobalStateManager: dp_amax_reduce_forward_idx = 0 dp_amax_reduce_backward_idx = 0 + @classmethod + def reset(cls) -> None: + """Reset the global state""" + cls.FP8_ENABLED = False + cls.FP8_CALIBRATION = False + cls.FP8_RECIPE = None + cls.FP8_DISTRIBUTED_GROUP = None + cls.IS_FIRST_FP8_MODULE = False + cls.FP8_AUTOCAST_COUNTER = 0 + cls.FP8_CURRENT_CONTEXT_ID = 0 + cls.FP8_AUTOCAST_DEPTH = 0 + cls.global_fp8_buffer = {} + cls.fp8_tensors_recompute_buffer = [] + cls.amax_forward_global_reduce_func = None + cls.buffer_delete_key_fwd = None + cls.buffer_delete_key_bwd = None + cls.amax_reduce_handle_fwd = None + cls.fp8_available = None + cls.reason_for_no_fp8 = "" + cls.dp_amax_reduce_interval = None + cls.dp_amax_reduce_forward_idx = 0 + cls.dp_amax_reduce_backward_idx = 0 + @classmethod def is_fp8_available(cls) -> Tuple[bool, str]: """Return if fp8 support is available""" diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 73b0bcdb76..5803cfa2f9 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -28,6 +28,7 @@ gather_along_first_dim, is_fp8_activation_recompute_enabled, in_fp8_activation_recompute_phase, + get_distributed_world_size, ) from ..cpp_extensions import ( fp8_cast_transpose_fused, @@ -77,9 +78,7 @@ def _prepare_backward( _amax_reduce_handle_bwd = None # Update amax and scale; Skip all setup for global amax reduction - if not fp8_meta["recipe"].reduce_amax: - amax_and_scale_update(fp8_meta, False) - else: + if fp8_meta["recipe"].reduce_amax and get_distributed_world_size(fp8_meta["fp8_group"]) > 1: # From previous iteration FP8GlobalStateManager.copy_amax_from_global_buffer(fp8_meta, forward=False) amax_and_scale_update(fp8_meta, False) @@ -89,11 +88,14 @@ def _prepare_backward( fp8_meta["autocast_id_bwd"] = fp8_meta["autocast_id_fwd_stack"].pop(0) FP8GlobalStateManager.add_amax_to_global_buffer(fp8_meta, forward=False) + else: + amax_and_scale_update(fp8_meta, False) with torch.cuda.nvtx.range(name + " backward"): yield - if fp8 and fp8_meta["recipe"].reduce_amax: + if (fp8 and fp8_meta["recipe"].reduce_amax + and get_distributed_world_size(fp8_meta["fp8_group"]) > 1): if fp8_meta["first_module"]: _amax_reduce_handle_bwd = FP8GlobalStateManager.global_amax_reduction( fp8_meta, @@ -549,7 +551,8 @@ def prepare_forward( # Previous iteration was grad_enabled if self.fp8_meta.get("update_amax_and_scale_fwd", False): - if self.fp8_meta["recipe"].reduce_amax: + if (self.fp8_meta["recipe"].reduce_amax + and get_distributed_world_size(self.fp8_meta["fp8_group"]) > 1): FP8GlobalStateManager.copy_amax_from_global_buffer(self.fp8_meta, forward=True) amax_and_scale_update( self.fp8_meta, True, update_weight_scale_inv=update_weight_scale_inv @@ -562,7 +565,8 @@ def prepare_forward( if self.fp8 and self.training: # Setup for amax reduction - if self.fp8_meta["recipe"].reduce_amax: + if (self.fp8_meta["recipe"].reduce_amax + and get_distributed_world_size(self.fp8_meta["fp8_group"]) > 1): self.fp8_meta["first_module"] = FP8GlobalStateManager.is_first_fp8_module() if self.fp8_meta["first_module"]: # Wait for the prior AMAX reduction to finish @@ -588,7 +592,6 @@ def prepare_forward( self.fp8 and self.training and is_fp8_activation_recompute_enabled() - and not in_fp8_activation_recompute_phase() ): FP8GlobalStateManager.copy_forward_fp8_meta_tensors_for_recompute(self.fp8_meta) @@ -599,7 +602,8 @@ def prepare_forward( FP8GlobalStateManager.restore_fp8_meta_tensors(self.fp8_meta) return - if self.fp8 and self.training and self.fp8_meta["recipe"].reduce_amax: + if (self.fp8 and self.training and self.fp8_meta["recipe"].reduce_amax + and get_distributed_world_size(self.fp8_meta["fp8_group"]) > 1): FP8GlobalStateManager.set_fp8_context_id(self.fp8_meta["autocast_id_fwd"]) reduce_func = partial( FP8GlobalStateManager.global_amax_reduction, From d58c08c72d289cb80f9c4fb729a2bda80b78b6ca Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 31 Oct 2023 11:08:34 -0700 Subject: [PATCH 067/427] [PyTorch] Experimental FP8 tensor class (#452) * Experimental FP8 tensor Co-authored-by: Tim Moon Co-authored-by: Sudhakar Singh Co-authored-by: Przemyslaw Tredak Signed-off-by: Kirthi Shankar Sivamani * Add fp8 tensor to ci test Signed-off-by: Kirthi Shankar Sivamani * review comments and tests Signed-off-by: Kirthi Shankar Sivamani * Minor changes Signed-off-by: Kirthi Shankar Sivamani * Default to FP8 usage Signed-off-by: Kirthi Shankar Sivamani * Fix docs Signed-off-by: Kirthi Shankar Sivamani * Naming changes Signed-off-by: Kirthi Shankar Sivamani * minor fix Signed-off-by: Kirthi Shankar Sivamani * Fix transpose caching Signed-off-by: Kirthi Shankar Sivamani * Debug transpose caching Handle case where transpose cache is updated externally. Signed-off-by: Tim Moon * Rename FP8GlobalStateManager.with_fp8_parameters Signed-off-by: Tim Moon * remove Float8Tensor from import API Signed-off-by: Kirthi Shankar Sivamani * Avoid caching FP8 transposes if not required Signed-off-by: Tim Moon * Fix import error in FP8 tensor tests Signed-off-by: Tim Moon * Fix tranpose caching and checkpointing bug Signed-off-by: Kirthi Shankar Sivamani * Improve caching and fix distopt case Signed-off-by: Kirthi Shankar Sivamani * Update transformer_engine/pytorch/float8_tensor.py Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Remove recursive logic Signed-off-by: Kirthi Shankar Sivamani * Fix cache reset bug Signed-off-by: Kirthi Shankar Sivamani * Store FP8 attributes in dict Easier for multiple tensors to share, e.g. detached tensors. Signed-off-by: Tim Moon * Make sure scale_inv is 1D tensor Signed-off-by: Tim Moon * Make sure scale_inv is 1D tensor Signed-off-by: Tim Moon * Fixes and detach recipe Signed-off-by: Kirthi Shankar Sivamani * Set default fp8 data type Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani Signed-off-by: Tim Moon Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani Co-authored-by: Sudhakar Singh Co-authored-by: Przemyslaw Tredak --- docs/api/pytorch.rst | 2 + qa/L0_pytorch_unittest/test.sh | 1 + tests/pytorch/test_float8tensor.py | 318 ++++++++ tests/pytorch/test_numerics.py | 133 +++- tests/pytorch/test_onnx_export.py | 2 +- tests/pytorch/test_torch_save_load.py | 4 +- transformer_engine/pytorch/__init__.py | 1 + transformer_engine/pytorch/distributed.py | 10 +- transformer_engine/pytorch/float8_tensor.py | 689 ++++++++++++++++++ transformer_engine/pytorch/fp8.py | 63 +- transformer_engine/pytorch/module/base.py | 81 +- .../pytorch/module/layernorm_linear.py | 79 +- .../pytorch/module/layernorm_mlp.py | 119 ++- transformer_engine/pytorch/module/linear.py | 87 ++- 14 files changed, 1448 insertions(+), 141 deletions(-) create mode 100644 tests/pytorch/test_float8tensor.py create mode 100644 transformer_engine/pytorch/float8_tensor.py diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index aea66b257f..f179569251 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -35,6 +35,8 @@ pyTorch .. autoapifunction:: transformer_engine.pytorch.fp8_autocast +.. autoapifunction:: transformer_engine.pytorch.fp8_model_init + .. autoapifunction:: transformer_engine.pytorch.checkpoint .. autoapifunction:: transformer_engine.pytorch.onnx_export diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index 268a534a82..54ba2a09c0 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -12,3 +12,4 @@ PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pyt pytest -v -s $TE_PATH/tests/pytorch/test_jit.py pytest -v -s $TE_PATH/tests/pytorch/test_fused_attn.py NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py +pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py diff --git a/tests/pytorch/test_float8tensor.py b/tests/pytorch/test_float8tensor.py new file mode 100644 index 0000000000..dc48c886cf --- /dev/null +++ b/tests/pytorch/test_float8tensor.py @@ -0,0 +1,318 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +from collections.abc import Iterable +from typing import Any, Dict, List, Tuple, Union + +import pytest +import torch + +import transformer_engine.common.recipe +import transformer_engine.pytorch as te +from transformer_engine.pytorch.float8_tensor import Float8Tensor +from transformer_engine.pytorch.fp8 import FP8GlobalStateManager +import transformer_engine_extensions as tex + +# PyTorch tensor dtypes +_dtypes: List[torch.dtype] = [torch.float32, torch.float16, torch.bfloat16] +# TE FP8 dtypes +_fp8_dtypes: List[tex.DType] = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2] + +# Numerical tolerances with FP8 types +_tols: Dict[tex.DType, Dict[str, float]] = { + tex.DType.kFloat8E4M3: dict(rtol=0.125, atol=0.0675), # epsilon = 0.0625 + tex.DType.kFloat8E5M2: dict(rtol=0.25, atol=0.125), # epsilon = 0.125 +} + +def _to_list(x: Union[Iterable, Any]) -> List: + """Convert to list if iterable, otherwise put in singleton list""" + if isinstance(x, Iterable): + return list(x) + else: + return [x] + +# Types that can be interpreted as tensor dims +DimsType = Union[Iterable[int], int] + +# Check if FP8 is supported +fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() + +@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) +class TestFloat8Tensor: + + @staticmethod + def setup_class(cls) -> None: + # Configure RNG + seed = 1234 + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + def test_constructor( + self, + dims: DimsType = 1, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale_inv: float = 0.375, + dtype: torch.dtype = torch.float32, + ) -> None: + """Call constructor and perform sanity checks""" + dims = _to_list(dims) + tensor = Float8Tensor( + data=torch.zeros(dims, device="cuda", dtype=torch.uint8), + fp8_dtype=fp8_dtype, + fp8_scale_inv=torch.full([1], scale_inv), + dtype=dtype, + ) + assert list(tensor.size()) == dims, "Incorrect dims" + assert tensor.dtype == dtype, "Incorrect nominal dtype" + assert tensor.is_cuda, "Incorrect device" + + def _test_quantize_dequantize( + self, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: float = 3.5, + dtype: torch.dtype = torch.float32, + dims: DimsType = 23, + ) -> None: + """Check numerical error when casting to FP8 and back""" + + # Initialize random data + x_ref = 2 * torch.rand(_to_list(dims), dtype=dtype, device="cpu") - 1 + + # Cast to FP8 and back + x_fp8 = Float8Tensor.to_float8( + x_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + x_fp8 = x_fp8.from_float8().cpu() + + # Check results + torch.testing.assert_close(x_fp8, x_ref, **_tols[fp8_dtype]) + + # Make sure we are not trivially passing the test + with pytest.raises(AssertionError): + torch.testing.assert_close(x_fp8, -x_ref, **_tols[fp8_dtype]) + + @pytest.mark.parametrize("fp8_dtype", _fp8_dtypes) + @pytest.mark.parametrize("dtype", _dtypes) + def test_quantize_dequantize_dtypes( + self, + fp8_dtype: tex.DType, + dtype: torch.dtype, + ) -> None: + self._test_quantize_dequantize(fp8_dtype=fp8_dtype, dtype=dtype) + + @pytest.mark.parametrize("scale", [0.375, 1, 3.5]) + def test_quantize_dequantize_scales(self, scale: float) -> None: + self._test_quantize_dequantize(scale=scale) + + @pytest.mark.parametrize("dims", [[], 1, 311, [7,11], [7,5,3], [2,3,5,3]]) + def test_quantize_dequantize_dims(self, dims: DimsType) -> None: + self._test_quantize_dequantize(dims=dims) + + def test_fp8_meta( + self, + dtype: torch.dtype = torch.float32, + dims: DimsType = 23, + ) -> None: + """Construct Float8Tensor using FP8 metadata and perform basic checks""" + + # Get FP8 metadata from linear module + fp8_dtype = tex.DType.kFloat8E4M3 + recipe = transformer_engine.common.recipe.DelayedScaling( + fp8_format=transformer_engine.common.recipe.Format.E4M3, + ) + with te.fp8_autocast(enabled=True, fp8_recipe=recipe): + module = te.Linear(32, 32) + _ = module(torch.zeros([8, 32], device="cuda")) + fp8_meta = module.fp8_meta + fp8_meta_index = tex.FP8FwdTensors.GEMM1_WEIGHT + fp8_meta_key = FP8GlobalStateManager.get_meta_tensor_key(forward=True) + + # Initialize random data + dims = _to_list(dims) + x_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + + # Make Float8Tensor + x_fp8 = Float8Tensor.to_float8( + x_ref, + fp8_meta=fp8_meta, + fp8_meta_index=fp8_meta_index, + ) + x_ref = x_fp8.from_float8() + assert list(x_fp8.size()) == dims, "Incorrect dims" + assert x_fp8.dtype == dtype, "Incorrect nominal dtype" + assert x_fp8.is_cuda, "Incorrect device" + assert x_fp8._fp8_dtype == fp8_dtype, "Incorrect FP8 dtype" + + # Change FP8 metadata scale + fp8_meta[fp8_meta_key].scale[fp8_meta_index] = 2 + fp8_meta[fp8_meta_key].scale_inv.fill_(123) + + # Check results + torch.testing.assert_close(x_fp8, x_ref, **_tols[fp8_dtype]) + with pytest.raises(AssertionError): + # Make sure we are not trivially passing the test + torch.testing.assert_close(x_fp8, -x_ref, **_tols[fp8_dtype]) + + # Check if scaling factor is updated after in-place ops + x_fp8 += 0 + fp8_meta[fp8_meta_key].scale[fp8_meta_index] = 4 + fp8_meta[fp8_meta_key].scale_inv.fill_(321) + assert x_fp8._scale_inv.item() == 0.5, "Incorrect FP8 scale_inv" + torch.testing.assert_close(x_fp8, x_ref, **_tols[fp8_dtype]) + y = x_fp8.detach() + y += 0 + assert x_fp8._scale_inv.item() == 0.25, "Incorrect FP8 scale_inv" + torch.testing.assert_close(x_fp8, x_ref, **_tols[fp8_dtype]) + + def test_basic_ops( + self, + dims: DimsType = 23, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: float = 3.5, + dtype: torch.dtype = torch.float32, + ) -> None: + """Test basic out-of-place ops""" + + # Initialize random data + dims = _to_list(dims) + x_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + y_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + x_fp8 = Float8Tensor.to_float8( + x_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + y_fp8 = Float8Tensor.to_float8( + y_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + x_ref = x_fp8.from_float8() + y_ref = y_fp8.from_float8() + + # Exact operations + torch.testing.assert_close(-x_fp8, -x_ref, rtol=0, atol=0) + torch.testing.assert_close(x_fp8.abs(), x_ref.abs(), rtol=0, atol=0) + + # Operations with numerical error + tols = _tols[fp8_dtype] + torch.testing.assert_close(x_fp8 + y_fp8, x_ref + y_ref, **tols) + torch.testing.assert_close(x_fp8 - y_fp8, x_ref - y_ref, **tols) + torch.testing.assert_close(x_fp8 * y_fp8, x_ref * y_ref, **tols) + torch.testing.assert_close(x_fp8 + y_ref, x_ref + y_ref, **tols) + torch.testing.assert_close(x_ref + y_fp8, x_ref + y_ref, **tols) + torch.testing.assert_close(torch.sin(x_fp8), torch.sin(x_ref), **tols) + + # Make sure we are not trivially passing tests + with pytest.raises(AssertionError): + torch.testing.assert_close(x_fp8 + y_fp8, x_ref - y_fp8, **tols) + + def test_inplace_ops( + self, + dims: DimsType = 23, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: float = 3.5, + dtype: torch.dtype = torch.float32, + ) -> None: + """Test in-place ops""" + + # Initialize random data + dims = _to_list(dims) + x_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + y_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + x_fp8 = Float8Tensor.to_float8( + x_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + y_fp8 = Float8Tensor.to_float8( + y_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + x_ref = x_fp8.from_float8() + y_ref = y_fp8.from_float8() + + # In-place operations + tols = _tols[fp8_dtype] + x_fp8 += y_ref + x_ref += y_ref + torch.testing.assert_close(x_fp8, x_ref, **tols) + x_ref = x_fp8.from_float8() + x_fp8 -= y_fp8 + x_ref -= y_fp8 + torch.testing.assert_close(x_fp8, x_ref, **tols) + x_ref = x_fp8.from_float8() + x_fp8 *= 2 + x_ref *= 2 + torch.testing.assert_close(x_fp8, x_ref, **tols) + x_ref = x_fp8.from_float8() + + # Make sure we are not trivially passing tests + x_ref += 123 + with pytest.raises(AssertionError): + torch.testing.assert_close(x_fp8, x_ref, **tols) + + @pytest.mark.parametrize("dims", [[33, 41], [5, 7, 11]]) + @pytest.mark.parametrize("transpose_dims", [(0, 1), (-2, -1), (0, 0)]) + def test_transpose( + self, + dims: DimsType, + transpose_dims: Tuple[int, int], + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: float = 1, + dtype: torch.dtype = torch.float32, + ) -> None: + """Test transpose""" + + # Initialize random data + dims = _to_list(dims) + x_ref = 2 * torch.rand(dims, dtype=dtype, device="cpu") - 1 + x_fp8 = Float8Tensor.to_float8( + x_ref, + fp8_dtype=fp8_dtype, + scale=torch.full([1], scale), + ) + x_ref = x_fp8.from_float8() + + # Perform transpose + y_fp8 = x_fp8.transpose(*transpose_dims) + y_ref = x_ref.transpose(*transpose_dims) + + # Check results + tols = dict(rtol=0, atol=0) + torch.testing.assert_close(y_fp8, y_ref, **tols) + + # Make sure we are not trivially passing the test + if transpose_dims[0] != transpose_dims[1]: + with pytest.raises(AssertionError): + torch.testing.assert_close( + y_fp8, + x_ref, + **tols, + ) + + # Check transpose caching + if x_fp8.dim() == 2 and transpose_dims[0] != transpose_dims[1]: + x_fp8 += 0.5 + x_ref = x_fp8.from_float8() + torch.testing.assert_close( + x_fp8.transpose(*transpose_dims, update_cache=True), + x_ref.transpose(*transpose_dims), + **tols, + ) + torch.testing.assert_close( + x_fp8.transpose(*transpose_dims, update_cache=True), + x_ref.transpose(*transpose_dims), + **tols, + ) + x_fp8 += 0.5 + x_ref = x_fp8.from_float8() + torch.testing.assert_close( + x_fp8.transpose(*transpose_dims, update_cache=True), + x_ref.transpose(*transpose_dims), + **tols, + ) diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 02fb63e71f..474f0a95b9 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -12,7 +12,7 @@ import torch.nn as nn from torch.nn import Parameter -from transformer_engine.pytorch.fp8 import fp8_autocast, FP8GlobalStateManager +from transformer_engine.pytorch.fp8 import fp8_autocast, FP8GlobalStateManager, fp8_model_init from transformer_engine.pytorch.utils import ( init_method_normal, scaled_init_method_normal, @@ -339,7 +339,7 @@ def forward( return x -def _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=False): +def _test_e2e_selective_recompute(bs, dtype, config, fp8, fp8_model_params=False, recompute=False): reset_rng_states() FP8GlobalStateManager.reset() @@ -354,24 +354,26 @@ def get_dummy_cuda_rng_tracker(): """Get cuda rng tracker.""" return _DUMMY_CUDA_RNG_STATE_TRACKER - block = ( - TransformerLayer( - config.hidden_size, - 4 * config.hidden_size, - config.num_attention_heads, - layernorm_epsilon=config.eps, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - hidden_dropout=0.1, - attention_dropout=0.1, - kv_channels=config.embed, - apply_residual_connection_post_layernorm=False, - output_layernorm=False, - get_rng_state_tracker=get_dummy_cuda_rng_tracker, - params_dtype=dtype, + with fp8_model_init(enabled=fp8 and fp8_model_params): + block = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + layernorm_epsilon=config.eps, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + hidden_dropout=0.1, + attention_dropout=0.1, + kv_channels=config.embed, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, + params_dtype=dtype, + fuse_qkv_params=True, + ) + .cuda() ) - .cuda() - ) te_inp_hidden_states = torch.randn( config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True @@ -400,18 +402,19 @@ def get_dummy_cuda_rng_tracker(): @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("fp8", all_boolean) -def test_gpt_selective_activation_recompute(dtype, bs, model, fp8): +@pytest.mark.parametrize("fp8_model_params", all_boolean) +def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, fp8_model_params): if fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) config = model_configs[model] - outputs = _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=False) - outputs_recompute = _test_e2e_selective_recompute(bs, dtype, config, fp8, recompute=True) + outputs = _test_e2e_selective_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=False) + outputs_recompute = _test_e2e_selective_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=True) assert_all_equal(outputs, outputs_recompute) -def _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=False): +def _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params=False, recompute=False): reset_rng_states() FP8GlobalStateManager.reset() @@ -426,7 +429,8 @@ def get_dummy_cuda_rng_tracker(): """Get cuda rng tracker.""" return _DUMMY_CUDA_RNG_STATE_TRACKER - block = ( + with fp8_model_init(enabled=fp8 and fp8_model_params): + block = ( TransformerLayer( config.hidden_size, 4 * config.hidden_size, @@ -441,9 +445,10 @@ def get_dummy_cuda_rng_tracker(): output_layernorm=False, get_rng_state_tracker=get_dummy_cuda_rng_tracker, params_dtype=dtype, + fuse_qkv_params=True, ) .cuda() - ) + ) te_inp_hidden_states = torch.randn( config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True @@ -483,14 +488,15 @@ def get_dummy_cuda_rng_tracker(): @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("fp8", all_boolean) -def test_gpt_full_activation_recompute(dtype, bs, model, fp8): +@pytest.mark.parametrize("fp8_model_params", all_boolean) +def test_gpt_full_activation_recompute(dtype, bs, model, fp8, fp8_model_params): if fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) config = model_configs[model] - outputs = _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=False) - outputs_recompute = _test_e2e_full_recompute(bs, dtype, config, fp8, recompute=True) + outputs = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=False) + outputs_recompute = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=True) assert_all_equal(outputs, outputs_recompute) @@ -871,6 +877,7 @@ def test_linear_accuracy(dtype, bs, model): else: assert_allclose(te_outputs[0], torch_outputs[0], 5e-2) + @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @@ -911,6 +918,7 @@ def test_rmsnorm_accuracy(dtype, bs, model, eps): else: assert_allclose(te_outputs[0], torch_outputs[0], 2e-2) + @pytest.mark.parametrize("dtype", param_types) @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", model_configs.keys()) @@ -1110,3 +1118,72 @@ def test_gpt_cuda_graph(dtype, bs, model): assert_allclose(out, graphed_out, 1e-3) assert_allclose(params, graphed_params, 1e-3) assert_allclose(grads, graphed_grads, 1e-3) + + +def _test_gpt_fp8_parameters(bs, dtype, config, fp8_model_params): + reset_rng_states() + FP8GlobalStateManager.reset() + + sigma = 0.023 + init_method = init_method_normal(sigma) + output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + + def get_dummy_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + + with fp8_model_init(enabled=fp8_model_params): + block = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + layernorm_epsilon=config.eps, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + hidden_dropout=0.1, + attention_dropout=0.1, + kv_channels=config.embed, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, + params_dtype=dtype, + fuse_qkv_params=True, + ) + .cuda() + ) + + te_inp_hidden_states = torch.randn( + config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True + ).cuda() + te_inp_hidden_states.retain_grad() + te_inp_attn_mask = get_causal_attn_mask(config.seq_len) + + with fp8_autocast(enabled=True): + te_out = block(te_inp_hidden_states, attention_mask=te_inp_attn_mask) + loss = te_out.sum() + loss.backward() + torch.cuda.synchronize() + + outputs = [te_out, te_inp_hidden_states.grad] + for p in block.parameters(): + if p.requires_grad: + outputs.append(p.grad) + return outputs + + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_gpt_fp8_parameters(dtype, bs, model): + if not fp8_available: + pytest.skip(reason_for_no_fp8) + + config = model_configs[model] + + outputs = _test_gpt_fp8_parameters(bs, dtype, config, False) + outputs_fp8_params = _test_gpt_fp8_parameters(bs, dtype, config, True) + assert_all_equal(outputs, outputs_fp8_params) diff --git a/tests/pytorch/test_onnx_export.py b/tests/pytorch/test_onnx_export.py index 4774cd39ab..dd50f15e43 100644 --- a/tests/pytorch/test_onnx_export.py +++ b/tests/pytorch/test_onnx_export.py @@ -147,7 +147,7 @@ def set_layer_scale(module: torch.nn.Module, scale: float, num_gemms: int): """Initialize the FP8 quantization scales in module""" NB_SCALES_PER_GEMM = 3 # One scale per: input, weights, and output GEMM tensors. nb_total_scales = num_gemms * NB_SCALES_PER_GEMM - module.fp8_init(num_gemms) + module.init_fp8_metadata(num_gemms) module.fp8_meta["scaling_fwd"].scale = torch.ones( nb_total_scales, dtype=torch.float32, device="cuda") / scale module.fp8_meta["scaling_fwd"].scale_inv = torch.ones( diff --git a/tests/pytorch/test_torch_save_load.py b/tests/pytorch/test_torch_save_load.py index f35b60ede2..2732db6ad9 100644 --- a/tests/pytorch/test_torch_save_load.py +++ b/tests/pytorch/test_torch_save_load.py @@ -16,7 +16,7 @@ import torch import transformer_engine.pytorch as te import transformer_engine_extensions as tex -from transformer_engine.pytorch.cpp_extensions import fp8_gemm, cast_to_fp8, cast_from_fp8 +from transformer_engine.pytorch.cpp_extensions import fp8_gemm, cast_to_fp8 from transformer_engine.pytorch.module.base import get_workspace from transformer_engine.pytorch.module.base import TransformerEngineBaseModule @@ -93,7 +93,7 @@ def forward(self, inp, weight): model_in = Test_TE_Export(precision, True) with te.fp8_autocast(enabled=True): - model_in.fp8_init() + model_in.init_fp8_metadata() # scaling fwd model_in.fp8_meta["scaling_fwd"].scale = torch.ones(3, dtype=torch.float32, device="cuda") * scale_fwd model_in.fp8_meta["scaling_fwd"].scale_inv = torch.ones(3, dtype=torch.float32, device="cuda") / scale_fwd diff --git a/transformer_engine/pytorch/__init__.py b/transformer_engine/pytorch/__init__.py index 8ff601f6f1..b29853a3a7 100644 --- a/transformer_engine/pytorch/__init__.py +++ b/transformer_engine/pytorch/__init__.py @@ -13,6 +13,7 @@ from .attention import MultiheadAttention from .transformer import TransformerLayer from .fp8 import fp8_autocast +from .fp8 import fp8_model_init from .export import onnx_export from .distributed import checkpoint from .distributed import CudaRNGStatesTracker diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index abc3936e25..1d93d03f3f 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -83,14 +83,16 @@ def initialize_affine_weight_gpu( weight: torch.Tensor, init_method: Callable, get_rng_state_tracker: Callable, - partition_dim: int, + partition_dim: int = 0, stride: int = 1, + set_tp_attributes: bool = True, ) -> None: """Initialize affine weight for model parallel on GPU.""" - set_tensor_model_parallel_attributes( - tensor=weight, is_parallel=True, dim=partition_dim, stride=stride - ) + if set_tp_attributes: + set_tensor_model_parallel_attributes( + tensor=weight, is_parallel=True, dim=partition_dim, stride=stride + ) if get_rng_state_tracker is None: init_method(weight) diff --git a/transformer_engine/pytorch/float8_tensor.py b/transformer_engine/pytorch/float8_tensor.py new file mode 100644 index 0000000000..1868bb4ed2 --- /dev/null +++ b/transformer_engine/pytorch/float8_tensor.py @@ -0,0 +1,689 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Tensor class with FP8 data""" +from __future__ import annotations +from typing import Any, Dict, Optional + +import torch +from torch.utils._pytree import tree_map +import transformer_engine_extensions as tex + +from .constants import TE_DType +from .fp8 import FP8GlobalStateManager + + +aten = torch.ops.aten +c10d = torch.ops.c10d + + +def _make_fp8_attr_property_funcs(name: str) -> Any: + """Make accessors for an FP8 attribute + + We store FP8 attributes in a dictionary so we can share them + between tensors with the same data, e.g. detached tensors. For + convenience, we also expose them as property attributes. This + function creates the accessors for property attributes. + + Parameters + ---------- + name: str + Key in dictionary of FP8 attributes + + """ + def get_func(self) -> Any: + return self._fp8_attrs[name] + def set_func(self, value: Any) -> None: + self._fp8_attrs[name] = value + def del_func(self) -> None: + del self._fp8_attrs[name] + return dict(fget=get_func, fset=set_func, fdel=del_func) + + +class _FromFloat8Func(torch.autograd.Function): + """Cast from FP8 to other dtype""" + @staticmethod + def forward( + ctx, + tensor: Float8Tensor, + dtype: Optional[torch.dtype] = None, + ) -> torch.Tensor: + if dtype is None: + dtype = tensor.dtype + data = tensor._data.contiguous().view(1,-1).detach() + out = tex.cast_from_fp8( + data, + tensor._scale_inv, + tensor._fp8_dtype, + TE_DType[dtype], + ) + out = out.view(tensor.size()) + return out + + @staticmethod + def backward(ctx, grad): + # Assume that we want gradients in full precision + return grad, None + + +class _ToFloat8Func(torch.autograd.Function): + """Cast to FP8 from other dtype""" + @staticmethod + def forward( + ctx, + tensor: torch.Tensor, + fp8_meta: Optional[Dict[str, Any]] = None, + fp8_meta_forward: bool = True, + fp8_meta_index: Optional[int] = None, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: Optional[torch.Tensor] = None, + amax: Optional[torch.Tensor] = None, + scale_inv: Optional[torch.Tensor] = None, + ): + + # Manually compute scale-inverse if needed + if scale is not None and scale_inv is None: + if isinstance(scale, torch.Tensor): + scale_inv = scale.reciprocal() + else: + scale_inv = 1 / scale + + # Extract data from FP8 meta tensors if provided + if fp8_meta is not None: + fp8_meta_key = FP8GlobalStateManager.get_meta_tensor_key( + forward=fp8_meta_forward, + ) + if fp8_meta_index is None: + raise ValueError( + "To initialize Float8Tensor with FP8 meta tensors, " + "the FP8 meta tensor index must also be provided" + ) + if scale is None: + scale = fp8_meta[fp8_meta_key].scale[fp8_meta_index] + if amax is None: + amax = fp8_meta[fp8_meta_key].amax_history[0][fp8_meta_index] + if scale_inv is None: + scale_inv = fp8_meta[fp8_meta_key].scale_inv[fp8_meta_index] + scale_inv = scale_inv.detach().view(1).clone() + + # Check input tensor + tensor = tensor.contiguous().cuda().detach() + if tensor.dtype not in (torch.float32, torch.bfloat16, torch.float16): + tensor = tensor.float() + + # Check scale + if not isinstance(scale, torch.Tensor): + if scale is None: + scale = 1 + scale = torch.full( + [1], + scale, + dtype=torch.float32, + device=tensor.device, + ) + if scale.numel() != 1: + raise ValueError( + "Attempted to initialize Float8Tensor with invalid scale tensor" + ) + scale = scale.to(device=tensor.device, dtype=torch.float32) + + # Check scale-inverse + if scale_inv is None: + scale_inv = scale.reciprocal() + scale_inv = scale_inv.to(device=tensor.device, dtype=torch.float32) + + # Check amax + if amax is None: + amax = torch.empty_like(scale) + if not (amax.numel() == 1 and amax.is_cuda and amax.dtype == torch.float32): + raise ValueError( + "Attempted to initialize Float8Tensor with invalid amax tensor" + ) + + # Cast data to FP8 + data = tex.cast_to_fp8( + tensor.view(1,-1), + scale, + amax, + scale_inv, + fp8_dtype, + ) + data = data.view(tensor.size()) + + # Construct FP8 tensor + return Float8Tensor( + data=data, + fp8_meta=fp8_meta, + fp8_meta_forward=fp8_meta_forward, + fp8_meta_index=fp8_meta_index, + fp8_dtype=fp8_dtype, + fp8_scale_inv=scale_inv, + dtype=tensor.dtype, + ) + + @staticmethod + def backward(ctx, grad): + # Assume that we want gradients in full precision + return grad, None, None, None, None, None, None, None + +class _IdentityFunc(torch.autograd.Function): + """Identity function + + If constructor keyword-arguments are provided, then construct a + new Float8Tensor using the provided tensor's attributes. + + """ + + @staticmethod + def forward( + ctx, + tensor: Float8Tensor, + init_kwargs: Optional[Dict[str, Any]] = None, + ) -> torch.Tensor: + + # Return input tensor if constructor kwargs are not provided + ctx.input_dtype = tensor.dtype + if init_kwargs is None: + return tensor + + # Construct new tensor if constructor kwargs are provided + default_kwargs = dict( + data=tensor._data, + fp8_meta=tensor._fp8_meta, + fp8_meta_forward=tensor._fp8_meta_forward, + fp8_meta_index=tensor._fp8_meta_index, + fp8_dtype=tensor._fp8_dtype, + fp8_scale_inv=tensor._scale_inv, + dtype=tensor.dtype, + ) + for key, val in default_kwargs.items(): + if key not in init_kwargs: + init_kwargs[key] = val + return Float8Tensor(**init_kwargs) + + @staticmethod + def backward(ctx, grad): + return grad.to(ctx.input_dtype), None + + +class Float8Tensor(torch.Tensor): + """Experimental tensor class with FP8 data + + The tensor presents as having a standard, higher-precision dtype, + but the data itself is (scaled) FP8. For most tensor operations, + the data will be cast to the nominal dtype before performing the + operation. + + Parameters + ---------- + data: torch.Tensor + Raw FP8 data in a uint8 tensor + fp8_attrs: dict, optional + FP8 metadata, primarily managed by Float8Tensor. If + provided, all other FP8 configuration is ignored. + fp8_meta: dict, optional + FP8 metadata object, primarily managed by TE modules. + fp8_meta_forward: bool, default = `True` + Whether to access the FP8 metadata for the + forward pass. Ignored if fp8_meta is not + provided. + fp8_meta_index: int, optional + Index to access in FP8 meta tensors. Required if + fp8_meta is provided and otherwise ignored. + fp8_dtype: transformer_engine_extensions.DType, tex.DType.kFloat8E4M3 + FP8 format. + fp8_scale_inv: torch.Tensor + Reciprocal of the scaling factor applied when + casting to FP8, i.e. the scaling factor that must + be applied when casting from FP8 to higher + precision. Can be inferred from fp8_meta if + provided. + dtype: torch.dtype, default = torch.float32 + Nominal tensor datatype. + + """ + + def __new__( + cls, + *, + data: torch.Tensor, + fp8_attrs: Optional[Dict[str, Any]] = None, + fp8_meta: Optional[Dict[str, Any]] = None, + fp8_meta_forward: bool = True, + fp8_meta_index: Optional[int] = None, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + fp8_scale_inv: Optional[torch.Tensor] = None, + dtype: torch.dtype = torch.float32, + ): + + # Check that data buffer is valid + if data.element_size() != 1: + raise ValueError( + "Float8Tensor requires data buffer with 8-bit dtype " + f"(got dtype={data.dtype})" + ) + if data.requires_grad: + raise ValueError( + "Float8Tensor requires non-differentiable data buffer" + ) + data = data.cuda() + + # Initialize tensor object + self = torch.Tensor._make_wrapper_subclass( + cls, + data.size(), + strides=data.stride(), + storage_offset=data.storage_offset(), + dtype=dtype, + layout=data.layout, + requires_grad=data.requires_grad, + device=data.device, + ) + self._data: torch.Tensor = data + + # Initialize dict of class attributes + # Note: We store FP8 attributes in a dictionary so we can + # share them between tensors with the same data, e.g. detached + # tensors. + self._fp8_attrs: dict = {} + if fp8_attrs is not None: + self._fp8_attrs = fp8_attrs + return self + + # FP8 meta tensors + if fp8_meta is not None and fp8_meta_index is None: + raise ValueError( + "To initialize Float8Tensor with FP8 meta tensors, " + "the FP8 meta tensor index must also be provided" + ) + self._fp8_meta: Optional[Dict[str, Any]] = fp8_meta + self._fp8_meta_forward: bool = fp8_meta_forward + self._fp8_meta_index: Optional[int] = fp8_meta_index + + # FP8 dtype + assert ( + fp8_dtype in (tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2) + ), f"Unsupported fp8_dtype {fp8_dtype}." + self._fp8_dtype: tex.DType = fp8_dtype + + # Cached transpose + self._transpose: Optional[Float8Tensor] = None + + # FP8 scale-inverse + self._scale_inv: Optional[torch.Tensor] = fp8_scale_inv + if self._scale_inv is None and self._fp8_meta is not None: + fp8_meta_key = FP8GlobalStateManager.get_meta_tensor_key( + forward=self._fp8_meta_forward, + ) + scale_inv = self._fp8_meta[fp8_meta_key].scale_inv[self._fp8_meta_index] + self._scale_inv = scale_inv.detach().view(1).clone() + if self._scale_inv is None: + raise ValueError( + "Attempted to initialize Float8Tensor without specifying scale-inverse" + ) + if not isinstance(self._scale_inv, torch.Tensor): + self._scale_inv = torch.full( + [1], + self._scale_inv, + dtype=torch.float32, + device=self._data.device, + ) + if self._scale_inv.numel() != 1: + raise ValueError( + "Attempted to initialize Float8Tensor with invalid scale-inverse tensor" + ) + self._scale_inv = self._scale_inv.to( + device=self._data.device, + dtype=torch.float32, + ) + + return self + + @classmethod + def make_like( + cls, + tensor: Float8Tensor, + *, + data: torch.Tensor, + fp8_attrs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Float8Tensor: + """Use attributes of a Float8Tensor to create another Float8Tensor + + See constructor for list of keyword arguments. + + """ + default_kwargs = dict( + fp8_meta=tensor._fp8_meta, + fp8_meta_forward=tensor._fp8_meta_forward, + fp8_meta_index=tensor._fp8_meta_index, + fp8_dtype=tensor._fp8_dtype, + fp8_scale_inv=tensor._scale_inv, + dtype=tensor.dtype, + ) + for key, val in default_kwargs.items(): + if key not in kwargs: + kwargs[key] = val + return Float8Tensor(data=data, fp8_attrs=fp8_attrs, **kwargs) + + def __repr__(self): + return ( + "Float8Tensor(" + f"fp8_dtype={self._fp8_dtype}, " + f"scale_inv={self._scale_inv.item()}, " + f"data={self.from_float8(dtype=self.dtype)}" + ")" + ) + + def from_float8(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor: + """ + Construct plain PyTorch tensor from Float8Tensor + + By default the resulting tensor's dtype is the + Float8Tensor's nominal dtype. + """ + return _FromFloat8Func.apply(self, dtype) + + @classmethod + def to_float8( + cls, + tensor: torch.Tensor, + *, + fp8_meta: Optional[Dict[str, Any]] = None, + fp8_meta_forward: bool = True, + fp8_meta_index: Optional[int] = None, + fp8_dtype: tex.DType = tex.DType.kFloat8E4M3, + scale: Optional[torch.Tensor] = None, + amax: Optional[torch.Tensor] = None, + scale_inv: Optional[torch.Tensor] = None, + ): + """Construct Float8Tensor from plain PyTorch tensor""" + return _ToFloat8Func.apply( + tensor, + fp8_meta, + fp8_meta_forward, + fp8_meta_index, + fp8_dtype, + scale, + amax, + scale_inv, + ) + + def float(self) -> torch.Tensor: + return self.from_float8(dtype=torch.float32) + + def bfloat16(self) -> torch.Tensor: + return self.from_float8(dtype=torch.bfloat16) + + def half(self) -> torch.Tensor: + return self.from_float8(dtype=torch.float16) + + def cpu(self) -> torch.Tensor: + return self.from_float8().cpu() + + def clone(self) -> Float8Tensor: + return _IdentityFunc.apply(self, {"data": self._data.detach().clone()}) + + def expand_as(self, other: torch.Tensor): + if other is self: + # Note: expand_as is hackily used to create dummy autograd nodes + # and access the backward graph (see + # https://github.com/pytorch/pytorch/blob/238fb660851268f44ff88127887041fea352fe48/torch/nn/parallel/distributed.py#L1026). + # We equally hackily add a dummy function to handle this + # case. + return _IdentityFunc.apply(self) + return super().expand_as(other) + + def _transpose_no_cache(self) -> torch.Tensor: + """ + Swap tensor dimensions + + For basic 2D matrix transposes, an optimized transpose kernel + is applied and a Float8Tensor is returned. + """ + + # Use optimized kernel for basic 2D transpose + # TODO Support differentiation # pylint: disable=fixme + return Float8Tensor.make_like( + self, + data=tex.fp8_transpose( + self._data.contiguous().detach(), + self._fp8_dtype, + ), + ) + + def transpose( + self, + dim0: int = 0, + dim1: int = 1, + *, + update_cache: Optional[bool] = None, + ) -> torch.Tensor: + """ + Swap tensor dimensions + + For basic 2D matrix transposes, an optimized transpose kernel + is applied and a Float8Tensor is returned. + + Parameters + ---------- + dim0: int, default = 0 + The first dimension to be transposed + dim1: int, default = 1 + The second dimension to be transposed + update_cache: Optional[bool], default = None + If set to `True`, the result is computed and stored in a cache. + If set to `False`, the result is computed only if the cache is + empty, otherwise the cache is returned. If set to `None`, the + result is not cached. Caching is only supported for basic 2D + transposes and the cache is reset after any in-place operations. + """ + + # Handle non-2D transposes + if -self.dim() <= dim0 < 0: + dim0 += self.dim() + if -self.dim() <= dim1 < 0: + dim1 += self.dim() + if self.dim() != 2 or dim0 == dim1: + if update_cache is not None: + raise ValueError( + "Transpose caching is only supported for basic 2D transposes " + f"(ndims={self.dim()}, dim0={dim0}, dim1={dim1})" + ) + return super().transpose(dim0, dim1) + + # No caching. + if update_cache is None: + return self._transpose_no_cache() + + # Update cache. + if update_cache or self._transpose is None: + self._transpose = self._transpose_no_cache() + + return self._transpose + + @torch.no_grad() + def reset_fp8_meta_scale_inv(self) -> None: + """Replace FP8 meta tensor scale-inverse with cached value + + The FP8 meta tensor scale_inv entry corresponding to this + tensor is replaced with the scale_inv value used to construct + the tensor. + + """ + if self._fp8_meta is None: + return + fp8_meta_key = FP8GlobalStateManager.get_meta_tensor_key( + forward=self._fp8_meta_forward, + ) + scale_inv = self._fp8_meta[fp8_meta_key].scale_inv[self._fp8_meta_index] + scale_inv.view(1).copy_(self._scale_inv.view(1)) + + def to_dtype(self, dtype: torch.dtype) -> Float8Tensor: + """Create `Float8Tensor` with given nominal dtype + + The new tensor has the same underlying FP8 data. + + """ + return Float8Tensor.make_like( + self, + data=self._data, + fp8_attrs=self._fp8_attrs, + dtype=dtype, + ) + + def _reset_caches(self) -> None: + """Reset cached values + + Should be called after any in-place operation. + + """ + self._transpose = None + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs=None): + + # In-place copy op + if func == aten.copy_.default: + + # Check tensors + dst = args[0] + src = args[1] + if not isinstance(dst, Float8Tensor): + raise RuntimeError("Expected to copy into Float8Tensor") + if not isinstance(src, torch.Tensor): + raise RuntimeError("Expected to copy from tensor") + if not dst._data.is_contiguous(): + raise RuntimeError("Transformer Engine cast kernels require contiguous data") + + # Make sure input is in expected format + if isinstance(src, Float8Tensor): + src = src.from_float8() + src = src.expand(dst.size()) + src = src.to( + device=dst.device, + memory_format=torch.contiguous_format, + ) + + # Update scaling factor if FP8 meta tensors are available + if dst._fp8_meta is None: + scale = dst._scale_inv.reciprocal() + else: + fp8_meta_key = FP8GlobalStateManager.get_meta_tensor_key( + forward=dst._fp8_meta_forward, + ) + scale = dst._fp8_meta[fp8_meta_key].scale[dst._fp8_meta_index] + dst._scale_inv = scale.detach().view(1).reciprocal() + + # Cast to FP8 + tex.cast_to_fp8_noalloc( + src.view(1,-1), + scale, + dst._data.view(1,-1), + torch.empty_like(dst._scale_inv), # amax + dst._scale_inv, + dst._fp8_dtype, + ) + + # Nothing to return for in-place ops + dst._reset_caches() + return None + + # Slice op + # TODO Consider additional bookkeeping so we invalidate caches # pylint: disable=fixme + # if these slices are modified in-place + if func == aten.slice.Tensor: + tensor = args[0] + data = tensor._data + data_slice = data.__torch_dispatch__( + func, + types, + [data] + list(args[1:]), + kwargs, + ) + return Float8Tensor.make_like(tensor, data=data_slice) + + # Detach op + if func == aten.detach.default: + # Simply return a new Float8Tensor with the same attrs + return Float8Tensor.make_like( + args[0], + data=args[0]._data, + fp8_attrs=args[0]._fp8_attrs, + ) + + def maybe_unwrap(t): + if isinstance(t, Float8Tensor): + return t.from_float8() + return t + + def maybe_update_inplace(arg, new_arg, schema_arg): + """Update values of FP8 tensors + + Keep the same FP8 scaling factors. + + """ + if( + isinstance(arg, Float8Tensor) and + isinstance(new_arg, torch.Tensor) and + hasattr(schema_arg, 'alias_info') and + hasattr(schema_arg.alias_info, 'is_write') and + schema_arg.alias_info.is_write + ): + arg.copy_(new_arg) + arg._reset_caches() + + # In-place op + if func._schema.is_mutable: + # Cast to higher precision, perform op, and cast values + # back to original FP8 buffers + new_args = tree_map(maybe_unwrap, args) + new_kwargs = tree_map(maybe_unwrap, kwargs) + schema_args = func._schema.arguments + args_len = len(args) + out = super().__torch_dispatch__(func, types, new_args, new_kwargs) + for arg, new_arg, schema_arg in zip(args, new_args, schema_args): + maybe_update_inplace(arg, new_arg, schema_arg) + for kwarg, new_kwarg, schema_arg in zip(kwargs, new_kwargs, schema_args[args_len:]): + assert kwarg == new_kwarg == schema_arg.name, "name of the kw argument should match" + maybe_update_inplace(kwargs[kwarg], new_kwargs[new_kwarg], schema_arg) + return None + + # Default op + # Note: cast to higher precision and perform op + args = tree_map(maybe_unwrap, args) + if kwargs is not None: + kwargs = tree_map(maybe_unwrap, kwargs) + out = super().__torch_dispatch__(func, types, args, kwargs) + return out + + def _get_data(self) -> Float8Tensor: + """Get tensor data property""" + return super().data + + def _set_data(self, tensor: torch.Tensor) -> None: + """Set tensor data property + + Cast tensor to FP8 and store in FP8 buffer. + + """ + with torch.no_grad(): + self.copy_(tensor) + + # Cast to FP8 when setting Float8Tensor.data + data = property(_get_data, _set_data) + + # Accessors for objects in self._fp8_attrs + # Note: We store FP8 attributes in a dictionary so we can share + # them between tensors with the same data, e.g. detached tensors. + # For convenience, we also expose them as property attributes. + _fp8_meta = property(**_make_fp8_attr_property_funcs("fp8_meta")) + _fp8_meta_forward = property(**_make_fp8_attr_property_funcs("fp8_meta_forward")) + _fp8_meta_index = property(**_make_fp8_attr_property_funcs("fp8_meta_index")) + _fp8_dtype = property(**_make_fp8_attr_property_funcs("dtype")) + _transpose = property(**_make_fp8_attr_property_funcs("transpose")) + _scale_inv = property(**_make_fp8_attr_property_funcs("scale_inv")) + + # Do not force the Float8Tensor type on the returned tensor + __torch_function__ = torch._C._disabled_torch_function_impl diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index c89ff10968..c7d4524113 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -17,7 +17,7 @@ from .jit import jit_fuser -__all__ = ["fp8_autocast"] +__all__ = ["fp8_autocast", "fp8_model_init"] def check_fp8_support() -> Tuple[bool, str]: @@ -59,6 +59,7 @@ class FP8GlobalStateManager: FP8_CALIBRATION = False FP8_RECIPE = None FP8_DISTRIBUTED_GROUP = None + FP8_PARAMETERS = False IS_FIRST_FP8_MODULE = False FP8_AUTOCAST_COUNTER = 0 FP8_CURRENT_CONTEXT_ID = 0 @@ -277,6 +278,11 @@ def is_fp8_calibration(cls) -> bool: """Is FP8 calibration""" return cls.FP8_CALIBRATION + @classmethod + def with_fp8_parameters(cls) -> bool: + """Should the parameters be stored as FP8""" + return cls.FP8_PARAMETERS + @classmethod def is_first_fp8_module(cls): """Returns `True` only the first time when called multiple @@ -400,6 +406,11 @@ def fp8_autocast_enter( fp8_group: Optional[dist_group_type] = None, ) -> None: """Set state and tracking variables for entry into FP8 region.""" + if cls.FP8_AUTOCAST_DEPTH == 0: + if callable(cls.amax_forward_global_reduce_func): + cls.amax_reduce_handle_fwd = cls.amax_forward_global_reduce_func() # pylint: disable=not-callable + cls.delete_key_from_amax_buffer(forward=True) + cls.FP8_ENABLED = enabled cls.FP8_CALIBRATION = calibrating cls.FP8_RECIPE = get_default_fp8_recipe() if fp8_recipe is None else fp8_recipe @@ -419,11 +430,6 @@ def fp8_autocast_exit(cls): """Set state and tracking variables for exit from FP8 region.""" cls.FP8_AUTOCAST_DEPTH -= 1 - if cls.FP8_AUTOCAST_DEPTH == 0: - if callable(cls.amax_forward_global_reduce_func): - cls.amax_reduce_handle_fwd = cls.amax_forward_global_reduce_func() # pylint: disable=not-callable - cls.delete_key_from_amax_buffer(forward=True) - @classmethod def copy_forward_fp8_meta_tensors_for_recompute(cls, fp8_meta: Dict[str, Any]) -> None: """Copy the scaling factors and amaxes for recompute forward phase @@ -477,9 +483,45 @@ def restore_fp8_meta_tensors(fp8_meta: Dict[str, Any]) -> None: fp8_meta["scaling_fwd"].scale_inv = fp8_meta["updated_scale_inv_fwd"] +@contextmanager +def fp8_model_init(enabled: bool = True) -> None: + """ + Context manager for FP8 initialization of parameters. + + Example usage: + + .. code-block:: python + + with fp8_model_init(enabled=True): + model = transformer_engine.pytorch.Linear(768, 768) + + Parameters + ---------- + enabled: bool, default = `True` + when enabled, Transformer Engine modules created inside this `fp8_model_init` + region will hold only FP8 copies of its parameters, as opposed to the default + behavior where both higher precision and FP8 copies are present. Setting this + option to `True` may result in lower memory consumption and is especially + useful for scenarios like: + + * full model training using optimizer with master weights, where the high + precision copies of weights are already present in the optimizer. + * inference, where only the FP8 copies of the parameters are used. + * LoRA-like fine-tuning, where the main parameters of the model do not change. + + This functionality is *EXPERIMENTAL*. + """ + try: + _fp8_parameters = FP8GlobalStateManager.FP8_PARAMETERS + FP8GlobalStateManager.FP8_PARAMETERS = enabled + yield + finally: + FP8GlobalStateManager.FP8_PARAMETERS = _fp8_parameters # pylint: disable=used-before-assignment + + @contextmanager def fp8_autocast( - enabled: bool = False, + enabled: bool = True, calibrating: bool = False, fp8_recipe: Optional[DelayedScaling] = None, fp8_group: Optional[dist_group_type] = None, @@ -508,7 +550,7 @@ def fp8_autocast( Parameters ---------- - enabled: bool, default = `False` + enabled: bool, default = `True` whether or not to enable fp8 calibrating: bool, default = `False` calibration mode allows collecting statistics such as amax and scale @@ -523,7 +565,10 @@ def fp8_autocast( """ try: fp8_state = FP8GlobalStateManager.get_fp8_autocast_state() - FP8GlobalStateManager.fp8_autocast_enter(enabled, calibrating, fp8_recipe, fp8_group) + FP8GlobalStateManager.fp8_autocast_enter(enabled=enabled, + calibrating=calibrating, + fp8_recipe=fp8_recipe, + fp8_group=fp8_group) yield finally: FP8GlobalStateManager.set_fp8_autocast_state(fp8_state) # pylint: disable=used-before-assignment diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 5803cfa2f9..1dbc40dc70 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -36,6 +36,7 @@ cast_to_fp8, ) from ..constants import dist_group_type +from ..float8_tensor import Float8Tensor _2X_ACC_FPROP = False _2X_ACC_DGRAD = True @@ -451,21 +452,29 @@ def set_fp8_weights(self) -> None: setattr( self, weight_cast_attr, - torch.empty( - shape, - device=torch.cuda.current_device(), - dtype=torch.uint8, - ), + Float8Tensor( + data=torch.empty( + shape, + device=torch.cuda.current_device(), + dtype=torch.uint8, + ), + fp8_dtype=tex.DType.kFloat8E4M3, + fp8_scale_inv=1, + ) ) setattr( self, weight_transpose_attr, - torch.empty( - shape[1], - shape[0], - device=torch.cuda.current_device(), - dtype=torch.uint8, - ), + Float8Tensor( + data=torch.empty( + shape[1], + shape[0], + device=torch.cuda.current_device(), + dtype=torch.uint8, + ), + fp8_dtype=tex.DType.kFloat8E4M3, + fp8_scale_inv=1, + ) ) def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> None: @@ -483,12 +492,17 @@ def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> N # This routine is shared across FP8 and FP8_calibration paths so should not actually # assume FP8 execution. - def fp8_init(self, num_gemms: int = 1) -> None: + def init_fp8_metadata(self, num_gemms: int = 1) -> None: """Initialize fp8 related metadata and tensors during fprop.""" + self.fp8_parameters = FP8GlobalStateManager.with_fp8_parameters() self.fp8 = FP8GlobalStateManager.is_fp8_enabled() self.fp8_calibration = FP8GlobalStateManager.is_fp8_calibration() self.fp8_meta["fp8_checkpoint"] = self.fp8 or self.fp8_calibration + if self.fp8_parameters and not self.fp8_initialized: + self.fp8_meta["num_gemms"] = num_gemms + self.init_fp8_meta_tensors() + if self.fp8 or self.fp8_calibration: # FP8 init has already been run and recipe is the same, don't do anything. if (self.fp8_initialized @@ -536,7 +550,7 @@ def prepare_forward( assert self.tp_group_initialized, "TP group not initialized." self.set_activation_dtype(inp) - self.fp8_init(num_gemms=num_gemms) + self.init_fp8_metadata(num_gemms=num_gemms) # Create persistent tensors for fp8 weights and their transposes # only when fp8 weight caching is used. @@ -765,7 +779,7 @@ def noop_cat(self, def get_fp8_weights_empty_tensors( self, is_first_microbatch: Union[bool, None], - ) -> List[torch.Tensor]: + ) -> List[Float8Tensor]: """ Returns empty tensors to be later used to store fp8 version of weights and their transposes (for the bwd pass) for this batch (or microbatch). @@ -781,23 +795,42 @@ def get_fp8_weights_empty_tensors( fp8_weight_tensors = [] for shape in self.fp8_weight_shapes: fp8_weight_tensors.append( - torch.empty( - shape, - device=torch.cuda.current_device(), - dtype=torch.uint8, + Float8Tensor( + data=torch.empty( + shape, + device=torch.cuda.current_device(), + dtype=torch.uint8, + ), + fp8_dtype=tex.DType.kFloat8E4M3, + fp8_scale_inv=1, ) ) - fp8_weight_tensors.append( - torch.empty( - shape[1], - shape[0], - device=torch.cuda.current_device(), - dtype=torch.uint8, + Float8Tensor( + data=torch.empty( + shape[1], + shape[0], + device=torch.cuda.current_device(), + dtype=torch.uint8, + ), + fp8_dtype=tex.DType.kFloat8E4M3, + fp8_scale_inv=1, ) ) return fp8_weight_tensors + def state_dict(self, *args, **kwargs) -> Dict: + """Get dictionary containing module state""" + state = super().state_dict(*args, **kwargs) + + # Convert Float8Tensors to plain tensors + # Note: Float8Tensors don't serialize well, especially if they + # contain references to FP8 metadata. + for key, val in state.items(): + if isinstance(val, Float8Tensor): + state[key] = val.from_float8() + + return state @abstractmethod def forward(self): diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index a8e83631bc..d4746ba3a0 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -23,7 +23,7 @@ _2X_ACC_DGRAD, _2X_ACC_WGRAD, ) -from ..fp8 import get_fp8_te_dtype +from ..fp8 import get_fp8_te_dtype, FP8GlobalStateManager from ..utils import ( divide, get_default_init_method, @@ -43,6 +43,7 @@ from ._common import _apply_normalization +from ..float8_tensor import Float8Tensor __all__ = ["LayerNormLinear"] @@ -79,10 +80,11 @@ def forward( fwd_ln_sm_margin: int, bwd_ln_sm_margin: int, zero_centered_gamma: bool, + normalization: str, + primary_weights_in_fp8: bool, ub_bulk_wgrad: bool, ub_bulk_dgrad: bool, ub_split_ag: bool, - normalization: str, ub_atomic_gemm_ag: bool, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible @@ -159,28 +161,43 @@ def forward( ) bias = cast_if_needed(bias, bias_dtype) if use_bias else bias - if update_fp8_weights: + if primary_weights_in_fp8: + # Weight is already in FP8 + weight.reset_fp8_meta_scale_inv() + weight_fp8 = weight + weight_t_fp8 = None + if is_grad_enabled: + weight_t_fp8 = weight_fp8.transpose(update_cache=is_first_microbatch) + + elif update_fp8_weights: + # Need to cast weights to FP8 + weight_fp8 = Float8Tensor( + data=weight_fp8._data, + fp8_meta=fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) if is_grad_enabled: tex.fp8_cast_transpose_fused( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - cast_out=weight_fp8, - transpose_out=weight_t_fp8, + cast_out=weight_fp8._data, + transpose_out=weight_t_fp8._data, ) else: - weight_t_fp8 = None - weight_fp8 = tex.cast_to_fp8( + weight_fp8._data = tex.cast_to_fp8( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, - fp8_dtype_forward) + fp8_dtype_forward, + ) + weight_t_fp8 = None ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ub_atomic_gemm_ag else ub_algo out, _ = tex.fp8_gemm( - weight_fp8, + weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -356,7 +373,7 @@ def backward( # DGRAD: Evaluated unconditionally to feed into Linear backward _ = tex.fp8_gemm( - weight_t_fp8, + weight_t_fp8._data, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -544,6 +561,7 @@ def backward( None, None, None, + None, ) @@ -646,10 +664,10 @@ def __init__( return_layernorm_output: bool = False, parameters_split: Optional[Union[Tuple[str, ...], Dict[str, int]]] = None, zero_centered_gamma: bool = False, + device: Union[torch.device, str] = "cuda", ub_bulk_wgrad: bool = False, ub_bulk_dgrad: bool = False, ub_split_ag: bool = False, - device: Union[torch.device, str] = "cuda", ub_atomic_gemm_ag: bool = False, ) -> None: super().__init__() @@ -666,6 +684,7 @@ def __init__( self.return_layernorm_output = return_layernorm_output self.parameters_split = parameters_split self.zero_centered_gamma = zero_centered_gamma + self.primary_weights_in_fp8 = FP8GlobalStateManager.with_fp8_parameters() self.ub_bulk_wgrad = ub_bulk_wgrad self.ub_bulk_dgrad = ub_bulk_dgrad self.ub_split_ag = ub_split_ag @@ -719,18 +738,30 @@ def __init__( self.layer_norm_bias = None self.reset_layer_norm_parameters() - self.weight_tensor = torch.empty( + temp_weight = torch.empty( self.out_features, self.in_features, device=device, dtype=params_dtype) initialize_affine_weight_gpu( - self.weight_tensor, + temp_weight, init_method, get_rng_state_tracker, partition_dim=1 if self.parallel_mode == "row" else 0, stride=1, ) + if self.primary_weights_in_fp8: + self.init_fp8_metadata() + self.fp8_meta["update_amax_and_scale_fwd"] = True + + self.weight_tensor = Float8Tensor.to_float8( + temp_weight, + fp8_meta=self.fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) + else: + self.weight_tensor = temp_weight + if self.use_bias: self.bias_tensor = torch.empty( self.out_features, @@ -769,10 +800,17 @@ def __init__( bname = pname + "bias" slice_end = slice_begin + slice_size - - self.register_parameter( - wname, Parameter(self.weight_tensor[slice_begin:slice_end]) - ) + # NOTE(future): Figure out a way to support slicing when weights + # are of `Float8Tensor` class + if self.primary_weights_in_fp8: + assert len(parameters_split) == 1, ("Slicing operation is not " + "supported in Float8Tensor " + "class!") + self.register_parameter(wname, Parameter(self.weight_tensor)) + else: + self.register_parameter( + wname, Parameter(self.weight_tensor[slice_begin:slice_end]) + ) set_tensor_model_parallel_attributes( tensor=getattr(self, wname), @@ -833,7 +871,7 @@ def get_fp8_weights_scratchpad( `is_first_microbatch` is not `None`) or return empty fp8 weight tensors (if `is_first_microbatch is None`) """ - if not self.fp8: + if not self.fp8 or self.primary_weights_in_fp8: return [None, None] if is_first_microbatch is None: @@ -877,6 +915,8 @@ def forward( """ with self.prepare_forward(inp, is_first_microbatch) as inp: + assert self.fp8 or not self.primary_weights_in_fp8, \ + "Need to run inside fp8_autocast region when weights are stored in FP8." bias_tensor = ( self.bias if self.parameters_split is None else self.bias_tensor if not torch.is_grad_enabled() @@ -927,10 +967,11 @@ def forward( self.fwd_ln_sm_margin, self.bwd_ln_sm_margin, self.zero_centered_gamma, + self.normalization, + self.primary_weights_in_fp8, self.ub_bulk_wgrad, self.ub_bulk_dgrad, self.ub_split_ag, - self.normalization, self.ub_atomic_gemm_ag, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index d41c8d39df..40256dba6a 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -20,7 +20,7 @@ _2X_ACC_DGRAD, _2X_ACC_WGRAD, ) -from ..fp8 import get_fp8_te_dtype +from ..fp8 import get_fp8_te_dtype, FP8GlobalStateManager from ..jit import ( bias_gelu_fused, bgrad_dgelu_fused, @@ -47,6 +47,7 @@ from ..constants import dist_group_type, TE_DType from ..jit import no_torch_dynamo +from ..float8_tensor import Float8Tensor from ._common import _apply_normalization @@ -105,14 +106,15 @@ def forward( fwd_ln_sm_margin: int, bwd_ln_sm_margin: int, zero_centered_gamma: bool, + activation: str, + normalization: str, + primary_weights_in_fp8: bool, ub_bulk_wgrad: bool, ub_bulk_dgrad: bool, ub_split_rs: bool, ub_atomic_gemm_rs: bool, ub_split_ag: bool, ub_atomic_gemm_ag: bool, - activation: str, - normalization: str, ) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]: # Make sure input dimensions are compatible in_features = ln_weight.numel() @@ -196,45 +198,68 @@ def forward( fc1_bias = cast_if_needed(fc1_bias, bias_dtype) if use_fc1_bias else fc1_bias fc2_bias = cast_if_needed(fc2_bias, bias_dtype) if use_fc2_bias else fc2_bias - if update_fp8_weights: + if primary_weights_in_fp8: + # Weights are already in FP8 + fc1_weight.reset_fp8_meta_scale_inv() + fc2_weight.reset_fp8_meta_scale_inv() + fc1_weight_fp8 = fc1_weight + fc2_weight_fp8 = fc2_weight + fc1_weight_t_fp8 = None + fc2_weight_t_fp8 = None if is_grad_enabled: + fc1_weight_t_fp8 = fc1_weight_fp8.transpose(update_cache=is_first_microbatch) + fc2_weight_t_fp8 = fc2_weight_fp8.transpose(update_cache=is_first_microbatch) + + elif update_fp8_weights: + # Need to cast weights to FP8 + fc1_weight_fp8 = Float8Tensor( + data=fc1_weight_fp8._data, + fp8_meta=fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) + fc2_weight_fp8 = Float8Tensor( + data=fc2_weight_fp8._data, + fp8_meta=fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM2_WEIGHT, + ) + if is_grad_enabled: + # Fused cast-transpose kernels tex.fp8_cast_transpose_fused( fc1_weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - cast_out=fc1_weight_fp8, - transpose_out=fc1_weight_t_fp8, + cast_out=fc1_weight_fp8._data, + transpose_out=fc1_weight_t_fp8._data, ) - tex.fp8_cast_transpose_fused( fc2_weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM2_WEIGHT, fp8_dtype_forward, - cast_out=fc2_weight_fp8, - transpose_out=fc2_weight_t_fp8, + cast_out=fc2_weight_fp8._data, + transpose_out=fc2_weight_t_fp8._data, ) else: - fc1_weight_t_fp8 = None - fc1_weight_fp8 = tex.cast_to_fp8( + fc1_weight_fp8._data = tex.cast_to_fp8( fc1_weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, ) - fc2_weight_t_fp8 = None - fc2_weight_fp8 = tex.cast_to_fp8( + fc1_weight_t_fp8 = None + fc2_weight_fp8._data = tex.cast_to_fp8( fc2_weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM2_WEIGHT, fp8_dtype_forward, ) + fc2_weight_t_fp8 = None ub_algo = tex.UbufOverlapAlgo.SPLIT_PIPELINED_AG if ub_split_ag else None ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ub_atomic_gemm_ag else ub_algo fc1_out, _ = tex.fp8_gemm( - fc1_weight_fp8, + fc1_weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -283,7 +308,7 @@ def forward( ub_algo=tex.UbufOverlapAlgo.ATOMIC_GEMM_RS if ub_atomic_gemm_rs else None ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else ub_algo _ = tex.fp8_gemm( - fc2_weight_fp8, + fc2_weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM2_WEIGHT, fp8_dtype_forward, @@ -530,7 +555,7 @@ def backward( ub_algo = tex.UbufOverlapAlgo.ATOMIC_GEMM_AG if ctx.ub_atomic_gemm_ag else ub_algo # FC2 DGRAD; Unconditional fc2_dgrad, _ = tex.fp8_gemm( - fc2_weight_t_fp8, + fc2_weight_t_fp8._data, fwd_scale_inverses, tex.FP8FwdTensors.GEMM2_WEIGHT, fp8_dtype_forward, @@ -645,7 +670,7 @@ def backward( ) # FC1 DGRAD: Unconditional _ = tex.fp8_gemm( - fc1_weight_t_fp8, + fc1_weight_t_fp8._data, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -908,6 +933,7 @@ def backward( None, None, None, + None, ) @@ -1020,12 +1046,12 @@ def __init__( micro_batch_size: Optional[int] = None, set_parallel_mode: bool = False, zero_centered_gamma: bool = False, + device: Union[torch.device, str] = "cuda", ub_bulk_wgrad: bool = False, ub_bulk_dgrad: bool = False, ub_split_rs: bool = False, ub_atomic_gemm_rs: bool = False, ub_split_ag: bool = False, - device: Union[torch.device, str] = "cuda", ub_atomic_gemm_ag: bool = False, ) -> None: super().__init__() @@ -1043,6 +1069,7 @@ def __init__( self.activation == 'gelu') self.set_parallel_mode = set_parallel_mode self.zero_centered_gamma = zero_centered_gamma + self.primary_weights_in_fp8 = FP8GlobalStateManager.with_fp8_parameters() self.ub_bulk_wgrad = ub_bulk_wgrad self.ub_bulk_dgrad = ub_bulk_dgrad self.ub_split_rs = ub_split_rs @@ -1102,19 +1129,30 @@ def __init__( else: fc1_output_features = self.size_per_partition # FC1 init - self.fc1_weight = Parameter( - torch.empty(fc1_output_features, hidden_size, device=device, dtype=params_dtype) - ) - self.fp8_weight_shapes.append(self.fc1_weight.shape) + fc1_temp_weight = torch.empty( + fc1_output_features, hidden_size, device=device, dtype=params_dtype) initialize_affine_weight_gpu( - self.fc1_weight, + fc1_temp_weight, init_method, get_rng_state_tracker, - partition_dim=0, - stride=1, + set_tp_attributes=False, ) + if self.primary_weights_in_fp8: + self.init_fp8_metadata(num_gemms=2) + self.fp8_meta["update_amax_and_scale_fwd"] = True + + fc1_temp_weight = Float8Tensor.to_float8( + fc1_temp_weight, + fp8_meta=self.fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) + + self.fc1_weight = Parameter(fc1_temp_weight) + set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) + self.fp8_weight_shapes.append(self.fc1_weight.shape) + if self.use_bias: self.fc1_bias = Parameter( torch.empty(fc1_output_features, device=device, dtype=params_dtype) @@ -1127,19 +1165,27 @@ def __init__( self.fc1_bias.zero_() # FC2 init - self.fc2_weight = Parameter( - torch.empty(hidden_size, self.size_per_partition, device=device, dtype=params_dtype) - ) - self.fp8_weight_shapes.append(self.fc2_weight.shape) + fc2_temp_weight = torch.empty( + hidden_size, self.size_per_partition, device=device, dtype=params_dtype) initialize_affine_weight_gpu( - self.fc2_weight, + fc2_temp_weight, output_layer_init_method, get_rng_state_tracker, - partition_dim=1, - stride=1, + set_tp_attributes=False, ) + if self.primary_weights_in_fp8: + fc2_temp_weight = Float8Tensor.to_float8( + fc2_temp_weight, + fp8_meta=self.fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM2_WEIGHT, + ) + + self.fc2_weight = Parameter(fc2_temp_weight) + set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) + self.fp8_weight_shapes.append(self.fc2_weight.shape) + if self.use_bias: self.fc2_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) @@ -1192,7 +1238,7 @@ def get_fp8_weights_scratchpad( `is_first_microbatch` is not `None`) or return empty fp8 weight tensors (if `is_first_microbatch is None`) """ - if not self.fp8: + if not self.fp8 or self.primary_weights_in_fp8: return [None, None, None, None] if is_first_microbatch is None: @@ -1235,6 +1281,8 @@ def forward( """ with self.prepare_forward(inp, is_first_microbatch, num_gemms=2) as inp: + assert self.fp8 or not self.primary_weights_in_fp8, \ + "Need to run inside fp8_autocast region when weights are stored in FP8." # Fetch the fp8 weights placeholders (for linear/gemm) weight1_fp8, weight1_t_fp8, weight2_fp8, weight2_t_fp8 = \ self.get_fp8_weights_scratchpad( @@ -1279,14 +1327,15 @@ def forward( self.fwd_ln_sm_margin, self.bwd_ln_sm_margin, self.zero_centered_gamma, + self.activation, + self.normalization, + self.primary_weights_in_fp8, self.ub_bulk_wgrad, self.ub_bulk_dgrad, self.ub_split_rs, self.ub_atomic_gemm_rs, self.ub_split_ag, self.ub_atomic_gemm_ag, - self.activation, - self.normalization, ) out = fwd_fn(*args) diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 5e2cab22fe..b14877e74b 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -20,7 +20,7 @@ _2X_ACC_DGRAD, _2X_ACC_WGRAD, ) -from ..fp8 import get_fp8_te_dtype +from ..fp8 import get_fp8_te_dtype, FP8GlobalStateManager from ..utils import ( divide, get_default_init_method, @@ -45,6 +45,8 @@ from ..constants import GemmParallelModes, dist_group_type from ..jit import no_torch_dynamo +from ..float8_tensor import Float8Tensor + __all__ = ["Linear"] @@ -57,9 +59,9 @@ class _Linear(torch.autograd.Function): @staticmethod def forward( ctx, - weight: torch.Tensor, - weight_fp8: Union[torch.Tensor, None], - weight_t_fp8: Union[torch.Tensor, None], + weight: Union[Float8Tensor, torch.Tensor], + weight_fp8: Union[Float8Tensor, None], + weight_t_fp8: Union[Float8Tensor, None], inp: torch.Tensor, bias: torch.Tensor, use_bias: bool, @@ -75,6 +77,7 @@ def forward( activation_dtype: torch.dtype, parallel_mode: Union[str, None], is_grad_enabled: bool, + primary_weights_in_fp8: bool, ub_split_rs: bool, ub_split_ag: bool, ub_atomic_gemm_rs: bool, @@ -141,24 +144,38 @@ def forward( ) bias = cast_if_needed(bias, bias_dtype) if use_bias else bias - if update_fp8_weights: + if primary_weights_in_fp8: + # Weight is already in FP8 + weight.reset_fp8_meta_scale_inv() + weight_fp8 = weight + weight_t_fp8 = None + if is_grad_enabled: + weight_t_fp8 = weight_fp8.transpose(update_cache=is_first_microbatch) + + elif update_fp8_weights: + # Need to cast weights to FP8 + weight_fp8 = Float8Tensor( + data=weight_fp8._data, + fp8_meta=fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) if is_grad_enabled: fp8_cast_transpose_fused( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - cast_out=weight_fp8, - transpose_out=weight_t_fp8, + cast_out=weight_fp8._data, + transpose_out=weight_t_fp8._data, ) else: - weight_t_fp8 = None - weight_fp8 = cast_to_fp8( + weight_fp8._data = cast_to_fp8( weight, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, ) + weight_t_fp8 = None proj_out_index, meta_tensor, proj_out_tetype, proj_out_pttype = ( None, None, None, activation_dtype) @@ -184,7 +201,7 @@ def forward( ub_algo=tex.UbufOverlapAlgo.ATOMIC_GEMM_RS if ub_atomic_gemm_rs else None ub_algo=tex.UbufOverlapAlgo.SPLIT_PIPELINED_RS if ub_split_rs else ub_algo _ = fp8_gemm( - weight_fp8, + weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -245,6 +262,9 @@ def forward( if is_grad_enabled: fp8_wgrad = fp8 and not fp8_meta["recipe"].override_linear_precision.wgrad + if fp8: + assert hasattr(weight_t_fp8, "_data"), \ + "_data attr doesn't exist (before save for bwd)" ctx.save_for_backward( inputmat_no_fp8 if weight.requires_grad and not fp8_wgrad else None, inputmat_t if weight.requires_grad and fp8_wgrad else None, @@ -294,6 +314,9 @@ def backward( weight_t_fp8, fwd_scale_inverses, ) = ctx.saved_tensors + if weight_t_fp8 is not None: + assert hasattr(weight_t_fp8, "_data"), \ + "_data attr doesn't exist (after restore in bwd)" if ctx.ub_split_ag or ctx.ub_atomic_gemm_ag: tp_world_size = get_distributed_world_size(ctx.tp_group) @@ -349,7 +372,7 @@ def backward( if ctx.requires_dgrad: if ctx.fp8: dgrad, _ = fp8_gemm( - weight_t_fp8, + weight_t_fp8._data, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, @@ -470,6 +493,7 @@ def backward( None, None, None, + None, ) @@ -554,9 +578,9 @@ def __init__( params_dtype: Optional[torch.dtype] = None, parallel_mode: Optional[str] = None, parameters_split: Optional[Union[Tuple[str, ...], Dict[str, int]]] = None, + device: Union[torch.device, str] = "cuda", ub_split_rs: bool = False, ub_split_ag: bool = False, - device: Union[torch.device, str] = "cuda", ub_atomic_gemm_rs: bool = False, ub_atomic_gemm_ag: bool = False, ) -> None: @@ -570,6 +594,7 @@ def __init__( self.return_bias = return_bias self.apply_bias = bias and not return_bias self.parameters_split = parameters_split + self.primary_weights_in_fp8 = FP8GlobalStateManager.with_fp8_parameters() self.ub_split_rs = ub_split_rs self.ub_split_ag = ub_split_ag self.ub_atomic_gemm_rs = ub_atomic_gemm_rs @@ -609,18 +634,31 @@ def __init__( self.sequence_parallel = (self.tp_size > 1) and sequence_parallel - self.weight_tensor = torch.empty( + temp_weight = torch.empty( self.out_features, self.in_features, device=device, dtype=params_dtype) + # TODO(ksivaman): This functionality works with FP8 outside TE. initialize_affine_weight_gpu( - self.weight_tensor, + temp_weight, init_method, get_rng_state_tracker, partition_dim=1 if self.parallel_mode == "row" else 0, stride=1, ) + if self.primary_weights_in_fp8: + self.init_fp8_metadata() + self.fp8_meta["update_amax_and_scale_fwd"] = True + + self.weight_tensor = Float8Tensor.to_float8( + temp_weight, + fp8_meta=self.fp8_meta, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + ) + else: + self.weight_tensor = temp_weight + if self.use_bias: self.bias_tensor = torch.empty(self.out_features, device=device, dtype=params_dtype) else: @@ -657,9 +695,17 @@ def __init__( slice_end = slice_begin + slice_size - self.register_parameter( - wname, Parameter(self.weight_tensor[slice_begin:slice_end]) - ) + # TODO(ksivaman): Add indexing op to torch dispatcher for float8 + if self.primary_weights_in_fp8: + assert len(parameters_split) == 1, ("Slicing operation is not " + "supported in Float8Tensor " + "class!") + self.register_parameter(wname, Parameter(self.weight_tensor)) + else: + + self.register_parameter( + wname, Parameter(self.weight_tensor[slice_begin:slice_end]) + ) set_tensor_model_parallel_attributes( tensor=getattr(self, wname), @@ -697,13 +743,13 @@ def __init__( def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], - ) -> List[torch.Tensor]: + ) -> List[Float8Tensor]: """ Fetch the fp8 weight tensor placeholders if they exist (when `is_first_microbatch` is not `None`) or return empty fp8 weight tensors (if `is_first_microbatch is None`) """ - if not self.fp8: + if not self.fp8 or self.primary_weights_in_fp8: return [None, None] if is_first_microbatch is None: @@ -747,6 +793,8 @@ def forward( """ with self.prepare_forward(inp, is_first_microbatch) as inp: + assert self.fp8 or not self.primary_weights_in_fp8, \ + "Need to run inside fp8_autocast region when weights are stored in FP8." bias_tensor = ( self.bias if self.parameters_split is None else self.bias_tensor if not torch.is_grad_enabled() @@ -790,6 +838,7 @@ def forward( self.activation_dtype, self.parallel_mode, torch.is_grad_enabled(), + self.primary_weights_in_fp8, self.ub_split_rs, self.ub_split_ag, self.ub_atomic_gemm_rs, From 66d91d5219f295ec1e2e714a4926ddb67a2b8f80 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 24 Oct 2023 12:11:53 -0700 Subject: [PATCH 068/427] [paddle] add documentation (#489) * paddle documentation Signed-off-by: Kirthi Shankar Sivamani * minor fix Signed-off-by: Kirthi Shankar Sivamani * review comments Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- docs/api/framework.rst | 1 + docs/api/paddle.rst | 34 ++++++++++++ transformer_engine/paddle/fp8.py | 38 ++++++++++++++ transformer_engine/paddle/layer/attention.py | 32 +++++++----- transformer_engine/paddle/layer/layernorm.py | 28 +++++++++- .../paddle/layer/layernorm_linear.py | 47 ++++++++++++++++- .../paddle/layer/layernorm_mlp.py | 52 ++++++++++++++++++- transformer_engine/paddle/layer/linear.py | 34 +++++++++++- transformer_engine/paddle/layer/softmax.py | 27 +++++++--- .../paddle/layer/transformer.py | 12 +++-- transformer_engine/paddle/recompute.py | 14 ++++- 11 files changed, 288 insertions(+), 31 deletions(-) create mode 100644 docs/api/paddle.rst diff --git a/docs/api/framework.rst b/docs/api/framework.rst index 81d980e089..e298535ed0 100644 --- a/docs/api/framework.rst +++ b/docs/api/framework.rst @@ -10,3 +10,4 @@ Framework-specific API pytorch jax + paddle diff --git a/docs/api/paddle.rst b/docs/api/paddle.rst new file mode 100644 index 0000000000..0ce6ce2284 --- /dev/null +++ b/docs/api/paddle.rst @@ -0,0 +1,34 @@ +.. + Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +paddle +====== + +.. autoapiclass:: transformer_engine.paddle.Linear(in_features, out_features, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.LayerNorm(hidden_size, eps=1e-5, **kwargs) + +.. autoapiclass:: transformer_engine.paddle.LayerNormLinear(in_features, out_features, eps=1e-5, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.LayerNormMLP(hidden_size, ffn_hidden_size, eps=1e-5, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.FusedScaleMaskSoftmax(attn_mask_type, mask_func, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.DotProductAttention(num_attention_heads, kv_channels, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.MultiHeadAttention(hidden_size, num_attention_heads, **kwargs) + :members: forward + +.. autoapiclass:: transformer_engine.paddle.TransformerLayer(hidden_size, ffn_hidden_size, num_attention_heads, **kwargs) + :members: forward + +.. autoapifunction:: transformer_engine.paddle.fp8_autocast + +.. autoapifunction:: transformer_engine.paddle.recompute diff --git a/transformer_engine/paddle/fp8.py b/transformer_engine/paddle/fp8.py index abf347042a..9ec3037236 100644 --- a/transformer_engine/paddle/fp8.py +++ b/transformer_engine/paddle/fp8.py @@ -15,6 +15,10 @@ from .constants import dist_group_type from .fp8_buffer import FP8MetaFwdBuffer, FP8MetaBwdBuffer, FP8RecomputeBuffer + +__all__ = ['fp8_autocast'] + + # FP8 support _is_fp8_available = None _reason_for_no_fp8 = "" @@ -166,6 +170,40 @@ def fp8_autocast( ) -> None: """ Context manager for FP8 usage. + + .. code-block:: python + + with fp8_autocast(enabled=True): + out = model(inp) + + .. note:: + + Support for FP8 in the Linear layer of Transformer Engine is currently limited to tensors + with shapes where both dimensions are divisible by 16. In terms of the input to the full + Transformer network, this typically requires padding sequence length to be multiple of 16. + + .. note:: + + When :attr:`fp8_recipe.reduce_amax==True`, any module must not be invoked more than once + inside a single `fp8_autocast` region. This is unsupported behavior because the amax + reduction is handled during the exit of the `fp8_autocast` context. Calling the same + module more than once inside an `fp8_autocast` region overrides the amax tensors + before reduction can occur. + + Parameters + ---------- + enabled: bool, default = `False` + whether or not to enable fp8 + calibrating: bool, default = `False` + calibration mode allows collecting statistics such as amax and scale + data of fp8 tensors even when executing without fp8 enabled. This is + useful for saving an inference ready fp8 checkpoint while training + using a higher precision. + fp8_recipe: recipe.DelayedScaling, default = `None` + recipe used for FP8 training. + fp8_group: paddle.distributed.collective.Group, default = `None` + distributed group over which amaxes for the fp8 tensors + are reduced at the end of each training step. """ try: _global_fp8_state.enter(enabled, calibrating, fp8_recipe, fp8_group) diff --git a/transformer_engine/paddle/layer/attention.py b/transformer_engine/paddle/layer/attention.py index 8c9be22748..02aa53b042 100644 --- a/transformer_engine/paddle/layer/attention.py +++ b/transformer_engine/paddle/layer/attention.py @@ -29,6 +29,9 @@ from ..recompute import recompute +__all__ = ["DotProductAttention", "MultiHeadAttention"] + + class FusedAttnFuncPackedQKV(paddle.autograd.PyLayer): """Function for FusedAttention with packed QKV input""" @@ -129,7 +132,7 @@ def backward(ctx, d_out): class DotProductAttention(paddle.nn.Layer): - """Dot Product Attention Layer + """ Allows the model to jointly attend to information from different representation subspaces as described in the paper: `Attention Is All You Need `_. @@ -150,8 +153,7 @@ class DotProductAttention(paddle.nn.Layer): attention_type: {'self', 'cross'}, default = `self` type of attention operation. backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` - backend to use for attention operation. - + backend to use for attention operation. """ def __init__(self, @@ -215,17 +217,17 @@ def forward( Parameters ---------- query_layer : paddle.Tensor - Query tensor. + Query tensor. key_value_layer : paddle.Tensor - Key tensor. + Key tensor. attention_mask : Optional[paddle.Tensor], default = `None` - Boolean tensor used to mask out softmax input when not using attention. + Boolean tensor used to mask out softmax input when not using attention. core_attention_bias_type: str, default = `no_bias` - only support no_bias type currently, {`no_bias`} + only support no_bias type currently, {`no_bias`} core_attention_bias: Optional[paddle.Tensor], default = `None` - Bias tensor for Q * K.T - set_zero: bool, defautl = `True` - Whether to use the fast path to set output tensors to 0 or not. + Bias tensor for Q * K.T + set_zero: bool, default = `True` + Whether to use the fast path to set output tensors to 0 or not. """ backend = self.backend @@ -358,7 +360,9 @@ def _pd_forward( class MultiHeadAttention(paddle.nn.Layer): - """Attention w/ QKV and Proj Gemms + """ + Multi-head Attention (MHA), including Query, + Key, Value and Output projection. Parameters ---------- @@ -387,7 +391,8 @@ class MultiHeadAttention(paddle.nn.Layer): zero_centered_gamma: bool, default = `False` whether to zero initialize the gamma of the layernorm operation. backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` - backend to use for attention operation. + backend to use for attention operation. If set to 'paddle', a framework + only no-FP8 path is executed with limited optimization. Parallelism parameters ---------------------- @@ -542,7 +547,6 @@ def forward( """ MultiHeadAttention Layer. - Parameters ---------- hidden_states : paddle.Tensor @@ -555,7 +559,7 @@ def forward( only support no_bias type currently, {`no_bias`} core_attention_bias: Optional[paddle.Tensor], default = `None` Bias tensor for Q * K.T - set_zero: bool, defautl = `True` + set_zero: bool, default = `True` Whether to use the fast path to set output tensors to 0 or not. recompute_core_attention: bool, default = `False` If true, forward activations for core attention are recomputed diff --git a/transformer_engine/paddle/layer/layernorm.py b/transformer_engine/paddle/layer/layernorm.py index 89c03ee25c..77c164e48a 100644 --- a/transformer_engine/paddle/layer/layernorm.py +++ b/transformer_engine/paddle/layer/layernorm.py @@ -63,7 +63,33 @@ def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None class LayerNorm(paddle.nn.Layer): r""" Applies Layer Normalization over a mini-batch of inputs as described in - the paper `Layer Normalization ` + the paper `Layer Normalization `__ + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * \gamma + \beta + + :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of + size :attr:`hidden_size` + + Parameters + ---------- + hidden_size : int + size of each input sample. + eps : float, default = 1e-5 + a value added to the denominator of layer normalization for numerical stability. + weight_attr: Union[paddle.ParamAttr, None], default = None + optional `paddle.ParamAttr` for weight. + bias_attr: Union[paddle.ParamAttr, None, bool], default = None + optional `paddle.ParamAttr` for bias. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in LayerNorm is initialized to 0 and + the LayerNorm formula changes to + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * + (1 + \gamma) + \beta + backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` + backend to use for softmax operation. """ def __init__( diff --git a/transformer_engine/paddle/layer/layernorm_linear.py b/transformer_engine/paddle/layer/layernorm_linear.py index 1d13ee093f..e1b46aaa18 100644 --- a/transformer_engine/paddle/layer/layernorm_linear.py +++ b/transformer_engine/paddle/layer/layernorm_linear.py @@ -40,7 +40,7 @@ saved_tensor_allow_none, ) -__all__ = ["LayerNormLinear", "_layernorm_fwd_fp8_cast", "_layernorm_bwd"] +__all__ = ["LayerNormLinear"] def _layernorm_fwd_fp8_cast( @@ -331,6 +331,42 @@ def backward( class LayerNormLinear(TransformerEngineBaseLayer): r""" Applies layer normalization followed by linear transformation to the incoming data. + + Parameters + ---------- + in_features : int + size of each input sample. + out_features : int + size of each output sample. + eps : float, default = 1e-5 + a value added to the denominator of layer normalization for numerical stability. + weight_attr: Union[paddle.ParamAttr, None], default = None + optional `paddle.ParamAttr` for weight. + bias_attr: Union[paddle.ParamAttr, None, bool], default = None + optional `paddle.ParamAttr` for bias. + return_layernorm_output : bool, default = `False` + if set to `True`, output of layernorm is returned from the forward + together with the output of the linear transformation. + Example use case: residual connection for transformer module is + taken post layernorm. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in LayerNorm is initialized to 0 and + the LayerNorm formula changes to + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * + (1 + \gamma) + \beta + backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine' + if set to 'paddle', a framework only no-FP8 path is executed with limited optimization. + + Parallelism parameters + ---------------------- + tp_group : ProcessGroup, default = `None` + tensor parallel process group. + parallel_mode : {None, 'Column', 'Row'}, default = `None` + used to decide whether this Linear layer is Column Parallel Linear or Row + Parallel Linear as described `here `_. + When set to `None`, no communication is performed. """ def __init__( @@ -503,7 +539,14 @@ def _pd_forward( return out def forward(self, *args, **kwargs): - """forward""" + """ + Apply layer normalization to the input followed by a linear transformation. + + Parameters + ---------- + inp : torch.Tensor + Input tensor. + """ if self.backend == 'transformer_engine': return self._te_forward(*args, **kwargs) if self.backend == 'paddle': diff --git a/transformer_engine/paddle/layer/layernorm_mlp.py b/transformer_engine/paddle/layer/layernorm_mlp.py index 85364552cc..c4752f6406 100644 --- a/transformer_engine/paddle/layer/layernorm_mlp.py +++ b/transformer_engine/paddle/layer/layernorm_mlp.py @@ -39,6 +39,7 @@ saved_tensor_allow_none, ) + __all__ = ["LayerNormMLP"] @@ -549,7 +550,47 @@ def backward( class LayerNormMLP(TransformerEngineBaseLayer): r""" - Applies layer normalization followed by linear transformation to the incoming data. + Applies layer normalization on the input followed by the MLP module, consisting of + 2 successive linear transformations, separated by the GeLU activation. + + Parameters + ---------- + hidden_size : int + size of each input sample. + ffn_hidden_size : int + intermediate size to which input samples are projected. + eps : float, default = 1e-5 + a value added to the denominator of layer normalization for numerical stability. + weight_attr: Union[paddle.ParamAttr, None], default = None + optional `paddle.ParamAttr` for weight. + bias_attr: Union[paddle.ParamAttr, None, bool], default = None + optional `paddle.ParamAttr` for bias. + activation : str, default = 'gelu' + activation function used. + Options: 'gelu', 'geglu', 'relu', 'reglu', 'squared_relu', 'swiglu'. + return_layernorm_output : bool, default = `False` + if set to `True`, output of layernorm is returned from the forward + together with the output of the linear transformation. + Example use case: residual connection for transformer module + is taken post layernorm. + zero_centered_gamma : bool, default = 'False' + if set to 'True', gamma parameter in LayerNorm is initialized to 0 and + the LayerNorm formula changes to + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * + (1 + \gamma) + \beta + backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine' + if set to 'paddle', a framework only no-FP8 path is executed with limited optimization. + + Parallelism parameters + ---------------------- + set_parallel_mode : bool, default = `False` + if set to `True`, FC1 is used as Column Parallel and FC2 is used as Row + Parallel as described `here `_. + tp_group : paddle.distributed.collective.Group, default = `None` + tensor parallel process group. + """ def __init__( @@ -753,7 +794,14 @@ def _pd_forward( return out def forward(self, *args, **kwargs): - """forward""" + """ + Apply layer normalization to the input followed by a feedforward network (MLP Block). + + Parameters + ---------- + inp : torch.Tensor + Input tensor. + """ if self.backend == 'transformer_engine': return self._te_forward(*args, **kwargs) if self.backend == 'paddle': diff --git a/transformer_engine/paddle/layer/linear.py b/transformer_engine/paddle/layer/linear.py index 9644f9c4e7..1c4ba3ef9b 100644 --- a/transformer_engine/paddle/layer/linear.py +++ b/transformer_engine/paddle/layer/linear.py @@ -38,7 +38,7 @@ saved_tensor_allow_none, ) -__all__ = ["Linear", "_linear_fwd", "_linear_fwd_fp8", "_linear_bwd", "_linear_fwd_non_fp8"] +__all__ = ["Linear"] def _linear_fwd_fp8( @@ -541,6 +541,29 @@ def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None class Linear(TransformerEngineBaseLayer): """ Applies a linear transformation to the incoming data :math:`y = xA^T + b` + + Parameters + ---------- + in_features : int + size of each input sample. + out_features : int + size of each output sample. + weight_attr: Union[paddle.ParamAttr, None], default = None + optional `paddle.ParamAttr` for weight. + bias_attr: Union[paddle.ParamAttr, None, bool], default = None + optional `paddle.ParamAttr` for bias. + backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine' + if set to 'paddle', a framework only no-FP8 path is executed with limited optimization. + + Parallelism parameters + ---------------------- + tp_group : ProcessGroup, default = `None` + tensor parallel process group. + parallel_mode : {None, 'Column', 'Row'}, default = `None` + used to decide whether this Linear layer is Column Parallel Linear or Row + Parallel Linear as described `here `_. + When set to `None`, no communication is performed. + """ def __init__( @@ -658,7 +681,14 @@ def _pd_forward( return out def forward(self, *args, **kwargs): - """forward""" + """ + Apply the linear transformation to the input. + + Parameters + ---------- + inp : torch.Tensor + Input tensor. + """ if self.backend == 'transformer_engine': return self._te_forward(*args, **kwargs) if self.backend == 'paddle': diff --git a/transformer_engine/paddle/layer/softmax.py b/transformer_engine/paddle/layer/softmax.py index 33b0293e0a..b48dd26259 100644 --- a/transformer_engine/paddle/layer/softmax.py +++ b/transformer_engine/paddle/layer/softmax.py @@ -18,9 +18,14 @@ scaled_softmax_backward, ) + +__all__ = ["FusedScaleMaskSoftmax"] + + THREADS_PER_WARP = 32 THREADS_PER_BLOCK = 128 + _default_causal_mask = {} @@ -112,12 +117,22 @@ def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, Non class FusedScaleMaskSoftmax(paddle.nn.Layer): """ - fused operation: scaling + mask + softmax - - Arguments: - attn_mask_type: attention mask type (pad or causal) - mask_func: mask function to be applied. - softmax_in_fp32: if true, softmax in performed at fp32 precision. + Scaled and masked softmax module for paddle with fused optimizations. + + Parameters + ---------- + attn_mask_type : str, default = `causal` + type of attention mask, can be 'causal', 'padding', or 'no_mask'. + mask_func : callable + custom callable for applying the mask to the softmax input. + `masked_input=mask_func(inp, mask)`. + softmax_in_fp32 : bool, default = True + perform softmax computation in fp32. + layernorm_epsilon : float, default = 1e-5 + a value added to the denominator of layer normalization + for numerical stability. + backend: {'transformer_engine', 'paddle'}, default = `transformer_engine` + backend to use for operation. """ def __init__( diff --git a/transformer_engine/paddle/layer/transformer.py b/transformer_engine/paddle/layer/transformer.py index ada4107648..95c592c672 100644 --- a/transformer_engine/paddle/layer/transformer.py +++ b/transformer_engine/paddle/layer/transformer.py @@ -8,9 +8,9 @@ import paddle from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd -from . import LayerNormMLP, LayerNorm, MultiHeadAttention -from ..constants import AttnMaskTypes, LayerTypes, dist_group_type -from ..distributed import get_tp_group_and_world_size, track_rng_state +from transformer_engine.paddle.layer import LayerNormMLP, LayerNorm, MultiHeadAttention +from transformer_engine.paddle.constants import AttnMaskTypes, LayerTypes, dist_group_type +from transformer_engine.paddle.distributed import get_tp_group_and_world_size, track_rng_state class TransformerLayer(paddle.nn.Layer): @@ -33,6 +33,10 @@ class TransformerLayer(paddle.nn.Layer): dropout probability for the dropout op after FC2 layer. attention_dropout: float, default = 0.1 dropout probability for the dropout op during multi-head attention. + weight_attr: Union[paddle.ParamAttr, None], default = None + optional `paddle.ParamAttr` for weight. + bias_attr: Union[paddle.ParamAttr, None, bool], default = None + optional `paddle.ParamAttr` for bias. self_attn_mask_type: {'causal', 'padding'}, default = `causal` type of attention mask passed into softmax operation. apply_residual_connection_post_layernorm : bool, default = `False` @@ -62,6 +66,8 @@ class TransformerLayer(paddle.nn.Layer): it controls the type used to allocate the initial parameters. Useful when the model is trained with lower precision and the original FP32 parameters would not fit in GPU memory. + backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine' + if set to 'paddle', a framework only no-FP8 path is executed with limited optimization. Parallelism parameters ---------------------- diff --git a/transformer_engine/paddle/recompute.py b/transformer_engine/paddle/recompute.py index cf42505bc8..b4d22f5240 100644 --- a/transformer_engine/paddle/recompute.py +++ b/transformer_engine/paddle/recompute.py @@ -11,7 +11,9 @@ from .constants import RecomputeFunctionNames from .fp8 import get_global_fp8_state -__all__ = ['recompute', 'is_in_recompute_phase'] + +__all__ = ['recompute'] + _DISABLE_RECOMPUTE = int(os.getenv("NVTE_DISABLE_RECOMPUTE", "0")) @@ -35,6 +37,16 @@ def recompute(function, *args, **kwargs): """ This is a wrapper of paddle.distributed.fleet.utils.recompute. It provides necessary state information for fp8 layers. + + Parameters + ---------- + function: Callable + paddle module used to run the forward and backward passes using + the specified :attr:`args` and :attr:`kwargs`. + args : tuple + tuple of torch tensors for inputs to :attr:`function`. + kwargs : dict + dictionary of string keys for keyword arguments to :attr:`function`. """ assert not _DISABLE_RECOMPUTE, "Recompute is disabled. " \ f"Got NVTE_DISABLE_RECOMPUTE={_DISABLE_RECOMPUTE}." From 96b31f87a111459c3132839945ba1707664c48f1 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 19 Jan 2024 09:03:24 -0800 Subject: [PATCH 069/427] Avoid using torch.compile for roll and fill_ (#609) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/fp8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index c7d4524113..e01e42bce4 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -583,7 +583,7 @@ def _update_amax_history(amax_history: torch.Tensor) -> torch.Tensor: return amax_history -@jit_fuser +@torch.jit.script def _default_get_amax( amax_history: torch.Tensor, amax_compute_algo: str, @@ -625,7 +625,7 @@ def _compute_scaling_factor_inverse( return torch.where(non_weight_mask, 1.0 / scale, scale_inv) -@jit_fuser +@torch.jit.script def _fused_amax_and_scale_update( amax_history: torch.Tensor, scale: torch.Tensor, From bbafb02097e6ca1605c3c0cad84d59dbbcb6e94b Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 19 Jan 2024 09:07:52 -0800 Subject: [PATCH 070/427] Changed VERSION to 1.2.1 Signed-off-by: Przemek Tredak --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 26aaba0e86..6085e94650 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.0 +1.2.1 From 29413187eb6a84a8032032e7f033371f6f83e47c Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 19 Jan 2024 09:03:24 -0800 Subject: [PATCH 071/427] Avoid using torch.compile for roll and fill_ (#609) Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/fp8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index 7bec34c861..d4d82cf0be 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -583,7 +583,7 @@ def _update_amax_history(amax_history: torch.Tensor) -> torch.Tensor: return amax_history -@jit_fuser +@torch.jit.script def _default_get_amax( amax_history: torch.Tensor, amax_compute_algo: str, @@ -625,7 +625,7 @@ def _compute_scaling_factor_inverse( return torch.where(non_weight_mask, 1.0 / scale, scale_inv) -@jit_fuser +@torch.jit.script def _fused_amax_and_scale_update( amax_history: torch.Tensor, scale: torch.Tensor, From f26690abfcb863a78bfb32f91f4121537b2d07a3 Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Wed, 17 Jan 2024 11:58:22 -0600 Subject: [PATCH 072/427] [PyTorch] Deferred Initialization via `device='meta'` option (#596) * Implemented deferred initialization via `device='meta'` option for te.Linear and added new PyTorch example to demonstrate its use with FullyShardedDataParallel execution. Signed-off-by: Alp Dener * correcting Float8Tensor initialization and fixing linting errors Signed-off-by: Alp Dener * removed duplicate code from upstream rebase, local tests passing Signed-off-by: Alp Dener * improved comments/documentation for FSDP example Signed-off-by: Alp Dener * converted reset_parameters() into a base module function Signed-off-by: Alp Dener * fixed Float8Tensor creation with deferred init, all tests passing locally Signed-off-by: Alp Dener * extended deferred initialization to all TE modules Signed-off-by: Alp Dener * fixed linting errors Signed-off-by: Alp Dener * removed unnecessary reference to the parent module of parameter, added clarifying comments in parameter reset Signed-off-by: Alp Dener --------- Signed-off-by: Alp Dener --- examples/pytorch/fsdp/README.md | 53 +++++ examples/pytorch/fsdp/fsdp.py | 195 ++++++++++++++++++ transformer_engine/pytorch/module/_common.py | 19 +- transformer_engine/pytorch/module/base.py | 49 +++++ .../pytorch/module/layernorm.py | 17 +- .../pytorch/module/layernorm_linear.py | 57 +++-- .../pytorch/module/layernorm_mlp.py | 92 ++++----- transformer_engine/pytorch/module/linear.py | 48 ++--- transformer_engine/pytorch/module/rmsnorm.py | 15 +- transformer_engine/pytorch/utils.py | 15 ++ 10 files changed, 441 insertions(+), 119 deletions(-) create mode 100644 examples/pytorch/fsdp/README.md create mode 100644 examples/pytorch/fsdp/fsdp.py diff --git a/examples/pytorch/fsdp/README.md b/examples/pytorch/fsdp/README.md new file mode 100644 index 0000000000..d492ea4a57 --- /dev/null +++ b/examples/pytorch/fsdp/README.md @@ -0,0 +1,53 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +# Basic Example for Using PyTorch Fully Sharded Data Parallel mode with Transformer Engine + +```bash +# FSDP without deferred initialization: +# Duplicate modules initialized on each device. Load on device memory reduced only after +# torch.distributed.fsdp.FullyShardedDataParallel mode shards model parameters. +$ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsdp.py +# Sample output on 8xL40S: +# [GPU-0] WORLD_SIZE = 8 +# [GPU-0] TransformerEngine Model: +# TransformerLayer( +# (self_attention): MultiheadAttention( +# (layernorm_qkv): LayerNormLinear() +# (core_attention): DotProductAttention( +# (flash_attention): FlashAttention() +# (fused_attention): FusedAttention() +# (unfused_attention): UnfusedDotProductAttention( +# (scale_mask_softmax): FusedScaleMaskSoftmax() +# (attention_dropout): Dropout(p=0.1, inplace=False) +# ) +# ) +# (proj): Linear() +# ) +# (layernorm_mlp): LayerNormMLP() +# ) +# [GPU-0] Pre-FSDP memory use = 83.935232MiB +# [GPU-0] Post-FSDP memory use = 10.491904MiB +# [GPU-0] Iter. 1 +# [GPU-0] Iter. 2 +# [GPU-0] Iter. 3 +# [GPU-0] Training Time: 6.647654296875s +# [GPU-0] Avg. Iter. Time: 2.2158847656250003s +# [GPU-0] Peak memory use = 3000MiB + +# FSDP with deferred initialization: +# Modules initialized with empty paramaters via `device='meta'` option. Zero load on device +# memory until torch.distributed.fsdp.FullyShardedDataParallel mode triggers a reset on +# on already sharded model parameters. +$ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsdp.py --defer-init +# Sample output on 8xL40S: +# [GPU-0] WORLD_SIZE = 8 +# ... +# [GPU-0] Pre-FSDP memory use = 0.0MiB +# [GPU-0] Post-FSDP memory use = 10.491904MiB +# ... +``` + +**NOTE:** This example has `fp8_autocast()` enabled by default. To run on GPUs without Fp8 support +(e.g.: A100), add the `--no-fp8` option to the commands shown above. diff --git a/examples/pytorch/fsdp/fsdp.py b/examples/pytorch/fsdp/fsdp.py new file mode 100644 index 0000000000..5d30be6c97 --- /dev/null +++ b/examples/pytorch/fsdp/fsdp.py @@ -0,0 +1,195 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os +import argparse +from functools import partial + +import torch +import torch.distributed as dist +from torch import nn +from torch.distributed.fsdp import FullyShardedDataParallel, MixedPrecision +from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy + +import transformer_engine.pytorch as te +from transformer_engine.common.recipe import Format, DelayedScaling + +def lowercase(s): + return str(s).lower() + +def torch_dtype(d): + typemap = { + 'fp32' : torch.float32, + 'float32' : torch.float32, + 'fp16' : torch.float16, + 'float16' : torch.float16, + 'bf16' : torch.bfloat16, + 'bfloat16' : torch.bfloat16 + } + if lowercase(d) not in typemap.keys(): + raise TypeError + return typemap[lowercase(d)] + +te_layer_map = { + 'linear': te.Linear, + 'layernorm': te.LayerNorm, + 'rmsnorm': te.RMSNorm, + 'layernormlinear': te.LayerNormLinear, + 'layernormmlp': te.LayerNormMLP, + 'multiheadattention': te.MultiheadAttention, + 'transformerlayer': te.TransformerLayer +} +def te_layer(l): + if lowercase(l) not in te_layer_map.keys(): + raise TypeError + return te_layer_map[lowercase(l)] + +def get_layer_args(args): + hidden_size = args.num_heads * args.head_dim + layer_args = (hidden_size, ) + layer_kwargs = { + 'params_dtype': args.dtype, + 'device': 'meta' if args.defer_init else 'cuda' + } + if args.layer_type in [te.Linear, te.LayerNormLinear, te.LayerNormMLP]: + ffn_hidden_size = 3 * hidden_size if args.num_layers == 1 else hidden_size + layer_args += (ffn_hidden_size, ) + layer_kwargs['bias'] = True + if args.layer_type == te.LayerNormMLP: + layer_kwargs['seq_length'] = args.seq_length + elif args.layer_type == te.MultiheadAttention: + layer_args += (args.num_heads, ) + layer_kwargs['fuse_qkv_params'] = True + elif args.layer_type == te.TransformerLayer: + layer_args += (3 * hidden_size, args.num_heads) + layer_kwargs['fuse_qkv_params'] = True + layer_kwargs['seq_length'] = args.seq_length + return layer_args, layer_kwargs + +def parse_fsdp_args(): + parser = argparse.ArgumentParser(description="Run Transformer Engine modules with the " + + "torch.distributed.fsdp.FullyShardedDataParallel strategy.") + parser.add_argument("-t", "--layer-type", type=te_layer, default=te.TransformerLayer, + choices=list(te_layer_map.values()), + help="TE module type used to construct the test model.") + parser.add_argument("--no-fp8", action="store_true", default=False, + help="Disables the te.fp8_autocast() context.") + parser.add_argument('-i', "--num-iters", type=int, default=3, + help="Number of dummy 'training' iterations.") + parser.add_argument('-b', "--batch-size", type=int, default=32, + help="Input batch size.") + parser.add_argument('-s', "--seq-length", type=int, default=1048, + help="Input sequence length.") + parser.add_argument('-n', "--num-heads", type=int, default=16, + help="Number of attention heads.") + parser.add_argument('-d', "--head-dim", type=int, default=128, + help="Dimension of each attention head (number of KV channels).") + parser.add_argument('-l', "--num-layers", type=int, default=1, + help="Number of modules chained together with nn.Sequential.") + parser.add_argument("--seed", type=int, default=1234, + help="PyTorch RNG seed.") + parser.add_argument("--defer-init", action="store_true", + help="Defer module parameter initialization until after FSDP sharding.") + parser.add_argument('-v', "--verbose", action="store_true", default=False, + help="Print out information from all GPUs instead of only the root GPU-0.") + parser.add_argument("--dtype", type=torch_dtype, default=torch.bfloat16, + help="Data type for input tensor and Transformer Engine module parameters.") + return parser.parse_args() + +def train(args): + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + # Initialize torch.distributed global process group + dist.init_process_group(backend="nccl") + torch.cuda.set_device(local_rank) + if local_rank == 0: + print(f"[GPU-0] WORLD_SIZE = {world_size}\n\n", end='') + torch.manual_seed(args.seed) + + # Construct a simple homogeneous model (only one layer type) with NO PARALLELISM + layer_args, layer_kwargs = get_layer_args(args) + if args.num_layers > 1: + te_layer_list = [] + for i in range(args.num_layers): + if args.layer_type in [te.MultiheadAttention, te.TransformerLayer]: + layer_kwargs['layer_number'] = i+1 + te_layer_list.append(args.layer_type(*layer_args, **layer_kwargs)) + te_model = nn.Sequential(*te_layer_list) + else: + # Single layer model + te_model = args.layer_type(*layer_args, **layer_kwargs) + if local_rank == 0: + print(f"[GPU-0] TransformerEngine Model:\n{te_model}\n", end='') + + # Print out allocated device memory before the model parameters are sharded by FSDP + pre_mem_use = torch.cuda.memory_allocated(device=f"cuda:{local_rank}") * 1e-6 + if local_rank == 0 or args.verbose: + print(f"[GPU-{local_rank}] Pre-FSDP memory use = {pre_mem_use}MiB\n", end='') + + # Wrap the model with FSDP + # NOTE: The TE model itself has no inherent parallelism. FSDP shards model parameters and + # controls all communication. + all_gpus = dist.new_group(backend='nccl') + fsdp_wrap_policy = always_wrap_policy + if args.layer_type == te.TransformerLayer: + # NOTE: FSDP causes illegal memory access without this special policy for Transformers + fsdp_wrap_policy = partial(transformer_auto_wrap_policy, + transformer_layer_cls={te.TransformerLayer}) + te_model = FullyShardedDataParallel(te_model, + process_group=all_gpus, + use_orig_params=True, + mixed_precision=MixedPrecision( + param_dtype=args.dtype, + reduce_dtype=torch.float32, + ), + sync_module_states=True, + auto_wrap_policy=fsdp_wrap_policy) + + # Print out allocated device memory after the model parameters are sharded + post_mem_use = torch.cuda.memory_allocated(device=f"cuda:{local_rank}") * 1e-6 + if local_rank == 0 or args.verbose: + print(f"[GPU-{local_rank}] Post-FSDP memory use = {post_mem_use}MiB\n", end='') + + # Fp8 setup for TE + fp8_format = Format.HYBRID + fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=32, amax_compute_algo="max") + + # Optimizer must be created after the model is wrapped in FSDP and the parameters are sharded + optim = torch.optim.Adam(te_model.parameters(), lr=0.0001) + + # Start and time dummy "training" iterations + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start.record() + for i in range(args.num_iters): + # Generate a random input batch + x = torch.rand(args.seq_length, args.batch_size, + args.num_heads*args.head_dim).to(dtype=args.dtype).cuda() + # fp8_autocast needs to be given the FSDP process group for amax reductions + with te.fp8_autocast(enabled=not args.no_fp8, fp8_recipe=fp8_recipe, fp8_group=all_gpus): + y = te_model(x) + loss = y.sum() + # calculate gradient and take training step outside the fp8_autocast context + loss.backward() + optim.step() + del x + if local_rank == 0: + print(f"[GPU-0] Iter. {i+1}\n", end='') + end.record() + torch.cuda.synchronize() + + # Print out "training" time and peak memory use stats + train_time = start.elapsed_time(end)/1000. + max_memory_alloc = int(torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") * 1e-6) + if local_rank == 0 or args.verbose: + print(f"[GPU-{local_rank}] Training Time: {train_time}s\n" + + f"[GPU-{local_rank}] Avg. Iter. Time: {train_time /args.num_iters}s\n" + + f"[GPU-{local_rank}] Peak memory use = {max_memory_alloc}MiB\n\n", end='') + + +if __name__ == "__main__": + args = parse_fsdp_args() + train(args) diff --git a/transformer_engine/pytorch/module/_common.py b/transformer_engine/pytorch/module/_common.py index edc3da120d..d2ab776288 100644 --- a/transformer_engine/pytorch/module/_common.py +++ b/transformer_engine/pytorch/module/_common.py @@ -4,12 +4,14 @@ """Internal function used by multiple modules.""" -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, Callable +from dataclasses import dataclass import torch from .. import cpp_extensions as tex from ..fp8 import get_fp8_te_dtype +from ..utils import get_default_init_method def _get_normalization_func(normalization: str, fp8_output: bool, @@ -187,3 +189,18 @@ def _noop_cat( # Perform no-op concat return _NoopCatFunc.apply(split_ranges, full_tensor, *tensors) + + +@dataclass +class _ParameterInitMeta: + """ + Stores essential metadata needed to support deferred parameter initialization. + """ + init_fn: Optional[Callable] = get_default_init_method() + get_rng_state_tracker: Optional[Callable] = None + fp8_meta_index: Optional[int] = None + + def __post_init__(self): + """Safeguard reference to the parameter's parent module and initialization function.""" + if self.init_fn is None: + self.init_fn = get_default_init_method() diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index cf9634b2cc..ad1f383617 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -16,6 +16,7 @@ import torch.nn.functional as F import transformer_engine_extensions as tex +from ._common import _ParameterInitMeta from ..export import is_in_onnx_export_mode from ..fp8 import ( get_default_fp8_recipe, @@ -234,6 +235,8 @@ def __init__(self) -> None: self.fp8_meta["async_amax_reduction"] = bool( int(os.getenv("NVTE_ASYNC_AMAX_REDUCTION", "0")) ) + self.param_init_meta = {} + self.primary_weights_in_fp8 = FP8GlobalStateManager.with_fp8_parameters() def set_meta_tensor(self, fwd: bool) -> None: """Init scales and amaxes for fwd | bwd.""" @@ -746,6 +749,52 @@ def get_fp8_weights_empty_tensors( ) return fp8_weight_tensors + def register_parameter(self, name, param, **kwargs): + """ + Thin wrapper around PyTorch parameter registration to stash additional parameter + metedata used in deferred initialization. + """ + super().register_parameter(name, param) + self.param_init_meta[name] = _ParameterInitMeta(**kwargs) + + def reset_parameters(self, defer_init: Optional[bool] = False) -> None: + """ + Reset all module parameters to initial values. Unless deferred initialization + is specified, all parameters on a 'meta' device are also materialized on a real cuda + device before the values are reset to initial. + """ + if defer_init: + return + + for name, param in self.named_parameters(recurse=False): + # Ensure parameter is on a real device + if param.device == torch.device('meta'): + param = param.to(device='cuda') + + # Initialize the parameter values on device + init_fn = self.param_init_meta[name].init_fn + get_rng_state_tracker = self.param_init_meta[name].get_rng_state_tracker + if get_rng_state_tracker is None: + init_fn(param) + else: + with get_rng_state_tracker().fork(): + init_fn(param) + + # If primary weights are in fp8, wrap the parameter as Float8Tensor + fp8_meta_index = self.param_init_meta[name].fp8_meta_index + if self.primary_weights_in_fp8 and fp8_meta_index is not None: + param = Float8Tensor.to_float8( + param, + fp8_meta=self.fp8_meta, + fp8_meta_index=fp8_meta_index + ) + + # Redo parameter wrap in case we broke it above + # NOTE: Currently this can only be broken when primary weights are in Fp8 but + # re-applying the nn.Parameter() wrap is a no-op when the input is already + # a parameter so we always re-apply it just for extra safety. + setattr(self, name, torch.nn.Parameter(param)) + @abstractmethod def forward(self): """Needs override.""" diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py index 653e23f4f3..fac941306f 100644 --- a/transformer_engine/pytorch/module/layernorm.py +++ b/transformer_engine/pytorch/module/layernorm.py @@ -4,6 +4,7 @@ """LayerNorm API""" import os +import warnings from typing import Union, Tuple, Optional import torch @@ -139,7 +140,8 @@ def __init__( ) setattr(self.weight, "sequence_parallel", sequence_parallel) setattr(self.bias, "sequence_parallel", sequence_parallel) - self.reset_layer_norm_parameters() + + self.reset_parameters(defer_init=(device == 'meta')) # These many SMs are subtracted from the total SM count when calling forward # and backward LayerNorm C APIs. These envvars can be used to prevent the LN @@ -150,12 +152,25 @@ def __init__( def reset_layer_norm_parameters(self) -> None: """Init LN params""" + warnings.warn( + ("This method will be deprecated in an upcoming release. " + "Update your code to use LayerNorm.reset_parameters() instead."), + DeprecationWarning, + stacklevel=2 + ) if not self.zero_centered_gamma: init.ones_(self.weight) else: init.zeros_(self.weight) init.zeros_(self.bias) + def reset_parameters(self, defer_init=False) -> None: + """Init LayerNorm parameters""" + if defer_init: + return + init.constant_(self.weight, float(not self.zero_centered_gamma)) + init.zeros_(self.bias) + @no_torch_dynamo() def forward(self, inp: torch.Tensor) -> torch.Tensor: """LayerNorm FWD""" diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index d36d5a9923..2e6803f992 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -25,6 +25,7 @@ from ..utils import ( divide, get_default_init_method, + init_method_constant, cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, @@ -33,7 +34,6 @@ set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, - initialize_affine_weight_gpu, reduce_scatter_along_first_dim, gather_along_first_dim, ) @@ -749,43 +749,25 @@ def __init__( self.sequence_parallel = (self.tp_size > 1) and sequence_parallel self.eps = eps - self.layer_norm_weight = torch.nn.Parameter( + layer_norm_weight = torch.nn.Parameter( torch.empty(in_features, device=device, dtype=params_dtype) ) - setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) + self.register_parameter('layer_norm_weight', layer_norm_weight, + init_fn=init_method_constant(float(not self.zero_centered_gamma))) + setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition if self.normalization != "RMSNorm": - self.layer_norm_bias = torch.nn.Parameter( + layer_norm_bias = torch.nn.Parameter( torch.empty(in_features, device=device, dtype=params_dtype) ) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + self.register_parameter('layer_norm_bias', layer_norm_bias) + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None - self.reset_layer_norm_parameters() - temp_weight = torch.empty( + self.weight_tensor = torch.empty( self.out_features, self.in_features, device=device, dtype=params_dtype) - initialize_affine_weight_gpu( - temp_weight, - init_method, - get_rng_state_tracker, - partition_dim=1 if self.parallel_mode == "row" else 0, - stride=1, - ) - - if self.primary_weights_in_fp8: - self.init_fp8_metadata() - self.fp8_meta["update_amax_and_scale_fwd"] = True - - self.weight_tensor = Float8Tensor.to_float8( - temp_weight, - fp8_meta=self.fp8_meta, - fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, - ) - else: - self.weight_tensor = temp_weight - if self.use_bias: self.bias_tensor = torch.empty( self.out_features, @@ -794,9 +776,6 @@ def __init__( else: self.bias_tensor = torch.Tensor().to(dtype=params_dtype, device=device) - with torch.no_grad(): - self.bias_tensor.zero_() - # Configure parameter splits self.weight_names = [] self.bias_names = [] @@ -861,7 +840,10 @@ def __init__( if is_subview: weight = weight[split_start:split_end] weight = torch.nn.Parameter(weight) - self.register_parameter(self.weight_names[i], weight) + self.register_parameter(self.weight_names[i], weight, + init_fn=init_method, + get_rng_state_tracker=get_rng_state_tracker, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT) # Construct bias parameter if needed if self.use_bias: @@ -892,8 +874,13 @@ def __init__( del self.weight_tensor del self.bias_tensor - self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) + if self.primary_weights_in_fp8: + self.init_fp8_metadata() + self.fp8_meta["update_amax_and_scale_fwd"] = True + + self.reset_parameters(defer_init=(device == 'meta')) + self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM @@ -911,6 +898,12 @@ def __init__( def reset_layer_norm_parameters(self) -> None: """Init LN params""" + warnings.warn( + ("This method will be deprecated in an upcoming release. " + "Update your code to use LayerNormLinear.reset_parameters() instead."), + DeprecationWarning, + stacklevel=2 + ) if not self.zero_centered_gamma: init.ones_(self.layer_norm_weight) else: diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index e5e884cd22..8f88d725ad 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -30,6 +30,7 @@ from ..utils import ( divide, get_default_init_method, + init_method_constant, cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, @@ -38,7 +39,6 @@ set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, - initialize_affine_weight_gpu, reduce_scatter_along_first_dim, gather_along_first_dim, ) @@ -1170,91 +1170,76 @@ def __init__( # LN init self.eps = eps - self.layer_norm_weight = Parameter( + layer_norm_weight = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) + self.register_parameter('layer_norm_weight', layer_norm_weight, + init_fn=init_method_constant(float(not self.zero_centered_gamma))) setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) if self.normalization != "RMSNorm": - self.layer_norm_bias = Parameter( + layer_norm_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + self.register_parameter('layer_norm_bias', layer_norm_bias) + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None - self.reset_layer_norm_parameters() + # FC1 init if self.activation in ['reglu', 'geglu', 'swiglu']: fc1_output_features = 2 * self.size_per_partition else: fc1_output_features = self.size_per_partition - # FC1 init - fc1_temp_weight = torch.empty( - fc1_output_features, hidden_size, device=device, dtype=params_dtype) - - initialize_affine_weight_gpu( - fc1_temp_weight, - init_method, - get_rng_state_tracker, - set_tp_attributes=False, - ) - if self.primary_weights_in_fp8: - self.init_fp8_metadata(num_gemms=2) - self.fp8_meta["update_amax_and_scale_fwd"] = True - - fc1_temp_weight = Float8Tensor.to_float8( - fc1_temp_weight, - fp8_meta=self.fp8_meta, - fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, + fc1_weight = Parameter( + torch.empty( + fc1_output_features, hidden_size, device=device, dtype=params_dtype ) - - self.fc1_weight = Parameter(fc1_temp_weight) + ) + self.register_parameter('fc1_weight', fc1_weight, + init_fn=init_method, + get_rng_state_tracker=get_rng_state_tracker, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT) set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) self.fp8_weight_shapes.append(self.fc1_weight.shape) if self.use_bias: - self.fc1_bias = Parameter( + fc1_bias = Parameter( torch.empty(fc1_output_features, device=device, dtype=params_dtype) ) - set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) + self.register_parameter('fc1_bias', fc1_bias) + set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) # pylint: disable=access-member-before-definition else: self.fc1_bias = torch.Tensor().to(dtype=params_dtype, device=device) - with torch.no_grad(): - self.fc1_bias.zero_() - # FC2 init - fc2_temp_weight = torch.empty( - hidden_size, self.size_per_partition, device=device, dtype=params_dtype) - - initialize_affine_weight_gpu( - fc2_temp_weight, - output_layer_init_method, - get_rng_state_tracker, - set_tp_attributes=False, + fc2_weight = Parameter( + torch.empty(hidden_size, self.size_per_partition, device=device, dtype=params_dtype) ) - - if self.primary_weights_in_fp8: - fc2_temp_weight = Float8Tensor.to_float8( - fc2_temp_weight, - fp8_meta=self.fp8_meta, - fp8_meta_index=tex.FP8FwdTensors.GEMM2_WEIGHT, - ) - - self.fc2_weight = Parameter(fc2_temp_weight) + self.register_parameter('fc2_weight', fc2_weight, + init_fn=output_layer_init_method, + get_rng_state_tracker=get_rng_state_tracker, + fp8_meta_index=tex.FP8FwdTensors.GEMM2_WEIGHT) set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) self.fp8_weight_shapes.append(self.fc2_weight.shape) if self.use_bias: - self.fc2_bias = Parameter( + fc2_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) + self.register_parameter('fc2_bias', fc2_bias) # RPL if self.set_parallel_mode: - setattr(self.fc2_bias, "sequence_parallel", sequence_parallel) + setattr(self.fc2_bias, "sequence_parallel", sequence_parallel) # pylint: disable=access-member-before-definition else: self.fc2_bias = torch.Tensor().to(dtype=params_dtype, device=device) + if self.primary_weights_in_fp8: + self.init_fp8_metadata(num_gemms=2) + self.fp8_meta["update_amax_and_scale_fwd"] = True + + self.reset_parameters(defer_init=(device == 'meta')) + # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM if self.set_parallel_mode and self.apply_bias: @@ -1262,9 +1247,6 @@ def __init__( else: self.gemm_bias_unfused_add = False - with torch.no_grad(): - self.fc2_bias.zero_() - if self.bias_gelu_nvfusion: set_jit_fusion_options() if seq_length and micro_batch_size: @@ -1281,6 +1263,12 @@ def __init__( def reset_layer_norm_parameters(self) -> None: """Init LN params""" + warnings.warn( + ("This method will be deprecated in an upcoming release. " + "Update your code to use LayerNormMLP.reset_parameters() instead."), + DeprecationWarning, + stacklevel=2 + ) if not self.zero_centered_gamma: init.ones_(self.layer_norm_weight) else: diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 2a28d67292..2cad516881 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -23,7 +23,6 @@ from ..fp8 import get_fp8_te_dtype, FP8GlobalStateManager from ..utils import ( divide, - get_default_init_method, cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, @@ -32,7 +31,6 @@ set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, - initialize_affine_weight_gpu, reduce_scatter_along_first_dim, gather_along_first_dim, ) @@ -82,7 +80,7 @@ def forward( ub_split_ag: bool, ub_atomic_gemm_rs: bool, ub_atomic_gemm_ag: bool, - ub_name: str, + ub_name: str ) -> torch.Tensor: # Make sure input dimensions are compatible in_features = weight.shape[-1] @@ -625,6 +623,10 @@ def __init__( if any([ub_atomic_gemm_rs, ub_atomic_gemm_ag]): assert ub_name is not None, "Userbuffer name [string] is not set." self.ub_name = ub_name + self.get_rng_state_tracker = get_rng_state_tracker + if device == 'meta': + assert parameters_split is None, ("Cannot split module parameters " + "on 'meta' device.") if ub_split_rs or ub_split_ag or ub_atomic_gemm_rs: assert ( @@ -655,44 +657,17 @@ def __init__( elif self.parallel_mode == "row": self.in_features = divide(self.in_features, self.tp_size) - if init_method is None: - init_method = get_default_init_method() - self.sequence_parallel = (self.tp_size > 1) and sequence_parallel - temp_weight = torch.empty( + self.weight_tensor = torch.empty( self.out_features, self.in_features, device=device, dtype=params_dtype) - # TODO(ksivaman): This functionality works with FP8 outside TE. - initialize_affine_weight_gpu( - temp_weight, - init_method, - get_rng_state_tracker, - partition_dim=1 if self.parallel_mode == "row" else 0, - stride=1, - ) - - if self.primary_weights_in_fp8: - self.init_fp8_metadata() - self.fp8_meta["update_amax_and_scale_fwd"] = True - - self.weight_tensor = Float8Tensor.to_float8( - temp_weight, - fp8_meta=self.fp8_meta, - fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT, - ) - else: - self.weight_tensor = temp_weight - if self.use_bias: self.bias_tensor = torch.empty(self.out_features, device=device, dtype=params_dtype) else: self.bias_tensor = torch.Tensor().to(dtype=params_dtype, device=device) - with torch.no_grad(): - self.bias_tensor.zero_() - # Configure parameter splits self.weight_names = [] self.bias_names = [] @@ -757,7 +732,10 @@ def __init__( if is_subview: weight = weight[split_start:split_end] weight = torch.nn.Parameter(weight) - self.register_parameter(self.weight_names[i], weight) + self.register_parameter(self.weight_names[i], weight, + init_fn=init_method, + get_rng_state_tracker=get_rng_state_tracker, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT) # Construct bias parameter if needed if self.use_bias: @@ -788,6 +766,12 @@ def __init__( del self.weight_tensor del self.bias_tensor + if self.primary_weights_in_fp8: + self.init_fp8_metadata() + self.fp8_meta["update_amax_and_scale_fwd"] = True + + self.reset_parameters(defer_init=(device == 'meta')) + self.fp8_weight_shapes.append(torch.Size((self.out_features, self.in_features))) # For RPL, bias has to be added after TP collectives diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py index 8da16d1c38..cad357de04 100644 --- a/transformer_engine/pytorch/module/rmsnorm.py +++ b/transformer_engine/pytorch/module/rmsnorm.py @@ -4,6 +4,7 @@ """RMSNorm API""" import os +import warnings from typing import Union, Tuple, Optional import torch @@ -141,7 +142,8 @@ def __init__( ) ) setattr(self.weight, "sequence_parallel", sequence_parallel) - self.reset_rms_norm_parameters() + + self.reset_parameters(defer_init=(device == 'meta')) # These many SMs are subtracted from the total SM count when calling forward # and backward RMSNorm C APIs. These envvars can be used to prevent the LN @@ -152,11 +154,22 @@ def __init__( def reset_rms_norm_parameters(self) -> None: """Init RMSNorm params""" + warnings.warn( + ("This method will be deprecated in an upcoming release. " + "Update your code to use RMSNorm.reset_parameters() instead."), + DeprecationWarning, + stacklevel=2 + ) if not self.zero_centered_gamma: init.ones_(self.weight) else: init.zeros_(self.weight) + def reset_parameters(self, defer_init=False) -> None: + """Reset RMSNorm parameters""" + if defer_init: + return + init.constant_(self.weight, float(not self.zero_centered_gamma)) @no_torch_dynamo() def forward(self, inp: torch.Tensor) -> torch.Tensor: diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 6250c07d60..819b3d4827 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -40,6 +40,21 @@ def get_default_init_method() -> Callable: return init_method_normal(0.023) +def init_method_constant(val: float) -> Callable: + """Init method to set all tensor elements to a constant value.""" + if val == 1.0: + def init_(tensor: torch.Tensor) -> Callable: + return torch.nn.init.ones_(tensor) + elif val == 0.0: + def init_(tensor: torch.Tensor) -> Callable: + return torch.nn.init.zeros_(tensor) + else: + def init_(tensor: torch.Tensor) -> Callable: + return torch.nn.init.constant_(tensor, val) + + return init_ + + def init_method_normal(sigma: float) -> Callable: """Init method based on N(0, sigma).""" From f6dd3fff261cf8b22d59ed952adf1a77ffcbfa60 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Thu, 18 Jan 2024 13:41:24 -0800 Subject: [PATCH 073/427] make TransformerLayer accept a `bshd` or `sbhd` tensor format (#557) * make TransformerLayer accept a `bshd` or `sbhd` tensor format Signed-off-by: Sudhakar Singh * Fixes from feedback Signed-off-by: Sudhakar Singh * more feedback fixes Signed-off-by: Sudhakar Singh * remove incorrect info from docstring Signed-off-by: Sudhakar Singh * fix from feedback Signed-off-by: Sudhakar Singh --------- Signed-off-by: Sudhakar Singh --- tests/pytorch/fused_attn/test_fused_attn.py | 11 ++- tests/pytorch/test_numerics.py | 77 +++++++++++++++++++++ transformer_engine/pytorch/attention.py | 46 ++++++++++-- transformer_engine/pytorch/transformer.py | 12 ++++ 4 files changed, 137 insertions(+), 9 deletions(-) diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py index 3f8962504b..296d9ff214 100644 --- a/tests/pytorch/fused_attn/test_fused_attn.py +++ b/tests/pytorch/fused_attn/test_fused_attn.py @@ -666,10 +666,10 @@ def test_transformer_layer(dtype, model_configs, model, ckpt_attn, qkv_format, f @pytest.mark.parametrize("dtype", param_types_lean) @pytest.mark.parametrize("model_configs", [model_configs_te_layer]) @pytest.mark.parametrize("model", ["te_1_2", "te_2_0"]) -def test_te_layer_misc(dtype, model_configs, model): +@pytest.mark.parametrize("qkv_format", ["bshd", "sbhd"]) +def test_te_layer_misc(dtype, model_configs, model, qkv_format): """Test TransformerLayer module with miscellanous settings""" ckpt_attn = True - qkv_format = "bshd" fused_qkv_params = True RoPE = True test_transformer_layer(dtype, model_configs, model, @@ -705,7 +705,7 @@ def _run_transformer_layer( config: ModelConfig, backend: str, ckpt_attn: bool, - qkv_layout: str, + qkv_format: str, workspace_opt: bool, fused_qkv_params: bool, RoPE: bool, @@ -724,6 +724,10 @@ def _run_transformer_layer( # Create input tensor inp = torch.randn(config.max_seqlen_q, config.batch_size, config.hidden_size, dtype=dtype, device="cuda", requires_grad = True) + # In case the format to be tested is batch-first, need to transpose the + # input tensor. + if qkv_format == "bshd": + inp = inp.transpose(0,1) # Create seqlens if "padding" in config.attn_mask_type: @@ -815,6 +819,7 @@ def _run_transformer_layer( qkv_weight_interleaved=False, ub_tp_comm_overlap=False, bias=True, + attn_input_format=qkv_format, ) .to(dtype=dtype, device="cuda") ) diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index acc3cbeda3..de7c84695c 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -1197,3 +1197,80 @@ def test_gpt_fp8_parameters(dtype, bs, model): outputs = _test_gpt_fp8_parameters(bs, dtype, config, False) outputs_fp8_params = _test_gpt_fp8_parameters(bs, dtype, config, True) assert_all_equal(outputs, outputs_fp8_params) + +@pytest.mark.parametrize("dtype", param_types) +@pytest.mark.parametrize("bs", batch_sizes) +@pytest.mark.parametrize("model", model_configs.keys()) +def test_transformer_layer_hidden_states_format(dtype, bs, model): + config = model_configs[model] + + sigma = 0.023 + init_method = init_method_normal(sigma) + output_layer_init_method = scaled_init_method_normal(sigma, config.num_layers) + + # Set `torch.manual_seed` to make sure the weights are identical to the + # other layer. Set `*dropout` values to 0 to make sure the forward pass + # is identical to the other layer. + torch.manual_seed(0) + block_sbhd = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + layernorm_epsilon=config.eps, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + hidden_dropout=0, + attention_dropout=0, + kv_channels=config.embed, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + hidden_states_format="sbhd" + ) + .to(dtype=dtype) + .cuda() + ) + + # Set `torch.manual_seed` to make sure the weights are identical to the + # other layer. Set `*dropout` values to 0 to make sure the forward pass + # is identical to the other layer. + torch.manual_seed(0) + block_bshd = ( + TransformerLayer( + config.hidden_size, + 4 * config.hidden_size, + config.num_attention_heads, + layernorm_epsilon=config.eps, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + hidden_dropout=0, + attention_dropout=0, + kv_channels=config.embed, + apply_residual_connection_post_layernorm=False, + output_layernorm=False, + hidden_states_format="bshd" + ) + .to(dtype=dtype) + .cuda() + ) + + for (n1, p1), (n2, p2) in zip(block_bshd.named_parameters(), block_sbhd.named_parameters()): + assert torch.all(torch.eq(p1, p2)), f"{n1}, {n2} not identical" + + x_sbhd = torch.randn( + config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True + ).to(dtype).cuda() + + x_bshd = x_sbhd.transpose(0,1).contiguous() + + # To make sure forward is also identical (just in case some module decides + # to act fancy) + torch.manual_seed(0) + y_sbhd = block_sbhd(x_sbhd) + + # To make sure forward is also identical (just in case some module decides + # to act fancy) + torch.manual_seed(0) + y_bshd = block_bshd(x_bshd) + + assert_all_equal([y_bshd], [y_sbhd.transpose(0,1).contiguous()]) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 750bc0403c..9316b32864 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1034,11 +1034,34 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor: return torch.cat((-x2, x1), dim=-1) -def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: +def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor, tensor_format: str = "sbhd") -> torch.Tensor: """ - input tensor t is of shape [seq_length, ..., dim] - rotary positional embeding tensor `freqs` is of shape [seq_length, ..., dim] + Parameters + ---------- + t: torch.Tensor + input tensor on which rotary positional embedding will be applied + freqs: torch.Tensor + rotary positional embeding tensor `freqs` is of shape + `[seq_length, ..., dim]` + tensor_format: {'sbhd', 'bshd'}, default = 'sbhd' + is `bshd` if `t` is of shape `[bs, seq, ...]`, or `sbhd` if `t` is + of shape `[seq, bs, ...]`. + """ + assert tensor_format in ("sbhd", "bshd"),("Only formats `sbhd` or `bshd` " + "are supported for input tensor " + "`t`.") + max_seq_len = freqs.shape[0] + cur_seq_len = t.shape[1] if tensor_format == "bshd" else t.shape[0] + + # Only apply the rotary embeddings up to the sequence length of the running + # input. + assert cur_seq_len <= max_seq_len, (f"Rotary Embeddings only supported " + "upto {max_seq_len} sequence length!") + freqs = freqs[:cur_seq_len].to(t.dtype) + if tensor_format == "bshd": + freqs = freqs.transpose(0,1) # [seq, 1, 1, dim] -> [1, seq, 1, dim] + rot_dim = freqs.shape[-1] # ideally t_pass is empty so rotary pos embedding is applied to all tensor t t, t_pass = t[..., :rot_dim], t[..., rot_dim:] @@ -2821,6 +2844,14 @@ class MultiheadAttention(torch.nn.Module): The device on which the parameters of the model will allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. + qkv_format: str, default = `sbhd` + dimension format for `query_layer`, `key_layer` and `value_layer`, + {`sbhd`, `bshd`}. `s` stands for the sequence length, `b` batch size, + `h` the number of heads and `d` head size. `sbhd` and `bshd` formats + are used for when sequences in a batch are of equal length or padded to + equal length. Please note that these formats do not reflect how + tensors `query_layer`, `key_layer`, `value_layer` are laid out in memory. + For that, please use `_get_qkv_layout` to gain the layout information. Parallelism parameters ---------------------- @@ -2899,9 +2930,11 @@ def __init__( bias: bool = True, normalization: str = "LayerNorm", device: Union[torch.device, str] = "cuda", + qkv_format: str = "sbhd", ) -> None: super().__init__() + self.qkv_format = qkv_format self.attn_mask_type = attn_mask_type self.window_size = window_size self.window_size = check_set_window_size(attn_mask_type, self.window_size) @@ -3045,6 +3078,7 @@ def __init__( kv_channels, num_gqa_groups=self.num_gqa_groups, attention_dropout=attention_dropout, + qkv_format=self.qkv_format, tp_size=tp_size, get_rng_state_tracker=get_rng_state_tracker, sequence_parallel=sequence_parallel, @@ -3398,14 +3432,14 @@ def forward( # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb - query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb) - key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb) + query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb, self.qkv_format) + key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb, self.qkv_format) context_layer = self.core_attention( query_layer, key_layer, value_layer, - qkv_format='sbhd', + qkv_format=self.qkv_format, cu_seqlens_q=None, cu_seqlens_kv=None, attention_mask=attention_mask, diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index f1c6194d29..addaf31689 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -168,6 +168,14 @@ class TransformerLayer(torch.nn.Module): The device on which the parameters of the model will allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. + attn_input_format: {'sbhd', 'bshd'}, default = 'sbhd' + This controls whether the dimensions of the + intermediate hidden states is 'batch first' ('bshd') or + 'sequence first' ('sbhd'). `s` stands for the sequence + length, `b` batch size, `h` the number of heads, `d` + head size. Note that these formats are very closely + related to the `qkv_format` in the `MultiHeadAttention` + and `DotProductAttention` modules. Parallelism parameters ---------------------- @@ -253,6 +261,7 @@ def __init__( activation: str = 'gelu', normalization: str = "LayerNorm", device: Union[torch.device, str] = "cuda", + attn_input_format: str = "sbhd", ) -> None: super().__init__() @@ -331,6 +340,8 @@ def __init__( self.get_rng_state_tracker = get_rng_state_tracker + self.attn_input_format = attn_input_format + attention_args = ( hidden_size, num_attention_heads, @@ -360,6 +371,7 @@ def __init__( "ub_split_rs" : ub_split_rs, "ub_atomic_gemm_rs" : ub_atomic_gemm_rs, "ub_atomic_gemm_ag" : ub_atomic_gemm_ag, + "qkv_format" : self.attn_input_format, } self.self_attention = MultiheadAttention( From b25611bd4ad36706552cdfb7c4798879e5eb0a5b Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Fri, 19 Jan 2024 23:29:30 -0800 Subject: [PATCH 074/427] Fix failing CI due to PR #557 merge (#616) fix failing tests due to PR #557 Signed-off-by: Sudhakar Singh Co-authored-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> --- tests/pytorch/test_numerics.py | 4 ++-- transformer_engine/pytorch/attention.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index de7c84695c..215cae2b97 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -1225,7 +1225,7 @@ def test_transformer_layer_hidden_states_format(dtype, bs, model): kv_channels=config.embed, apply_residual_connection_post_layernorm=False, output_layernorm=False, - hidden_states_format="sbhd" + attn_input_format="sbhd" ) .to(dtype=dtype) .cuda() @@ -1248,7 +1248,7 @@ def test_transformer_layer_hidden_states_format(dtype, bs, model): kv_channels=config.embed, apply_residual_connection_post_layernorm=False, output_layernorm=False, - hidden_states_format="bshd" + attn_input_format="bshd" ) .to(dtype=dtype) .cuda() diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 9316b32864..cf7bee8c66 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1034,7 +1034,11 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor: return torch.cat((-x2, x1), dim=-1) -def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor, tensor_format: str = "sbhd") -> torch.Tensor: +def apply_rotary_pos_emb( + t: torch.Tensor, + freqs: torch.Tensor, + tensor_format: str = "sbhd" + ) -> torch.Tensor: """ Parameters ---------- @@ -1056,8 +1060,10 @@ def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor, tensor_format: st # Only apply the rotary embeddings up to the sequence length of the running # input. - assert cur_seq_len <= max_seq_len, (f"Rotary Embeddings only supported " - "upto {max_seq_len} sequence length!") + if cur_seq_len > max_seq_len: + raise Exception(f"Rotary Embeddings only supported upto {max_seq_len} " + "sequence length!") + freqs = freqs[:cur_seq_len].to(t.dtype) if tensor_format == "bshd": freqs = freqs.transpose(0,1) # [seq, 1, 1, dim] -> [1, seq, 1, dim] From c6f0a1f555ab315493032b0a77b0985654d42964 Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Sun, 21 Jan 2024 01:47:13 -0800 Subject: [PATCH 075/427] Activation offloading to CPU's for the Linear, Layernorm Linear and the Layernorm MLP modules (#571) * Added support activation offloading to CPU's Signed-off-by: Selvaraj Anandaraj * Moving CPU offloading library to TE Signed-off-by: Selvaraj Anandaraj * Restructured code, added switch to choose between weight/activation offloading Signed-off-by: Selvaraj Anandaraj * Removed arg during constructor Signed-off-by: Selvaraj Anandaraj * Fix nit-pick errors Signed-off-by: Selvaraj Anandaraj * Documentation fixes Signed-off-by: Przemek Tredak * Fix to the code block in docs Signed-off-by: Przemek Tredak * Added offloading unit test Signed-off-by: Selvaraj Anandaraj * Fixed formatting Signed-off-by: Selvaraj Anandaraj * wgrad fusion fix, minor errors and lint Signed-off-by: Kirthi Shankar Sivamani * Errors, test, lint Signed-off-by: Kirthi Shankar Sivamani * RM test file Signed-off-by: Kirthi Shankar Sivamani * Fixed stray PyT tensors in LayernormMLP getting offloaded Signed-off-by: Selvaraj Anandaraj * Fixed typi Signed-off-by: Selvaraj Anandaraj * Fix offloading for rmsnorm, rm test Signed-off-by: Kirthi Shankar Sivamani * Fix errors Signed-off-by: Kirthi Shankar Sivamani * Float8Tensor compatible offloading Signed-off-by: Kirthi Shankar Sivamani * Cleanup Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Selvaraj Anandaraj Signed-off-by: Przemek Tredak Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Selvaraj Anandaraj Co-authored-by: Przemyslaw Tredak Co-authored-by: Kirthi Shankar Sivamani --- docs/api/pytorch.rst | 2 + tests/pytorch/test_sanity.py | 24 +- transformer_engine/pytorch/__init__.py | 1 + transformer_engine/pytorch/cpu_offload.py | 506 ++++++++++++++++++ .../pytorch/module/layernorm_linear.py | 27 +- .../pytorch/module/layernorm_mlp.py | 38 +- transformer_engine/pytorch/module/linear.py | 26 +- 7 files changed, 615 insertions(+), 9 deletions(-) create mode 100644 transformer_engine/pytorch/cpu_offload.py diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index 7c81c2f071..9b291e6d0a 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -40,3 +40,5 @@ pyTorch .. autoapifunction:: transformer_engine.pytorch.checkpoint .. autoapifunction:: transformer_engine.pytorch.onnx_export + +.. autoapifunction:: transformer_engine.pytorch.get_cpu_offload_context diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index f1e172b36b..593231d6d1 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from typing import Optional +from contextlib import nullcontext import torch import pytest @@ -20,6 +21,7 @@ TransformerLayer, RMSNorm, LayerNorm, + get_cpu_offload_context, ) from transformer_engine.common import recipe @@ -215,7 +217,7 @@ def _test_sanity_e2e_gradient_accumulation_fusion(block, dtype, config, fp8_reci assert torch.count_nonzero(p.main_grad) > 0, "Gradient not accumulated." -def _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad): +def _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad, cpu_offload): te_inp_hidden_states = torch.randn( config.seq_len, config.batch_size, config.hidden_size, dtype=dtype, requires_grad=True ).cuda() @@ -223,9 +225,16 @@ def _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad): if skip_wgrad: _disable_wgrads(block) + if cpu_offload: + offload_context, sync_function = get_cpu_offload_context(enabled=True) + else: + offload_context = nullcontext() + sync_function = lambda x: x + use_fp8 = fp8_recipe is not None - with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe): + with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe), offload_context: te_out = block(te_inp_hidden_states) + te_out = sync_function(te_out) loss = te_out.sum() loss.backward() torch.cuda.synchronize() @@ -449,9 +458,11 @@ def test_sanity_layernorm_mlp(dtype, fp8_recipe, model, skip_wgrad, @pytest.mark.parametrize("activation", all_activations) @pytest.mark.parametrize("normalization", all_normalizations) @pytest.mark.parametrize("parallel_attention_mlp", all_boolean) +@pytest.mark.parametrize("cpu_offload", all_boolean) def test_sanity_gpt(dtype, fp8_recipe, model, skip_wgrad, zero_centered_gamma, bias, activation, - normalization, parallel_attention_mlp): + normalization, parallel_attention_mlp, + cpu_offload): config = model_configs[model] if fp8_recipe is not None: @@ -489,7 +500,7 @@ def test_sanity_gpt(dtype, fp8_recipe, model, skip_wgrad, .cuda() ) - _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad) + _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad, cpu_offload) def test_sanity_gpt_126m(): @@ -512,6 +523,7 @@ def test_sanity_gpt_126m(): activation="gelu", normalization="LayerNorm", parallel_attention_mlp=False, + cpu_offload=False, ) @@ -713,7 +725,7 @@ def test_sanity_drop_path(dtype, fp8_recipe, model, skip_wgrad): .cuda() ) - _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad) + _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad, False) @pytest.mark.parametrize("dtype", param_types) @@ -751,7 +763,7 @@ def test_sanity_fused_qkv_params(dtype, fp8_recipe, model, skip_wgrad): .cuda() ) - _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad) + _test_sanity_e2e(block, dtype, config, fp8_recipe, skip_wgrad, False) @pytest.mark.parametrize("dtype", param_types) diff --git a/transformer_engine/pytorch/__init__.py b/transformer_engine/pytorch/__init__.py index 43ad38e108..16bd128734 100644 --- a/transformer_engine/pytorch/__init__.py +++ b/transformer_engine/pytorch/__init__.py @@ -17,6 +17,7 @@ from .export import onnx_export from .distributed import checkpoint from .distributed import CudaRNGStatesTracker +from .cpu_offload import get_cpu_offload_context # Register custom op symbolic ONNX functions from .te_onnx_extensions import ( onnx_cast_to_fp8, diff --git a/transformer_engine/pytorch/cpu_offload.py b/transformer_engine/pytorch/cpu_offload.py new file mode 100644 index 0000000000..dcede62ef7 --- /dev/null +++ b/transformer_engine/pytorch/cpu_offload.py @@ -0,0 +1,506 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Functionality for CPU offloading of tensors saved for backward pass.""" +from typing import Any +from contextlib import nullcontext +import torch + +from .float8_tensor import Float8Tensor + +__all__ = ['get_cpu_offload_context'] + +CPUOffloadEnabled = False + + +class CpuOffloadSavedTensorHook: + """Contex-manager that executes a pair of pack/unpack hooks for saved tensors. + + In this context, the ``on_save_for_backward`` method will be called every time + a tensor is saved for backward (this includes intermediary results saved using + :func:`~torch.autograd.function._ContextMethodMixin.save_for_backward` but + also those recorded by a PyTorch-defined operation). + + The ``on_get_saved_tensors`` method will be called when the backward function + of this op attempts to retrieve the saved tensor from context (this includes + :func: `torch.Tensor.backward()` or :func: `torch.autograd.grad()`. It takes the + as input the return value of the ``on_save_for_backward``, and is meant to return + an identical copy of the tensor being saved by ``on_save_for_backward`` in terms of + size, device and element values. + + Example: + + >>> import torch + >>> from typing import Any + >>> + >>> class DummyHook(CpuOffloadSavedTensorHook): + ... + ... def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + ... logging.info("On save", tensor) + ... return (tensor,) + ... + ... def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + ... logging.info("On get", saved_state) + ... tensor, = saved_state + ... return tensor + ... + >>> a = torch.ones(5, requires_grad=True) + >>> b = torch.ones(5, requires_grad=True) * 2 + >>> with DummyHook(): + ... y = a * b + ... + On save tensor([1., 1., 1., 1., 1.], requires_grad=True) + On save tensor([2., 2., 2., 2., 2.], grad_fn=) + >>> y.sum().backward() + On get (tensor([1., 1., 1., 1., 1.], requires_grad=True),) + On get (tensor([2., 2., 2., 2., 2.], grad_fn=),) + + """ + + def __init__(self) -> None: + self.inside_context = False + + def __enter__(self): + global CPUOffloadEnabled + CPUOffloadEnabled = True + + self.inside_context = True + torch._C._autograd._push_saved_tensors_default_hooks( + self.on_save_for_backward, + self.on_get_saved_tensor + ) + + def __exit__(self, *args: Any): + global CPUOffloadEnabled + CPUOffloadEnabled = False + + self.inside_context = False + torch._C._autograd._pop_saved_tensors_default_hooks() + + + def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + """On save for backward.""" + raise NotImplementedError("`on_save_for_backward: Callable[[torch.Tensor], Any]`" + "is not implemented in CpuOffloadHook class. Inherit " + "this class and implement your custom hooks") + + def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + """On get saved tensor.""" + raise NotImplementedError("`on_get_saved_tensors: Callable[[Any], torch.Tensor]`" + "is not implemented in CpuOffloadHook class. Inherit " + "this class and implement your custom hooks") + + +class CpuOffloadHookWithOffloadHandler(CpuOffloadSavedTensorHook): + """Context-manager that offloads/recovers tensors through an offload hander. + + The hook just offloads/recovers the tensor object to the handler through `tensor_push` + and `tensor_pop` interface. How the offload-handler manages the offloading, recovering + or prefetching timing is transparent to this hook. + """ + def __init__(self, offload_handler, handler_extra_kwargs={}, debug=False) -> None: # pylint: disable=dangerous-default-value + self.debug = debug + self.offload_handler = offload_handler + self.handler_extra_kwargs = handler_extra_kwargs + super().__init__() + + def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + retrieve_identifier = self.offload_handler.tensor_push( + tensor, + **self.handler_extra_kwargs + ) + return retrieve_identifier + + def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + tensor = self.offload_handler.tensor_pop( + saved_state, + **self.handler_extra_kwargs + ) + return tensor + + +class OffloadHandler: + """A base class for CPU offload-handler.""" + def __init__(self) -> None: + pass + + def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any: + """Tensor push.""" + raise NotImplementedError("`tensor_push is not implented in OffloadHandler class. " + "Inherit this class and implement your custom tensor_push.") + + def tensor_pop(self, tensor_tag: Any, **kwargs): + """Tensor pop.""" + raise NotImplementedError("`tensor_pop is not implented in OffloadHandler class. " + "Inherit this class and implement your custom tensor_pop.") + + +class GroupCommitFunction(torch.autograd.Function): + """this is a dummy op with output identical to input. + However, it is necessary for marking a timepoint for offload handler to + accomplish all synchronizations. Implementing it as a function is necessary + because we need to actions in both forward and backward. + """ + @staticmethod + def forward(ctx, tensor, cpu_offload_handler): + cpu_offload_handler.on_group_commit_forward() + ctx.cpu_offload_handler = cpu_offload_handler + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, grad_output): + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_commit_backward() + return grad_output, None + + +group_prefetch_offload_commit = GroupCommitFunction.apply + + +class SynchronizedGroupOffloadHandler(OffloadHandler): + """Offload Handler that offloads/reloads in a synchronized way. + The device-to-host and host-to-device copying happen in the same stream + as the computation kernels, thus the copying will block computation. + """ + def __init__(self, + num_offload_group, + tensor_need_offloading_checker=(lambda _: True), + debug=False + ) -> None: + super().__init__() + + self.num_offload_group = num_offload_group + self.tensor_need_offloading_checker = tensor_need_offloading_checker + self.debug = debug + + self.groupid_reset() + + def groupid_reset(self): + """Groupid reset.""" + # Data structures to label saved tensors and book-keep their cpu copies. + # Currently, on push, create a new cpu tensor and copies; on pop, copies + # the tensor back to gpu and deletes the cpu tensor. + # These will increment whenever `group_commit()` is invoked + self.current_group, self.tensor_count_current_group = (0, 0) + self.tensor_tag_to_state = {} + + def on_group_commit_forward(self): + """On group commit forward.""" + # finishing up with updating current group and tensor count + self.current_group += 1 # increment + self.tensor_count_current_group = 0 # reset + + def on_group_commit_backward(self): + """On group commit backward.""" + self.current_group -= 1 + assert self.current_group >= 0 + + @staticmethod + def offload(src_tensor, pin_memory=True): + """Offload.""" + fp8_offload = isinstance(src_tensor, Float8Tensor) + + cpu_backup = torch.empty( + src_tensor.size(), dtype=torch.uint8 if fp8_offload else src_tensor.dtype, + layout=src_tensor.layout, device="cpu", pin_memory=pin_memory) + + if fp8_offload: + cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) + + cpu_backup.copy_(src_tensor, non_blocking=pin_memory) + state = (src_tensor.device, cpu_backup) + return state + + @staticmethod + def reload(state, non_blocking=None): + """Reload.""" + dev, cpu_backup = state + if non_blocking is None: + non_blocking = cpu_backup.is_pinned() + return cpu_backup.to(dev, non_blocking=non_blocking) + + def tensor_push(self, tensor: torch.Tensor, **kwargs): + """Tensor push.""" + # obtain a unique tensor tag + tensor_tag = (self.current_group, self.tensor_count_current_group) + self.tensor_count_current_group += 1 + assert tensor_tag not in self.tensor_tag_to_state + if (self.current_group < self.num_offload_group + and self.tensor_need_offloading_checker(tensor)): + state = SynchronizedGroupOffloadHandler.offload(tensor) + self.tensor_tag_to_state[tensor_tag] = state + else: + # will be offloaded together after group commit + self.tensor_tag_to_state[tensor_tag] = tensor + return tensor_tag + + def tensor_pop(self, tensor_tag, **kwargs): + """Tensor pop.""" + assert tensor_tag in self.tensor_tag_to_state + state = self.tensor_tag_to_state.pop(tensor_tag) + if isinstance(state, tuple): + tensor = SynchronizedGroupOffloadHandler.reload(state) + else: + tensor = state + return tensor + + +class AsyncDoubleBufferGroupOffloadHandler(SynchronizedGroupOffloadHandler): + """Compared to synchronize, this uses more memory because of the buffer but + achieves better performance due to the overlapping. D2h and h2d copying are + completely hidden behind computation if computation time of a layer is longer + than host-device communication time. Bulk offloading with delay and bulk reloading + with prefetch are implemented. """ + def __init__(self, + num_offload_group, # must be <= actual number of groups (number of commits) + num_prefetch_group=1, + tensor_need_offloading_checker=(lambda t: True), + debug=False + ) -> None: + super().__init__(num_offload_group=num_offload_group, + tensor_need_offloading_checker=tensor_need_offloading_checker, + debug=debug) + self.num_prefetch_group = num_prefetch_group + + # prepare for tensor buffer + self.tensor_id_to_tensor_buf_double_bufs = [] + for _ in range(2): + self.tensor_id_to_tensor_buf_double_bufs.append({}) + + # allocate streams and events for synchronization + self.d2h_stream = torch.cuda.Stream() + self.h2d_stream = torch.cuda.Stream() + self.h2d_finish_events = [] + self.compute_stream_bwd_start_events = [] + for _ in range(self.num_offload_group): + self.h2d_finish_events.append(torch.cuda.Event()) + self.compute_stream_bwd_start_events.append(torch.cuda.Event()) + self.d2h_final_event = torch.cuda.Event() + + def get_tensor_buf_for_offloaded_tensor(self, tensor, tensor_tag): + """Get tensor buffer for offloaded tensor.""" + group_id, tensor_id = tensor_tag + # obtain ping-pong buffer + id_buf_map = self.tensor_id_to_tensor_buf_double_bufs[(group_id % 2)] + + if not tensor_id in id_buf_map: + allocate_new_buf = True + else: + tensor_buf = id_buf_map[tensor_id] + if not (tensor_buf.size() == tensor.size() and tensor_buf.dtype == tensor.dtype): # pylint: disable=simplifiable-if-statement + allocate_new_buf = True + else: + allocate_new_buf = False # in this case, reuse the old buffer + + if allocate_new_buf: + # supposed to only execute once + fp8_offload = isinstance(tensor, Float8Tensor) + buffer = torch.empty( + tensor.size(), dtype=torch.uint8 if fp8_offload else tensor.dtype, + layout=tensor.layout, device=tensor.device) + + if isinstance(tensor, Float8Tensor): + id_buf_map[tensor_id] = Float8Tensor.make_like(tensor, data=buffer) + else: + id_buf_map[tensor_id] = buffer + + return id_buf_map[tensor_id] + + + def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any: + # obtain a unique tensor tag + tensor_tag = (self.current_group, self.tensor_count_current_group) + self.tensor_count_current_group += 1 + assert tensor_tag not in self.tensor_tag_to_state + + if (self.current_group < self.num_offload_group + and self.tensor_need_offloading_checker(tensor)): + # first copy the tensor to tensorbuf, so that the original tensor will not be deleted + tensor_buf = self.get_tensor_buf_for_offloaded_tensor(tensor, tensor_tag) + tensor_buf.copy_(tensor) + if hasattr(tensor,"weight_offloading"): + tensor_buf.weight_offloading = True + if hasattr(tensor,"activation_offloading"): + tensor_buf.activation_offloading = True + # Here we just save it, and at commit, bulk_offload_group will handle it + self.tensor_tag_to_state[tensor_tag] = tensor_buf + else: + self.tensor_tag_to_state[tensor_tag] = tensor + return tensor_tag + + def tensor_pop(self, tensor_tag, **kwargs): + """Tensor pop.""" + assert tensor_tag in self.tensor_tag_to_state + tensor = self.tensor_tag_to_state.pop(tensor_tag) + # the tensor should have been copied back in on_group_commit_backward() + # which invokes bulk_reload_group. + assert not isinstance(tensor, tuple) + return tensor + + def bulk_offload_group(self, group_to_offload): + """Bulk offload group.""" + with torch.cuda.stream(self.d2h_stream): + for tensor_tag, state in self.tensor_tag_to_state.items(): + group_id, _ = tensor_tag + if group_id == group_to_offload: + assert not isinstance(state, tuple) + tensor_on_device = state + + # if offload, return the reference to cpu copy + if self.tensor_need_offloading_checker(tensor_on_device): + state = SynchronizedGroupOffloadHandler.offload(tensor_on_device) + self.tensor_tag_to_state[tensor_tag] = state + + def synchronize_on_group_commit_forward(self, current_group): + """Synchronize on group commit forward.""" + # the host should wait for the copying of previous group + # to avoid overwriting buffer + previous_group = current_group - 1 + if previous_group < self.num_offload_group: + torch.cuda.synchronize() + # TODO (guyueh): this part is originally designed to reduce the peak memory usage. # pylint: disable=fixme + # however, uncommenting this part will cause illegal access, have not figured out why. + + if previous_group + 2 >= self.num_offload_group: + # this buffer is no longer required + self.tensor_id_to_tensor_buf_double_bufs[(previous_group % 2)] = {} + + # the copying of this group should wait for the computation stream event + if current_group < self.num_offload_group: + # perform bulk offloading + self.bulk_offload_group(current_group) + if current_group == self.num_offload_group - 1: + self.d2h_stream.record_event(self.d2h_final_event) + + def on_group_commit_forward(self): + """This function will cause host device synchronization""" + # handle synchronization events + self.synchronize_on_group_commit_forward(self.current_group) + + # during forward, the next_group_to_fetch always points to the min of + # the last commited group, and the last offloaded group + self.next_group_to_fetch = min(self.current_group, self.num_offload_group -1) + + super().on_group_commit_forward() + + def bulk_reload_group(self, group_to_reload): + """Bulk reload group.""" + assert group_to_reload < self.num_offload_group + if group_to_reload == self.num_offload_group - 1: + self.h2d_stream.wait_event(self.d2h_final_event) + with torch.cuda.stream(self.h2d_stream): + # move back tensors + for tensor_label, state in self.tensor_tag_to_state.items(): + group_id, _ = tensor_label + if group_id == group_to_reload: + if isinstance(state, tuple): + recovered_tensor = SynchronizedGroupOffloadHandler.reload(state) + self.tensor_tag_to_state[tensor_label] = recovered_tensor + + def on_group_commit_backward(self): + # first decrement the current group. + # after last commit in forward, the group will +1; in backward it -1. + # Finally it should be decremented to 0. + self.current_group -= 1 + assert self.current_group >= 0 + + # decide the range of group to prefetch + should_prefetch_until_group = self.current_group - self.num_prefetch_group + should_prefetch_until_group = max(should_prefetch_until_group, 0) + + # do prefetch + for group_num_to_prefetch in range( + self.next_group_to_fetch, should_prefetch_until_group - 1, -1 + ): + # record the event in the compute stream, for h2d to wait + torch.cuda.current_stream().record_event( + self.compute_stream_bwd_start_events[group_num_to_prefetch]) + + # start of h2d should wait for the compute and the d2h + self.h2d_stream.wait_event(self.compute_stream_bwd_start_events[group_num_to_prefetch]) + + #recover tensors (copy back from host) + self.bulk_reload_group(group_num_to_prefetch) + + # record an event for the backward of this layer to wait + self.h2d_stream.record_event(self.h2d_finish_events[group_num_to_prefetch]) + + # always is set to -1 at the end of the backward + self.next_group_to_fetch = min(self.num_offload_group - 1, should_prefetch_until_group - 1) + + # wait for the current group + if self.current_group < self.num_offload_group: + torch.cuda.current_stream().wait_event(self.h2d_finish_events[self.current_group]) + + +def get_cpu_offload_context( + enabled: bool = False, + num_layers: int = 1, + offload_activations: bool = True, + offload_weights: bool = True): + """ + This function returns the CPU Offload context and the synchronizer function that needs to be + used after every transformer layer. Returns `nullcontext()` if offloading is not enabled. + + Usage: + + .. code-block:: python + + cpu_offload_context, cpu_offload_synchronizer = get_cpu_offload_context(enabled=True) + + with cpu_offload_context: + te_layer.forward(inp_tensor) + cpu_offload_synchronizer() + + Parameters + ---------- + enabled: bool, default = `False` + When set to True, CPU Offloading functionality is enabled. + num_layers: int, default = 1 + Determines the number of transformer layers + you want to offload activations/weights for. + offload_activations: bool, default = `True` + When set to `True`, offloads the activations for the TE layer. + offload_weights: bool, default = `True` + When set to `True`, offloads the weights for the TE layer. + + """ + + def tensor_need_offloading_checker_activations(tensor): + return hasattr(tensor,"activation_offloading") + + # This includes the Gradient Accumulation Buffer + def tensor_need_offloading_checker_weights(tensor): + return hasattr(tensor, "weight_offloading") + + def tensor_need_offloading_checker_all(tensor): # pylint: disable=unused-argument + return (hasattr(tensor,"activation_offloading") or hasattr(tensor, "weight_offloading")) + + if offload_activations and offload_weights: + tensor_need_offloading_checker = tensor_need_offloading_checker_all + elif offload_activations: + tensor_need_offloading_checker = tensor_need_offloading_checker_activations + elif offload_weights: + tensor_need_offloading_checker = tensor_need_offloading_checker_weights + else: + raise ValueError( + "CPU Offloading is enabled while it is not " + "mentioned what to offload (weights/activations)") + + cpu_offload_handler = AsyncDoubleBufferGroupOffloadHandler( + num_offload_group=num_layers, + num_prefetch_group=1, + tensor_need_offloading_checker=tensor_need_offloading_checker + ) + + def group_prefetch_offload_commit_async(tensor): + return group_prefetch_offload_commit(tensor,cpu_offload_handler) + + if enabled: + return ( + CpuOffloadHookWithOffloadHandler(offload_handler=cpu_offload_handler), + group_prefetch_offload_commit_async, + ) + return nullcontext(), group_prefetch_offload_commit_async diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 2e6803f992..0431b8e046 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -42,7 +42,6 @@ from ._common import _apply_normalization, _noop_cat from ..float8_tensor import Float8Tensor - __all__ = ["LayerNormLinear"] @@ -68,6 +67,7 @@ def forward( fp8_calibration: bool, fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, + cpu_offloading: bool, tp_group: Union[dist_group_type, None], tp_size: int, sequence_parallel: bool, @@ -239,12 +239,27 @@ def forward( ) if is_grad_enabled: + if cpu_offloading: + if fuse_wgrad_accumulation: + weight.main_grad.weight_offloading = True + if fp8: + weight_t_fp8.weight_offloading = True + ln_weight.weight_offloading = True + weight.weight_offloading = True + + inputmat.activation_offloading = True + if normalization == "LayerNorm": + mu.activation_offloading = True + rsigma.activation_offloading = True + ln_out.activation_offloading = True + ctx.save_for_backward( inputmat, ln_weight, mu, rsigma, weight, + weight.main_grad if cpu_offloading and fuse_wgrad_accumulation else None, weight_t_fp8, ln_out, fp8_meta["scaling_fwd"].scale_inv.clone() if fp8 else None, @@ -254,6 +269,7 @@ def forward( ctx.fp8 = fp8 ctx.fp8_meta = fp8_meta ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation + ctx.cpu_offloading = cpu_offloading ctx.is_first_microbatch = is_first_microbatch ctx.use_bias = use_bias ctx.sequence_parallel = sequence_parallel @@ -298,11 +314,16 @@ def backward( mu, rsigma, weight, + main_grad, weight_t_fp8, ln_out, fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.cpu_offloading and ctx.fuse_wgrad_accumulation: + weight = torch.nn.Parameter(weight, False) + weight.main_grad = main_grad + # Primary weights are in FP8. if ctx.fp8 and weight_t_fp8 is None: weight_t_fp8 = weight.transpose(update_cache=ctx.is_first_microbatch) @@ -582,6 +603,7 @@ def backward( None, None, None, + None, ) @@ -992,6 +1014,8 @@ def forward( is_first_microbatch ) + from ..cpu_offload import CPUOffloadEnabled + if torch.is_grad_enabled(): fwd_fn = _LayerNormLinear.apply args = [] @@ -1013,6 +1037,7 @@ def forward( self.fp8_calibration, self.fp8_meta, self.fuse_wgrad_accumulation, + CPUOffloadEnabled, self.tp_group, self.tp_size, self.sequence_parallel, diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index 8f88d725ad..050ac21a92 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -51,7 +51,6 @@ from ..float8_tensor import Float8Tensor from ._common import _apply_normalization - __all__ = ["LayerNormMLP"] @@ -95,6 +94,7 @@ def forward( fp8_calibration: bool, fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, + cpu_offloading: bool, tp_group: Union[dist_group_type, None], tp_size: int, sequence_parallel: bool, @@ -420,6 +420,26 @@ def forward( clear_tensor_data(gelu_out) if is_grad_enabled: + if cpu_offloading: + if fuse_wgrad_accumulation: + fc1_weight.main_grad.weight_offloading = True + fc2_weight.main_grad.weight_offloading = True + if fp8: + fc1_weight_t_fp8.weight_offloading = True + fc2_weight_t_fp8.weight_offloading = True + ln_weight.weight_offloading = True + fc1_weight.weight_offloading = True + fc2_weight.weight_offloading = True + fc1_bias.weight_offloading = True + + inputmat.activation_offloading = True + if normalization == "LayerNorm": + mu.activation_offloading = True + rsigma.activation_offloading = True + ln_out.activation_offloading = True + fc1_out.activation_offloading = True + gelu_out.activation_offloading = True + ctx.save_for_backward( inputmat, ln_weight, @@ -429,8 +449,10 @@ def forward( fc1_out, gelu_out, fc1_weight, + fc1_weight.main_grad if (cpu_offloading and fuse_wgrad_accumulation) else None, fc1_weight_t_fp8, fc2_weight, + fc2_weight.main_grad if (cpu_offloading and fuse_wgrad_accumulation) else None, fc2_weight_t_fp8, fc1_bias, fp8_meta["scaling_fwd"].scale_inv.clone() if fp8 else None, @@ -440,6 +462,7 @@ def forward( ctx.fp8 = fp8 ctx.fp8_meta = fp8_meta ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation + ctx.cpu_offloading = cpu_offloading ctx.is_first_microbatch = is_first_microbatch ctx.use_fc1_bias = use_fc1_bias ctx.use_fc2_bias = use_fc2_bias @@ -492,13 +515,22 @@ def backward( fc1_out, gelu_out, fc1_weight, + fc1_weight_main_grad, fc1_weight_t_fp8, fc2_weight, + fc2_weight_main_grad, fc2_weight_t_fp8, fc1_bias, fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.cpu_offloading and ctx.fuse_wgrad_accumulation: + fc1_weight = Parameter(fc1_weight, False) + fc2_weight = Parameter(fc2_weight, False) + + fc1_weight.main_grad = fc1_weight_main_grad + fc2_weight.main_grad = fc2_weight_main_grad + # Primary weights are in FP8. if ctx.fp8 and fc1_weight_t_fp8 is None: fc1_weight_t_fp8 = fc1_weight.transpose(update_cache=ctx.is_first_microbatch) @@ -993,6 +1025,7 @@ def backward( None, None, None, + None, ) @@ -1336,6 +1369,8 @@ def forward( is_first_microbatch ) + from ..cpu_offload import CPUOffloadEnabled + if torch.is_grad_enabled(): fwd_fn = _LayerNormMLP.apply args = [] @@ -1362,6 +1397,7 @@ def forward( self.fp8_calibration, self.fp8_meta, self.fuse_wgrad_accumulation, + CPUOffloadEnabled, self.tp_group, self.tp_size, self.sequence_parallel, diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 2cad516881..87c78aa151 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -45,7 +45,6 @@ from ..float8_tensor import Float8Tensor - __all__ = ["Linear"] @@ -68,6 +67,7 @@ def forward( fp8_calibration: bool, fp8_meta: Dict[str, Any], fuse_wgrad_accumulation: bool, + cpu_offloading: bool, tp_group: Union[dist_group_type, None], tp_size: int, sequence_parallel: bool, @@ -266,12 +266,26 @@ def forward( saved_inputmat = inputmat else: saved_inputmat_t = inputmat_t + if cpu_offloading: + saved_inputmat_t.activation_offloading = True else: saved_inputmat = inputmat_no_fp8 + + if cpu_offloading: + if fuse_wgrad_accumulation: + weight.main_grad.weight_offloading = True + if fp8: + weight_t_fp8.weight_offloading = True + weight.weight_offloading = True + + if saved_inputmat is not None: + saved_inputmat.activation_offloading = True + ctx.save_for_backward( saved_inputmat, saved_inputmat_t, weight, + weight.main_grad if cpu_offloading and fuse_wgrad_accumulation else None, weight_t_fp8 if fp8 else None, fp8_meta["scaling_fwd"].scale_inv.clone() if fp8 else None, ) @@ -279,6 +293,7 @@ def forward( ctx.fp8 = fp8 ctx.fp8_meta = fp8_meta ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation + ctx.cpu_offloading = cpu_offloading ctx.is_first_microbatch = is_first_microbatch ctx.use_bias = use_bias ctx.sequence_parallel = sequence_parallel @@ -315,10 +330,15 @@ def backward( inputmat, inputmat_t, weight, + main_grad, weight_t_fp8, fwd_scale_inverses, ) = ctx.saved_tensors + if ctx.cpu_offloading and ctx.fuse_wgrad_accumulation: + weight = torch.nn.Parameter(weight, False) + weight.main_grad = main_grad + # Primary weights are in FP8. if ctx.fp8 and weight_t_fp8 is None: weight_t_fp8 = weight.transpose(update_cache=ctx.is_first_microbatch) @@ -515,6 +535,7 @@ def backward( None, None, None, + None, ) @@ -862,6 +883,8 @@ def forward( is_first_microbatch ) + from ..cpu_offload import CPUOffloadEnabled + if torch.is_grad_enabled(): linear_fn = _Linear.apply args = [] @@ -880,6 +903,7 @@ def forward( self.fp8_calibration, self.fp8_meta, self.fuse_wgrad_accumulation, + CPUOffloadEnabled, self.tp_group, self.tp_size, self.sequence_parallel, From cc289dc55df47189ec3bb6ec3b7332d76004951f Mon Sep 17 00:00:00 2001 From: Marks101 <46690260+Marks101@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:05:24 +0100 Subject: [PATCH 076/427] [PyTorch] Fix bias initialization introduced in #596 (#622) Signed-off-by: Markus Schnoes --- transformer_engine/pytorch/module/layernorm_linear.py | 6 ++++-- transformer_engine/pytorch/module/layernorm_mlp.py | 9 ++++++--- transformer_engine/pytorch/module/linear.py | 4 +++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 0431b8e046..589c787b74 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -781,7 +781,8 @@ def __init__( layer_norm_bias = torch.nn.Parameter( torch.empty(in_features, device=device, dtype=params_dtype) ) - self.register_parameter('layer_norm_bias', layer_norm_bias) + self.register_parameter('layer_norm_bias', layer_norm_bias, + init_fn=init_method_constant(0.0)) setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None @@ -873,7 +874,8 @@ def __init__( if is_subview: bias = bias[split_start:split_end] bias = torch.nn.Parameter(bias) - self.register_parameter(self.bias_names[i], bias) + self.register_parameter(self.bias_names[i], bias, + init_fn=init_method_constant(0.0)) if parallel_mode == "row": bias.sequence_parallel = sequence_parallel else: diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index 050ac21a92..54de8f16f8 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -1213,7 +1213,8 @@ def __init__( layer_norm_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) - self.register_parameter('layer_norm_bias', layer_norm_bias) + self.register_parameter('layer_norm_bias', layer_norm_bias, + init_fn=init_method_constant(0.0)) setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None @@ -1240,7 +1241,8 @@ def __init__( fc1_bias = Parameter( torch.empty(fc1_output_features, device=device, dtype=params_dtype) ) - self.register_parameter('fc1_bias', fc1_bias) + self.register_parameter('fc1_bias', fc1_bias, + init_fn=init_method_constant(0.0)) set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) # pylint: disable=access-member-before-definition else: self.fc1_bias = torch.Tensor().to(dtype=params_dtype, device=device) @@ -1260,7 +1262,8 @@ def __init__( fc2_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) - self.register_parameter('fc2_bias', fc2_bias) + self.register_parameter('fc2_bias', fc2_bias, + init_fn=init_method_constant(0.0)) # RPL if self.set_parallel_mode: setattr(self.fc2_bias, "sequence_parallel", sequence_parallel) # pylint: disable=access-member-before-definition diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 87c78aa151..88eb6080e8 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -26,6 +26,7 @@ cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, + init_method_constant, ) from ..distributed import ( set_tensor_model_parallel_attributes, @@ -764,7 +765,8 @@ def __init__( if is_subview: bias = bias[split_start:split_end] bias = torch.nn.Parameter(bias) - self.register_parameter(self.bias_names[i], bias) + self.register_parameter(self.bias_names[i], bias, + init_fn=init_method_constant(0.0)) if parallel_mode == "row": bias.sequence_parallel = sequence_parallel else: From bbadf40304e20f0640885b64e8fd0fbeedc8a6ad Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Tue, 23 Jan 2024 15:30:47 -0600 Subject: [PATCH 077/427] [PyTorch] Fix for deferred init bug causing NeMo MLPerf LLM crash (#619) * added missing parameter materialization on real device for LayerNorm and RMSNorm Signed-off-by: Alp Dener * added new unittest for deferred initialization and modified parameter materialization to support standalone execution outside of FSDP Signed-off-by: Alp Dener * restored tensor parallel attributes that were being wiped out by the parameter reset Signed-off-by: Alp Dener * fixed incorrect order of fp8 metadata initialization Signed-off-by: Alp Dener * added deferred init unittest to the QA script Signed-off-by: Alp Dener --------- Signed-off-by: Alp Dener --- qa/L0_pytorch_unittest/test.sh | 1 + tests/pytorch/test_deferred_init.py | 87 +++++++++++++++++++ transformer_engine/pytorch/module/base.py | 2 +- .../pytorch/module/layernorm.py | 11 ++- .../pytorch/module/layernorm_linear.py | 41 ++++++--- .../pytorch/module/layernorm_mlp.py | 25 ++++-- transformer_engine/pytorch/module/linear.py | 33 ++++--- transformer_engine/pytorch/module/rmsnorm.py | 6 +- 8 files changed, 168 insertions(+), 38 deletions(-) create mode 100644 tests/pytorch/test_deferred_init.py diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index 729b4b8992..51b7b6235e 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -8,6 +8,7 @@ set -e pip install pytest==6.2.5 onnxruntime==1.13.1 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py +pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py pytest -v -s $TE_PATH/tests/pytorch/test_jit.py pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py diff --git a/tests/pytorch/test_deferred_init.py b/tests/pytorch/test_deferred_init.py new file mode 100644 index 0000000000..cbc761a27c --- /dev/null +++ b/tests/pytorch/test_deferred_init.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import pytest +import torch +import torch.distributed as dist + +import transformer_engine.pytorch as te + +_core_modules = [ + te.LayerNorm, + te.RMSNorm, + te.Linear, + te.LayerNormLinear, + te.LayerNormMLP, +] + +_composed_modules = [ + te.MultiheadAttention, + te.TransformerLayer, +] + +batch_size = 32 +seq_length = 2048 +num_heads = 16 +head_dim = 64 +dtype = torch.bfloat16 + +class TestDeferredInit: + + @staticmethod + def get_module_args(module): + hidden_size = num_heads * head_dim + args = (hidden_size,) + kwargs = { + 'params_dtype': dtype, + 'device': 'meta' + } + if module in [te.Linear, te.LayerNormLinear, te.LayerNormMLP]: + ffn_hidden_size = 2 * hidden_size + args += (ffn_hidden_size, ) + kwargs['bias'] = True + if module == te.LayerNormMLP: + kwargs['seq_length'] = seq_length + elif module == te.MultiheadAttention: + args += (num_heads, ) + kwargs['fuse_qkv_params'] = True + elif module == te.TransformerLayer: + args += (3 * hidden_size, num_heads) + kwargs['fuse_qkv_params'] = True + kwargs['seq_length'] = seq_length + + return args, kwargs + + @pytest.mark.parametrize("module_type", _core_modules+_composed_modules) + def test_zero_memory_init( + self, + module_type: torch.nn.Module, + ) -> None: + """Test deferred initialization via device='meta'.""" + # This should not allocate any memory on CUDA device until we call reset_parameters() later. + args, kwargs = TestDeferredInit.get_module_args(module_type) + module = module_type(*args, **kwargs) + assert torch.cuda.memory_allocated(device=0) == 0.0, ( + f"Initializing {module_type.__name__} with device='meta' prematurely allocated " + "memory on CUDA device" + ) + del module + + @pytest.mark.parametrize("module_type", _core_modules) + def test_reset_parameters( + self, + module_type: torch.nn.Module, + ) -> None: + """Test parameter reset for core modules that have been initialized with device='meta'.""" + # Core modules own their own parameters so calling reset_parameters() here should + # materialize them on CUDA device. + args, kwargs = TestDeferredInit.get_module_args(module_type) + module = module_type(*args, **kwargs) + with torch.no_grad(): + module.reset_parameters() + assert torch.cuda.memory_allocated(device=0) > 0.0, ( + f"{module_type.__name__}.reset_parameters() failed to materialize parameters " + "on CUDA device" + ) + del module diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index ad1f383617..f77e07a68f 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -769,7 +769,7 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None: for name, param in self.named_parameters(recurse=False): # Ensure parameter is on a real device if param.device == torch.device('meta'): - param = param.to(device='cuda') + param = torch.empty_like(param, device='cuda') # Initialize the parameter values on device init_fn = self.param_init_meta[name].init_fn diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py index fac941306f..6178199be6 100644 --- a/transformer_engine/pytorch/module/layernorm.py +++ b/transformer_engine/pytorch/module/layernorm.py @@ -138,8 +138,7 @@ def __init__( dtype=params_dtype, ) ) - setattr(self.weight, "sequence_parallel", sequence_parallel) - setattr(self.bias, "sequence_parallel", sequence_parallel) + self.sequence_parallel = sequence_parallel self.reset_parameters(defer_init=(device == 'meta')) @@ -168,7 +167,15 @@ def reset_parameters(self, defer_init=False) -> None: """Init LayerNorm parameters""" if defer_init: return + + if self.weight.device == torch.device('meta'): + self.weight = torch.nn.Parameter(torch.empty_like(self.weight, device='cuda')) + setattr(self.weight, "sequence_parallel", self.sequence_parallel) init.constant_(self.weight, float(not self.zero_centered_gamma)) + + if self.bias.device == torch.device('meta'): + self.bias = torch.nn.Parameter(torch.empty_like(self.bias, device='cuda')) + setattr(self.bias, "sequence_parallel", self.sequence_parallel) init.zeros_(self.bias) @no_torch_dynamo() diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 589c787b74..2de860cf73 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -776,14 +776,12 @@ def __init__( ) self.register_parameter('layer_norm_weight', layer_norm_weight, init_fn=init_method_constant(float(not self.zero_centered_gamma))) - setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition if self.normalization != "RMSNorm": layer_norm_bias = torch.nn.Parameter( torch.empty(in_features, device=device, dtype=params_dtype) ) self.register_parameter('layer_norm_bias', layer_norm_bias, init_fn=init_method_constant(0.0)) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None @@ -876,22 +874,10 @@ def __init__( bias = torch.nn.Parameter(bias) self.register_parameter(self.bias_names[i], bias, init_fn=init_method_constant(0.0)) - if parallel_mode == "row": - bias.sequence_parallel = sequence_parallel else: bias = torch.Tensor().to(dtype=params_dtype, device=device) setattr(self, self.bias_names[i], bias) - # Configure tensor parallelism - set_tensor_model_parallel_attributes( - tensor=weight, - is_parallel=True, - dim=1 if parallel_mode == "row" else 0, - stride=1, - ) - if parallel_mode == "column": - set_tensor_model_parallel_attributes(bias, True, 0, 1) - # Concatenated tensors are not needed if not splitting # into multiple parameters if not is_subview: @@ -935,6 +921,33 @@ def reset_layer_norm_parameters(self) -> None: if self.layer_norm_bias is not None: init.zeros_(self.layer_norm_bias) + def reset_parameters(self, defer_init=False): + super().reset_parameters(defer_init=defer_init) + + if not defer_init: + # Set parallelism attributes for layer norm parameters + setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) + if self.normalization != "RMSNorm": + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + + # Set parallelism attributes for linear weights + for weight in self.weight_names: + set_tensor_model_parallel_attributes( + tensor=getattr(self, weight), + is_parallel=True, + dim=1 if self.parallel_mode == "row" else 0, + stride=1, + ) + + # Set parallelism attributes for linear biases + if self.use_bias: + for bias in self.bias_names: + if self.parallel_mode == "row": + setattr(getattr(self, bias), "sequence_parallel", self.sequence_parallel) + elif self.parallel_mode == "column": + set_tensor_model_parallel_attributes(getattr(self, bias), True, 0, 1) + + def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index 54de8f16f8..d48ee4887d 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -1208,14 +1208,12 @@ def __init__( ) self.register_parameter('layer_norm_weight', layer_norm_weight, init_fn=init_method_constant(float(not self.zero_centered_gamma))) - setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) if self.normalization != "RMSNorm": layer_norm_bias = Parameter( torch.empty(hidden_size, device=device, dtype=params_dtype) ) self.register_parameter('layer_norm_bias', layer_norm_bias, init_fn=init_method_constant(0.0)) - setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) # pylint: disable=access-member-before-definition else: self.layer_norm_bias = None @@ -1234,7 +1232,6 @@ def __init__( init_fn=init_method, get_rng_state_tracker=get_rng_state_tracker, fp8_meta_index=tex.FP8FwdTensors.GEMM1_WEIGHT) - set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) self.fp8_weight_shapes.append(self.fc1_weight.shape) if self.use_bias: @@ -1243,7 +1240,6 @@ def __init__( ) self.register_parameter('fc1_bias', fc1_bias, init_fn=init_method_constant(0.0)) - set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) # pylint: disable=access-member-before-definition else: self.fc1_bias = torch.Tensor().to(dtype=params_dtype, device=device) @@ -1255,7 +1251,6 @@ def __init__( init_fn=output_layer_init_method, get_rng_state_tracker=get_rng_state_tracker, fp8_meta_index=tex.FP8FwdTensors.GEMM2_WEIGHT) - set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) self.fp8_weight_shapes.append(self.fc2_weight.shape) if self.use_bias: @@ -1264,9 +1259,6 @@ def __init__( ) self.register_parameter('fc2_bias', fc2_bias, init_fn=init_method_constant(0.0)) - # RPL - if self.set_parallel_mode: - setattr(self.fc2_bias, "sequence_parallel", sequence_parallel) # pylint: disable=access-member-before-definition else: self.fc2_bias = torch.Tensor().to(dtype=params_dtype, device=device) @@ -1312,6 +1304,23 @@ def reset_layer_norm_parameters(self) -> None: if self.layer_norm_bias is not None: init.zeros_(self.layer_norm_bias) + def reset_parameters(self, defer_init=False): + super().reset_parameters(defer_init=defer_init) + + if not defer_init: + # Set parallel attributes for layer norm parameters + setattr(self.layer_norm_weight, "sequence_parallel", self.sequence_parallel) + if self.normalization != "RMSNorm": + setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel) + + # Set parallel attributes for linear parameters + set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) + set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) + if self.use_bias: + set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) + if self.set_parallel_mode: + setattr(self.fc2_bias, "sequence_parallel", self.sequence_parallel) + def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 88eb6080e8..68c5bf1a1d 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -767,22 +767,10 @@ def __init__( bias = torch.nn.Parameter(bias) self.register_parameter(self.bias_names[i], bias, init_fn=init_method_constant(0.0)) - if parallel_mode == "row": - bias.sequence_parallel = sequence_parallel else: bias = torch.Tensor().to(dtype=params_dtype, device=device) setattr(self, self.bias_names[i], bias) - # Configure tensor parallelism - set_tensor_model_parallel_attributes( - tensor=weight, - is_parallel=True, - dim=1 if parallel_mode == "row" else 0, - stride=1, - ) - if parallel_mode == "column": - set_tensor_model_parallel_attributes(bias, True, 0, 1) - # Concatenated tensors are not needed if not splitting # into multiple parameters if not is_subview: @@ -804,6 +792,27 @@ def __init__( else: self.gemm_bias_unfused_add = False + def reset_parameters(self, defer_init=False): + super().reset_parameters(defer_init=defer_init) + + if not defer_init: + # Set parallelism attributes for linear weights + for weight in self.weight_names: + set_tensor_model_parallel_attributes( + tensor=getattr(self, weight), + is_parallel=True, + dim=1 if self.parallel_mode == "row" else 0, + stride=1, + ) + + # Set parallelism attributes for linear biases + if self.use_bias: + for bias in self.bias_names: + if self.parallel_mode == "row": + setattr(getattr(self, bias), "sequence_parallel", self.sequence_parallel) + elif self.parallel_mode == "column": + set_tensor_model_parallel_attributes(getattr(self, bias), True, 0, 1) + def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py index cad357de04..4b1b2c749a 100644 --- a/transformer_engine/pytorch/module/rmsnorm.py +++ b/transformer_engine/pytorch/module/rmsnorm.py @@ -141,7 +141,7 @@ def __init__( dtype=params_dtype, ) ) - setattr(self.weight, "sequence_parallel", sequence_parallel) + self.sequence_parallel = sequence_parallel self.reset_parameters(defer_init=(device == 'meta')) @@ -169,7 +169,11 @@ def reset_parameters(self, defer_init=False) -> None: """Reset RMSNorm parameters""" if defer_init: return + + if self.weight.device == torch.device('meta'): + self.weight = torch.nn.Parameter(torch.empty_like(self.weight, device='cuda')) init.constant_(self.weight, float(not self.zero_centered_gamma)) + setattr(self.weight, "sequence_parallel", self.sequence_parallel) @no_torch_dynamo() def forward(self, inp: torch.Tensor) -> torch.Tensor: From ffdd519647701a34ec05e5cea54a0f35ecfbe64e Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Wed, 24 Jan 2024 09:32:11 -0600 Subject: [PATCH 078/427] [PyTorch] Workaround for incorrect output from torch.cuda.is_bf16_compatible() on V100s and TU102s (#626) * replaced torch.cuda.is_bf16_compatible() with explicit sm_80 check via torch.cuda.get_device_capability() Signed-off-by: Alp Dener * implement te.utils.is_bf16_compatible() to replace torch.cuda counterpart Signed-off-by: Alp Dener --------- Signed-off-by: Alp Dener --- tests/pytorch/fused_attn/test_fused_attn.py | 3 ++- tests/pytorch/test_numerics.py | 3 ++- tests/pytorch/test_sanity.py | 3 ++- transformer_engine/pytorch/utils.py | 6 ++++++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py index 296d9ff214..42ffb32ad1 100644 --- a/tests/pytorch/fused_attn/test_fused_attn.py +++ b/tests/pytorch/fused_attn/test_fused_attn.py @@ -41,6 +41,7 @@ get_device_compute_capability, init_method_normal, scaled_init_method_normal, + is_bf16_compatible, ) import transformer_engine_extensions as tex from transformer_engine_extensions import NVTE_Fused_Attn_Backend @@ -194,7 +195,7 @@ def _is_unfused_attention_supported(config: ModelConfig) -> bool: } param_types = [torch.float16] -if torch.cuda.is_bf16_supported(): +if is_bf16_compatible(): # bf16 requires sm_80 or higher param_types.append(torch.bfloat16) param_types_lean = [torch.bfloat16] diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 215cae2b97..4f5a9807c1 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -17,6 +17,7 @@ init_method_normal, scaled_init_method_normal, attention_mask_func, + is_bf16_compatible, ) from transformer_engine.pytorch import ( DotProductAttention, LayerNormLinear, LayerNormMLP, Linear, @@ -53,7 +54,7 @@ def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq } param_types = [torch.float32, torch.float16] -if torch.cuda.is_bf16_supported(): +if is_bf16_compatible(): # bf16 requires sm_80 or higher param_types.append(torch.bfloat16) batch_sizes = [1, 2] diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index 593231d6d1..ae960369c4 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -13,6 +13,7 @@ from transformer_engine.pytorch.utils import ( init_method_normal, scaled_init_method_normal, + is_bf16_compatible, ) from transformer_engine.pytorch import ( LayerNormLinear, @@ -101,7 +102,7 @@ def is_fp8_supported(self): ] param_types = [torch.float32, torch.float16] -if torch.cuda.is_bf16_supported(): +if is_bf16_compatible(): # bf16 requires sm_80 or higher param_types.append(torch.bfloat16) all_boolean = [True, False] diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 819b3d4827..824508077b 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -222,3 +222,9 @@ def assert_dim_for_fp8_exec(tensor: torch.Tensor) -> None: "Tensor dimensions are not compatible for FP8 execution: " f"({tensor.shape[0]} % 8 != 0, {tensor.shape[1]} % 16 != 0)" ) + +def is_bf16_compatible() -> None: + """Replaces torch.cuda.is_bf16_compatible() with an explicit + check on device compute capability to enforce sm_80 or higher. + """ + return torch.cuda.get_device_capability()[0] >= 8 From 8571f6999ffea902166d82dffe4ab0675d86e35f Mon Sep 17 00:00:00 2001 From: Marks101 <46690260+Marks101@users.noreply.github.com> Date: Wed, 24 Jan 2024 18:48:21 +0100 Subject: [PATCH 079/427] [PyTorch] forward attention_type in MultiHeadAttention (#621) [PyTorch] fix forward attention_type in MultiheadAttention Signed-off-by: Markus Schnoes Co-authored-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/attention.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index cf7bee8c66..7bf0678898 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -3090,6 +3090,7 @@ def __init__( sequence_parallel=sequence_parallel, tp_group=tp_group, layer_number=self.layer_number, + attention_type=self.attention_type, ) # Linear From 18186b410ad968b21ed841a5a03bd5574b96ab12 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Wed, 24 Jan 2024 10:13:08 -0800 Subject: [PATCH 080/427] Fix compatibility with pyTorch 2.0 (#627) Signed-off-by: Przemek Tredak --- transformer_engine/pytorch/jit.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/jit.py b/transformer_engine/pytorch/jit.py index 5fb1768ba6..684004a27e 100644 --- a/transformer_engine/pytorch/jit.py +++ b/transformer_engine/pytorch/jit.py @@ -17,7 +17,12 @@ no_torch_dynamo = lambda recursive=True: lambda func: func if torch.__version__ >= "2": import torch._dynamo - no_torch_dynamo = lambda recursive=True: lambda f: torch._dynamo.disable(f, recursive=recursive) + if torch.__version__ >= "2.1": + no_torch_dynamo = lambda recursive=True: lambda f: \ + torch._dynamo.disable(f, recursive=recursive) + else: + # no "recursive" option in pyTorch 2.0 - it acts as if recursive was True + no_torch_dynamo = lambda recursive=True: torch._dynamo.disable def set_jit_fusion_options() -> None: From bcbe9b0365b649695a325f720423b4fa61d37527 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 24 Jan 2024 16:55:50 -0800 Subject: [PATCH 081/427] Revert "Avoid redundant computation for cu_seqlens (#535)" This reverts commit fad3044bde1547eae9543a6a3f80401e59bb629e. Signed-off-by: Przemek Tredak --- transformer_engine/pytorch/attention.py | 32 +++++++++++-------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 7bf0678898..a8300bad87 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1621,24 +1621,20 @@ def forward( query_layer_packed, key_layer_packed, value_layer_packed) cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv else: - if self.layer_number == 1: - if cu_seqlens_q is None: - cu_seqlens_q = torch.arange( - 0, - (batch_size + 1) * max_seqlen_q, - step=max_seqlen_q, - dtype=torch.int32, - device=query_layer.device) - if cu_seqlens_kv is None: - cu_seqlens_kv = torch.arange( - 0, - (batch_size + 1) * max_seqlen_kv, - step=max_seqlen_kv, - dtype=torch.int32, - device=key_layer.device) - _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv - else: - cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv + if cu_seqlens_q is None: + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * max_seqlen_q, + step=max_seqlen_q, + dtype=torch.int32, + device=query_layer.device) + if cu_seqlens_kv is None: + cu_seqlens_kv = torch.arange( + 0, + (batch_size + 1) * max_seqlen_kv, + step=max_seqlen_kv, + dtype=torch.int32, + device=key_layer.device) elif qkv_format == 'thd': assert not context_parallel, "thd format not supported with context parallelism!" assert (cu_seqlens_q is not None and cu_seqlens_kv is not None From e7319f55e3f41886a2a9ceb3c7a45fd809daffb0 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Fri, 26 Jan 2024 12:59:40 -0800 Subject: [PATCH 082/427] Fix pipeline parallelism with FusedAttn (#635) Signed-off-by: Przemek Tredak --- transformer_engine/pytorch/attention.py | 86 +++++++++++-------------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index a8300bad87..469791c5d5 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1587,32 +1587,30 @@ def forward( assert ( max_seqlen_q == max_seqlen_kv ), "Maximum sequence length for Q and KV should be the same." - if self.layer_number == 1: - if cu_seqlens_q is None: - assert (attention_mask is not None - ), "Please provide attention_mask for padding!" - _cu_seqlens_q, _indices_q = get_cu_seqlens_and_indices(attention_mask) - else: - _cu_seqlens_q = cu_seqlens_q - _indices_q = get_indices(max_seqlen_q, cu_seqlens_q) + if cu_seqlens_q is None: + assert (attention_mask is not None + ), "Please provide attention_mask for padding!" + _cu_seqlens_q, _indices_q = get_cu_seqlens_and_indices(attention_mask) + else: + _cu_seqlens_q = cu_seqlens_q + _indices_q = get_indices(max_seqlen_q, cu_seqlens_q) _cu_seqlens_kv = _cu_seqlens_q query_layer_packed, key_layer_packed, value_layer_packed = PackTensors.apply( _indices_q, query_layer, key_layer, value_layer ) else: - if self.layer_number == 1: - if cu_seqlens_q is None or cu_seqlens_kv is None: - assert (attention_mask is not None - ), "Please provide attention_mask for padding!" - _cu_seqlens_q, _indices_q = get_cu_seqlens_and_indices( - attention_mask[0]) - _cu_seqlens_kv, _indices_kv = get_cu_seqlens_and_indices( - attention_mask[1]) - else: - _cu_seqlens_q = cu_seqlens_q - _cu_seqlens_kv = cu_seqlens_kv - _indices_q = get_indices(max_seqlen_q, cu_seqlens_q) - _indices_kv = get_indices(max_seqlen_kv, cu_seqlens_kv) + if cu_seqlens_q is None or cu_seqlens_kv is None: + assert (attention_mask is not None + ), "Please provide attention_mask for padding!" + _cu_seqlens_q, _indices_q = get_cu_seqlens_and_indices( + attention_mask[0]) + _cu_seqlens_kv, _indices_kv = get_cu_seqlens_and_indices( + attention_mask[1]) + else: + _cu_seqlens_q = cu_seqlens_q + _cu_seqlens_kv = cu_seqlens_kv + _indices_q = get_indices(max_seqlen_q, cu_seqlens_q) + _indices_kv = get_indices(max_seqlen_kv, cu_seqlens_kv) query_layer_packed = PackTensors.apply(_indices_q, query_layer) key_layer_packed, value_layer_packed = PackTensors.apply( _indices_kv, key_layer, value_layer @@ -2030,39 +2028,33 @@ def forward( global _cu_seqlens_q, _cu_seqlens_kv if (cu_seqlens_q is not None and cu_seqlens_kv is not None): # use cu_seqlens when both cu_seqlens and attention_mask are present - if self.layer_number == 1: - _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv + _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv elif attention_mask is not None: if self.attention_type == "self": - if self.layer_number == 1: - _cu_seqlens_q = get_cu_seqlens(attention_mask) - _cu_seqlens_kv = _cu_seqlens_q + _cu_seqlens_q = get_cu_seqlens(attention_mask) + _cu_seqlens_kv = _cu_seqlens_q else: - if self.layer_number == 1: - _cu_seqlens_q = get_cu_seqlens(attention_mask[0]) - _cu_seqlens_kv = get_cu_seqlens(attention_mask[1]) + _cu_seqlens_q = get_cu_seqlens(attention_mask[0]) + _cu_seqlens_kv = get_cu_seqlens(attention_mask[1]) else: raise Exception("Please provide attention_mask or cu_seqlens for padding!") cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv else: - if self.layer_number == 1: - if cu_seqlens_q is None: - cu_seqlens_q = torch.arange( - 0, - (batch_size + 1) * max_seqlen_q, - step=max_seqlen_q, - dtype=torch.int32, - device=query_layer.device) - if cu_seqlens_kv is None: - cu_seqlens_kv = torch.arange( - 0, - (batch_size + 1) * max_seqlen_kv, - step=max_seqlen_kv, - dtype=torch.int32, - device=key_layer.device) - _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv - else: - cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv + if cu_seqlens_q is None: + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * max_seqlen_q, + step=max_seqlen_q, + dtype=torch.int32, + device=query_layer.device) + if cu_seqlens_kv is None: + cu_seqlens_kv = torch.arange( + 0, + (batch_size + 1) * max_seqlen_kv, + step=max_seqlen_kv, + dtype=torch.int32, + device=key_layer.device) + _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv qkv_dtype = TE_DType[query_layer.dtype] From f15b70744a0aebe5aca9d3466ba81805cd36f3de Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Mon, 29 Jan 2024 16:00:01 -0800 Subject: [PATCH 083/427] Fixed offloading for PyT version/ Added Attention activation offloading support/ Native FP8 support (#632) * Fixed offloading for PyT version/ Added Attention activation offloading support/ Native FP8 support Signed-off-by: Selvaraj Anandaraj * Removed activation offloading for fused attention Signed-off-by: Selvaraj Anandaraj * Fixed the illegal memory access issue for activation offloading of attention Signed-off-by: Selvaraj Anandaraj * Removed the version guard Signed-off-by: Selvaraj Anandaraj * Pipeline failures fix Signed-off-by: Selvaraj Anandaraj * Fixed lint erros Signed-off-by: Selvaraj Anandaraj * Lint error fix Signed-off-by: Selvaraj Anandaraj --------- Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj --- transformer_engine/pytorch/attention.py | 24 ++++++++++ transformer_engine/pytorch/cpu_offload.py | 46 +++++++++++++------ .../pytorch/module/layernorm_linear.py | 2 +- .../pytorch/module/layernorm_mlp.py | 3 +- transformer_engine/pytorch/module/linear.py | 2 +- 5 files changed, 59 insertions(+), 18 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 469791c5d5..b7a98de0cd 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1662,6 +1662,14 @@ def forward( deterministic=self.deterministic ) else: + + from .cpu_offload import CPUOffloadEnabled + if CPUOffloadEnabled: + tensor_list = [query_layer, key_layer, value_layer, cu_seqlens_q, cu_seqlens_kv] + for tensor in tensor_list: + if tensor is not None: + tensor.activation_offloading = True + with self.attention_dropout_ctx(): fa_optional_forward_kwargs = {} if _flash_attn_2_3_plus: @@ -1848,6 +1856,15 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, rng_gen) + from .cpu_offload import CPUOffloadEnabled + if CPUOffloadEnabled: + tensor_list = [q, k, v, out, cu_seqlens_q, cu_seqlens_kv] + qkv_layout = 'sbhd_sbhd_sbhd' + for tensor in tensor_list: + if tensor is not None: + tensor.activation_offloading = True + + ctx.save_for_backward(q, k, v, out, cu_seqlens_q, cu_seqlens_kv) ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen_q = max_seqlen_q @@ -2722,6 +2739,13 @@ def forward( assert (not context_parallel), \ "Context parallelism is only implemented with Flash Attention and Fused Attention!" + from .cpu_offload import CPUOffloadEnabled + if CPUOffloadEnabled: + warnings.warn( + "Attention activation Offloading is only implemented" + "with Flash Attention and Fused Attention!" + ) + if _NVTE_DEBUG: print("[DotProductAttention]: using unfused DPA") if use_unfused_attention: diff --git a/transformer_engine/pytorch/cpu_offload.py b/transformer_engine/pytorch/cpu_offload.py index dcede62ef7..b2635bb9bf 100644 --- a/transformer_engine/pytorch/cpu_offload.py +++ b/transformer_engine/pytorch/cpu_offload.py @@ -184,6 +184,7 @@ def groupid_reset(self): # the tensor back to gpu and deletes the cpu tensor. # These will increment whenever `group_commit()` is invoked self.current_group, self.tensor_count_current_group = (0, 0) + self.torch_tensor_count = 0 self.tensor_tag_to_state = {} def on_group_commit_forward(self): @@ -310,24 +311,35 @@ def get_tensor_buf_for_offloaded_tensor(self, tensor, tensor_tag): def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any: - # obtain a unique tensor tag - tensor_tag = (self.current_group, self.tensor_count_current_group) - self.tensor_count_current_group += 1 - assert tensor_tag not in self.tensor_tag_to_state - if (self.current_group < self.num_offload_group - and self.tensor_need_offloading_checker(tensor)): - # first copy the tensor to tensorbuf, so that the original tensor will not be deleted - tensor_buf = self.get_tensor_buf_for_offloaded_tensor(tensor, tensor_tag) - tensor_buf.copy_(tensor) - if hasattr(tensor,"weight_offloading"): - tensor_buf.weight_offloading = True - if hasattr(tensor,"activation_offloading"): - tensor_buf.activation_offloading = True - # Here we just save it, and at commit, bulk_offload_group will handle it - self.tensor_tag_to_state[tensor_tag] = tensor_buf + torch_stray_tensor = isinstance(tensor,(torch._subclasses.fake_tensor.FakeTensor, + torch._subclasses.functional_tensor.FunctionalTensor)) + + if not torch_stray_tensor: + # obtain a unique tensor tag + tensor_tag = (self.current_group, self.tensor_count_current_group) + self.tensor_count_current_group += 1 + assert tensor_tag not in self.tensor_tag_to_state + + if (self.current_group < self.num_offload_group + and self.tensor_need_offloading_checker(tensor)): + # first copy the tensor to tensorbuf, + # so that the original tensor will not be deleted + tensor_buf = self.get_tensor_buf_for_offloaded_tensor(tensor, tensor_tag) + tensor_buf.copy_(tensor) + if hasattr(tensor,"weight_offloading"): + tensor_buf.weight_offloading = True + if hasattr(tensor,"activation_offloading"): + tensor_buf.activation_offloading = True + # Here we just save it, and at commit, bulk_offload_group will handle it + self.tensor_tag_to_state[tensor_tag] = tensor_buf + else: + self.tensor_tag_to_state[tensor_tag] = tensor else: + tensor_tag = (-1,self.torch_tensor_count) + self.torch_tensor_count += 1 self.tensor_tag_to_state[tensor_tag] = tensor + return tensor_tag def tensor_pop(self, tensor_tag, **kwargs): @@ -350,6 +362,10 @@ def bulk_offload_group(self, group_to_offload): # if offload, return the reference to cpu copy if self.tensor_need_offloading_checker(tensor_on_device): + if hasattr(tensor_on_device,"weight_offloading"): + delattr(tensor_on_device,"weight_offloading") + if hasattr(tensor_on_device,"activation_offloading"): + delattr(tensor_on_device,"activation_offloading") state = SynchronizedGroupOffloadHandler.offload(tensor_on_device) self.tensor_tag_to_state[tensor_tag] = state diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 2de860cf73..6836ef6d22 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -242,7 +242,7 @@ def forward( if cpu_offloading: if fuse_wgrad_accumulation: weight.main_grad.weight_offloading = True - if fp8: + if fp8 and weight_t_fp8 is not None: weight_t_fp8.weight_offloading = True ln_weight.weight_offloading = True weight.weight_offloading = True diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index d48ee4887d..3a0e5cb559 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -424,8 +424,9 @@ def forward( if fuse_wgrad_accumulation: fc1_weight.main_grad.weight_offloading = True fc2_weight.main_grad.weight_offloading = True - if fp8: + if fp8 and fc1_weight_t_fp8 is not None: fc1_weight_t_fp8.weight_offloading = True + if fp8 and fc2_weight_t_fp8 is not None: fc2_weight_t_fp8.weight_offloading = True ln_weight.weight_offloading = True fc1_weight.weight_offloading = True diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 68c5bf1a1d..f2c955bfc0 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -275,7 +275,7 @@ def forward( if cpu_offloading: if fuse_wgrad_accumulation: weight.main_grad.weight_offloading = True - if fp8: + if fp8 and weight_t_fp8 is not None: weight_t_fp8.weight_offloading = True weight.weight_offloading = True From df9c29e6a2cff8413acfc8c471a8f0417ebecec5 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Wed, 31 Jan 2024 08:20:19 -0800 Subject: [PATCH 084/427] Update FindCUDNN.cmake for cuDNN 9 (#640) * update cudnn cmake for v9 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add back license information Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- transformer_engine/cmake/FindCUDNN.cmake | 82 ++++++++++++++++-------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/transformer_engine/cmake/FindCUDNN.cmake b/transformer_engine/cmake/FindCUDNN.cmake index 6d7455919e..065174e62a 100644 --- a/transformer_engine/cmake/FindCUDNN.cmake +++ b/transformer_engine/cmake/FindCUDNN.cmake @@ -8,25 +8,29 @@ find_path( CUDNN_INCLUDE_DIR cudnn.h HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${CUDAToolkit_INCLUDE_DIRS} PATH_SUFFIXES include + REQUIRED ) -function(find_cudnn_library NAME) - string(TOUPPER ${NAME} UPPERCASE_NAME) +file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" cudnn_version_header) +string(REGEX MATCH "#define CUDNN_MAJOR [1-9]+" macrodef "${cudnn_version_header}") +string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}") +function(find_cudnn_library NAME) find_library( - ${UPPERCASE_NAME}_LIBRARY ${NAME} + ${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}" HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES lib64 lib/x64 lib + REQUIRED ) - - if(${UPPERCASE_NAME}_LIBRARY) + + if(${NAME}_LIBRARY) add_library(CUDNN::${NAME} UNKNOWN IMPORTED) set_target_properties( CUDNN::${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR} - IMPORTED_LOCATION ${${UPPERCASE_NAME}_LIBRARY} + IMPORTED_LOCATION ${${NAME}_LIBRARY} ) - message(STATUS "${NAME} found at ${${UPPERCASE_NAME}_LIBRARY}.") + message(STATUS "${NAME} found at ${${NAME}_LIBRARY}.") else() message(STATUS "${NAME} not found.") endif() @@ -35,24 +39,18 @@ function(find_cudnn_library NAME) endfunction() find_cudnn_library(cudnn) -find_cudnn_library(cudnn_adv_infer) -find_cudnn_library(cudnn_adv_train) -find_cudnn_library(cudnn_cnn_infer) -find_cudnn_library(cudnn_cnn_train) -find_cudnn_library(cudnn_ops_infer) -find_cudnn_library(cudnn_ops_train) include (FindPackageHandleStandardArgs) find_package_handle_standard_args( - CUDNN REQUIRED_VARS - CUDNN_INCLUDE_DIR CUDNN_LIBRARY + LIBRARY REQUIRED_VARS + CUDNN_INCLUDE_DIR cudnn_LIBRARY ) -if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY) +if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY) - message(STATUS "cuDNN: ${CUDNN_LIBRARY}") + message(STATUS "cuDNN: ${cudnn_LIBRARY}") message(STATUS "cuDNN: ${CUDNN_INCLUDE_DIR}") - + set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found") else() @@ -71,11 +69,45 @@ target_include_directories( target_link_libraries( CUDNN::cudnn_all INTERFACE - CUDNN::cudnn_adv_train - CUDNN::cudnn_ops_train - CUDNN::cudnn_cnn_train - CUDNN::cudnn_adv_infer - CUDNN::cudnn_cnn_infer - CUDNN::cudnn_ops_infer - CUDNN::cudnn + CUDNN::cudnn ) + +if(CUDNN_MAJOR_VERSION EQUAL 8) + find_cudnn_library(cudnn_adv_infer) + find_cudnn_library(cudnn_adv_train) + find_cudnn_library(cudnn_cnn_infer) + find_cudnn_library(cudnn_cnn_train) + find_cudnn_library(cudnn_ops_infer) + find_cudnn_library(cudnn_ops_train) + + target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn_adv_train + CUDNN::cudnn_ops_train + CUDNN::cudnn_cnn_train + CUDNN::cudnn_adv_infer + CUDNN::cudnn_cnn_infer + CUDNN::cudnn_ops_infer + ) +elseif(CUDNN_MAJOR_VERSION EQUAL 9) + find_cudnn_library(cudnn_cnn) + find_cudnn_library(cudnn_adv) + find_cudnn_library(cudnn_graph) + find_cudnn_library(cudnn_ops) + find_cudnn_library(cudnn_engines_runtime_compiled) + find_cudnn_library(cudnn_engines_precompiled) + find_cudnn_library(cudnn_heuristic) + + target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn_adv + CUDNN::cudnn_ops + CUDNN::cudnn_cnn + CUDNN::cudnn_graph + CUDNN::cudnn_engines_runtime_compiled + CUDNN::cudnn_engines_precompiled + CUDNN::cudnn_heuristic + ) +endif() From 5b90b7f5ed67b373bc5f843d1ac3b7a8999df08e Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 2 Feb 2024 20:36:10 -0800 Subject: [PATCH 085/427] Update cudnn-frontend to 1.0.3 to fix cuDNN v9 SDPA NaNs (#650) * Update cudnn frontend to 1.0.3 to fix cudnn v9 Nans Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * make d_out contiguous for bwd Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove cudnnDestroy to let torch handle it Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * Update transformer_engine/pytorch/attention.py Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> * Update transformer_engine/pytorch/attention.py Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> * Update transformer_engine/pytorch/attention.py Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- 3rdparty/cudnn-frontend | 2 +- transformer_engine/common/fused_attn/utils.h | 5 ----- transformer_engine/pytorch/attention.py | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index 9f82dda5c0..a86ad708db 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit 9f82dda5c029d15a5f371f0fe003dc0c74a0c987 +Subproject commit a86ad708db725e4d29919bb6fadf8e6cdfa5dc06 diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h index 9da0dc553a..44288dd754 100644 --- a/transformer_engine/common/fused_attn/utils.h +++ b/transformer_engine/common/fused_attn/utils.h @@ -152,11 +152,6 @@ class cudnnExecutionPlanManager { } ~cudnnExecutionPlanManager() { - static thread_local std::once_flag flag; - std::call_once(flag, [&] { - if (handle_ != nullptr) { - cudnnDestroy(handle_); - }}); } private: diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index b7a98de0cd..27c031e267 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1733,6 +1733,7 @@ def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, @staticmethod def backward(ctx, d_out): + d_out = d_out.contiguous() qkv, out, cu_seqlens = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() @@ -1802,6 +1803,7 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql @staticmethod def backward(ctx, d_out): + d_out = d_out.contiguous() q, kv, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() @@ -1883,6 +1885,7 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql @staticmethod def backward(ctx, d_out): + d_out = d_out.contiguous() q, k, v, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() From 5da878d2c0c39127eef89b1fb8530ea7629dd4ea Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Thu, 29 Feb 2024 23:57:32 -0800 Subject: [PATCH 086/427] Create a small tutorial on how to accelerate HF Llama models with Transformer-Engine (#615) --- .../te_llama/media/llama_for_causal_lm.svg | 1 + docs/examples/te_llama/media/llama_zoom.svg | 1 + .../te_llama/media/llamadecoderlayer.svg | 1 + docs/examples/te_llama/media/model_change.svg | 1 + docs/examples/te_llama/media/swiglu.svg | 1 + docs/examples/te_llama/media/swiglu_te.svg | 1 + .../te_llama/media/tellamadecoderlayer.svg | 1 + .../te_llama/media/transformer_llama.png | Bin 0 -> 971304 bytes .../te_llama/media/transformer_vs_llama.svg | 1 + docs/examples/te_llama/media/weight_swap.svg | 1 + docs/examples/te_llama/te_llama.py | 172 +++++ ...tutorial_accelerate_hf_llama_with_te.ipynb | 697 ++++++++++++++++++ docs/examples/te_llama/utils.py | 180 +++++ docs/index.rst | 1 + 14 files changed, 1059 insertions(+) create mode 100644 docs/examples/te_llama/media/llama_for_causal_lm.svg create mode 100644 docs/examples/te_llama/media/llama_zoom.svg create mode 100644 docs/examples/te_llama/media/llamadecoderlayer.svg create mode 100644 docs/examples/te_llama/media/model_change.svg create mode 100644 docs/examples/te_llama/media/swiglu.svg create mode 100644 docs/examples/te_llama/media/swiglu_te.svg create mode 100644 docs/examples/te_llama/media/tellamadecoderlayer.svg create mode 100644 docs/examples/te_llama/media/transformer_llama.png create mode 100644 docs/examples/te_llama/media/transformer_vs_llama.svg create mode 100644 docs/examples/te_llama/media/weight_swap.svg create mode 100644 docs/examples/te_llama/te_llama.py create mode 100644 docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb create mode 100644 docs/examples/te_llama/utils.py diff --git a/docs/examples/te_llama/media/llama_for_causal_lm.svg b/docs/examples/te_llama/media/llama_for_causal_lm.svg new file mode 100644 index 0000000000..22cc438490 --- /dev/null +++ b/docs/examples/te_llama/media/llama_for_causal_lm.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/llama_zoom.svg b/docs/examples/te_llama/media/llama_zoom.svg new file mode 100644 index 0000000000..6134ecfe1c --- /dev/null +++ b/docs/examples/te_llama/media/llama_zoom.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/llamadecoderlayer.svg b/docs/examples/te_llama/media/llamadecoderlayer.svg new file mode 100644 index 0000000000..189369917d --- /dev/null +++ b/docs/examples/te_llama/media/llamadecoderlayer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/model_change.svg b/docs/examples/te_llama/media/model_change.svg new file mode 100644 index 0000000000..6f0bed1927 --- /dev/null +++ b/docs/examples/te_llama/media/model_change.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/swiglu.svg b/docs/examples/te_llama/media/swiglu.svg new file mode 100644 index 0000000000..75b0a277a6 --- /dev/null +++ b/docs/examples/te_llama/media/swiglu.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/swiglu_te.svg b/docs/examples/te_llama/media/swiglu_te.svg new file mode 100644 index 0000000000..5a846f2a0b --- /dev/null +++ b/docs/examples/te_llama/media/swiglu_te.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/tellamadecoderlayer.svg b/docs/examples/te_llama/media/tellamadecoderlayer.svg new file mode 100644 index 0000000000..f93f49b720 --- /dev/null +++ b/docs/examples/te_llama/media/tellamadecoderlayer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/transformer_llama.png b/docs/examples/te_llama/media/transformer_llama.png new file mode 100644 index 0000000000000000000000000000000000000000..a6c25639740a652157d22ec3aecd92656d2d6d34 GIT binary patch literal 971304 zcmV)yK$5?SP)Px#1ZP1_K>z@;j|==^1poj532;bRa{vGt)&Kw*)&UsN%IyFE|D{PpK~#8Nti1=g zT~~E2Y|D}**^=CQw{fEzj4>q!Y>G|j)dUQ-DGrceFklS90Rn^;Lho2zw--q^4tYsP zeJO+%2qm}|*^(?*OwY^w#~5?XwdUUETuFZayWcmyeeN!6&A!H7d!2oDnH^ncf$7oF zf)Se@!~ekK=o%XzS!I)BtMLEYy1$R9(X~R}H#0s`k@siBb(14&go$;lZDM4#e;!?J zQ{!uGW)f`x(~~%VivM>pJ+TJo;Tp&|KC;5b)-4w%MpsJqp+GJ1^VRB-m zVq$!qjpMuto;&8Y5g_Xr+8kT!Z98z@2yD0xx~_q&Yau_j<7;poj*XA3wK3Rt6yUYI zM%S;wXPh&!7S|`R9k!=7>7(&6o0%B1*@;nGKZz|c158bhLRRzzOcU}Ou?ARO0Dq3J z6sE>jzz!>51K4Y=V`3fTSi3yI_m8b{jB>lyb)+m)qv&^R9PLfm%-EQ%$L9?J$hl#B zRB}pI?i-n~*^yD3SqD8}7wRVcXqRG(Q3A0b8*y9q2WTroF;g&x>qlW%Vr1PKVI8m* zvaW$miM6ohI`p+}-Keb%z@{0i1L%7t`sVfY$=F)R0(sW~Y-cU|K{#fzT35n3FZUj2`GMW{^-h;HUf;UV%#u)+#i!;@X3hV zmNpLf|M0`exUC*o-q$+DRwGsc?&C4aKeh&Pq;n>L>9wdI zJxs2KT&oCNU)mU7&Gt(lkZS_5;Bf>w5kC%>WeS*ChyT&<4E#0&Th73?vjnbljMO@o zu^#noJz|rf&ROq>&p7V?<2=-*^{_E93p=ha;OAlWYzqC*)&qciL_T*0ZEQea)NgtW z_R;koE3CDwDXe>es7-Sr`WR;(Aa+zMB5n|qQJ?&L^Z~hKhtWyMFaelyPhg>x$>>402%#zMC3B?Wpj+2Z8uN42-WG zM;j&nAm2D*N4WraZXl+#uf!x`#^Vci#+JUChP@OUum`aL*oZ#XLr-Fs@r84zvCqdt z$~7TOqyK5xkM)lG6`w_Bo;wBGDgOY^S=tJDiu@$TSlgk0^&w>OF$VPym_dCEOd&RS zJpAZ2xz%WnB~$SE6zT;rO?v{~Hpe%{#~_$D>(O=wW1eG#&VemG-^P%;uqiMzK54V4 z?bI)7kn$Drthj|eV?V(<=6-Qs5Mz)xYDE{UBdrDos2|78Db$b2Rj|Pdj5*6!*!W7= zpG$_3M%ASHUk|!`Oc{##L^KE3tp0a=FD3 zF;m*0{rjN*y5PP+P^SL>M5m6R|JiZWMb;~939s3(zijGxOGJJsNASPK0ALnAVJ%wE z*oV(l?-0uws+D8#{luiLpP0fp?sYTzWXwk&K6W|uKQPL%0yY|heL0Sgv95uCkgv^* zudxl_bXQHmrqg5gwb@a-YIY2muq$UK?aJvX+c-6ibEeTZ#wqa!=skfP)t3KvAs&?e zr=2*a4Azs#?Ljc_BKHWz2J&jU4noBwUxU0y-ryi=rEa6QKnKogXQ0|b(9k|d{)=2<}G7=y}s5@yIK zSs~wg9GiSOq!Ov9HxA$QblTP!!fxtVa9Uf3OoiN5V}Y#=E2$TlAMnNDLB!3QoXn z9Ebr)u|we8{$LOVjOR_kp0sVmUVJ8z<$yR_21Y{U76p#QWD-6hIG6?4_811QaSeR< zpZx&&oJkaHZ=XI5c+5L-#x;d#J7H$QML+Bdu`m~Do&iajmW%y$j^LKJOMdg=OWGdhI$xW zE{9~H?C=lcE@F=Gz5&?PiN4!-f}~*W1VHi}WUS?b=Lbn0iL%GDfd7NYiZsLKLrF8* zm?iK(<4Gi&vaq(nHj}uqoCImkgs&?zn0@Fm(JiO6J5^#Qum->MNDPcWWol7n@! z)bbd(BOZcmMJ?VwX~t*Nt}Z+t{f5)YvI8-RZNy?G%~RA7h?@iI#+o0s$U$BFbFEDk z+*e35CZ>bitZI!W^V|q7dZ?e$H44`(Qho<8cUT+pt;KsM(6PaXZB% zlSnhht1B^CE78Vk z+XxPK6?olMGwW>Q4D1a8y>S}kd}X$ZIJdZV`=lBD>N?6gJOLUJbkfo% z&0}ytCCwN}f&?BKnx&!m|odV&LN zDafu5x45%kOZfgwnmHh_P;xLDhdh%^qC%P@nK^?;a*zXAgc`}nq!eiesFMaBwFg}p z6DVAq)emTPpZj6=Oqxj$Xm2M9IJY2XoJpWXqR|ftYmnx3_p1Y288Z@RlI4sM*o!_KrJZOy#s+0YTPzxtyc6g%laN}>fmVQA$E;6=?4DvH zuJxJvat=U}B;uO390>DFFu{EuV=c~d`#XbNkTaY#J0?m|7imVnD)8__moy_@6hnv+ z4AvyMBEFIvZNfIQ1v9XNh%vYDT@YqR$iwUW#(;1V?XlM#BnYWIt1ePb7AT&} z@qxM*=V9QB8c1T8NwdKEryApE{qg*^k6Y#|pHMhTEkXVURF6oo;0xIVx^@Y306Ez6 zy>5_Ts}{3`G?$naDM4P$MVdWFTYU`KHAAU5ZhdwT=M^wcIMXcB3?F8~Typ|?Imv}C zILCG3cm{uHTyvX~G?Mg?Fgp+nXoELm93V;3DZ>QzI}sm)ElAdZLijQ{8IT=_8cErL zG%HWD9q2#EQ=bg7MzWRzjWl~)OzJuxYbt3*dvlRyBF4BJJB(u|&CnTJLX&1C7B$0O z@}@}ITa%7)4xXiMS$J&d6k$q&hG$VpVwAirbVNHby=d~6cnMBNOq zVSJTsoLp@ireW9V5fJ845aw|ZW)O7FFzYD?`-QIJ5G2jcc?QAb4H4511i>-F=JbK` z3gbTWOr$xcqrIp&m=K)R8w0@{EyxXJsH5^(&U^sK*A4}a_erxGtRe0`;to+kUjly$ z#F50gk!R|`VC3C?H<&eZTu5^x&L{-vgUFI;({|-YTbxDbXAy)H7rd^1#xWs| z^W5I~PE=U%0*wIVwuZrHmghcK1vv1shQzH0=Fw-= z79yYL{kARZm(MaIMzCc(jvx*R%{C)ONxE3{bpu}sWkw9T;pgV#h(Wd;h`4p2K9C^y z2s5F2QP9^V_vfJQIERq_1tg7%Vb1jW%~o6!G3YbHeVs_4+qf1tD(hi;k!C=%&GbFC zPMFcQ;*oJl!tAKT4_n1-vQNlRQT`f@1p?5}~G|zxEvo6qnzC6YC%5m;Cb$c?-l?U>H`=)Cj z{!g26OQKA;4ZCsFAVOTovXDq}M(d+}=JmPle2g(BnCd9 z5@s$l+yLK@Omis)c}BByyS8$QIq?6EIMW|aUj0VcZO{74Cyg2d6tmEsdKdrj7;E2f zqU_)?C&^vXEZ?U%2w!L79Q9ne!YyJhNb|&4SrQ7&jxC2hSAaCH0%2YQvJ4wfku;B0 z((L_uU$JyJIA0%`560(yjolF%$8dYo{!>JdW=~8eXe0rQ0O*l2oEn6Ad!(5N$I6if zAw=GnApv{s^r7oz1R4N+?9;!JUjqb0N+ok$uukqj1Z$$y{@zL5ng1 z`E%HZci6EuPIs8No{zm$bWyliz*Ia7f;{9z<{(#Rg%b2Ao`F~bFSo;B&16)wwn!TN zubF0?!`bmdnx_z?B+OGI%Z0W?V-OQKCemDzImoiW{}Z@$VibCl$n{AxZNvc2auOob zoE*p93Mkwr0H zgU@TwrbsjTSdN%ki5OY~a6L4M?kLFe_?jzVA784%n+MM7jS@JAws4|CTX}4G;;~O#skd7jsKeA?VEP7C0_?d&Y%k0ZLoHXOQQkZgFgs%D=aRaD8_K>gZf_7!>A^y>i zd;#=xlKb=0qhXjqKN0gR&fJPDXy9xJ((W>|4tY_Gb;RotOY0dgD3&6y=ywLT3-X+a ztrKP(LyR=i%o{V#R3r8K3|C;l(Rr#v zeqKeK1O57K5+3)A9mr7ZsU;xkoZKG&SH8j?5hr|lmE+NU3StG~M*GF`1HX9+4Em91 z`QsQ$nia#2OqfZcBW_}h2x8&n!s99Oeh}$=*r46yg?tg8v{A%(oI{ZCVT<^ggw1`d zgO3$kPMSrW@jqjLeRAD5^N+P+9M_Hk+M+$)c>Bz0vq|Kh^0y##R<1CoVI$7|HKObP zOK!@RCmaOCM<*hU+<-MJw71j^jCkE3yjSzvu3 zVZO|e@#8rteG}r^VLk}15nw*-EOHs+L^#5Hq*aP+y zvMo_Gx#o7V6wwSNGwc^LMI$(mWO?oKm9}=pN?SwXjPq8b?UnHL3fOKHWLSfKNp?A_ z%)x(R4GA-su{a3{d$cvwASwWFC^WNyYbhshf=I5{adsNeqixzz=eW&fZ+*u399%F! z^N}zgLytoKQM4n{jBDy(P4N6;1L@0sHD+2-lMl-0THts-5dZt^p39W;FFtp`crGzr zuMe<40-t?G1Nq5w;Tv9;2hJdDF0FZ@haM<3}{QfNOtI>)(3zCmCyKe;baa` zwT3g66jwo-q2p@kTA<|`<#Ua>6odMbCE&BlJ;*=)Ib>rk^3PrSR6AhHxZ`9PNpj{I z^b2*bzJU!0#i(M4D7FydfBMET7%Ovw*Vb`Ray9#B(2pIzqFa^Nn_F$m0)}NyRcAEw=<06KUp?Z_PHZgiTlBrUZ3q0;G9j4FH)$ei>&W zgWte$GNQ#_^GQbD&(B(QpqPg4fNDPE@!H=-)Yoh?*`D!;*yK0^(#%5QaOFsvStMe{ zg2X6Db8BQSRHs0x_&f~#>mo4v$b&=;3>h>1uffRs%lp;WIQD@!PwNpC3%C;=-KoP6 z89@|l-$hmj4O)T+ASv+2#g3Rw6FE>aGHBRN1U__TK&2wV1ZHs;1SM5N61l0)`9Esb@@!cM;k2$ z@xQL4j$HFQjy5q^16o#sc74F6KbnpmJxkok&O>|4XBfA0v>=quu#B%|L?h6ZW0 zuXEKG<)FXa=Y+>O+J+C5zbVIg(;5SPV305eNSe!e%F9NYtBruFH)Thkef0Gmc1|GV z96&F=10q7uDeP8d*%>5xeCuGIJE4tOY+r)ykY?ol4D%fOhqgZyj48wzkZXW$sJZ3; z%0m#B0w**r&YKO`m5J%lYnUxBuZnCp>T-QdH%k*~_xf$_L`_NcJ zqMX}Ia`dkuXset@L9BGMSo^_e+1b&JUy_+N<`OX;x9b7uont8HU3?w@Wad+m8aEh= zxQ?`sI6fCb|BR5U?(6kkdjo*D!V42;?8fFc)*SUN`oO&PRAav+ZeH4z;rRFUYJG_x>swkU#A0Sf<6 zC+g0ij;G~ zS7P5vTZ@9c7W#7y^IF99IuKkUNVBgEMa*+xO6}`2?IF7YUr#xL z)D(Y8=5h^Vh`VaB~cw_GKiPJFsB6bGh$DNp$ z7{~vVg)yvJmWZ?0IPUj2ce$~};8;nsCNc22qaW?kY}?>^4&RrKMT`q2#!zqc3dXTU znt4;kL6vWfsH9mn7U!a#j^n0&3}F6=IKu|<-31Y6z-vm&>BwilIq`bSSxovQ6I%!L zX6@q4EBXvn@>SrrXHL_8j_8MRP0}1o7Gl{#jdS)*8DwMF6*i%La$OFvJ~tl#vPFYp z7q%jq3HVbW#ya%DG1@<#RuR=t9iLF=B40(*DaJ#hPnJod^YK2SjXpviqVDg>;CeHs zda@Vfy!QG8yVM(-1llg?n%fNS(dAD~&S7ZRLUwK;OCZY^OVa}W`Z%wFI=@^SehQsL znh|&Y9V3p(M0=_b{vWId4O!nfUq08iPnrqnQfdJ6R&fP;=GY=617yo>IK6!%n){>X_M9MHI4`{1`0e>K;oHV0Rty@D9^3gEgze<9kqQK$+N2^G7oZS$? zKtut{9nXc1FayV-EM#hH520sEeho~3h;$PR^0GfABDd%ZnBr$9Z(j}# zI0(8F7y%!Qn_OMOOwe|;A?=q2Gxdo!(cffAcG}m^Tcv`mWe+i^mV{Kmz+ZjR>_N`} z^K&>*Z%79e?+kJvVCfPOh(0o2Q1%|!otT(@N+{=fVSyEnpI zVzKxmZdNAwBiifIdEe z#Qv}m{Xw+a<7oT1Tq}s7>Rz=|i^pzU&V{bcyx7wh;R9}qufh%;+XT|_)Nii0B+c=- zKVqR%bD*E=Lc+XKgjG+YBu55iYH~H;7-xvkXDrv^KFT7yu|MY90B{V)bsi@!%3$gJ zbL^nZO-I>@d5%82wuDcy)fd*#)^O0zxSq8}?Sa$~u)FG~W^lPh8yc87SU1wlwVgql zC)SN8;_Q=1T9Sr5jTwB^1N^U?RzAC~1aE*@V>PD0HmwFjM!?@DLti1^3CPLGD$fIM z8{Gc%H*H<)%|1{ooHWB9Luez8BH|I25AJ`0~v=OmxF5#lRoM>9T_}tgktAz>_LY@ z!n8C1=#pk`3t-EEye(}8M~d3-XfpSa+aLt>vP!f9ap+LWiuA zzit?W5oFn4L+Q&Pn%|)Zd>1h|yf(O=^l!e-QO0)J3FEU!v%_PZHYFDTU{8M=2si=k zD?77o$qzuE_^kRs=z5%+&X512j*xTs_gWhCzV#kvw1vFt$8A2~7wp?bPA!2$5)zNP zD`_U_#K|6%+1U*d41p-*(eW_!g&8=94uA|KTanm&ud3e>bQ7N7c^Ofl^0&Gp$w;J? zh~@5F;G=!$+m(TiLQtgS3XnX~zyj}3u+_P^CIX4y7WG+r+7Ge>Fqo)#q8-gHLjSNa z5u|wxj#pa>E)PNy%uJetFb8SYIwAx~e8ZgZ;)-Mt!7Bs}ehbn}!tCS+g%5p`V6Z*5 zuUFC#w0*Iw>kru(I|VU-@=H@}V+O~0zLu`gR_J>gWN?P|s-)TVi3IbuJfXQPr{fEu z`6Ea(2cF5*u)TnrKKdNT{TwHl4UFlsL<8VsR=x&GFU2n}$>$67g_V3;2H~^Kd@}|g z(Q*LgI%B@hiuz0eH4Z2bV+;C+onny7{;hcfG{Ve*0oOtweye5AN1ELi1+Z@nyiv5G=yo771pCVVU4(A}ag!sG(L-Ao*@YgD!FP;{VSQ+J2Iq-P zqKNw|9r0O!uH$@^2ivI7Vjpzj(jd*uDX!@Uk4=wXzX5FV?2wJPl@ErY)wJv{`K;J# zFeV2F*ovO*9N7nur}(PbgLR~j$PFO%+pV?K8G15D%ekyYt?ob%&PMBt&?~Uz=cZQ| zO)dxa@!SmT%d&Lh5P27$nS-<~Z>ZziNq!lKuec;>t~ztuvsEvi7)M-xJd3u1BzWzL z&o~Ed#28TRhqhEdhmdBUo#OkN(Js#80IS(Cl4hWhX15`qHaThbS?3^Zije|N%G8`j zuF?*S&480;$XCGEFvfeQ`K76D&iX#Z2<@EuVO?+Szz;;SeU2F?8E)5MV<*R2#SE`y zZ7II@*+Tb25J&$Kr4wd{j~&$q!+B^Vegn}W?fJ_ z?0<+}esk|*awg4`yVU@K{lZ?IeGZ-*@g>`o&l$9p2s3hT0=9I!fvAs8;u->^wb-_` z;kN6;<2m=k&knyH{GuG^j}gtRo9JK|X?EaR#6x@$Jg+_of9;|fPb3@fP%B|BaMHno zfS}Yxbo3x15CRGgB05gU0%Z8L>0CHLGtD)*N!maXr4)`8_+Z)o7Z5Od>AMO@HxVa9 zOwj%Sa1G9n$K)z{ICl#BbsTr`@$KCy6e%a7EI4os5J{qe3W4P>^lqe?Kd!+g7lWjk zHgp?A@eR^kNlytl1^^f6^HnfGnw`+{PKw7{(mHAO$NVJB)2k5>AeB+f36Tp7q?&cc z=SVvKg5SpR3EMa}UP1Cq+q#{+00z7$c%V5!L0ix*NHYfs0sSi|Q*y@aI1%l%IPlL= z5cP!(w8c^ep3 zG51Cw(lig*xZP0uU{jwJCK)J10qxb#kh7A&LPS{yWjn;6NDc>Uk!75RILoA&@2d4WirNWT zCXnme;{UoV1vb@+EZk!aY5jm ztW*0UAmTI;<~)XHw3+^tk3Be9b*LZKfut{yZRXdwzS3Dbz*dRS1u9t%uqM@9CW!` z{CS@_E;1sv!Y-V2^IU4+_@fsFFb}~of-p~wPUD&>Ac!+(yjkx6whfzQTgZlLGt2CV z>uWpQVyq@no*fC&jQ!+98nbb{1IQ1%1!7XO0{!no0@s<51L^Qz{(^uci`*ip^sQVD zCMX&zzPX`-Sj)vA(@bFsnm{*E*iN*CetPG2NpnhY$ilZ(G=dvS#-LdSN+jMS2+st- ztv+L)5;7%Ttd-)nwF!M9VT(QZu1!HUgC4xoM`0#>R@W0S>lg*KQ(V!u+lxBWP7Pd} z>4R1zt`m-U8$iq|Yq37J2%h*NBB4it-Ot=K?D_?mX()^8>VCJy$aJq`S{O+k=m+MP=pNSG(Nt{MKKZ)M}+SAMHnsu$xBmCikl|IwlutNLvqXhgVK=i3WO){|{LT+sHk7f{@N!M&TkLfscqfc-h zb4ZB@8*xCLWPGkAVa9efVv-<%R}QX4znl%H%mC~hUqEu*8DEh{2`r8(E+zjEBdv(L zJw4VU&+xevBhMYm$NwQS3l~ua2mVa@)aZo3K~qEng)m+Z$MvBcRNDZ}{?R@Mi863` zktH!8F+scfjPr1vlV%(r2T1{`86St9AT*Uw>l{B{kB|Y6iIjV^NzrG(HbTjJ=5Vzo zpwGCbwlnNn0O$ERWlLKXyT!o8n}aq$Hn|1bv)Vp^b}DlFKK!4Nk5|yw4eA>*u|`H6 zX!?XbSZ{$6CyYtm?4o_itu3yF42~G^&|mc<<)w_F3uR&eiKA6SNeNW2;Cn+QYVvDYRAe4LDI@ZS{V=@6xu~ zAn+O2t52MlZ5;4?47=hpZ35lgW;jRfJED$z+_(*h$ZKBTCOc>^^s7GlfKAm`@o&W9 z0O&Kct?>%y!4A$n8fhj(qN#U*f0-4~XCL!D51T(ccf9|&8O#$7p<5T!$#rd85;6kB zN+3xK8xUU0aywq@0^!#N+9=1(Fz}ds9rXrV@v#8>t|#O;&vo_doHGEWo!T#QCS(bD z=p$dx>re7IsTJb|Z{i&5z+teZE-5C6W&?ait*bevu^`(B;8?HY06C0o%DCibl4aV} ztN}Ay?C&UdNVBF7b zNgl{JxpoC~XoOj$8PL0khd`tmHi&j3S-qf<$Q1j~UfRdOmih!D8AHE@Ak4KO$q7y8 ztZNi)1_o^z@1(77>Rv>k(1I)Dhr41N^4TZExT=aDpz z;yMvil4h=DPRRQ-3(b5$+m)m%$sCz5lR)#QsX6QH%ZN%5s>kUck5i8^$XQ6U2s6Oh z?a@`(mRQpgk_x^nngcBL?&nZLO?t&vlhq3Y_w`WXEpo?aAWBgRvPVCGbWD!HZ$Kr@ zfIbIlMq6IQT3;&ADm>WY{7jnbqi6L2zoHG1X0+$C648GG+bnSqagh-NLe4p%=6d8b z&Mz_41=|k$b|Jg#c!PflvD6*<7JWDJRS>dMSH^ePz5=m}>mC@2p9@s~C=1$G9f6!( z;v9M)e|4^(R|nV%>{G0=rg#ijA|_esgTc6(%VyEPu0@+wo_e#x>v)_tgG{g!w|%)p z>c=3rfX{_A2PmU%rq~xy?SOr%l^j4hSQnxkq}hpcv`1vSc6H+aKF-XII(`7WCT2rp z8G&k4s?CJQm0#QZi+zwGUrXrqSJ4LQ0zXHe9Mt_PVT#NJ&^s9Ya{(nuI5kh zeT*WB^dIj*EyS?^zYNP|N`8~<@SBPPlH`($HOG23>gS-tuW_5U7!63LB!BebYnJ0Y zUKjpB9|5+ndeQ~szwJ=TZwVkXt!gB*cWk_X3YfmE{iIXU6_hys*Ri%|lJ4DFyFr0Cfu ze-5bFRNWfFZecqQfTnXM%`t#7Flft6nq`9&Sbmp?ZN47GrhYM`fWO&KXo<;69>=y6 zZ2E(3&5V!R2=x8b$N$XU`|dxoOE0M>Yk7*rsc}A}f z&0wKD2mZ(LMwWae{p9@{K&kjZk5kX4gV>+Y@+qA!J=&lcGt2>g*^jTNnjV9$AkK~C`T*L< zJALBgmH>OiAh~m-IosC#5|DiG6@4A3 z=va__px;U8f%~1@6&WH2f!H?}vaKS{yJ%xDF|3W~%j**C3VRS?o0w_dX&_&i+cmC% z{_K70A2tHG&AyEo63RZ^^rv=no4Ky);XiP-KNa zncSS20pMEiacsv| zatmEnLIx1z+0_#u&J)@?VMb7M5Qlso;Q3JuZQ=0{MNY(&gEa=^R(MM62!4(RUXY$l znqvu0^#}bZzmXu3ERPn_>^|(13YsOK=V$-C>8U zV42!x#?I)d(VFEUwl?vx($eKj1nO`+sEe`Gj7Yl^BQu04(?|g zaZU4i@P|Hz<#GO@lO6TR8%D0@k2U>uEbN^ZH74|*3t=ZmseJa zWO=}(xu=`f#wQzh>@EA1|7rVT#|Fe8;Nwq;p#q;EZS7?L(0);GJXaWBJ#=i7ZC40w z%|X^2N4sv&wnD!Tq4~+jB#uonKA|7=g?MshdO%sxp< z_^n88{k0-mB7i|LFtrM8ieL>w@A@U7L_rI}%mU{(5>Au{pmUhmEx9vtl1I=f@SvMJ z2l~neUhE<{a$v-IJq{iu$XrUHbz%kD*RxR+;#!b{FcT!pB433-NH*w#ez`z@dTPQ> zf9z?t%Pzavym|Aq-DjV@?WHe$iLG9-!oId~BhFu^b-;YbF_$fjaTa?y`dR@SuN<}Y zAj%t7PhmR+(mV;}^LFY(}b%tmJ0IXo_jwmbv- z>60Kv0^>P+7NnW8&D{F_p#*h+G_xI#dmls`zE&A1Tj>S(+H;a;F4ePB)%q$ z1&KgABpl@$krqHRsmWgs$SSaHq7)z$B`#SQB+cGNzn$j$hJ3u9SK5l$ZeYCA-eE(W z&)Ozp;P9B@{;qv1!uAD}fpxDxcsY6Nw%yP@S781@#z54C?ZXGJfagE!0b>k*P8$4+uuhu&SdeDMUDiVwqHp99ZOS)N z#Cyr^Rm@XA*Oj$KHxcmX17DuewX^HNE&hjoK6W{r z9F}A0f7KiTxy#%x7}G3sBh7?rbcrX8Jw<=cyyy50d&ZjPOmzH)37;jK%bjyO@;KJ_ z5(7RdwUT3fe*01#BOcc*(h8eT0rxS6rv> z&nO2P;wB>O;7>E~ne``ZG6davnMoh&_!xm&&NcE;3t9gd6RC}EU21-8GPf~*d(f9F zsUPwy-xP2Hs&8l6y7cL9d;z#GaTLGlLJk4^C1k}-iK9%I)fSNbv1jbwW2>nH0tZn)X z&|kR*W~gu0I*v``9*F$}%D0P?<+%n0CnbS+WtYAXg#eAgB@CQNa}Xg%3#1O}aRArF z{%}^bm*LAV;4l=}C>$IpNRnqqCb8wWBe?}xTgzEa7?sNc0IuO}1ZxofbDdt);zoib zCT8FJs6ki~xnsw)U&kVV0qW(!fPRbq`L2_;%Rv(8ufq{wufB~2h;jgW__Bra%XXAg zvaJ>&BJdHjBU&~ z%TM$(<5zOFSQ}lvd`{A=SlljYCSc1f6VQnlVpwbe{Fze{=SE;tJNWFRAM!)5ART&? zrEA=`*;j-Y-1a{a=fyhTxkz)qrp}u5&^f0d+R6hJ(FS=O*Y)V9Lkt`qTSH<=q@viq z`XJisW3b(ZA?H&j$(H;G9kve+U=5!C@Qr{$s*>g)&h#bxtr)<$nJ{Z$;Q-`=Pb1A! zxMmXnXOi4So@r*RboBs+=__z7DWti8H#YQB*0aSa@7HaqyWQ49Mv3DjCC1YveEHaiZk0d$F3*Ur+r)Faoe`LW6j z#F#%PV2l2f=Flzr>DapTsaMtl@qg48*0ET=$gyVz@fPhdjt1H*ap^js4cQOZ1M#%S z^9K4u_pV(@KBG=-k2HHbL7E5c51{?X+XmR9_<%%tc6_oR`G;@G>0KAGuw76W)JRPP z!Hxm$-=!anVWb8c&IwAqOHxoWxlv_YKx7d6n9xjY1x5%^ly_qml;H7iezyJLu)NN< zfx!eLgJ`fH-^V~AApv1Vk*`?CE%q~+NSX_|BD7Qh+i|_(i$Gv7sS!*AD5Qy~g>D!@ z1N!zxwOTJ@#e$L=lZy=78iDX@;C@NWPpnMCCirB!o`d9zlijL-R%jFd%XSW2R>+1;}|eOp!f@W6Dy(H z6-b~JqqcF)v|YJo#x|^)vh^g*E5{%McwI#uNjCx$<;_qU$$BpdI-|&g@U%nwduNAwD~{I4Z`fixB18iL2i3D+NvGsL%v1Z z_4)+vPd4q(;D$c~G1%sIy}+@+cM;?=2$si?ldLX5;QriaLc~=DWQgNUHg9JTJcmB- z`>eG=RQ)CQTnK#}UIR!Rh+(AJW?}O_ao& zru62MqH6~whojxvevvuXQ~z-#BFC)D7_mE?G`kJjxdG&Vr|uTppe?TzUN1UkdS@Thueg4-?fV-WjeMfu~6(W%qAIe4&#L+EC^Y}1okIlrY}z8I@y%IEB@=& zp`Am>tk2Mc-)Z90%)Wfw@-??n2MNFNaHxOSFX|0B0G9zx2O#UPS?laW{VzS}2Pe&} zIgx)Zv*(9{a&cK#Je4AR?BiQCXe<4=&^X3_kY=@ov9gU>%|H2?fd9zOX!3>6GCOJJ z)=6`|hG=mxH}X>_{TLIs1>Cm_*Zg|Hc4R{}u(-hi*lH~(3_OrbBr7aXNCc5){vQdI z9MOzgoD>Ps-wy<74k9Q(M$Ix4>JLgH(JN^#v&;@I*FYkXAQzI%Uy$SDC%qbyc_St9Kr#|rs1lG8XuzlEYY83)`^`u?3ZdRmu!%FyJC6{MFUnT-{ zb=^wBDG43j0?b&!;D>8+F^JgclV&H*j7Qo(`~uJ~&|kh4_!;McGK`^~aSu3>FQtRen%uG^^(_R;ZiMvc!5$WZieal(4yvF1yHaEw5n zXjxZ@&)SCSgtrlT2+EbYz-xWLA^Z`&+m2{!n%hl*>;vC+2O%FQ{4;&bc;#S6;vD3d zvD=7phQHR3^3ujFu7~Ls@ED;V6KUq>3ct>mLzKakI_t+-TFr2TJ$Nqlr2kM`I6e>$ zfHW%)YI~GB{ORcG&^?yH{8S$67e+H_9x~I+`dqH#cW>Bs_+P{eHXveOBhB^h*6fcS-fd~(CLZjWZ0t}RLpP#)m;Dje5mw~=gx zx)txd_TM!?jRS%-#{^s-*;ibik!?|Jf|w9-E{M-vTg;7G_lA*X5at2WT&$Rnj_MTEptP!k7cgO);ur3^8@4yaSnYE zH7Ci7KAT*WlQ~O}I6HzgGcH@*p`PBB#+9n0+dXU__I0uxXg={J8FQ27X2&G+5cupr zke^J=8xH`GfF}Y!kyF% zUIGq?FW6*E%E=_m1n-!jq>2X&B;pKX75KV7IHBn1YhIbu&{MiX4??;yd6Br8h{WKA zKKSUrPeem+w-N2dw)@~9Ov2DWqDwuU$hb54&G5NMGk-pl1*NW00;>JcZ(pt`#KZRc z7XfjcKBC-82KW^FNt*d(zdw5CWwvzBJ#7B``L=M;B75y?UuzQx!pkqe-1gmfU#)ds zwrrUldDM~in8!TEE`9skZ7techrxiat69$*77*u+Wwv>C6>erQfa~!-lV7?O(yWAG z5;9f-kV*P67HG2}dY5BFkY@L7De6M_rNt`!JdT^5?UClVW*A!k1*K>aL7JU7Q_og( zSunhKL{Y%C$_@0{C(VTzXYef={MNnRrV)sNxW+4GrwwQqpKbGE=EcJ8?eW%y@2~QQ zJ}DLHZYa=xGA$Nr47T$TJ52`;8<0=y5#^?4y!C>EJ$A9F+FdCfZVa$tS#4 zw$5ZBp;PFkj~DjbM?>*%0Ion$zaPbC8Sw;31dEU#BAys$USs&i8s?sSRbsTZlSoIk zH4*X-1+R?(tWTPm6R;uR`k*a>`A2_Ok{pAV|Ihup=@1bH$_z6XO#b=|r; zRa?r3h-1XR!1{~-otV*(sT(H6(0$oqb6 z@K=s@p>wL=F5k*y#vCp01-DYw4Y z0=x7{vrx`)B%O-w8<4vLvCscUVRsT{#B2x9PoQri<{izSj%E{nF8P{Cb9uVke)E8B zdOnO{93VuR!>4mVdI78(BE-1{H`6f3p^@kqvSzI{J2!y~|Bjhcr*%^S~^# z$5(9^pB?-RI~BwmEr#ht#If*Qpf8*HLHjYAouOZ=1pMX(DE1{pnqhM-b%p(F4eB9Z z6Z<^hAd3KbnOmJ0pzb-Q7-t2#8An`}T#LHHdgN`zxvpEp@i5ZtpWP=8C(U_Hv(}sA zys&E@S=S7G&!}<{=+9V`E&YFxW(XFf0}6Vem!QYRDry|efe{2bq2i{(fNdTCF!1|Y z8Enybr>GUf<~x#R=x#vNeUN21X^uN3zgr^lkAb!i4dQAq64}8o8gvUm*+TxT6OoB$ zMsCmH1)_oWERVVn3=Wsab;Y(gu)UnU*sFj6KL#7>R|P3^NeZ^XJ~Nrmg*b|F2b>|- z8!phVz7$#Z7kTJYlAu*z{feD*{DbO}jORT2Iri`W{_pmqAN|O_`qi)6vBw^({YM^o zq&?w@Pqe3;`4s#8H@w!qa>W<)j@{`It|cG04Qrh=Z&)>H8`k(waq~w~{MAte{700> zJuOfnzkKE~N#984qPL?ZXC}=WwBR!p;f`JgVC-=KEdBIis>w!#FYvh( zsmQ}7C)z}jln+AJbWWVdT%#Xi2~As99|Wk1%KgWqzEscj$@;_-V2lywfYeDPYVFBYMV zAXJeTc_89-L7MBD-f?^`h{usW(Bpi?bBkS{Ey}=to+p`-eKWE?1#VCJ(0`7aKULpe zf0`y|m6c z?*wVCwn}n_+U|Yau#DzhTDwW zl&^_2Hy|INFKaVnSTlK}_zaA33x8#ZAQRy&Np*i{a9m~{c;TFCu5Z~_trEIl4jlvqg~p(hpcZOZRXuk zYdgSeL7F)mOL&ou**Y3ZzQjG^l1T8`$nJz(bSjD)OITj z1lNP9_yk!FKxYl|3ak`F#fh0T$4qlLY!tTSz!Ic6$fFXy$Rn9jAr8bbL8KY>Tm>6} z$oM*M$gAZhENZK@j&|epl)dzYFR*!qG(Y@d53z55>znqIAOFy9xZyhchkyKgJL{=u z>e!;ii|mAx9%N5^^5gBLuXvID*~dR*S8v#8S52(1q`pNN5mwEW)v;-;cL;L zcL$N59&<6UFz=)<;}x=I_&3{K{$>+wi@!wD3R%{V=K$kDnsH2Ix5RaQaeNveSx_qx zKx~F@`+3H8NUFkLltaapvl(-cX3c02j6cba*e7JCKEg*mQ0_ipzZjfYFa4&?{lceJ z_2~}%NFNoSv9?4Tc`~7c>Z>l;XC}@3pEslauIA#$;*;D~=O&2*eIm?3nzK)wn0THk z*LsB6V;kp`IC6Al4WgR$^8EGq;-h5#Og$%KGGUH)s_O=D%{bx_GDMrnuRfTjSu5&? zb1G>r*JN2FPa&fsq^Ui^Wptl2##(++F3LC@J}c|7$^U)K?2+an%QK?(XpBJ{B+Gpn z^#xqcx?#T)L6C(?nhDqea)*8u)Q9yc+h6+iV?L8+#<17WVaPS1lOKTV!fnb|=oO?{ zlQux;?oT`;jy;cvs0r~_pG=bT{|@*X=#yrD^=Br{u!rhNvw1-Exai<^a{Gxi*SBVX zka30+#4Sj(lQ@q(##gm5Kf`Z*(%h*XbD`u>jY;NMX*Uq|AM_346dY`9?Q*nLzPkkb z<|nTl6S+iH^#S{9zdT3h_7TTvqkh~B*5^Qui;z2BbkNUbuQ7=AdR~d&62qS3%=G|C zvlHYlX?9-)X;w|>)F(%$eirbChN~A?JDG=+yC3Vj=rv#j6x8cRXw+i3TKqwn2|YFz zSoqzLJg*uY&L+-B2qCZ11^pWI9I^+T&kg&Na&~=0wpv4hfPGdVqbxR6xZlF^+zCx z%3wT>LPnyXFHM&JNuKpQsFIn~vgt$tZbIpaItpI!EzIg*Xrd zF~HaE@r@$<#V`NWFD4FT;Bo2#{TZbEMZo>`-dn_Y&pr3hn>Ds=-C{Rhca7b8%Z+yP z%{SP)-~CQoyvIVFv-f^W?Wp69u!lbE!S?F&Uk%?(+J@2fHoa=n)~}hcE7x)6d97Uu z|7rMz1#}|uV$M-N#WX$#I!lDYPS|RoM}I29+!n&DFWUxb#@2HMHi+wkoK>)I6uaV2 zwnan^2RH*6ZFzg{2iYEW=RoQOlluzLHgDMSD*|gj>Z@%dZu(HZDjvAy|2P+N$3U+c z3Hr181@`4lDf5xBCcn9FbM1_}*L>ukTm1__kMh~=@5Md}zUpK6yTr1$i7f~9Y8QY5 zJ=YWp{=a6K;gj0G?8Q6-N!I%2Se8+UGi>PcqWuoI&u#8o<^=PAwsUB(g-un@xJ9g1 z!YmXbsvHS>vpvWJfBO350$ z@`do2cCzOLL?n*(qJH!z?Z@^N>rQwYm`m;xw|UW)EjqsuA=X`jY>!)mW!m z7jWGj={(5hHH!W&Hmuj6zvSnfE3#{+OTyjM<2iFVqOApYg)@wqaX%omryM%4fzx=8FAkAH+ptKq&0n9>B3OwJoP-nOh z>{0ZIoIufz8zn0~H{_T|kOH;W+t>D&Eg(gaWO@{V&%TVomokjXp-d2;?bW)dMJ^3cajyfe-?Lxh;PgRV<*f3ezTV|&yX8_%!t_)zdMD`RFG)T7yyV8i@Y9f z$E$V*5l<`}iZ!;G;Ijpe{;>dyb1l+^Bm{Uo zq2sBKauTuWPa4oZV%(o@l(ykp4oFjo1O1W|5X*3K;fTCq(TbTl&CbCN%1gwGWJWx6 zi!)=rgziXUjmMXk#B&ydBND_7Hur|G`+;4q&5fAD7Yn zPIwG*OIhJZ&g89IE{u;f`F!moa|7oBiic8LcL=o)S$Ivd*Kqy17#^3!H}ofD@+X<_ z4f=_vLj8K_BtLR4FgPx?n>FUY>zZ{AL>uA^z$nJnzzB4Xo1;p|5r(GV-zwq1n43wn6J~;hxjj+xIvG#HqIP=yWbH&h1lnO7SCXbTJvd4B z&IOoPZhz~(!d}lu`K4BBK7qtg|8YIoF z@5oI)A&Xz3T@T_+kVMNLy*y!inm|)MaO7BZ`*o4Wm#_nKGscHqveNOng2kZ}c|tQ> zCHT7ul#P@X-Z83CaeZ^ljuCyvw;JpE9mvC3*>y08lV$`1XSss37LYWHbQZ`!HIdLh z69;lhSN!jV19qTI2+kN4TU476xGo!=B*AAV8Z+cUlkyfFO1lMec7lIn=kS)iD#~ineF~(LrQgbNx7;iYnWgJ>Qg#VSdu!~>s zH-?!slTc5gP0nCc5*RwDSh=oR3yYl5z)D*&kC|Wi++>bI1Gxpn9`ldq>0Iw4;*3u; zrdFaaJ%)9{%t!6im-2ZLbs3zzxxL!W5Bp?%ck?NT3-#nv9L`XB4pTM)vamh~{G4wFYYCR#aUv<@e9Eds=zUadT)PTo4XBE}2 z9oXhJ)z`YMWL5hKlv)CmM>OfX< zG$s(D4(B@QGtG$iQrksD&{wn*17tjkZ38A`lFuU!c8rj_#i8q;W5CBn#v5`GGC4?? zGh9~B4Zm(jzeY?EAzY5IrK9~nqAk<_@9wt%fYzv zI14?8A+C=xl0-Qp{OqI|?b3Dy5ts5u4?e@(>TQr_C(Ma7LoYo|f<2n9_`j2z1dhWV zURw|okexB0ST1oKq&X1#JziW^uQ@HIqMilXn@=bg>uv%18@w_0n;q)I8E3ZbHO|rH z0-isP@RyGt{n{X~4i(?H?{h39AEw$Na*KXLmpI4U;b*j!i8*7Ww$<&Uk>k=X?N-lC zsK;FfJ-I@inW68I=iU5>Wn)p7SesKGP-gfv+i=j1)TNC#AxDoiQ|T^zT_nLkCpipw z;EN!_@YOg(C{~1<3mt=0(Ae2IU9>{h{%^-23ldDx`qO(|EP0F>P1RTxlD|rtL})=nbQjAkGu`ze}2V&IAb&Vn5=>xgq((QtB0l8gO_4=t~tPa&7Xi(S2e z!oECM>m2zXveu&CCC$7!)A8sNGIaHTY_N~pwb&h~Ij#KZ#v5<+d}f0M95U!U$RRs> zTXpUDDo2p!mJjG(GP#{asKQo~i^rmHQ3mLeu63|ICt04u+%m6RrUu9y1rqHjuOXjM zWNv`Gv_a9m!DZ^k7C{n;0uA3!ppdHuq93)ZoTFa?-ly9zeB5eit7lz!e)?eQgx^WC zmU!V9`|3X(?6*CveJxcf@!Jq>db_dTANdDqZc8&F$Le{^1NySJPCCAD=#Q$=HqL8L z+Z-ORsLjx|OPU$?h<6UEB`?`e9xVS~k!IPY#9`K}#X*V15@*Uo5j<`Ql4ky2K)bW{ zK$jgs-`%0zOP_rmn{JIL`;Ca7I~bb;_p?prcDIiDV|n^J;Cc?^JK*CQ!Ma04e3ckW z_UR+H5l>~;;s15WhwNt`58(^8$vn~$q2`k=X=WV3K0wr5e!+YeG2JK4KDN1ivc9rG zCqIKUd(ENj8V6EL1WC?hS+W&5X^UKM2FDTfE13##yiQgS=(8S!+yeH8PVw}uZ(B#y z%ThDtqwp2BuzMHnTK7He!gf6lbn{Q&GKR(N3a~%)$+9?)Av`u5IW~sJoA*&-sh-;h zWyF?!M@+!K{B_!>MZRQ=?~IQ+HHPz1qfk?P8E5l@COHCVm$4U1UKNL7!x9HXk2Hr8 z9#riDH)vgpBsekDF&d;AHI!*V5{carwg@m3YOAzS;s5)f8t4Wp9jPx5!04GYdtt*d z46aIu7`R!ORh(J;peuq{pc6X-oXh1lpx$UV5#}Jxz9!Z&2C~Yn21g9=fkv9MY&p2R z7&&PsVP0o_w=^DC|dzO`{2A|gu|JLUSwivgeNH5(wiD6JD}Ik&Cc3OUi^}} zMB};7`dzzY^KG{E_S@|CTW+!~x8G`8L7ca2z7@E^%zkFy`S#!2i(YuP9dhteJND>< z?C2v7wiiC{Iq=blUA^IIo1Iv18z*P%%5f0qaS&(R9S)FYASY9Oy&&x7#e_0r5K%HX zTJjKWzz}d<5ad9oSOK}%4vSqMgdKT*oJ&LjCn8R3yz2Z`yqz#R$@Nzuy4|})rwmdW ztQb4k#~6`aU?1iU@--eY_Gg-X-7jQjp8(@LmMTaOoD*>&z%IG}i2p8vq(%(GS0#r@ zJb6>r+JxN^SN`g`Mw%%fi)P1$VJq1;*P@JUH`g_0n*BQph+)X&geCwv2^{k_icUpl zN6`W2$*!m?-2uw|)&)spDbR?wSQ?^%l6FIfJ*I8`afH6*e4(2o?ni9L(Y^qGc1SbqfaAkR zvtDxspVlvn!dI0v^Qj7a6C{NF3?r08EC&^$Q>=^FnWXCyljAKH{7k-Y~l6{kj$pC5Av528|Zp(8=jQK+NqQ(|ux8%+o zh;}Mr#x-lX<#+=*2!HbG=aZE2-TnUle)omyX9eS^*atdBz2z9f$Mw06b3bkBYrSEU z65IL?4Ep2*%y81|bwW33rT<~5u>wRMS29*&HQ+vi|9}{~d?vX!4&huU&js8EeKgY? z{w$zf+y6WkKE*M{r~Fg^eda>DwzP#lLm!U69nwq`om$_`4t;yHbG#4Qw#6L#ihlCB z!+`4~m&0Z7Sm&7EsmH)nY}ZV{|2)=w!S=#O`i>anZ%bYoucf9Y`vz%dJna(Sm~F^` zgW^>NI)^u;#*(rfgF&d8)C!cIu;uzJP)0tV$ZZ99m)H8F3@oY~AiJcQNjeF#%(q35 zG>?-wt1u5C&49jKR!B1m0Og^aUNl%Rx}=%wPyI_}=!b*iN)*h_EHBzxg+5?A64Lqw z62RqT;bFV6B*QTR!c5XijDRo)X&yR@JjbkZMigek3yc$Hk!I-3J9gSSieu<$p;JGR^mGJU=M`WM@5>*j0i zt(UycjyPf;JLZ@J?cjs=wzHn{JGKUTeEsWxW$P!VMVNy$ucuGpOBM(cX8P4*i!;sT zZ7i@;TSCx?geQ$BPM0*d7@{wwKjOj(b1PCrF0`~s#93}5uZzn2b6iFN_nGEyvGw!W zW{{Fb{JeOzyl|r9WM?qX(mur~^NMZc_Db%eUB)MErNb zeQdw&b2*?R5QD!4E49o0O(vqA-?t%cg{|ADo$m_Dq_G#mjQnBwr z1196B+ot|!yCC4xYdEk#7od#+5u?yW*HE{<{w)W>hx&3FW1n9v97nDR@M$H?1Y^%* zQSqpL99@5R>yc#1eQk$KX1R76lr7n)kx@S$H&I;T6JPZ6ffsh>_)|%9mpFIVhwlKe zN9YiG4bGysb39+NoL&FaIAeSv4uUlMejd;E_w)NP6h6qv$76D7F8Iy0kHsT}G*{qS zl4jWx{^t3;R*;$u)I&EPxS7M)y% zuKr;UuRCRbY1?Cl*Q9>qoOE6vxxWVPk7JDg$N}<#0Bwx?(Y9S1?b1hl3B)?sxRPd| z6W<{N^bGEhUO9)tiMuTa1Yi&m>B}s>wzo7?6s3_`P_>};0M7}XgdHMOpcx{!)kS_Z z5`iq@3`aR4v2hH9nWUNIS;U!llrSO-b`a+v&$# z?B`$_1Fh=|dx6xRe%fPgw_WFnIKT66-}uYFw6B4XZv<&3cz4ak z{92O1LKI1)q|Kzc6cXsdca%d z#+nz)ZtSPFfV^K&E>NzJd64+DJmBZ@f7qmg|A(D;E(<){a1zNS3ys*e*mkmv<0LJk zu8(Y^7=f)3zb$_X26BLiwg-=S{x&;D{Iw$KXa%mD=l(ypZg;m$(L*=w+#+v*QRFgT zN!D^W25968Y}S@%K;D98-vIl=cEHB8ZIiFTWovO(05a1^A$`sK17Rxqdw#<1(xvZf z??2%EN9=nWvDP(a$^s&u*dN5C?+-_A*W*>6asAZli2-y;v&+Ks{YDG>V27^10DSgY zKjb81oNJh4mRY2k!1;)~+74q8*XnIU9pE^0qU@QVN&nP$CX#(J+5FYeAB)=lat-X_ zGt+J-Ki=Al8shH_=lDTT7XsJC>~bZ}1Z`Jp?+(Da6QsGXTVKBl#+nfNM7<(tN61xU z%Lx$k+HFa_ay;SsGLGc2FO%hrI37?OCq&LgY;swlW|<2_QqZTq$XjG^JF|XzosW3+ z`U$%b#a0de+P(NPx!3YyPn^sA$##yMY~ViFk~OqU4n}=*JHajy6FoHBHn9E#ajw@_ zz>h#B&C+!^{MC^mH{>jBg?*#V_J>2_dd67iGV_1>0QC{Jsx^%^%P97%uTafE-bRcPoLFfy%?%+_ zyO|qGnjvT*%_{VnG^a+w7!=k68(S)XU}b^wq-=17X(+bS5c_MpKqyv=Lmw=_=qr|P zL;yR{gItso!Ox``dMEKp=+z<3V=xpx0|TTP$-}~t4d-yjX2PpqKtz992MW@m_f4{( zkSHgx$ixhDAAQoy^~^rYjKZ>#q}jn)e%A@X0lR4pCbmrcD0YJd!BLZsHpKweC(Tiq zI)wl_aTZ|X?0S3KC2zH5OO}c>KmPR7?b>U8YFln6Vcul-+;yk@?B08A+m<`*&aK<* zj?K5**3Gxrz4vUk|MP$Dvx$i-?2)HD$PPGg4`7KMc+g(<_De3dzq;}&+c-Ii0x@dq z$2nuy2KGQC>h#D8$lmVAsY@fxEykRb4i2Dw((DDc2V5WUYrU-)gnXtMGWBE(+Y!0w z`BEaru!C}Vvgcw&qKnw$;6{JqxccE%z&<2&?F+i|3qAGGB@-U!_zyrd(%cF^+s0?f zj{bmA$jNst>sQ9uFJd-s{zQ~B26BMFb-CabmIeM&4$BY*s@ zFSG%!t)#ixuF8V`8yNEqoFVku6Lu`|1N-=FCv|sXqFJYEFNfcJW!)TlWWAcR%)n66 zjO%$V3A})|3RHKizuedLhc;jj)s#w@@fr1ckTiS!t@=ne!ex$HF4B*E{dgHhD&u&Z z+s9nTv>!g}l4kZ<+Js!H9q^r#X1>dqq?vk9A6#2WGyGQJgL<=BB0fjp z&vodZo3dtLk4?rGE4e>;!#jBe$2^jIZ1iVTM2a^?N+e&(%{8oZA`y z%Krl7@z2}`Te7yOt`J2poG)8NO!UyrTZifI(AZg|n=dd<(&Kq_pK1RQ)JNu+zJTmCyzvCb4BBFSlI`dVZk4l)y%eLcyPif?IYUR^c1ZK| zT7Nv(h%*FDAX%=T!>}@Lf1n{EIB+hMtAs;Em;+umSc4m)wH<8`LyN^2+Qn8u$SwM9 zPJxmL`dH^l!c0u9#Q%U^H?wXWhVo?1pd6k+Xs@)@;0cjQb6r~s1kuop@<5T<5ovDV zdwKZ-GMZ&xiNdmqg$oX4BGZP+PO=qbrMy9!JB6YAUke5TCDs>mU>o)gU=143hed^t z_ov3jZPgXa?ZGFVYzr6cZp)S~vke&-O?bIrxl;6XLg?@4MjWFZl4kje1($gwe}vqSC)qFCJP$ar zrkyv_I!3u^2Lf@Cb#wi^hBdUf^O+fkbcY;#U%To)$@4JMjB7-gNd|dcLiQtbrp@?q zIRWe-qKx(f#YQn;`JfeYcKwp;sU|@re>cZ-+z=rPk8vAq#{L}wKHHvts-N`ZT%?)T zRpQ*xU!IZcO#VNX%k}jlvff#)9G9K_fb^&TialFQ;j`?*wtQUnF)#GPbtyJGW0;!(}I#zk`Igc3?)D?bla$UF? zbPRtHK1OtX-i4o^;nza$>?i!+^iBIa@^|Y}_n5QjCUT-`Wqy7Y9zkq&g|F}N%>yc&@O9YAs4xEAF-T*@+Xa+YK z)dlqcaxgH$Y2jGHl^rw~9aW4F?E+orJ90AUXI)WXTKj%I+jF_82llTTchbB9*8w8U z+z-;s$AdXxm9&t>?EpFC;O5XiX;$$m1`V`A5Ad2?aB%%F#PN(#+^CGM0uj>tZBd}0 zm;d}Q0-w4j$ICWK{=qxy_#6dC_9H7e_g1t2 z`K4X=U*ES2Uinfxc>g7~-(Gvz!rkWC)6aa0ty!_$)=y4ohFx}q?VNBpLF^} zFC{x~1ajYK3;xdbAZd;}VBXm13pFClJ<`m94fa%Q0$jG>>wNj^k{rB1$O`;bb~(OF z3@3OU@V3=Ha~SsO>(uH&0pyQ7&d7ceU^Cs=!Ip8Y>*;pt%TtSRt|78bLXyi1Bd)`K z4Qgz^K(&kSfE~kUuF2J!^kGP`2KcMhm{;&sA&bf<`HR4Lkdbx8zi1S7FzFHY24Y~U zUjkjb96nyI&!LY$J?QI0JJAllNa90|0-zAIL}3z^|QuAwHF}2 zGH>8JV7BC!&%krrMZ|MP_@)g|%?9B^m)ql|A2YF!7}{<@J~j;e(Ts8Ux9XRUsh*+@ z)l}F^V;S@&Ja^FtuL~O`J7v<`uQl;G2=k6ebC$_(q+<;3>R0ui!=`?ok9U=HmT`~Z zL`=kHcWiR9oCz~)KwL*cSY;q!8^&mSuYjbv))&a(@!Pj0uE&k1+mD}Jci9ObVV*&} z2CDox7IqTwf7rbP$f+OAs-!tUU6`AVG&5J3L-~d?5V6@ueRB%Xw>!ZJsazW(28O}c zKkX7886g3VvsRcom;peV zeW7$XgYZEShEIgqzXk6KvGLki8bGu>qXexxsq46Ggh9}uL|ofNejGXp!XuzSM?f%n z{p}Yxe>L`lFi);Ru&qFFCDJT1JqKxK-_x=IbcCL9*CD`I449ViB|+jdP{{`WkLv{% z3m)gzp;zKX+XDM?!s7!glNW$YUQl2^%8Vkyb*aR-ic2e6OdLU31Mfnr(jKj&p)n!U%TR~Hnz%tW_D_gljj-Ol55<8jD{|O7Jr4%K*#Q&!$-y>%8e}VENRZM zNk6&|-M=|e2|i9_vd$=B8M9sT>!gA4jU2-DKA<_eiQKN^HG0g5<6aChX-16krVFv7 z_nnF`XVUEX%jFm955@Ko#&5oh`vMrl(YS`J_i5-HGpJf4F{uAJIcVsx9E^CH>`jy z=u65UIy7DQ+2bbHMI!9X9BIDMn`qGgH2gS>GzTFcAkBck{tZAIv}+I-J|-t|o`wwV zM^O0Lw;@l&gU5=?)pD42CduGU^eXABUospLFCH^(f3zEIcgGbX@|ov`EUc?I20w0> zG)Il?kY@CuylvNY0sYGLM6NZNC=1-*)2**#(+i+)Dk1C2+dYTALR^u=5!mPLd3_7t zIb6=UW}5R%q1zKSL>n>o5Rq$9&xjx_PFhOrC}#P8*b;Uv=kJI#&oa&c#+xQWu;ucM zu#WZ1Q##Trp#S17qw&*f*)3h4M6xl7+6du9jCxD6!zDGM}#0{3nyqGq*JOh+z=IV zD4o^NoE^Y9YM1TBtVMvw%j^n_L{dCaNizyImu9R$prqv)6P&3=F{)oi0pd$8{CQ6r z&R@v`Jpp$pj0dCn0FM8me}CX2sq%q_#h?pcb4lQQAhzBIw{9B`4j)`SI3O2fb$M|d zHg>|y&$J5*RRjwYs5#&Nk3?ir!G?XpoJeyJXZ>s^bWp*Dt@!wRa(vv*Ip=KKV~@qQ zc=2MJn3ypBeei=H6lp&4s3Yvy zfgG|e?E<|B`c6JXUG0!&)R(Xmi83%insH2T;lTNPe5`>SwOB-XU6AHLmWlDrm;oS* z+my1!XCG)Cl$W3m4$o_D@xS7R?RZ?&_)sh|cUf15!E>z2gZ?8P6ce>A`VUC4uR4w6 zUhC5}1Hk#XKG$K7aeSr@h!=jRgQPi*cYd%z`CqP~jA$q4S*;(;ar__h*5e#k$~iv4 z!F!Kc z>2v5Awx;}{LwwFCb}k@Ej+~Z{cpbK!1&tczey46NMnsr-<4MxYU*TLc0r|#pJ;x-E z!5FKw#Dlu>I@$VRy^eI;i#Mn#U&2J|;$fhhhj^+`I(4lxF9 zAF_{Pl6H;y7oVdKo)hQAx&8BD18$4G)h={kyfHug=j0n^xa1h^__9QjW{*48%cvdx zB$_dTS`H}3s8@kLD^>`QW)VCTr;f9W!>dpbM$RgMra>H?)94y-df@&D@*r^JZ@bvDu}krmVqKti{nnlxyFpMqS)4;hvT_s4@6>eC9cIFuZ8}U zV|-Kw!vt+$Lm4n^u<5y_r%{BP*3U{&hCQySy=c^ z*--=qUyG+#OTnR0Of)cJO9DL!BB*65v@fp7(6^6apIR)?x4>coeKn|qEcbueD8rKq zenEefe`b2p-uK==wj&NdOv^Ohap|QZ&Hwzb|6+SC+0<TV`*0^IPrIM?V%MdRII2 z;KS`*m;I4#-gLYD_Se6%ZCf|ny?1W0pK-ayfB(w9^KbubXPxl`+kM_Hwq(%)JNTge z?9CVdp zc!a;0sERM#T*2>kNeyD95@`BXHUj7?UaO*03RZ%OTtN^$73_3Q_<3+JW^wWY#hi>n z<0KjKt-{SsnQd-G#tSwM{M})^#^JDxkz=nRa5ODuE(0;m>c_SiOYgHiIv<(9Eqk}%>IqbXFe?hxZn;Z2N zSBf(XaE!Gv4C3iEVI-?AD+e6?y)bJrbN)GR~WgiI~{vqko>Dn8r;q z+Ha&e{7_^s;9GvW^~n)&)B)|SL4P>9wu?3*$2`y4O&V+gJ)50qlNN96E5~~`P7ouw zmZU~x5ST#yB__*_7UPFM@>J{e5Oi{o;}C9$hv_pH;s&eP$92m&?tn~1N42e(^%^3g zR7eedM_Y9LNgo%W9p6X$;Tq*=yS}2QgP-rY9zJO?k>er98E?w?zaNv$2J}LnCd59C z=jgZ9Ov1-rL3*}aYw)<}qutC>AA;8gpdYsCJH-oMw_4*xx`!?O2ADt6L0R;s5%du< zFh1j2*{A3)S&E-?p3H?|IZWSnZBmfpp~k=Z3f~P!vs-SL{j{1>Yn^Px81!R})A_aj zMN~eCy8`ehQf_Nr)2`!Z*M)L7c>@FaiZ=9%xX^KKh%`e01ZF19BF-@oNtoiKav?j? z3=$1a!?TDbsIVdEBjHIFh8fL+0HC58AlzAwIG={h%KEz|xhBsCnKA(38WGK8TrSm^ z;?LRg%Zxe~V0$cH69_^i;aYw14mv}pwJTRZAChS5?M0(~8=aqNR8Kn29(76aP9SuG z9_`M>_0sG!h$7$RJOX(~xSkohP`4n>Y{NgpJ{lBJC=ig@(KQiqU536`KwrRLG4RZV zIHwkQ7OTl|Te1AB_TZCG(#-N{k2%dYZ{1>d-gB2d^x+TFz5@?D#QxxoZ?dzV@-&;b z>u$Dm$ufJ+vz~3=`|fw_zkl;9yLa1Wt#SVOy?2`Z&u{ITAN|12KI_T0aQ?11caiPA z_Y!;c`L8s-)0$s~9N~MfVe`<7fz5ALR~-}LCXVr3bu7dsK*F47C}MV^pV*$5jm$XO zqe}gRb*FGdj^r5j_HyAEMw*G}S2D=IXfGiOAior;{)&u@^B~G?X@=*P$80NbUDAv= zgUou`p}~ssHaqx?gco4V=ns*3kTCmxotyj;zG*r*RIGddJJ_$!dh9%DPAIl?xLyrW zoQrP;`fgyH)nb~^8Q^f9zZZ6H(yZA~oEP$PK#e6@{>z~rQ~rh=0_X7w0b-8!?CRq& zqBt40uhfJF=p4Re4p%u4N31)w4Yo(S#9+f3+WPReaa{)`R)iRAeV}h-ri*r705KhiXq(dF&hCv@xgLDI~LD9!#xTZonVrT2O* z>qUThO2ktRg8t?)*b99-pJ3c2nG!w8;_m(MZg*>0mchtgUvdmdF>>|zzNAR23g3NG>gyvX&bds?2~NdwnJ>z z8_~HjR1dUY{ojF)DLMXyIqVVe_M%O5r(pPL+RfT`m{|iGD6ZL6S3S@&PRJ)8i;euostwu*g6D@u!O*FIcd^MzNoN zPks8+v~S+LdA4-vQhV)dUTaT1>uI)d!9rWI#}a!aNcF_{h}nPt8l-uf-2?Lc^Sic! zH~-SEzxF5g@ehB{9`VqVY~k*^+Oj2!?Y!SR7y40e*qBQ+l3v?`%bawp#KjA$zoMu? zYrJX0l-P<9`L~Y;Mw(e1n!mmM9g$|j{Y&8bjEH}oSMsM86BJVsW|zGcH^Sxg_1GlK zg)jrUfh_HDfQvz=g0_R)P39)IuOmhtaSMH^n?OD3i-Z{bwuc&bD*7=9_vGQR4*7F} zI>R9>gOFkU+823|FuAPUyf;0=q>5ZMH|6V*oC8N&PU! za~;a0nK2TiS$gzIGycy+Sz{*p=tDVIVnT6&o2+4aWrU8>GucQ$3{$UMlN^X;fH}=2 z3wm1)V<7Z|PWtjI^iutVy!53L*C5LQ+AC~TVlK&)5%Pw;g(7S9b%_Os_t& zxmuw29#bFcRr>b28Fdx@*#T*m?a)_8UfKfoY5C#%m`@S!nKV0D9{kQuCCzISXsA0vfvQ3G(mJg5%+iooH03{YO*nWMxd1N`R3DQhCBIf&Tw{bAbJd9I_YoL$FGXXMj>E9&e=c`$r zG~;AFQrH1$&c#8Fp_53odLUt$DaF8F4M)Mla1odVFpM5;%aGoWkd&(lS3t#LtqITN z^<0}w;_Sii4xhvTBw~uK3QcKSB#_R8j30d8d+kY2c)Xo_{DbWAr#;60_+9U`nTc^5 zS-sNMuDrsooSCdk81z~$v_*YH*wMz$BKi)JOSuAc#01(EXf5>-@oW}BF7}juX@$1>^Hyp zjh*+ZSL*-0_VQm2+;`u7?cz7R$;=zx7W9KIusv@oBJTPKeFJeLCw&TGuEZI7 z#G5o?5DVl)S4@@o4ARUW_8CT+9o*NhaWK|fOtalNNi$K->%}i05p!-|PzDlh3!T1Ogqj*U+2-IVRG|*lh(k#G+DYtx3g?vCfd19P+TzSlN z$WceH=BuImI)vE=={S~c+-j-U&#t{3j4vHyPVt%w>?`tn`|O7xVG6{)9Rb%%Ce*D_ z$@gb{GHR|UC*7XI(6^mm&v?qQn(ZAn3cEy3LiSXDhz@DSwGk(b zUDi^P9)UVbz>nRd}xJGSZ z|9!n?;RDw#2y>wMw)S6bg_uh+^-+BhD6-aAp#NbnuT9K1KHUkBq<62+_Ur;}AwCFy zVjW;EmikSyhI-K_&AI(rj}mho%>i{n7#Y%CTRO&S~vX4)6-;^Cj7k|100%gOYcF`eKYmn$ajRj5G%zK%X=ZBg_sY z83_uBiUP--;DMQy599DN6;1th5b`**CNCY=dSJ9KLZCj}NHPdMPk{i3QFO+}$ z${Bm`Nypm0`z*Cx(Z*BHIs>{dw~Z4Mnt5Zf$@-}Ha0^sor?}+6%g>=#SI_#T5gt=< zYoKjBhT341f8nwHbb12LaeKo~fL`@h+90aULq;I$QP0JBofv}M{YOYLNN9pA_Xsm$ z$qPe#fvgn@9n(3vAQ3K`ua)oGBH+Z?2lbFS%cSc|8?9)2vCIWC?;8%DgX??!4+Hy) zzFm&KUPLC#fimFWx{!4MXnzI-ptF<**J&0G_KweljsbI$=5g$+-y2~2Zf7oelFcP| z*Pb~x`p3f7L-%p8EvTfl91{U7c5by4eo5^#yG1?5KGqjrBe0J^^g(38Or(By0JWQD znHy;?V+`$t_7rR%)mE%SO{R=0N0O<^TJzG6w^|;=+1E7J>k`7QU7)QPC&d{eR=hqGorgbLkK2ZA zwW-#qO^t8uQkzh_l=^k3T2*_5qGBeoYSbRJM_b+07SxD6YQ$&-u~)3vVnmYny#IpF ziSwN2x$o;@{C<;6HdX4Z=>r4$;0P}eb>%uzzavqkC*$%1%k%PdGw&mFrCiTsXZTO2 zgvW?4s>8RLudm`h_8JQJpILmIcJHvzpOuJ1=yeEndxs_mo^YbXJ=?oPR_xlP1cF#L zQ=uhnQSGfP)jv!E=~l$o=jC*C%S0eu7N>91j2;!tyYS1yw%lsph(vFF2QHpklPo&C*-YlgZ&0dz!>lM5_N)eEUEhqT^?a6v^vz%#1@~I<+ z8+mScg#>`Cz=okLRig_sAH;3j^g60IVj>a$8{Ni=i1;Y*rXvc$(-TT+y};*yLdxdr zPE}fn8#}nsk;d8i^Q+_V#(W~4TSZgb%`#%{2c&}({-YS!>^J=){juKtx#cc?3eameh!sesUWJM=?`!yWp4!U5_gA-^U0~M>9Il+kzqS$8FBqXVFcy*XOkbl zyvk;({;i5~L;Ji#Dl7YS27Y}JY7m;ecTEZO4HiBCyM5bQWFV%e`JB2|$}z6q_=!#U zO9vM!-h|a+$OqeFGQ0Xewkqiu)uV+#Ml|JZ&6{dycPBn4)`?vWRl)&Tcw}au7XSyT zg}7((hi#y2EK{{9(f76lBbi@#8P-+=Q;U7Ia&L8DxejDMXYL&RahB`12mu?#Wqp0o#_0Vmdsk1C*P~Am9US9xpON|RxXl`y$XKu##y`>j}RG5U@S-{!Eocw_maY>sgGoeLUNx3q7Sw*b$6CZg? zp}NA%yM`6+wFj~Ln=jJe=exNMpLL^XQ@N7~qW-zH-NNQDx&9`?HXZKfGq&K6c1x5T zQiMJxxLZ>UjSD*V1B=3))u(W|+%$Oi`Vg|0gCI)(y8~8KF8@ok5s1WoxUUbWlT)Gq z=ws)&_z)`#tLNC4!2i;jHVxJF)4q9D{I%w`5tQr<6MwA74mg1=_$3%UOY9yG)GgI| z+lxjQAdSx4*qdmNAr1YdB(=hVqU!&z>;q3Z*;RG!;4$IvH`1^Jp}>0Pamk6A=E@i~ zct(&3T&2xD<7^QcO9Uj3evTrcCHZjzx+SJBcy^^kv&G)v?>UWS)#`E&Mh1Q}Z8gYa zD5~0mJH!~+@1!{!S*JzGR80;mHP-aapw%bc-zY~|T}@DqZj}MBQu?#Aw1>`1Y!l}5 zZonZY@DQfCe7<_R-Q3u^E{j)%AFDzJMCdWx?FZ_bgxzmDZZWqDLWVulYK|TJOSJc! zQFX+#dCSkC4{RBD5z4*FlY+NX)y;D;L3hid41qZ zZfQ?ZP=(f26cqpXExhONAn}J+k-8IQQLxH!XaOW%xiy4K?b4prGo&%~*#(pFOA!D0 zx)L1<>f&2MiX819`F*Prq{*O4S@d|W@Lh`n0&%?k_Q=(+bGPD><+R9AomqjIR*i$c z&iYU~KH-i#TSmB^s2Ce{F@y&>_CwzB!|7v_K(C~W^ka>51|^Q8q=A1ZjO0kZlsk>* zsQ0px6YIr&kkA?b_9~EQtcZiRz7oy!Vdo5sQ$$9gBm(0sW-9nquSe2>RWHtbu zW*;r2xFAJ+9@&Ai!hYwAi1aR^(0npvU?K3{R&nw889}W7LBwaUyN~qJIkyf@_(Dl+ zXnbw?UJ5E>vyaBI{H+y|`pd|yq^Z)iIv`XYs4p=p7%VG!Irc8-XiZHsT24(BQ$!f&jrmSv^X8U;`qF;Z7sakTO*;3>JDy)f zwFO)v21*74XfK^=>sWMDh!a>H*M5HgJlQ;CA#`K1w7&Fe%^0%Ef}VcH{(JH25=Pi3 zw=W`mng>;Ui|U!=L*5DmO?RpY8Ke$zSZFQlO`qK%{NYAY6k92)@%+#Fw)pz1iOm!~ z^Z%Wg&iu#cTJTXoBnIVhKX}yQk4b(alS%ta%9LA?VE!8~K0@pItW!Ort9(+{Dm= z$VBU#K|#dpO| zU)4noxP6_j&3;vg1RR~1N(knf7w68e>uRT2r``R+v6uz7y446;w%uwfRsAv37!JhW z_q`_j&uKnWbJl+l`o;=WkKx>E5fp`gQcu|Yd>^zn@SL1)QxR^i3Zg>~L_>DTsmjLV z*VL9zVXyfE!iIwx>ayH?n@fLJgIMDt)@D0Yy3u+$nmpI}-igfCf3VD<_Sf1$8zXUf zp*QX*#_VdVADxo&ZA-d4F&Vfiys_q(O-w+}d)!4cG+K-Ky)vi1>2Z^Z(vG?9r>OQB zp%2p)ivJ!3KKDoaG&rxRi4+t+m)RSiaK9)eqB@#Amu0G#$_u+u4`qj|<^rwa7Pxvh5z2vI=3u59SuU z*w9Pbs6zU3Bezl@D~D_oN0bi1e>37Sq4MzjHf-90tK;RayYkD_ik=P{lV>;7wMBK| z^(iKFE`>5X&doHDrk|END2hyvd4z^4jKbflrFHq9f^YdS9nkoUMcTfTCREYhG96mE zo4)ylompS@5zq8B*m6KXL{|xU4hw`ixJ}5xNWwfm(GvD&;Sz6#OL8P-3cMa8slYq6 zY^hdSJ;v{2f*0FxKPgN#pRV$R-L_mReC4k~T+U8ajqU5Yqi*tf zDwK>_tZ&(Gnr}ZIA%jsGgddOtN&@E7TP7crVbbnIW9v+1QStcFQtw-h-z5e;*If5| zZYOEMo2aOd4QLs)ADISeKbQ-0JX}ixXnJyvvy) z)7UjHE69ox5h|L7iw)PvP#=EFcZ0=Mh=v`A`*wXDLe&BqtjFWSK7&quR*?tdRE9NTQd@8P?=H3 zeAPst@x2R-cy}AE>T2 zP@%irvw+xm|Le*Gk6-#MPz;Ai*_ydZT8(7De^zIjv^!KUvD@AHqQMEF>J)Z#KN>eY8@-!D|x+W!Zxkm_v0C8 z4w3IZ!csyXYBr2x-nd$@sR2)aCJGVGk^M$SsaakG#$2V@?Mycz_$s$Sv6}mY-DO8m z3{K;&=V(te-kJbDx-L9op#rs6V7imd&ahdg{@!9kck&B^@XOY(7UPE;-<$qD2sDC!~45&*a4{ZI}n~2IE*P z*A=}#>v;eRi^teg!>AaA%qMAs4~`6}`l{3>o)rz|8Dg%Y*eb~r<<*Q4UpsKyF+Mfr z_!{E>zCU5VW(QIS9Ogsf2<(cs57124|<*p0Jm>&J~WGP8GoOE33_YJi7X5{Zq+-ic@E+70Ns3q_iD26>m120TsB9Y?r z1%`ZN3F-e@h(p58p)H$1RSj*r^;lG?UlL?-UM_U9&_~Qi^HUTRDXzr)D)H$FJn3W9 zG>vCT$A>u+r8k223wV(n!0vGMdrx+Mqhy6i;O}x*WJLJ`7+DU=cb7<(94Wd;MD3QZ zyB9j`Wo+DixKQRy24Wyu5bqWu|_4_!jTKid$+=uPs9l^-MS&!uyUlrAc3y`HG}X6-stIyAHL z?yz3zbF;Q#JgAo~uhCmC&$NgaJ@!^5wpjBsuWj2uGmV4RV#+sNkmt}H{YA-Fgml>} zI}w6PVQibWJcgWd{=cbD?s_1Yejh1_e?Q{t?fgl0)Yl@n$2Bz`m=@<1TIm;(DJkD*iaW#6M+fObmXP4vog|DAA zqoA{h`DWhO3ocCU&Dc=P2n224I15y+yfAxB^D2J72NB@r`90*BAV8uxn@xkAQxkUG zbv#k9DJ5KW4C`O!?FBP@eZTg$75?`zsJs{o5LT`-aNWsvT88`1Nq_q{e;hZ6OsjcxAZAeD*_dSkt8OE)z&$2~ z>*PWE>$8nM^649{Nd`sgJLW498~N_DLyn|?@V{k3;Nw5$M9=DfY3*m)7{u1ImeWh` zT|8f9o%hQsA=sHM;uK-K%eQ#W7e40CRx2JZSJTVHJwr*@^%e@*C-^|9R9^j0?-|KJ z(Ve_TzHTh5!oimO^fo#~d{~WjfdXyLhsVo=7VHEUtKiPCE~F9%2&;eNR2^v(dFjY2 z+@l%9JTd-VrY6?Mx|(lg=z)W#jG2SxPjBJi@|em%;UNK~CE&-YC1G$r^o?reRirYT zUU8J0R+YX?3VXt@>&JrOw5F`NzogxTcSP&~^)p(OullBYVM7UTD7|g0!1I1OM?WLc ziFvG2No*ETH>XP4{bdNo*BAr>>L|kiIQ!H3x!q=81{rmzfY9v1w_nR!51b#f{Nc*7 zV7i9Yuy$e&lDXZAb_BlhYtDGTRxXi?0tn#ulJ_ODW&NtQ_R(T!7|!GX>=s1MD4@S+t~&VBTZ8k2`2xxs$8X+R)C?-kYU* zSF^dZS9xr$n|0go!Q*ehF}g;c)SC`iYF*}7S(T=O-s_zHY%ruMkON||ImzFzZOTU5e+FK(l1B792nm%mmJg4Gra}9>f zJ48%D!lCJpyu73HQ&YK=e;(yilO>J;#k_QAdI(>&I8jjx&u3?idOUC^lbN7@#_IC; zG7j>(dOzgV7!FM+kzP~^+52E_a4wvlYKj%^N!Stzs+lsnlojm>WDY{xYQc2RJbVfQQt}fFX*GARS z^!-NU^fwr8CS|6tO3AF=?o1a-Y!OsBFrXQG#=dcCvh@71x>+=UX-%P0)-A}tQBtwY z3Dpz7S;I5E7Fc*#2BD!Fv-@=y&|Gj19hNiQCiMuP&@YTKN5Kat8n+H)$>x zz_z$5B&?zM_3k(#{mxDf7bL}vH~ntB$oEL-^Ax}*{JAx5IYmk#~D~|NC&3@}|f~jTOu=f>%vc9>@wzEntFfZ9h^yApi+D7R&=UZxKMCvx z1ICR8XikS&4WK?ANM@^s6kzVe4U2x$y%VEOTZC#Y(;bh5wk~Q&kuBIvI?9kr0>$ zxJdAunZ$jxEFgec>qllwDX;G?hR!f7wT2kF0XAVVvKAe^)rTq(>Xp91yRM09+h=+< zw)Q{#BX{Dqrj5%V#=IoVZ5(e|%1N)~>`$GY?I^{uvn4P{mZwJ8)2CbR93%~AFr_Hh z?NJ5K#~4yl)BgF8$3f+KF7ShM zg93A!&uTfV5LhQ)vB&e*B8v$V+_OS$$6)+A z3V!Re-(S*lRl6_fVxbY|NG9=_+?h^SqPxaVoaJ%F>N40zgfd@~jdl?;IiRzXo?xZ{ zw$JJ&$1=XGcReNUdivtM$0&MUJvC8cq68ezI#lqKkEKd5Y~qAxCsnB;j6Y$wlzh>@ z%j;h1+9F@%o^Y>!@pP0zw za|-88w*A$E7^Ms?Ir{>D*fUSLjLlwBT+W6^rijM=`gg1 z7JuPv7p?I@_EKCAwuKXG?F%71^b;-?svSzD6%g>R#LPn5wMDrG?y~E&+}%vtU7hA` zKcwQLQPssXw>wmg!P?~Tk{@J|snp$fG7gM-%!G<}3KpF^P}G^iwYxmDHWHq`N+UqC zkMc9Ki$J)GBh638I^sE?BAOep!t&RdTI+`9Mbv9MZn<$L%<>ig;B>)`%MI!7l_4`%^%pv3?@F`ZjcSF z`K9QnV1V3hOs)tv@?Pu=+!{LY261_XTMbRNvkE(|@nsyV%J+L{&`UkeNI^ekjfG5q z-F7}f<(L2KS0ws3w9)%Z6l=a;X)KqFv@_X2)uY;gif&jSUBS_2xAo_!a|8BWAp zyoa-I3+%9Ue~1Gi_}x;xeH9(Nnj+%gUMY`1$U6qZ2pOCk68bybhs^0&eSUW73dX;7 z(%lve?mIS63jISw2@lhl1WI8UA3S1-eiS?XA5ea*{eJ=4e_MKEEbp9RdA~o{Ggovz z#7Vj+`B0lEos3H_MrwRNg7d`3W_p+K_Q*t{)QVUB!V?QsdTa2W+%mZ1YimW=C>{NqH0zxHMk4{+GStV7Ya0ybv&>i@e%dMm1c|Y zKHGYej~e05@bFW#&+2*J!m3uYmpJ3{gLLYFP!N4!$nqS zGp=$do+8g2;@Y=o%<7s$t6F|v)7@A-WNuu(lGH(Bwf7NU%<$RsYGtFP)3AT#qjimr zVTl4~AH=|&do9x?W?UGDaM?O)R<$wwOt8?ehdQTLX*HmT)xuqtDtL^5KMkC{*L_v! zusVL4t(_pyvi^HVFHdzwyV0*!wwl;hjhnLdtL6sn+u+kV)2RIyEbB(nLVy((o5Ps~ zq3fFt*m}fA_n^)s%x((cML%6*eI3^*xF2O`iFpGP`!u3sklJYX#Nf8rvYB~*WC=Y? zXz4<^Cxc$&6GjsA5g9Gx{<&zZxk(UnxfVslp3A?!*tFcJzyPT!_QfeU0l5wxB~HiR zH<`J|ryV?hZQ$7{9EP8r+~V%<$`J`CGhFW^n-=U)6kgpTY%BJ*iOsUEtzQqk$V-O{ z^IBt7atLfd3_h z=zzpX&d6@ecfXVec@?XEiS`^u4wmXG67fru(wXj;b(nAIM_r0@sWCdde?#SZny)f= zv{Qy~pAl5|U}1!yinSMUSl>CVSnv-&p=6AQ*e-X_kHHz6#^J;4L(N!aLKeH^$kmKb zal&8ecGOLj91B0omzs7y3;A-pIGa71 z=_4~rnnomZOyItxrjsuv2RWSL4TecFZe>J@aZG_r$9t)Yj`0y!uI4pX*C`pUJzzaZkz02z%vuB_`Js!;{h>H`~s%^vKKXTh9o>B>@|{lVVA4qYR7qa zE^(8vSX)<0x{;3TeNm!Y^k(G{(s_ZMkveG+=()5;Ugako1HUzHe4*p&F;KeBC#Dgv zxPvhbXyVvZYDM*8_(c*9D6v(Cwp=hPC$3t>3wdFSE?mMfe2#vO7sHYwIvYJe^F2V?jYWv7V zN(A?A1bz~Q7+N@F)jXRE^v6-Hw$z3anM%oaEx3(BpGg+gX{fDVV|e47kF8&OA1oB* zM!=ARzSxH>G~=84wI;Ym7Y+CQVB>)$&S~K%Kk&Vl@F~*D&lFcKe*bIaZGYazblcvW zVN=s@1yd^S8F1`H#?Y?(x1wI|6y3pjgif^a9FE(#CAAVj@O3nv{U7^{a?@? z(~+#YnI<&Eb{>D3?zZpv02f^txp=Co|`C*95n9e zBF8O-hp#3sw*9XbO#*!vU3(FI0I>%#YqN%@>L%p#X@(1C1U8Dyd~%N0-9)w1VO|q$ z$u`Ja-M4}PK-*dY^22k}x@~HVw+Auy;r}X};G;X$`rO%}O~r{7-fNrGA|o?&8Pc}` z?~`4^KP?6~va0`tHMlWnAWv+p0(`3%`&_9NC(V^7m`Sz3ezsny;qA2Rt--sr2euca zb%s0E*`GT%zeg+NQMkK(`P9wjp{6ro64;S4mwko^fvSyHv59RCDYbb&(VbL}y2F;l zaEa#AR8y-t*Hsascr82nuS|*Gog#3^2742@tl0+~7UH%np#=k&5+?51A%`m}LJN{) zMi^&nr(U^2L3SR38Xh*t1|{|FXQR` zS_k@!tTCr(Vh;>o2Zo&4H%6H|x(B}^3^~qIQxW+H!06X7A938EqJijhfG$hp^!f$C zEhERg*0TC)FCD{WX2Xv8TKC%tvz_!pj1#R$TqCL-GTj+J-7K$-UbviUoXslT z-T$*e$P4X0T4rlq+*|nF3q(E#_ZItjv3y`)b+Zf9(Q8j1d*gahWP6PJ%q!d0mnFqP zeft#d(}%_aW%tbZREK9@=y-`ZNxUq#IuEroO{J0czaIN5&1mVh;K}5NcyCXkK2dOX zqq>SV=mlSLZvskqL~oA|D)hWgNbsgZYGPdZw(~#b3H5v~8k}-rS05 z7k(GZwo=g;sjx^jz=Y5K?;0W(S-tqtTdP{EB>)rS*Sg!-pQ<|W(EC0rJe@#Dz?NvGpWK)cww(>dZt}&QN z8M+;GdMPZXX;v`QAq;<1qS8To8Iv6uXe`pQ@X1OypwD^E3+z*S8d-EJk1nx}ynkl<9ugE!Uvs}uk(q#DLw0R;QZ9WZn94;nq2p5tQR_N z{I7h`kq&Qs1$osHS=YZ?SEav}Oio)(9?z@_Bd{&`S-<3`F-`u)n#{CTa`c|W7ueNW zfK<^iDbOb+?6(&~xW*UAOi%LfG*vM*nhIH!MzEAOV*Ph(%=tph2{lc99H9@%_|tO0 z=xJ0i=zfJ$8}OUS0M|T>KkFn-%M;j|nANN5WN(G+y#~IRWSd_c{aajKOwkSyFD}1@ zzPxQm=BFM2ZOagO&i2^wyqGz7rG5QT-zy5+i)8=i6cLu>-(o*XI}%oTc+EIE-|qS@f@U1sWRu{=F>8TuCx zF!BI_Ke{jwyj@sR8#T@vEjLx=!SyzVigcy|!14ZgqX7Tk=yyBymW73rIMrE)@I9+s zX03f2?DQG(cpoBm?H~w5{0Jq3Nb@9OA=$^;cg~|KWM&8ma0o7S+-fn+Ym5_SM?S-&iuZXlmRTekX2BhAUvzU_xE*tZH#T)P~}D>{TU zKVms_({jNXrGwpqNL^DJ6kF`5d2@I_d~Dn}N=j!@=ctoy`lobdb$+7hVA%R_@|5H; zw(^JSV)wA4YfdO@Zp6LJ%u zb+ni7{;*W%#22`?92fs<*?WZw15PzrrB{LB*p>w+$>KNB3CEPxla%!Qv3yA}xFu?~ zTIktAf2t;4E&R0H5$p#$F2rp*r)ISMI3tDcyITJ$ZE&0m|2AKDA<^Q#GlIb3`F8g- zefwC3ku%X<0!1&OeR^JcWc!E(-`r@Zh6_aL~qrNkIrg$9m z`Z9q2YIxd7%VlLs+>iC9{7fINPJ4}Ky`gyev)=?=o{)Kdj?|otR`n&t!{#VW8SZeP^UB+qDx?aS@^2-bnJqfEm`v}9otg|V1HT@x? za}Cmp!wUivMeIlK#y*bq%GmX%>eJOk1P(|lP^m;bFTP^D1jj3fw=Agae5Ly&qB9)x#PAT>;7!GFeB-K+e!Q~PU{i^?dvt$8|AXZn z>c3U{oUC^0i&X#Q-YaW<`sH-D$NjYXMc(XplURkyV7}%NYIPy{sKCtBaR;N+WuId& z!huLsSJ$L1)jix{avPpqtw6d|?YxCGr=&7g4Dr+WnUr zr=8x7@niq#un|CGjkF?}cBd8%&P!VLHlzLqpyjIPpH|vW`NU?&G2YOohjb^*6BXn# zxo?+tyWC16R^|GFr&cGBd9hJpdLln5%Ti1*Frw#PrS(5v59}pPJVCG@kZSOMsk{!8 z+cvKH`UaJEyd$$?&fd) zwD@E+O#pw;#PEz;d)@nvj(#CQ|CqS{8P;4^qfRd?2&DOsJC>K~%@QM_`AIUmQ5blL zN8F_Fy)S0zEDC@2;~maBM9dK%ufYKaby#{#xxS%>--hqS(R zy5rzNcXypF2FPigXN&ic1n^Zl0y#rqa8rd5_%czmhw9QX^W4KD)dE!tmFoNXL+gtHk|oa5U3+BV^|&Fv*%Af_ z*Q02{esdb|GoOQ!X6fWcA|0|!97rJD0@LdL3yBOxjKaMIk&!tRmFeO*N$2tM^Twlc z4N@-(zQC%n`7(X=0+5EHC_Hug7oZoe_>45znB4EpdkBIpVRTCx7hrAk zHK#^|D=f&@SkV6iuI&xbfpuZiJTsK9O_B>~knISPhmR-Wwo~|CJ4db1!vhfcinKYmK*^Eb~20npX_*p zp>?m9b6BeBYd0E6;`7YcHy-5p8w)yG97S{T+%K5aofPlYqze2C*uJrM-1P=Jx<}SeH9zm#O>_2g;-gu1Ih+_E) zwG%UiLDE(#sQaufkV^4o-v>asp?ZFAt}kTSqg`nDTlH#cdZLT2B~2Jiqzg$THn%lj zVnf5;NnluKU%;fm_p}R8w`Oj!!t1CiWj$j5IGfkx1nth)BjnpXiQJ#{v*_aGxMoD~aK7O@ zDb@E~Z-U>W_MC06nI&#r46t$j4-J2YHdSI~ZPIIb2Z_!YWuXj@yUz23kZ@ zUv)WNd-bxaw5Ym4`k4nLIPEQIBPo{*yaoICsv0!+XI}wIYh#&XV~Be_JkQ|@T@%W zEd+qW+*-~mF*w*#$%yJK3bEjYY$dY-_Yt@ok~l)dkNA`l zh~NR?RF8$m@@-(CysPK0MclUcO=7k9|$!VJE6cR7__QU`H z5x=mWgwukWwg`>C#D5zptk8!@C4Zx5vh?Feu`d875&{PK;{crOyo@dwEJKQlLstAD zcy-GLHt0DK-+pE7QV=-bvN9^GQAqOAi z|MS!+fwIUkD)cM#P^r|Y2H1A($X6n&z;=GtopR37#pj2z%2Lkxwu*aRcQ5Uifv@gjK~PZSq-~^(`l5Qv!5PAr)`3c4?sVb}dZ>NvBi45~a3xemjb^V{Z zRMdWz_f6&&Vmxtd&Vh{dDz6mm5ZsOXeTdW)Rma8^bHDAogO40DXG}j~RJ!Rq1I)39 zzawNp*r;*(7NY{&BJ~$UpMbFi-{3Pt-d(C9E1C9ekgKI{Jlk_3<^9w50day1^ZT_l zAu^bcMWrjk7=O8wb#^JU{K27gc;s)k_U*;`QQFI=X5)QmMyM??ATH|DuNCF!r@S0C zU)p(enqT*x(oVCIKiI@!_i;q6U;R@`VjPG2^?}_oUG|O0?F6q!m4+vjM_-JK@-1a8 zc$8*x)#C+3l=`D?#Zyk}JVKcb;77r}Y}w-5y2(m0{>)VUvkAAjELZ8$XNLoTI22jH zZXqX8X`OD9t%MDE_R$!x?b6l~o39(L*6&?IkSH4*w)^<&IpAXY9`)t=k#)=tFS@zi zCfJua_xQp&~mL`Y)aG zj31?^_%Spxy6jI~Zzw$JV^Pm@k*)5!*3Enm?GAN%H~1;GQ%7J`J)JMgqDu<_VK#si z|I(wVG#r=kqsHWF(XE$M94Akub9eg99nPb%|NkNUlCWP_C5wsK z@xx;Nzhn;N_$K6_{kFS)p$zP|R@Vp{q*Cu&#r4OUM%!(8zN1HjowWYdQh3!XS z*GXILzMO=eW?rXn%$M1U>jQ^er&%!uSt8pf{GWLfRNV$jkKke>HiFkf{(jzoWyf=^RDT_goDJ$VegXHaYcBIt&n|J@|D}Dv@c-kKGPc^ zvwA`{-`j>!bt;zD{JZ>lS06Up>!%!9t5sGi8RfG-w~*SQ_D)0L5BMkTNBxo~2ZOO9 zy0$iPy7+RjEZb-TkJ!AOhgtIhZ|JNsK$FM35oHa1f+i z5Q=UdJsr}Bx3~TETFp};f}dGRj?vQ%?mGzx;A;6dkM(jreU+QBMpK=X7}l*qkHfu} zza}TT9~3aLug=g#`M><%=Y0GU!khCFmb=xE#q$^XHIy+nUuh(({N)?F&0aNTf!?tc z>y*d9z4Yya_Z>8!C>Zpw%Oq#8yib_{sT{;by>VAbr4Z{=2&syh@Lp=Y_xj+)fJue^ z%Ph5HG@&oy+WiQ(m4|5+(Zhe^jSR*LRr#|9gzI+s#};cp<%Lf^J= z7>uk9Iev(q(UcF5U-;y|t7r@qD#N}&FSAM!TZ#V>2I6&muv)7=8NQ<>eN|Z8T|aKk z^J*fF7pv8~aMo@H4hwh-JuW5Zg4<3d@k;C)VB4xX&HW}ynoEfm@12Lh6TJWi-tJJy z5DVG0yz)G=ZS4XzUd4L!t;I5Ii~)Z@59&!(J7#hOL%Tzz#At9{ zQRfInCRc2J%$=Y@y+bk4K^tZz&rvhw^YhH$G(8W4oD7q)BCx~TP9Ex`TR!e+OANiv zfdjyBdmHv;VWqca#@P~{4Lxy&d%*z^2BvWFi2gE)bF@ncdhR4jHWmnK5@48s_iz}` zD)#d`SE=o5Wnh6~bETQHg&HY@fQAzm@jHuGbH23x3$47LFZM?8mgEzb@XeUC#`)m+ zUM^g1xd43NBR>1FpetJ^e%QFwi;YgC6okqRdtH1YW}(r!5V-XPLHRj>Ivb^;R8oEO zyqN$R+?QE-U}>zDC#v$>pwEi_jxUG%WwfJS`bL7IocZgRppd+qEC0@s%gr(CyM40&2eNF17`O`NPEq& z>g`_+q4r0-!mKiT+{PqxgW^9L;Un9apPI{&JQe;M_v$m;{KQNG;R z`34xK7HC}zc|a4Cy%nfl@s+N^izWpm!#<=u02q0a9EUTVGcKn#0TTMyZuuCB#E0Z# zV+n~YWGzYq>I8xg$v@R}GFo6Q1n)H{^~yt19=^gDUJ1W9QA)VyUeIV(@X=CYWDc?` z7{)Dhs(A3lrY9+~aQ#Kv&$K!Gk=0MiiyxUQpB5g4`q1C5I${i4D$tKbg!}pt z4%FVsOD3s3p1LC$z~mx_?$er_!-@l9AA?p3;^kK8^c6>Y0Pr_1+?y{*o_^vuW*K0V za^Pq_8q|dh=4^;+P&TPMzHw2c^51FWMzK%-&fJs}c{&c-W>Nn_(ZT(hQx(lf`=?Uj z;refWPRG|BSx=shHHM$$EABqu#c*r~KU5R^2Gcxr!2Jnv8`;U)a8erS<F~vs>8| zSv+Rw@xFp)Pkx=f)H9wA<1ze;f$K^mHRdr(js4oXKO|k=>Krb(K?%Q|To66nG%kMS7eX|I zYys7)Ds}Sx@Ci;>f2#H+kLG#D|JzG|joA=!puH_q7G`cCC8?#+f#zb+$gw~ydK!ebx|Vvuy4kYmb8$g)dZGuAS)_7?5@1(zK&+%dG z?8~2&c9-;zm{5sUMp?X4$UlhkVD8k2jVfkzO9;J(Tm7JL(>lXm>*o3bvsh z7Jt+}gs+C?$kZ}9hE^`99QDe>Mje7WFAyH%dys1%eu67jqJ#Q-zU0~Ee9 ziK<7nUq%onyhj^v`h^s?9q)ddewzkuYFkN6LmZbL8fb1ULF+dznBaI0!YGK8HBu3~ zZ!Mf-WpKj!@b+*Rd+lig{Bm(n8edhYt7u^Lt7fDL2*KAo|M#RicVVpq^(CPopz=RFc5P=xH9fv&z79#7G@AK3_o<$a%{0X;)5~$&-`fD zarbufyWsQ83Oc8eMHPvn4h>JYC|TPc6Bt%0_kPO#uU6C1mQi`7k9i}GD97kIWnN^n zC>rs$J=~XMuUecCwxWCYo$oJ*&i8(m{cXmJU$H!hTSniT_eu}Q7q^W2WjZ8TuEs?~ z1!*rQVxA>^QlV>&Y)N)#T!|3Izc0AQi|8OnfMgdV%Sh~xaSc&({Y@+P>Z>ry3R5B@ z{8B6N=0}{d!ln+3GpzIhBfi!Ou%w6MmAgqqEqNb-gF+EM$*Yx`96g(we>_wzOLhHC zSTkMleRxLCt|Q5^T7E#0(y_i2(-%ZFKs)Qrr_t81S0yfY>MgBiA;p?ivrc{4&b!Y@Jsy+{)~U zbb4;VFXPm029|(COYf$C?z@wP{`Ibe;ivnOKI?d@OUO@E-Z3VPt*@+MuJ>Pb5)CU8 zqAEC-{d)>^$-AfGUd+d(7v0lRhkC?23UOr*E#j)%59keVy{2aN?cjf`WzBp4nQ^cv z73Ti{(m*Z0q`y%9@5jgB@(hCQMBD6#pv*;X&UNE)!Rr)dw!!x*~wj@2gol8aXb-iOA1TP zWSl|9Iz}r7(N2lMnveLri!*YcCFz2lHROWtNJl^0&~AKI%w&&*gt)-~QG&?Y?{Ow(G9_sa=V4xW1W$`48UsdVA+(m+Dn^mt6c7 zJO8{_+B432nw@#Z8TPo-PSb4bqJ{Hq@uCI3UA(}KJp2&5=)zaoH~;tF+x_?40rI>F zMwl3J`AG$sXzUYjehkC1lx#hoSw3;t)G!@SL5?l z@XKF}kLd?gNEn!?{E61ll^|0qL6TQ~8HJi;`75|%yZec*kU+n2Hbish^AInQ~Xma|ZI66OaTcZ_D}xd!~mBM!5NKICLO_0gy3 z#$w*C^K7?W7l=4N{{_#nZ~yzh*lpKeXFvM(ckNsM_>cCTzxx~e%P)P_X5M*;{mY+R zW>>x8RW^10i|n5+d!v2ht*^6hyz`BA^}GJS{_K^{wI>{QfE^5(UU>G?eW}LAuh};? ze9dOZC!yCGn?<~Agq|B%FpzYkC=g>DR7WO|s3@Q{z8U*{V1Fr+Q7}3&jrjBRx+D)s zQY5**1`p?yeCh!4IV=_(UO+TJr9vG=Kgh#4FpA^E806pp%Qwk*K|s!+P&$0ZiC8D{ zoPPL1WR)bDq&Xv#=7#)*TlFVBDTC*tNHYg84rm;}VnEOxJ?sSgQ?}Y(=_mS%Vir)| z7CX}hnKbvYJ;EHwqPI z)=?i*AMRV z{oVeeedgg9Vr>j<`k2Qp2`2mr)Yn1sdio-+OLmoD7JB{Sb#5}cnWFU&&1Kk^<=GTm2@i7u&I;gAk0M`Y@?mp7W_Lgj?abL38ii{;#>TV{`f|Y zu5P$C%axIA<31#uG!rub>tP3or%Ia3c^o@KX8m3OboH2r?v*SDJihdXHQ2iXZPdDx zfj&W+X_yQaH^M(x;DUBn07KLQOJlYD6-=INSdwU-n~;m#fN(%RIfQDECa)vTb<)g7 z8cv!y+w4m+d;kZaxCZrJ+jEg-KH8*CED+qTpB%NXZJ4n?|Fe(UQAZsiLU;Uwj1++f=_U1N7`y4JSb_#?X?`|iK%CcEp-8||vUn6zg<{~S9AgmmA1_qU^uJW}s_ zK6tN%_V|ZA*xq{K8|?h&o@)<1>I6GtzeDUHCqCHDeDaz0+Sk0+zVzA8+F#DBx4)m= zX#Y5~!Tx%B+Wul<+^!xUx37^zGcYmmuIEkG$}ic(%CDf8cU&YZ2Mvg0ig{mZ^-j7^j&j*>!9?D0&_uE1O(6+YkY%2y zZ6&QcN1CJPgnmTi9d8t93(nL;QLF<3i#`zV3MN|Lt)F2vA@t9K)G)M1t01E!&V?us zCD3Eo2eQ0=yle|`&NZ*q9-`DD=3Vy)3rU@^x=Nz z7WSbn=OE3rGiS+io^2P33&xiBLs^<`O>V+%p62_@S0)$W#@K|@m$HcBhu_Y2g(_?7_`F}GDvBU#Uwju zz`>=+``TUvLXnxXHGD_%2#kI9Q_m6+TeNVY{oc7Rv8%4yU_boN@7rJf)nD3EpL(W_ z@tw~L<}c8jA|C$mhwE(-&wlo^?d8Asa(e~v;upTyo_Y2&?ex=52Vp!O=N({s?!o07 z^X*{|J<-1Kxlh=po3F87+_%N<+H#}ayY(i!dou_$i1W7Ve`woo`H5{Nk-mGgU4PT{ z_Q}tF&K~>NC)!?n?qx^t3vq`XXh-g|r=4`jA@=g$eU|<4n=ZE3y!gfT5RhsvS>T%A z!wx#YUiq?@+2=p;30v{`FW9F(`seni?|Hxd>3iR2pTj<`W!^Xi@~k&(Oxj;;yb?Cp zXd5Os=n2AF5a-pf%L>@!^I!h5opAEWHh=d8`t`r1doHzCop*svj7{6r#I$|!3!k%- zPddT&-*2Bvn7QRlbC73lxol(4B};AB-FCJ43w=HKN8b0x_VcYVlHa!<{?Qxlo-cgR{`Kt_*z_yUv8k6o)23eb92-0L1vc^O3+>O&J=e}U z{&-skBG0y{`y(HEvVHb*pSCM!XAuJ^kW3Evn9C;kxOxNPf(hDUzQuD4PKs$I6=DQ! z^ow4O=g!SqNyLnZ%YGXfjMwgUg)oyWLk5CGnWWi&*53DVEmxK$W|~A&3t|5MLz??D z%|xA5FL@U-Q-{ze25jEAcu{jiUu;{mqn(?1T$60Dvx61EV;cBCX!M0Fe2!%tLRqhw ze~Laxg5U5%R>nw`1&~jJa1dOJ$XY}>=pV|2{&5|VYgU&u6U{dLHSrD6kE2U8=uh4V zIa#ixx%!R%FR@0OK&}?w*av^Ptvs%>?J}}F5mQ~l93W}-r6DnxX%;A5pW7G)&5Db7 z5h2$Q*(f{h+iRN`I5$vyLdXWRM~b5clBh0OiZ=4+K#*qL9M4IbeWqEj1kKkDkv}8< zbiKzRh)*TVUYBGC9MgvFC3>#(1GM}GE6GhfGB?OTDiI;0t>$C}+7&?B`Mn3FX3;AFY+ zGd*oX3<~IrQ0|n^e!jO)oqhc`;t};naa~C>>tmlZdmec%`B*vh4Lp!*7Y}Y4mS8|p z@nn1;U*L#j83H90(TGJ(sy1<4B8s7g=PS)O-jKU z>u2JQr8${8&^EsH&u9yd*f7D@vrXBvp7C^BP`-4wY{{N>;RUa z?QV+}FBEp$Z8yE2_^6|fwnseTk@n;>&a~$|>vN7y4D_DFlw!%wl(Pd(Kh|JX;_nP;4C&wbX@?4k=_X&3*& z8||HMzs&yRJs+@-e*`4^3tzSkt41y$p3){?r(p5 z`Q;!`>+H&n>+MfJ_dXGn{{GFcvuob-Hrx8C57@>Fo@Z0%Kg-5m z_6(ai_n9{NQhYx5dA8=2=h~Mpc)9)Y3x3yLaOxxMsHOZR!(HraGlgf#MOGlMvd{IagEQ4D<2&40Hr|+ZN%&CIBVel;~3&D;xWhh_DFNYYxJi+ zK^|gYQqnW_fiZ!DD+V9zchVd{ezABK(##vcOqvCJ_99C%+Y0Vnq?v2O^jM3z1|2GK zE?enV^i}>9o$7$Ce!P%z>vDSA+_t>uHf&zn8!~7SaXdfS^0k!&&f%;g2SJhLF`E|f ze+F)twX}@64|GX0<-k6`{Xrtv9!WQnS7c$VC|=Np{Ehvc0TTa1cP&?e{;|H7cFpyW zGlCf;J3%yPFTF-kYtjkMICD#%VK8C4C0<-Evn2D>ld(X7P_8T2Huyjg;3Mf6z&pAC-C!+7FTs+DGj)st7kzTTThC&ML1^Q| zqz0rvyyzl3?G5hUp&pzXE_WrkBY+v~3`|Xm8F0dy)`r&r^!yjUg zf$Wce=tJ!3k3GfCdHf^oNsm0np7xmI?Yw6{$=>&t*V$WNbDq8E|Ksf~p!7DfZQ-`t z%p7A(u^q4zJ77D8n3Gn4-TpWd7*-@xMo!1H!C6PvGTA1pbPQOPb|HJxa*#fMiM3 zNONe0&`+>o74&apJLzDxnGy%o*3Zs zMwoqBy#JNVP&s}f%p=XbZ~vzgnb!9(NJa+%X{P`EU)>oMvVIs>ITlbam684AC=>Dr3hW`8jkk2pG%I0> zlBQVoMbD*^UNcYe*PaMOoHNM3dEKglNR1a4>EyS(gvqXpg>3D3d0ldQ)UO!6FR^WH zU*VR`20AaxrF=i*Eyb`yni=kL5cm3Qayk%QGhw2*a7nYq2m2TK3FR#=8}*f(Ao+>> zM}1L`u}<|@(OADct9L4x-J|_zE>yc}yXq4xFZ&wxht7rVujRC_Zhx;wnj^Mih`yM_ z287v2vwS+1bt`^t@>x4U8}2cNYX`pmf1ETE%x4v>W2)211qRLQB&&xIVfNsjcXh`w&wg!exTKj8CeHlM2d`V#eb%3D9i6f- zQ?66eOnTtF=o9Av{Ox8NC;jnKeAcz2>QzW{a4-z*xH^m+la5fwB}hmiPJ+$;IE(&w z&fB$7!tB2sLTxcz!@98ZQpO9qAn<~78k{clILmLc@qS3LKFC0>gP4(h`4!xzzlMq+ zcWG3PmvEduei$>SCgAqlY9TIH9O8*Umn~Bgx7<<#6)Tn(^2cPGk~DTDlZa#Q&@v~TkO9=UW5 z|KPRGzg@%k-@S{U2;Y7ACK2Vg@ZDE$;;UvIrP!FSfcgNP1OEGoqNEFGR z8^wzjM4`e35Jxu)543E7g9rBGB@8?b!#PeDDE2eB%|o``87XUp)`cE}w;$h&(?zZ4}Or>4$Tpdn08? zXPoTU2FLoeqHSB8?A;cpdbYvguC4Ikkbc-YekAJBO${$knKX7hE~KAB>T&XQ@&{e7 zb%Fx#gsd$!``aMl4is`uVn+QgIuJ$So)7!4=dT2}jKTF+iAfKx&kJcLM3aCd0wGng zV=P3L4KiC9CdbkLW7Xv@L2>1b38$|*TN;q&@IXqs6B~qe9?m;DxVisx8NS{fR9v{^ z*~oHEh((;kGc4R*Y|^aup|Za0)-SpyGDtFE=9{-zIyQZ{exbVK^VBC#cm9CjEvaz_ z37y4PJ-ehi8~C{2x9l)kOg53%n^6uU@R?{oVj)&r8KlRo)9Dn8>E!=JhGLLrC48K> zS@!I8#A=IWv6(g_4{n?pba19Ni3DG-PWK$^)+Nn+^A#n`y!1yVnW%hp&}X~KXE42e z1K(td9!b{lcxa&GG&Y%!%Z%|2!pNH|hYzYNG)D6H=f@)xG;d5cM06867wfzaj+NN< za<313m37T_36o|8#V+Rp;UYo1=7+K1wr94qd=iT+hix7%pVRs5;Bwpth2?Y8g`2bf z2(FH-ziyu`7G2V8zAygo*w}@V=0AYbr&#{Q`QFv5k63kixz+s72y#wveKly4>hZ?i z;GG#a7gCNqnJ9=(GN~;bKO|koslYJ|O9Ja-M^A9+KoA@*aprTe*MM<7T%VQ+J_DkL zlr<+_um?s|*i2T;*(@hX#99}p!Dh~;$GA@++4z44a-QV$ox^h;2Ft4bXMF_wZW(qs ze#wUQaW*{#hY#$-+*#An>VYOGQLM21oOY2Sg;1VIA%8g{K6h?9mWK#*JQ3)8M4pLE z$K@3iE?ffj?z{(m`VPdnaS0ebdK~)p?T1z^TA}tWwNQcxCKJb-E0xFURSR+L%d7a0 zfBk}Ae*GH1^ADu_^G71n-{P;oeu2OK@;QF^>H}E^{o6O6;?+mbWABQ27}2pA+T30V zO=^_H-Ia+rmoJRA4Q|8GjvY|Ha#i%W_dYzgXAQo~+J|?KEXVst7vs&nvyeHn7ZNA+ zz-K47;H7Qz@#NYmcwxtETv37y(?!XuA7g7)p( zOWzyRzZb8)_BuZM=skS+{9}0W^K(CHCHDNTyG--rVR9|7bxtTgK0q0V)kbXk_f&G;qLGd~%{+yB6 zTzsCliXbM~Hv+%o#9;6ruXFG^$-@x#U4=^|Lm)C>kXa;7kR0H2PC(cBkC-g@rETxL zfd98R;w?c)G81QgRHge&n!~Xd3zmqvkN%AT)nV`-mq{K`$fv{Xf9{9GkzN-N=5P#$ zAIGxJ=~|h+3nTRFa2{d}EC!b}Gn63HVC#dw%geJ?!Qh)6>NDYikIAwv&0x?uk)3A( z-NB5GGkKNSWhy5inRpEzKT@>EilQ4&*Pc#7enwFHs{X={APu~Zj|U3fS3R@bREMwI zDCnG*Bcm`mOfK;U{yK*P{yGb9v-yj6Z7jrUiyT#F3)Sl)$bsSUT|u&n zpL_NWo#W$y&BJiwo+0&kbvaP~WLvXcIrd0zIgw_LaW2pGc=ehuW&`<9ARqfn5Z78C z!|Jy9W~gte9BOlhd;GePw%9irmj4WnKK=+1J=t;Yo5i6nx#RxwdFD}SJ09zzHa5sI z4bIhGIYYQTwH50*mVa5UiuOHNTH^IV$}qgT$cN-f2OZ=0jwGHu{qpjmoJg~bLtfkj zq?!B9c~GIGnFuqJW)Hl)(R|9yk8!?XIcRy%5XaI2tN;xLE$jxz^+~fjl6>2i8fD1L z9S@2Q%uxKzSeGO-5|8WpI~RV^W}^6+u~2s?lCmS>(%9X*6qOBbPEubxDBYobWOf+(0j9}48lM+7++auaEeqitMV zZrUgE9M3b+ymTW!N|h{&dm1!EpFaIDeE3KVA36d9`t?WK)~!+ht~*e^Y$=p0R|*p+ zjl)-8Uc-O<{dc_g<{Nn9`DgI)tIy(-mmkBmmmk5e?>vWJ-})22ec?R5edaiRc=`yw zy1X0jC9cNPJEtRcX)i*}M_+=gkqatUnWelYohH)ryr+wQNPw z;ZnT((w{MV_Dp%9vQnigc=*vr@WXds;PW@0#ha-I@$`l{xHzL9E=}%%$EWneBjbAE z@`#?u7}OQJ+ux6!ogcu??rpKFcPH%X+Z9^}_QCx(l|vyS%_WN!!s0~>$q#t;ii41Y zm+YF%eoprGd_a9(k6n*6ThJSr%-Za$PbO)fp#w@793T4fj)BY4*6}$%LD6v`!5;W= zt$wk92TU6nyz^XE!24bd$;~TYrvq#Nu0xZo#h1ol_(s-`Jpw1D(DD-^WtMP7M+A-^ zgMWa7 z{*?T{;1g%bfx!UEKda0GtNu@Qa-Na?a-QG;$|D%mPo8N`g&kAEtYCsn13(CTj0tl>mUA);y7T^9Q(%@ zVk1^tvi_X)DLQ3+8uYw>J;wz5n|o~xSb3JkCCx&d#ed13?8Br}e;_7XGErt*~@|I&+60cGv-eUws-Ux!EBZdZVrucT?o{s$l&>n=O+O)c4geBGIGOC zeX-nP{um(0jZ(kJ5+I20pbb0b`lGce?Rv+ugJo7|3^QE$T=4N5L{vxKaqgd+yNxva zeI>9>T^ol$^0{M>A=H+GSpF4@G>dGsH3a<)Y|mq`v?u$H4oSb)BhAvL=kM; zgqNWBqv~OT%-;jHmsZ?PD*VBokB2^F9oXQWXCnzwi8up84Cd=3_}?5UTFzwS zWOnk%L7YBz7^jaO#m>!}(Y<3w6e+;7%5g-H<7LJQusr zd#hqemph3VHpc$pt#D>iPdu_}B;Gtc71t8y;@hNo_$hfleo0%1A2XNX%X4e-TG9#} zT0RWjo85vg_f}T^Jfu(SgV%P=!MB+ka4mH) zJ~=ZRAD^0q4^Pg(hgAQy^kw+w;zoS?=pJ0n+JU5vv(dBJUAU=0Jo0}1@zBC?cxd_n{As}`JT!FxGRE}4m8nCJGPJvp=6&5-V|TZ<*iP6# zq!(tjYk_L{^U7NBqNR&s*_xF|iK2VH_fM9_z5B=bT-T_@5Tgw!GdRr^+!y+xS;E7KnsJeAp*y1b7PTaH?A1+qa!>> zyYc9q^GUN4=aRsv?-nZ(Lk>i59^^bHNsdLj#?=ucTl_M7lB@8SQbcB(MgMF^57}(v z9rx=I!kbD654`dikjmMbPWeu#?-R78|Fvu|penq6NMAXZYHk(VQEp77q@8xcCCrL6 zBh5r|btamwXI*=K6!d{?>FLgXl`(}A|A`bNpP+%jAf!2fk!HD(iui|Xo2V@ez8szp zus_5m&6>-7U-7PE#Hu$II(B^L zdPtQ)mj_>#pT=48EnAjh;O{ky-%**I2Ky+kN1Bt)C~3Ck8T?X{OPXWV=e8+73i5A| zX7%GR{G3Q|eJUFqdxgPyl6LYto`yiz z@Q=U68k0h8`H1+H2YE8)=)sqb{M1F*{$^+OEqRZG&RZRv8)A`W`|^>? zc0^xV2}rbiPC$t*FnV^V5dwvZQXK~tC#K-B0Ap}+xgKe@0p1+K-8V>C8X$Nu<>Z@u z{HWX&@vp6&I&>HZckaf-QKL}3a%JT4ej?hy#F@WEoQs!j5c1Ob{6wPj89-Yt_CkNKY?tb^;;+&qic4!vsL$PS#0&Lx~ zNr*F(<`TtF3h^y7|T z@y+1GB|jtXBwmS|7~tc4-d{5p(NQ#h4gNq#@CmaA_ngGHQ=cVj7vZ*yH0x|J=QI1( zs}bh2ls6bmdhBPH)jup3hGmuG;y0F4Nq1yZA8Yz&4Qn#%M1#)E?RnwMl&l@`(0@ ze+W#*Au1DU%f?4lD&glAB*G1Zs8$NDF+N0|z-m;Ts2Oh~~o0#wZcSv(~1it0Q zzI)sU$CWpJMEmEr@5Kuf&6xIZVD@7~>kI3a{8IYN`(FRKpI+IRq?rk`gXp*vYRiL+ zVUA1&_C;3@zD%S)eSw{}lxO*vOrD5-SU%>T%0cy-zy|yk9eLltMna^xvS4AM+GFrSXgcrfK8>_P8b z9OE*9ruuZ9C4v9vev^Z+Gq4VXcrYxuIN7oTk*xA7J1om_+9%@Azod8c!F|}fV;fd4 zTY?E=Mxkq`4yae}4iqDT$Rv7nb1Lj79jGNbk>= zSK#H{!|=qKUU+5K7s`Vw4AT|^{$9=;|* z{zc++e3?2MpQSIt_mA(!^Cy>M<&=K7wMIqxMl$2pTWVs|sNq<)VhQ#=_#h@tnu20Q zN}&FocVX|&op|xNKjFpapT--nJcpN`dmKGG>36X=tXTs5$Ip27{3$FS(E|sGC_lP+ zAznGQ9oHU7#{c>F1$=qw7`FFnj{V)5;lWPLv7>ckY-w{Zj*sYuZZ*oI1l5x-J{NKq zibw9^h0*wd7FfT2BeGI>{UrG)`M5v8v9HTM`<6nJ1D{hr^Amyil=_trW&WQa>lP^x z-7Q~lk1uuEgzmG?#fJK!{p`Jc%w2qv{4hkADPANtq@2l1`_!-Oq&YGuP(qaiCkcwd zG3LdpfcI4fliQWk)-tAT8N+}y6TEuD;A7l3)xX$+?Ls<~ z&m2-N=DMr#h0UE`jGm(mx&y4NRa3GB^Y>dD~y zlR?U{F9wiYCceqgeLY5PvTukFv9Az0_9pQROqgX$ajkjIA9HV=PlV&W~5an zxyj}HD7k^_a^dM(^zQm~EO33S1Jecjl(y@~p?8kUiXSuRTKU8x`!jC^R3DW!xOM8* z(WwWMBg~s957Sm=lxbhy=b?Qg%1WBmpWWqye!HPI2~^&dOF-w`xK?>6u81${x9tfm z7ay0hT%TuuCd`sM$qsalj;R)*lwlLvdj!FOObj8`AkBm17bI{0!Q*f8q6Q3THGft5{3QNjDHbh@4tt0Xu8 z_h0VlJ?HkFoB~eX-sWqL)0V-Bj?VEqXNHlQQ$(aso;W5;6Zi`fMGEUTgLzp-t~g%0 zkr#yvDq-ed6XbQkg$w0F$r6RpvwJ5TI=CB;o=?Wh52xe#3n%gFS5l1mN>a&6fUe9kMqk%;DucY`0K-)@Yl=B@GFt#zn+_k-?FCR z=kzIrsrWH%27XMLgYT0FM1FtFSbz_XOvH1W`s1xb+!aPRH8CQ*c|=N+=+t zH8<|O>voKuGz!btuEE+3Td{xdK|J%=pz0Ukoi5s_eVL$u@WSFMi3k7#|lnbB`I}<1G^!`@tjk*FyOx z&8^(^p!TGCZ8lkEkkvOr@a!H1*CBQw-{D-sGn@XbEai%HBC_dBoKLVls4qS&hiV^~ zC^A`2a=`!F%(I+lyRf}H=+isi>K8SFFKc+YSaO@ee1rBq-Ne$5^hI){!bqNhkMV4e zmz!Oi3wdp_Oj!FoGPn~Np#+wh4bT$YD;t($miB)5OU)Bg4g$0Q1a=v^9LZ=Opd@l zVFc&7PL3_*94=9g4dx}*KZIFbKJ_p;Azkqv`#e4VxBejz87jtBL+dWk~YW$3;k zXhYlOzM1}mI)i>1MDM;X2u_``;L76rHJ{`5$XAu;=pm z@{N4?MsC!Dyc8zQ$#jwqvXIiDP{S-JIgd9p6-`upjXSNcq?r>OJCP-O7Tla7fX-Mc zG0u@N6Z}Aty28%pgJdy-2t3CKC1ek-;|NHh(S#$;K*19w&Fok{iFd%HSsw9u*QD{F z!O4!~;`7oMnK`F^QaMgoJjkEV%EX>MJJG*iU(~Ku8^ww;Q7(&8B}&Qca;cKV1;vZ; zmnVAS__2dHd}tpwZCQ(TYgc0Rf`!<)U;)<5oq<^shM{xI#<;CqNepUqJD%7+9shW0 z9scpeV*LK-3jF=CmH1yzuEF1~EXHpa=HVA2&A*+SgI}`d5ILTM9|%9D&%+M{Cd}WZ zDZbB`kMHQ%7fCbmF_GcVlV{-5Gt=;K;xv3t=iWFn887S|j}Oi)!dFC^uMuhf;>2Wp zd3*xCpzy485`#pm$e z>o4G4!dq{@j(4xVg*RS#5%0bI26k@Wg1GoN^c^%1-~9L^l1P88TC~LC+0(IZ$s}A( zdk`PI_%MF|;CXyN&-aB_HO93RsgYx=cChg++l zTzoDRi;qXK{6%qZ-TQEK-!Wt+Wm2G|638$7fjsPcv67hil72X zGa+aWDZVb`$48VP8{G3Q+2ehi9W?SBB|ITHy?ujkPpVh<>6qsmA$aAofov}IlHDYi zNY3D=Q<^ufN1FL*oy;D4vL=(>j4+1=70({ANpqAqJ4Aux3E56;!tiY3=z;rS(0oUJ z$hj^8JLl$9BV86RG4(_U-(d0oUTpct*!Ke;R=LTS>A1BKE9R6mGjzO?Ysnriu?xV* zbih^UaqhS4H!k>`_LXfMkPn`r(^=<0ejhw;dipWAI`wCmJ-CCt$;GqC(K@6a&rV#1 z$DBk4wReVON+Q_O8SR(qQFJN#RQY^cJ3e53;O<9Z^_mX`?PM?OLT+G4ch`f-$)wqp zDV8s}aAh@lV#z4`+*XJ*^R<>&o!L=`@VO9~#MgNn91rSokEvd)@6tE&XVbs-&*C6} zT<`0G;MMm>;QIW2kx%IL8c*(0K=WVr=M3_L=XfvA$1z!s66Tns znZP7b*GnmEX#yu^J}&_#&dXa?M3^wMG6Kg_n3F~caxD0PLUpP0#DFKe4>_JaXe@9H zVq`HsDTW=uOY{N?7+L3Ljzf^yA(BZS=LOIK&`7ge2g}X6GQ!L^1{{oZj*P*?`2rE= zE0->k(N1IUu07bjV<$GOU55!{$DodUd7}U!AFAJ65smJxkCx5vM-w8)H&w5Iq6G?~ zM6P(`mznA~lq3Q;zh4`Cdww_m>*2NdHK{-TlG+Quqzu6?siW}MvkCb9{0#hlaV~y4 zKNmk|&B70a@3Us&+sqmGE`yLhn?Qt_zy$i+v^headHVyAp za6A#{dH5z{IX*u<4_}|2g|AOe!MCRp@b#%F`115@BHD}b{o{v_x_vQD?cI(UQ)Z%Q zkuu1`Uko9+D^@6j2@}U4BmE5C`12EZlkoN%FW|j*UcuEjU&5y!y^GY;L=-7n7>%0T zk9XdC52?v1Xx*$i=1rY~$zum#;q=jX_Iwil^V3)H=IPD2xO6zqFBpuZ8U1i#U~k$s7T1JtB;dr6xJWr3CD(imw!D_VzVC{kWa(e=ixkvEvQ{z*91qq3lc{jtp!#Ha1|dS4Esyf`HgNE=3|^+; z4l4TP1P_*D*={~*HsY+{b8K6)gA8~Ewt;u^;o3s<$Z~KFW4Vcd8%Z{zXYlhN!3J3+ z&0G)LDvF?uKo%|Q+kytlz>fpy7(`^5{61E^3FumP#@Y2f?vFwB77VCLbOOH!2JbNV zoca*SMDd_wVgQrXwHcQ+-M9$p@;H_CNi)l5vQoQjhsd>l9W1NYCw@tof5?alGlTbo zPfL2bh{(sB!0#K|oqWP>qNJ~$O-wI_K!@skwr{dT%Nyi6iicPeQ!Y$Sd!lSG|H%oi ztR{=J8Ie!^r<|tCMP0f1dW%QSuS)p*cR{WPYEx(Tt&Z&dXB&k3r@GSqSsW<*I{dPP zP+b93_Uy+5etVj~(@q~Ff1;at%T0nHmZLr<{_fa7h^FL1D(i#laEP;sTisV-#`NHjBefFh>~T4-v-s^8^(bIgbi2*HkmtCSu)+yd;av z4u`;rNFBx{%!;T(1+YhW202dGnl9~AdonZN%dW=BOijUs%uFHCm(OJp&SLM5?P%Db zJ__Z{i+s6r$rm5^hfbIv=gyZ4WlI)Ag(8Jet4MxyuTusYb9&()=_~NV?$P*eOLzSJ zKxh2@NN4;jaR~mIJQDwrHUa-}b~=7bpMqbrX5h!nnM9hW6KS4`@3JQ2yUeNhHe)uv zNtuOjQ)lCQBF^8Z6Itec!cQ6V@Llpu{6gEGiBNx&Jd4Qn0xG)*Unb7SH-zub5a~{w zfp1Sw#g`{0EDMj++MS)zoP@`HEOr0?q z85xOq^6~R{@wrFw#w*X_y|-V*wU0l*OD{f$J8r9m%9YCDsmC6|LzyY))T9xn3>%Cw z!}_Cn;|4geWi9aAm-ze&>3!Y+9Gln)hX{wqcg4wZU644s9ZnBxgXFPYaFPi0;W3?Y zAYmXD4e5<)#fqVH!IJ3Kp(pu98qOxAlI_@k*;j1s8j+a9U_bbuAJcI%Qy*1dQ$m+X z0hmRI4&dg$8Z-Y*xUuIUCedI>d|Nq9EWDpHY4(ybf#a124<*S&yac*0gPgs4;Q6|) zVKW&l$y;Alr(pzra%DC>UJoJC9685zFiB(LtWUudGU&6OxIGU>GHzIwf0s0SxyK$q zb6(KAL0ifdVLSXWX;$!HLb;Sd_3@zWgY0DplV*jTW7}$fv-eX;Gxt;Fmt4htkO7bL z8-ep2)x+Bm(R1JqwgGA8*wGg*18_SLzm4I`>ce5MTzq4qFK$}AMdD5fbFfrH>hf$I zj8CRJ-tuvUm6Ng|8wBvnh)q~7wsjc3z34n|`Iza(MM$3{kM8sMVdJNQ+hhOb7x}_< zP~V*X2x%rCaY?hT-Sz6CW2Q?ZDGINDq&I^QXX~p2zb>y$AJPs%{3iQ#4F5VWr|7IJ zL-~Qv13YLxqrQ_Y!5Du%upfnUT2$8X{~RMhoalAjZ;MBlGOcVkBi#tB zpKLs@uT;NFd=&vP4ijejp9n%a1rx(P&TyTRA%xjmx2P}193$|xBnyFmrj%zQ`4K)F zhhHiZ9gHAS78HBho>5j3cG2Zj&sk1icqjx!7Z0RQw!L_t(L zpkCGT7}NYVyu4-@e%aI)Kg@4}zbtNsf3E3-e{Jc3f9~o>?O8=)3K~7=JBCyXPjrl56#OM)uI&Fz+wW88Tqcp^ zv^n@TWgcC(kjgJ0Qawkm|4xYa9DJEF7vE$p$B!8+@cUB-aN+m{96Nmo33KM6bcu2( znYRFn#OFnPo;)a7p&S}M&;(t(cf_dS12KK_1T3CE4{KMi!LFSSkUu^S^Jh%L z)>RA9>HhlY)v_u2^z4qRH&?;<;Y0B2r|;mGmoDMlvXMAJg!$OSE;u&68;*_ah?MbN zaGVJ9(Fr|qYI1L!p57nZMt8%40bS6hK|K^NSQxdc*TS0RD{(QMKifriNjX(!^}`cj&v-ZUAXnwF&h9}8KQDT>yAa5 zMgP%q9zyF_39=5H^cbxBrE)%)EE}S_3gqKKS+=D2C~UA-`vhb;`dLpdEA>%bK|gio z*M9z)VSRLc$R-r{QLYWBKGBQ723uJdDL0%JJ4ToTsNb0G13wbd!+m!AS0M6}>=Ik{ z#S=pia{-h%TN}PUBf7KygIO^LEC&s&E@5(Uy>kp9o(z`3^S>xer@1X3k$X;xcmj9Nd4G^cRF zAjg+vPFvS_u0lANnH(Eo7Sc>`NwX1W5uT0i)no7z6AR&$3z6Z-v7AUVci8H3AxS<) zfEX;u4AC}YljfY^l4c{$@)k488gWi_GIqAW;bhi;>mXfaQ3BHSb>1UGl8@iCmg5AL z|6E!cE@fqryl1eNh~og|KpMZ!ty{5Z{#@L8TP+kUkYAQ&+*c$HM|#x6r!()vw^MJ! z&$H{}=f%zO^UC)4b#p)bv}*)@+CLh9IW`f$oSKB6&P>AhM4-P}MSxfQt#nt%X@;dzd?0&rY>=_(6u^W8`^hLc|wQyJU8pxL`4tWdak)I%! zS#&}^LZRI8D3Y%bN)#=Ho61!{QMzHspD!LIiWNeo@}+Teg|etu=T8SXd9fs{K3>GI1%{q&gH012bP@ugu{u} z@o$E{o8b93{m*qVG2(T&dcDG4stW!O>Xm+aF&T|f_qc*&<-SL7qonzI2=p69wBGB& z$vrMy`MFNBt<7?~b_jkUkiq{G9NoE4{pmU19^OFjlI19P7R~%h!F^>N8hK@+PsdpH znEfDIQ|@y8$j@;R!fXSs^--VF*v%)$fr8uBStqt#AeY7;_m}r$_y_Be?IeSClr%FU zGtORI1bq*~^>6+^WG|A-{K60=$o9VxV>`}EYHZ(y*;4zb?f8iF)zv>kWTNZ1KYhSgL z_1BH5Sdbf6*(aTNInmCUz8GA;=|&L{C`EA2?N4C$fW+#{X{r~~YfjLm41Ar{<3eQ& z!9N$Y7l4j?<+y()qrvoOaL@TNF_$C4Y@aUVuca}8w*Q6qkPa-iqqbvM{*>6_94Np2 z!7{OI?6XXksUB~pSwKF>@`uM9l8t;Wi`<8(K}ux%_k_>9MJ#vc-EF^aByiSsS}=2q&~lX+UE{B$_No+ zZ{H=%2EPmsCgM1O%#7;|R1E91Au2;?P4e|1!5${fcFxu|N1aq0l^l&9lj$-L{-4?w z(tJ{hGU?2=wl6UG#ZGFk^O@;HoVTNDmC8acOA*OzSh^TiwXKJX!|%gelN;ilF?Zsv z(Rbs`sm*Y8X$QQ&t}otOGZOEt8IJe2jKtfU2I1qm|qYB#8u7;-7 z%cC&$rC8oLw7vHhBy?eCY2hjhT2 z-Yv1JYhx^Ea~D?kYKfHtyJGf$UT9Xg9!eD|fr{m-p-0yNICGo^+vId2WF9&8kY@Y* z>v4$;8q_`CWPkj>CCwrKagGa-k@Dp8+>SRWxDe7z$-*TE8q1C^`-l!;K@#I{(;BHY za58aaGsse;NJ6!xEwSO&&HZElrh1&%j3S6(s-I&%0wc|41Am{|GgxvLP@AF~2WE>X ztiBt9lY0!3cRWZkr?%LhrVsu@-W+A?sP+2fDv=JPuKkc}B64{!N@j(sLh$sUYE8EF>K|3a)peud6D za^2iF1OKmb2%H#TxcXPRUH9u5;hAQZg|<|GcBtH<7lyYVg|rbN zn1&Ex_I$;aHSFVD7hlJEP`e5^Zm7R*Tq*1r`J(sT^Z@$5oi}~4&zQVAtIUWJsq2wu z+p7IdM+_xgu_3yXYuMkNz8Sne2C#M*cAbKKh4Y5KTuOc+{R(t$Q2S;-r$1`NYu#Ng zr$m_{@3Qeeos*enu7l&7wo#sFIaOjSlmk84e8L=+RsB@`RsC0%t&!ga{_g33^AYX4 zWZCkm`>9IXmnUVOo($P-9DvEPLG49$C#Vc3i8PZVnDSU4*%DkuNCyb)G@-|`J_17P zgBo+|^&y>8xaEy38~&IyXGcy0mBDR0R{Q*7pFM^MK&03LJc0~5C*LphiAMl)&?x-i zlL3Uvo#Zt`CqiV|g&b$Y%N?6K3J+YFsfidna+nb2g1K{}Ua_Ls+NTp9pF0wdPw#=u z5zUc4q$Lsuv_|6SPB=KR6*ly%hb`^vVNdtFad==O92<5oGNv}ipVsumtJ?YY9GFCyh%b*%!4Jtqlv8IYVNRb;gjq@RcNuf=eWrrHy74m+=bwo% z|8#aSetu{*etB{W{_^5J{PxO0eEf#~w`+_8Bl{s~>M$(n zav$2>To6kKw!rI$SK<3(M7R$w##{Sl;pxp&khydW9-P`AtA@76__p_;b*-|fS|~S) z(+xvG(nF!V1yCTq0E!eUf|}JTqfYrEnAPQ8JhLVNPtF{SBRyMS;oY?{^X^($-LWxN zb#H=s?e4~ku1&F~drNE^+#Rd>biu;D-7&C5b5tvJ6DpLfib@shV(#qaIG=Hj{E!B8 zBF?EiGbUs?jkdH;L`^wB8^lbAk>I3Lz~~UBn>G+?K0jOYVVQ zG{>1-Zdn&r-wh#$+!G}Fpgrxo-aw>L;x;{|on#x3e?&<04dDdY&if9t667dp4h9Nt z*Bw-%gLyD8We4|9ZtlIDCA#E6RdQT*xIPjk00A@(0sXKiqZztv`^{eoTQObx?i$3w%i)m^gW{+pSB4ue2EPq2XGPCv8%IS>Mbv-ghS1Z4>RV%)k$p)9 z`=KHunZYDEKxPWXx{RS(Ms>(sBOt`t2{X?|GQxQ{+9ww-Y0il_+qvtKW;*Z2n7}_5 zjTz5Rq)#3qav5UD&ugoBIf4>JT`?f=eqmB!JS@y@_ZW!=gqemMiyw(J|CBi!zh=$FZ&?fR%Y~)*?a>YR z>8b7b{f*=Jua7hFw|A28zdp=F*18c`-}`Pnwq!Iu*fIwnZ=Qyaw@$+QyC&n^12gf- zu|@cF_cZ*lc@%!$HWXj(9f526$Kdlr{6gV;e4Dro-=wa@J4ff^?26Ht*`*=c*QtU^ z`SPM<{(LB1v=C~PD}nne6vE+zUU*~kR6IFr0FDo6gFSuPVMCAB*wViPmi2Cl$?fh# zLhD9Y*{?e`4(*Fo{kvmXzb=^Cy&bwXx({_K*GBoWwQ+x=);M+aq!4HRo%)RAbg~5p zuOcAKlmH~Sg8=>`(#&PdhdE|^pK*M}j#*4u-G2QsNOLA75g{{>T|s>^o8ZKn59*8CU#%k?N7siE1cpn3Trys+5aaJ!oOb=J`(w<@P0xR!J;#5k1@ zCCiNLGtEXog1UoxL-li?sT}{1f=ifVk>*750Vd7XH}ebLcm2poJcdzB->0ge{PZ2DT|Gz9kw?gCx0Q>S6R4i>Azn$wc~9~KEHl~lgM5x5d6wli0;RF>?~>-6 zV0v+nsqb^Y*heBDyI6cFNOsSElnlmL_wQZf=GGel|42x(z~OhM7lYS-R}a!3LfD3+ zXYvvGC0O>_(AuP(YgLyyqW2X*Ru(4@jSRqDhEz7_MFn8A-f^()6S; z&bcnIwF~WkOw!C?(rhn-m_cI^W&u^kzowGpRqQHI5+OCfoz(A8a%V|Fen&kxiVMKO z&d~oQF?-G|f_ly!fQ=QY*MkNV$(jvnsGuzcE5N{yA(S-h+oI_VYX9FM%^{G^WZ+=E zk#6lS(>Dm|E$Ia5o=7vVgXT5Navk+i1|PD6{Nj?YCj#{)mHKh|)KN-!$1!`(G?Xe+ z0tE`@!|k`I`femFkfH45)~SUGWvUQyz8RCpC*YxTm&txK(59pkc!rq*Gnq*<1tR&N%_6uyd84G+ zPh#XRhERg?&ilUZAt%zzKb~Nu*@&}CYNE&%^8!v{(Scl(j*=RM{-RxgbQ)|d3Ch=E z{8)2inGor(k>(pC&H-tbL`?}bN`?%a*F?wOOrJ}fxt}g!W*EVa!qxj9!H-*mr$wITbM zM)HxI=F=h4?9{_P7y3py!Qz6!Gs^O#IaGfN)oC|M{<0>jJCl5hq3u71{37hTUXF=%Q^3as=1Yp84wk{|TeK|)T-Nqgt|a0n$prFS zvPD1|StktcJKL4@;^~5~^Ps*KfNklM=1~8*ZH^z6LFLvPBZm5_SC5_;U<3}&8LYl^Bbc~p1r`9t@I}bSPsXyvIa-M$H_5CXRK4Z%jXw!%ovH} zjQs0>OqzMUkk;qbLuI93*gEn)q%3HQYpFBv>FEi$c6uDXOiI94iBs@> z(hU5NI1@jenSo!3H2+3~`S;8@_~-cr_}4?L@Xsf<;_uHN#DBei7XSM2BKFVijvZ6m z;*I^&@b1Q8xVm8=-rw4f$Z{`yxThyR**6%U>>7bPK?bSggGHUfuw-HOMT49B7Iy>WM;c$6j5T&-|!^thuU4vy=Artqo}0^HdcQVUHnIy6 zdbdW~d+Ool(l?=8$#SS)_bwbebPVS+&eA|ai9MMDGns;t0+t6Ip9B$vbg@s84dq8L z_+=EH`IXsUenC$m>%JsTEU9TCVqejuWQm$VF+=b_QRso|j}m6_e~PhyG}CcvLt@j5 z=P&~Q^Fhbu#*KVgNGJQWn`jJtp0+~loNb6Sb1YHYy04J+YTPgGpMmm;zFQEHB?ODR zFlm;bxF!f;W*w2twiQxL`|cxhS-(tuw8zZ0PLm<$2|?x4goJd~SQA9S3T$S8DQmu)Cgi6p0n{`bnApt@~MGe6bhn{+13 zvQNiPupQ}I0qxUuaswp#2;}u;rkp9Wg3(QL^kq#td#J23Y@` z=UGSOH%gcpoJZC7%nx1r1Rz@oF=ZHGHnQr1kC7eOf4u=!!F96_s2}+IB$LJ}gRk=l z9@WLL7hd&Sy+lN~9x5w0XuKt#V1H4v&fx33I;dZ=F8#EAnaBDUx~bs!;c<(RMDdeM zIt-0#v`=yhS)uYSY1Y^c^lQY~j|abB{6C$idiiN4lV+bd(=`%PWCtV8%oR9pEl>G5 zi~Lsd7L~IbQieNTGS~-1R+2A+`Wxs}Z745X3js1h#Nb$?e);3OA{J3rklk2cq9f05 zscnY(l|u}vPXUbUD9>S7pQJysrT@Jc5%B+^{(5C1deC#!2l2N+_YBP+w3U82TRG0K zTtoVyeJ0=RCz8LoPm)u@a<~v1JO9g%C6O%2jZKI;9P4IFD#Q`mj zqe)Z}3>jMj*@Y>L1PqMG<;1wGLR`Xgjv> zJro4omR!T_nXwhF<1uW&l^I8NLNS6GgB?tM3Gmc0`RWMI77I=uLDHFHBquuw5p7ABQ=AF_GuhDcnEv9@5GY1^HHU8MHDVk7?rD6N4@&@p+&0> zxcA;Bs9Lc)s+O;Wd+(@&f!$hR@${isK4UPJjPHSYBimx$*fy9wrWGa)X^xS7o1tgh z`_S(G`e<}(P1LPi9<@ssK{+CnGrBdvXBXDvlSCrTCq@%S;!aiG`Q9P;Y*LII1jHUfCM3{%;%RNK!+0J2femwm@2|pj7i;p*t z#KA5#@TX;i@Y0UiSlX)@Zp#~o2E_|tVynBbaX@RNPwRvCw$H@d>nGyWfTq~ny&?AW zX^E{pTVPF>=2+gdJ?8f6g0^?uDlZlmAo5(SU_5G9DT6-k?#IaPZ85l0JKT0tMciDr zJW7=;g|TDC;o{kgWTP}9s{{!~O$_2MWJjHqBfCjpvtLsDkZ-zRLaVcL=8x)YMiQ7b z`GmnhKB`Y3+*ws7;VwBg1TkQc!IWazjdMTNDLG_P8=2IGY~_CYMF%(&NI57vGN)Nen|CB8k~dUaavv3C6SxP7lqmMs=kZXMi> zd}a6pJA?C)+FYIti3|bsQMYGLe<1SmsQpj6c5SZz8@PN_b|u{IaU0|XLclfm*gli# zl;fm_6QnO8;k-{W({U#1*1q~A4;)k{x9#at5$KqokTGFS_wuN`Jmj?%f+rKpXf|{a z*pSXUZHX*WCK%`#od4xTNG1;go`q5};rhNI){`EF48qp-zXgtcg-=jvj7&R4GVo^t zNoI+uGsi_g?)b#y!GEG*eLF>@`4oSFhjKpom=bUXm8ENZGD~DImB?TUkwHe_V}TC& z=FUi{zNlz22MFohj*(rHsm|miy162fz~En=)Tbjn_7XUTz5L<(D#7f`KIG>X2RYfD z;mQz@LxvF@B|*Nvxs2r2oIv^2BhC8N2L6;9f#hV_782ml$z&_?jVI72tA?YaeKaVkikJxvG1=ObHX}3;BRPTl$cVr)b z#IDSZC`ZN8Q!HfbZzPvSA@Y*`46e-T4+hG)wB-pB_qgfAJr+FC(Wg!sMxr@Sh`#9> z)(Pco<8Qodi9f}7hGgI;`a+s11`Kg$4j+LL=bSog;J32f877}Ji{p5SO*-rmTMD#n z@RSfWSgf+H0;9-Anq{CO*?o}AoA-UyJt-@XLY2@aZ8U%|wnrJ(_^ek50um zN2kbAiTAe;#q&!$;K|ud@C@PU`7QD6^6q$ec|T+<=!bQ^?#AKKt?|O9@pymR1bn)Q z>e@mCn@IC#M2f%MHymH>9f7ZRkHpuz7_`4@IQ>5iU+fx;&-aYLC%Z=D>qA7&_fJ8_ zu)Fcv^1k?B*AygA?TR6{mqSgGxl8q%aB#{%oLxY5oZNt4PH)7+Q~O|l*ZXj&PYdks z`2ewxMdEXzV$p)=+ol;3`u9eM26v%a znKCF@vLtHPx)sL{AII6$OiIRS$mE~cCO?xkfn+}kEZ(wzkp0+iEP)%+Xh1T;%rHL- zC8$W!GVoIfPB222sa^L68p5D~nPhU4634Kc)I#x2@taA0m`Q_*5@(MzlUzP&Hn7a( zqt}fElGFBOHpzo4kICfXf08ttUVYtiz0S4Pjti?ZO7z1B$Viwpa~|S+#JPy-Ap~Sb z!M>p~u{}vHY1eY9CzE4uw-16GWLMJc{;%XzKm(oml!EGJa?GSzCQC?X9%+{ASuU@A zD=)K7LY#wh0eCRwrEQAfx)|W`!n3jU#Spe3`6lbk@B59CW*P|ppOR*VHx7BEIRNQnMrdBkwGQ5q#w_}LZq4MGe33Pagj~$ zmdB`0Ti5JD);N21w`EkZ?405~W$^Qh2lG$Y?gm#5m-uL01vjR)WWz{iK$5eANwef! zl3jTT`w`2fesAmk<%ZEG!yY&uNPg~*epy0e$t@#sD`P(Oi*ysplP(+`7&Hg6ji?V! z-(AQUB4m)aB%?r&@i~|5g-LT*$9{dzu_!G6aJ+|X>Din8LY}0PPX0Rx`WzcUtSbV} zmfasDp2^qE&Vg=%V+@Hk@{?GknGq$;yuROFkWC~R_=Zu&bC)zH9X%#L$r zw7&md6eO~nKW`r7%U1w-3lv8FeEE={jxh-?S|A?93&o>U(fp`gy#gBET^pS`G{mqW z9WZ0sKrCJ|63f<%!E)Npo75L`NA<(Zk;BljWh)dcR0IVI7DT@IcoZP=IjHqL_~wa| z`1-vvv(_`%^!(N8)xDD z3n%dQ!|7N*XBuuPe-ldN&xhi9;!(M1VGQrk8sodR$DP$GqjafKC{eUHrYB6ph0OEh z1DwP;fwM2D4<(cD>*E~q4fYwbqa|$@VbZKn5@q06;%D<@ceQMtn4ekd7X1dow#kTiSwhqv0Un^*aNl9xgHcJr$CfzR1>PX3i^S@(hd z?VLdJjx*EDwm(6B>kt4X0d%dj8w{F!lP`}d1+hsIlVl#M$GZ(Q4GsE)ve-AnixjvyF(oFs0n97kfN7~gj%?h4r zwi^Kjks&5ayc{f<_K6Vw-;(B-Ai20MS=UTRv@P|6k;L{Ddosdy_U(OwIDj{c>&A}R z{*Uah_L__Q5?Bjz8N*Y z7t%~|_$BgQh8_&LRew$0G*3G@F{ z-iwa_RMrRUJ~m|5oNX)MJfM)JEW9PV#Tr9y1fhP%fFI``awg3h+wxU0&&H-3Mqqbu zEF@piWm+k8O+c6h;z!iBr$_nm9=b+BF+g(24H5Z|PnylCS!muyJLuy2q?w`0j&@!Y zKmrKFpn(BRDQ=SdBUp0KUxu-SLjQB}5Lg**i<7F0pnbpX>$ES*_l#v_qyJld=DZ5M z<1_hW%Aa(^(6z~QM??BML;B}^UjLRzb;x;Y_mnK#IEj-dj^gC;W5`TRL*nt%=-#0# zYE`a=s^zN4cZy5J=fQ2|%AixDd$DN3a2#5{5SLDF#Vc11;o8e7`0;8c{`&D*{NvLs z{OyZ${BZ3QzJ2Q`K7aB6-bmVx#FcX}wsUKgh|euQB2tXVcS8Tx`23kO`2LaI_#$;F zuAQ2WH@Ei1wS&X(-RTMVHYovL5I#IM3UBZ0hnF^W#-BI##0Lk);J5TS_*>Q-{477U zy$Ii(U5RTc%kk3ANqA&wAH2GGBtF(yvv8{ho9G%t+@1^d<>*@P(cK=3< z?$ZNBh%gt-O*bF;^OJ5%U~2D<=+NkHlrB*crHdEGy?5PB{+}$f%{l|3L`eo3@_(6) z6WjAK@-y)zCCe^pHX>^N69V~!JHQxh);35sv`=*fWZA&UIw)r`>?W~LtdktP6vLKe z*dQMuJP23}#Eb)1X8PaSW(0DF* zLXs|nI1lJH$kTbs8#=&QE>ds;c4gK!-MkveAac+(0dyeY9A@SG{2QZ=P?-qX)fr|Q zFlgYXfp*Q??E*M;cSw{x{Z)JjVVV%1=aP8?M)Nc=DLx~x(fvpea zvaAm!QIu=Xc;8N!figRBjzyRmt{?yV;A>s7?0{pCGzZvbTW zJOm=u|AVACBnQc*W7sJggOn&U+#fdM8!1_?kOSPAW*0g6K-3R3w{TpkT{XV+g)s%$ zT6C4uwH{>rrZS{^$pP$((!LVn5JE9T=e;sYnic#`ms^JI6p&_Wk8>mYXb9O!v*thA z_sZz^h_lCqjv)rKu?x17LYGhJe!zDOEUQ%`3&GB!%=drRq}q?u#O(KGALNVEO`5$S@rtS66O#_O zIV&bin%#s=m69{&G{fw6!juqJ2JO3L{s87IA=1o_!h;HRiXSZ{lLF=U@!7FyaB$xa zmt*8e+GkA`sx(r(*M=FxHC9+=C&}>j>{On^iqm>>@rz>orG-SQ`wW#yB7DrX@lE*KQ(N)tBkS<?Z8#(h_^SJ%Iha9>AvVbZ%r@Ts^fOS5vm*rPO`6 zcyd36^zDIsM4U?&E`nmY<8b>;CDHiyTDYln8A16nWwC7GVqD_2%^Zj_h><@KILM4N zs~?z8aU3Z4Wdiqw7}t-?m)u0w|Zo7 z57n#3+_-RKAu2D!@`npoCKvxVq*=)mgY$!vgS>sN({h%dw_mLWv@Eqy0na^i-4mdmQ$Z4C1sGt$lT4+eIv2(7a+cRk-mI#E)JpFOS;w9DiP3 zvVL)ypiSP-PMU+b3817o3_B;Xkgn{hB7@)g2>Ylan(NdqwC|l)((FR@VqZ2@6GQ4(z9!?5{O`c=!2nv1-iQT$Ww7<_@(~FSQTqq|j*X}s zu}QN|OmKO=L2>hg>BT)JF-O-2q*-I#h4_cBW3oB*->q>cBzZ{*3X)sMG9_#;k?`BW{K6v7 zs3>vffU|zGvb3Bnw@qgxNKe3pJVsv>McOri7W5&0CdCNONi$Qc`%fnG7Pq zpF&B7i1W!rA}1DIBJtQ6Y+k<&?OJsr0#{OImMiDajo}UJ;L@rw z_$GA?zCO7K-<@CrIvrmgnTBsqOv7*K^YD*zi}7EVSKwb4mg9e2SdIUEX+8dRWh;Jv zVn6=%^fCO$W5@8HkDbOpE+5DDseAD5fo1sR@*d!&llbM*1|rPM@nhyPe0(?oS9c7< z_s6E-pBanrkIY5*TlymWC4C{aGatXE&B5=P^YEAR3-Qav#rTQH@-G(_;720RKU`dj zZ!Rpw$4PVW%Fa=Ejp})C??`-cD1k`tG<-#5`70*H`$pr7J)?*qkH&|)M-fTq&(4h| z;yfCk?;V5BcM@r){c8`7!^g56kFQ9sH`ew=#>ji{!ops7W%&@iw0tODTQeN5tQd;N zr}f0K?hSE(XPG-afW4jC5NU3UU0qsXZ`bD7*S!gL_H2d+hjql2Wn=OD-UWF2@Jc+9 zya&e*Y{lL8)I=spk?4t`IY@SWOr(Z2(v#S!uls-Y%_@gc&3@QOsHdkzm&m0vCA)j z`GlEZ%Q3?3t9&+L5fh}dDEyZVJ)*%rsYH|Q%XP8-ZSAWLkmmnBsGJ~jJf;je!SgTP zcVi&*^#>nxoaB`FVbHZai0jvLl&}kNrgFTFJ{UW>&*W=XzxQ>3Fli=x@!DqR=1gT% zIsYBf9ER$W$)Nj$)SYQgWS{UzvwQ){?Cp_e`2rM=V-9%MF{gMlxN_(i8$f+pV zyktjp==Gbf*DvRg{@B+zCMZ4(7B6nhw35aO`f3OQ7 ztN;U(0;@knnn}^giIl`W(#&_sMw%JnZ+sHi3HNelhbkf_$g$iX5h?`<2bT=5032Fjzc>);N-ro*t2>$=8hSI zJ}q0KPU*5JMr65ao?IAw_pP|FJOMu?Z^S>7R^XStoPw+S6Y$Z&srdN81blIDGQK%H zS!RsCArk%7;iyhd9^PMWEm;^txFDEvp@U0`7y^ukf>}RWrX6xfg4-dAhF2ha*7jf zC+x>kr&q7k8Ft^|>-yz1K;cL5`t1d0-`kuJ!6= zm|hgzZVr8?J>t_2ufHF0B5Xj^7YWCfb)9B=36l z=_qlg|H&UEek68uO}{(KEyS7a!MdUT`TdQILphRWg87-myr)x_G~0`2Vta2tXQ2asvODRX~~<#4Si4Iypk}$e6F&=qL3IDvRciK(-*ihDfug zA4V|i83niL!er5H_L*k2y})b7#v=7)C(ls*ep&DOur1g&o=-aVV%T*iKik~x^Y!4z zl_64h0FEn?fn?@#oFn~yN9xcCASTWHREOGe;>qoU<>#oPc!UJf9~+vdr8$Z=tp>&oI(V z*Lb8^f`p&grH=Fl!k=+Oj&o7{r-HH)HI*<2WXUsar#+yigzn};7x5pg{>AHNa- z{p-F7_>aRg@Q=fD2s82bL(}m4!36y4=oI|x$YlKU&?Nl+;CL!G5x?!8fZuma!2j7Z z6F9N}|0i)d{&)IH{O$BS{N?m4{4HrV{+2cmKNG%8orllU7vjt0+4w$VK7Ky86u+Kd zN!yk9p2+g|M4Z1(TS^3(khTopq_4pDSsUTgT#qEfa8c z{RAWrX^lOd8xcuvMWmU~r6rN(=GfP*Id*q%gB{&kA!X74e4e}muN+x{Cl4)0^0xU{ zJ!=d`_UVjzHL9Xwu@boZ*4t69P90ROPytm4Jli~Uz(8ESc!_M9EcQH`o+kF^0Znhj z$UnWO7nY!8pZt@5%*G|4vEas&#atMCj*~`^aD#+FZSer=_;9RxW7bV|%6BQ9dUfeW zK%8AUt~+{*DxY-GNSDU{dK@G7~{IAT-O5+`Z<4wljSCz+F&gC zCz7j34kgP5&Iw%32D0ojhQT`JBu?^>{sU-^;epd#YbgVIj-;6pwL2f9eB_vTX4 zlV%3bGM_nl6p8Y!XF7j^XPCpJ+29^GeTLgr=pgLq%mv#q0{14)2lZRN3G3IV(0m>E zz8eGb)FsC88wUHo$>fb+eB31f3X;L~3u=c4?U$D%j0qv&x*Zus&g+n7ffJiT zn#raNCY-MQeVJmxD`P$;FR+VGcre@YQwH)47cOaz#t*}dF&DAq(lOSx4VC5kBapmD zvRZt)Hi*Wnl4g#Tz&HO0;hAQBAu@O=GFTF!xKYxqG6nuDpYrr{V6mw2$VepJ@`nY& z9~TJEa!24@AH-W=dq1!HgxNV4f!1N%$KKzG_@r+igj6t#(xSI^fYu$b}@sZZ@IBav+FP(EscNN}PFnT1YboNV@wi zQY%lgic(Bbl=%E_W)i}(nA6-CX;#pAN^U&U%rEirEM$6W3NBr|fFnnapm)!nxT#ce zlqiq~rQ+kz=YhMhf9YgoZkdNychAMutqC|W=su*4Y>L-c4aMiX67a)=)9@RSEJZ{?c^N%D`gS>m9_-`Bm(^RjJ5b{)_VMYegpn~X)}Jm zybZsc+kzj@ZN`u1x8VD8Tk$R7o3q>SMb>tFes&i=$k>ZFQupAI18cE#R8LIq+ZL1h zwZr({tud-gGbHqEg-KmoV04>C7~b|i^nTz@bh+mibf{MiE$UQ8gDPck2cdeILa0_e z7jDiIhkHuo!NpZXlBrLh@JoP1p05!;+dBatQor8bGy=~r?Tz!3S|WAieaINo7|Ej= z;^{>_@&5MlcyH?jJT|j84)$t{z1^E(rx4|q*iCZm>C&9Ya|`V2)dmL!bwc{|p?G}l z3_P=a5uV(=6n{Fp1%EoTA6a|1p;xo}am!8R(7H(z+*7X}s#UIxa;3|lPOaKFbl@N! ze&`A^(=*7Ar?|9%!=QCg%u?N9KMrEb zd{+F{tIJOe^1rraORgmGd-;ID*M)rF>WGjfslyvEErGg;G}y?_ z{s<;>6e_3Y9e*Hpa*8XHl4WY&%Q1R`Zp4mtN9TRJIr}0rpXUTl^3bsWnqSyX0byn^ z5um}8bDUgjG6k}k9uzDCe~BO{qk}a20NIdyDomP>;mipp&Buf=D`__3Om*|u2RxV$ zxPA~zCeuSqm@OU1tP%B{p5Ayl^sOVrJq56WnbbnkU>Fp(s9n4HglLG zWSJ6e$-nuRkU7Qz!b~yYfxnomq;)lm>YD4Ozy8JPbz%jr#k4l=)kUdRbAzrhu@LT94q$fU$+96x_||``i}mNv+*3Iw&KEM$arxX?Y}vdCcinX-a_5di zfxK}y*eD2AfNi=lYYq9|U3wk3+Ac&QR7Tc$M1mo1M98ej zjLn^I#>Nhn@xrPBbmK7=FVPK0@~GC>)1x7Fc5i}RJz8KVk>y>4{k@1d_iBQ@eOhDR zfKEuBG!z$SkH?eCr{m9C7vrUU>+#5rwb(N|0o6(rMTtU%(YQea)VuXIBFUV6Qd5DflPSCudq}j89kO9xGoVUDd zmEdT7VZzMdV>yv#8c6vU0lh&_NvX|jbA7Jv_?*7HXyqCD_*n@>&ZL>z^NBOtoBH_w zfHVhrpY}aF=zz<!5yAKB}T#)2CUHw;H69lxkvIvynf;wvJPUdL_aG}FwanW2x8 zsV?#tMp!9FsCHCx~wur5p0Axqb0pbVYe4sazB!`TvyrpB* zhUFm3Ls8OfuzGTWE1UGwAXkO6sY734P+um6Ga7?)I;F4v7d*1h3L!`a{G^UY^d_!vQhy;B% zJt{O;lOK3?;IYi=5A{J}hw_ovAM3mMzu2B2q&bNMU?dX}{3Ft=!4yWgGdBPe*d@#i zrvqXf3I52n1T_RD7E*{NPAbm>7dwdzJQ}nT5Y$=whk$wCzQheHY#&X0Wg;*cM6Nj8ls6B`WIm`+F@wxX6W6bDLOW7gmz8u#{*3oqd|i^ zQTML8xVv#fG;Y-rjoY+C-KMQj<<9$1oNy?b;HDdOv{UgWKTbux>~hI|QkthazL-U}O>b zKR32Nk??*vHE{^eFPnpnlg6P`Ub0Jp0;qgbc~rZ(DymefjG9%eVbl5zc>J+Pk&%{4 zNRe--rl&AzPA0<4cjV-&4q@gORy8Rw$mb-0H7;U;$_0cf@L%ycip!ixbMT_A!SP#y zUkAkw!xDrIl5*Z_+fV+~4nvMR*DFb)!G>pMXrQr~YDSoFhT$enhQ(e~M)#Nxe?W~O zhddQMW{|qv|9Q(~z?)I?;rD^$^zx489;a@{Ryrut|D^jcX|_I?{94|J00lw%zSTvz z@tJ0}fz34Q;HTu=9S8&4D`5?jW*&HL&1RH1@LJ~Rn&yBwldK+L=I`Cwd4~Cb5ds$~ zgUJ&?)TRWUdCj78lKVZ#z)LzZgszF!Nw64j5v$yF;rLxJei5OMkO1}-_9eTH+cMJ3 zFv5J6x85=TMMV!G`P?==pVA*Xp)xT@v-%RnD#fH5j}ntsXDmqn)ECA79DmdoNJ%!j zITKJHxs51k&i>;-d>zT^$iT?%C$7F&4(*GOW*Xa-EE{2VNHamd-V!Cw+2!CF%4l4% zuj@@hfZ*HAOqf05L}lgkdx;KdPCSa_Ge@c4InOk6+u3s<{V(UJf8;L~7e<&z z+Jjd|_V##9vl3@N$B7O!hnXH3nxko)u^hqj`!Va(NwLG@H9sNdxZ-v#&$_wxG$AgT z3hPh)lHxqLWR=;xbxzh1M>-V16dgcGrX!eNR_pPF&jvon9SoA8&LpscRbg&oV#riI zmE(H-!G{CF3&tp1r!l#tuO7H9`%DS4)FjKT#9%CGMgESMh!3g<)7l7;a=%cf{W1ohThH=|^c{D>!l zoIihl6e?H*C8%8KJVcla4Qf?Ity;IB&OJnsJM}`l$_=};o3oXo_u#EY1l@hBFbe;R8Z{xh~c`5HDo@)DM% zKZcoylQD7EK@8ir9|N`=K)3Y=&}P|gG@P;mH98DH+3I(oTA8wPqp)*)4}5uij(mph zvwair!PcR8b8TO|xuHKk*fkO#?jDT~wvWJv)aTdN^v6^4JL8qr1Mm)!=6BW%#TVPB z;QjRpbd%5+J3BVSfgY`Js()u>4egB!!~5Yfk>tyxhTx&m!*ONw2s})9Wb`mx9x)J? zh7ZEop#zXPY#_2`PDhI>RpeXN`Qme-K%s)DShhSWl`M-Ejqj% z@KTawSx#1R8qOxAX$;BWMuC`2iJ1H+iTsHZ9={2kNrCD6x*#BwOeB4OrE}ufk%7tv zD~83W8%qj}FY|Y;lan+LX8dU0pvl7O)R|@_%M5ubmjRaq?yUY^H%^(64Nm?nq`KP1LS{#s(q){No>CA|Fdm1H}mWZWi^4 zpU!0QKHrRz?n3@R*Scd_6tUJb@-?x>m~=v*7cD2bpWBQjBgYn<5BVYg?}4u|aCv*g zot4Nxyg>TkHoWmg%6r#X{WcCW_VdjQSn#v-~b}83XB#p!v@btY_9W%?$E!YRf0yvLsaDlVk^h>``#}+%~7U@8-d!A^~6xUxprWf-?1Os4Vvq7 zv0yJmX9FMid|K)=pYr^R;hv8L;iwWLd|m8*DX1=%pZ0l-)g_VIAF0Rdi^h^7nQmfJ z7*s|l5=h^q4_Tf=`i+xc7@~_^Xo3ikK~8Vqx6_0X?x4cGhRf>Atg8U+9Cwzkx1`{~ z4>$n_067xv``?VEPKQY|?^E5Lf!Xk)q)0u{J8T!Oa#>C#FN!E>mVS}4C85cShI+@D z$_b0)OFn*#2om1`p2CE2W6_{~1C%UQ0(o=i34X>pj`s5wM82ZMQMz(vR3w60wa%@$ zrQV&W*RT=pY4QN>YtjPuHgAEu?{A9wci)G*>fVW4s@K9zM9_*9E{ZZGOQL$En{Zp@ za=7EB;<&R!9yBhI8$<7?grv#6aP7eb_{-sW_-%Ire%U?&pRXN-=VtXo;*hqOS-%=O zlqrS=h4bQ$QiV~gLK&1RT^uFLltjr&l~B3L|mgnjG6R%@t))N?U<`RY^UcrcrXEFNRpD{Z5DfHiP2(?>u zMX}=CkC@pQgEJM_)Jd6+2jmL78DOD6bH%V|3qNO+^IR9zs|hzC%?gJAy7f|944%GsuC+OVC zV>Ea%VLnXpc#J^T$rB!G(*+;*1`4ZR5wzzbR{sLf$G;6C~`xb2$lTm(5$|98u~&_}hr&EAIDp5YQA)1wV2vDyttv*ot{QPRxt z^*{&{W=5Da%R1O>q?y|E!Q>@{eB0U%v%S!xq?uxb;=#4OUC)EIy{{4; z$fNbD4PuQkOqzoiH$B<>HnP`a@#XuU2b;~**i~>HcAb3=%O77nSX)*un5E|NSalan z=1_kf|8_~U{DNPUG^X^U14p64%{5V|a1oR&S{Nk@=0}+V`B5%!JW5l0RddH-bfelh zJ+&WR-aH-e?^}dFZJdsE16!kQjbf-)G%qUMR0(xz-Hv-3-;Wj@JE7x{;pjGH2Kueo zfC)#EFz4dqnE&XXFz2y9Vfw?*V(LTBV)`S`Va^jT5?;b=+RlCaB`kRA6)b-C4J>`` zZ7hG`DpvhjvGULFV%hUovG`B#V9~R0W5JVeV&0>#V%{TE{_)qa^69s+`h|}$@2Ph& zH0>#LIeHOYk6%IG(~n_7*7I0&{yB7>vJeGts*GH@;xMCsD|~)t9^TkI1aA>|M)|Yd zqj8N$^hdi#;e(w_nnw{~9*(P9N8ptmqwp#ne{1(hyuW88KHf71U+kNVr)G3VM*kLg zYIq;KFn9o7?B5^H4egDmM)tt7qkG}Gv3>CTxPEwHJb|{)jOs<`iDyRk#G^yH;L@Nj zNE_M%Q%S!qZ!UrQ6$<0_QbkasP=1sq@?5!CQA`{$7LQ+k5>H%w43~*8UrNoug|sYW zreq*3nPM!3Vl0LHi+t!D`QHT^BrcsG;!OLoNHYhv__)NBB21bY!T&aEtIIMB^uK_U zjF)J*9`bKPm^3>v;tqryOG1_n7F(R;DTbI#`K0*>#UsxKkxW9Gsa-yzwrjR?5#&<+ zUtVN(u3>0hY?JKu3$p1>;p>X(jMb(tF;gNF%JV*gTy1UndLRUZnc5RXNHdiQ);p_@ z7^8)?=@(&Qg>U3QS}$ro~(X_oe>P1oQ4 z2qtg%IQQ4h6WlkqeBf71n(fJxk>pg$e*vgG`v>cvWMi0a82rB*r<`--371EjLw)pS zpp{_rHFUl2d%X2xH`Et}Z+9;z24vZwq*bB%RpLxJ%_FZj0?mDaO@#1Eym7mnm&v|F zmQzj;k&`7Eblhp*l{tIe;XYjttW!UqQT?vYU5NhK{*jo<2@-2E21UTfBYthZ6${a_ zQr^Zg1NnTczA0(uNhxn)ipu3on#o5AE@@^Y5|}*uq*=*u?BwYaX^&Wk8i~=l0IV1; z!}SE6_B+BI%xQK=z(M#wK$_VRZGg!EfpV!Hk~Pb#_m4<(O3G=RN+zPrvkTPLrL+|E z>f90e3*<$7p#r#%_NVUHhBcY#*z)kh*zxd_*m>ni?0)PSY`gL}wm$R-Heb4eZM5Bf ziHPy#$Fcw6r?HPn*S;$cWB;R%Vc(0uWA@%tW@Wk7n;ZW*%+}o-n?rhc; z_jl-pdph(*^@i^2{o7KnAF+H(y-*Rj`vK|R*W@F;IMVLjTesW3{ zmR))byPtjzM_+seTS=a2DHkv{CWzrQBqyZ%X>(n33@$W)o(mKZ%)H&tb~h z7cesMG4wff4n6m0p#Ooh7<=Rr=A5{QZc`VaV1>$Z<1wLYW4wDH0k3Twh!2Pyf3$_ zbi^yed*HR9J@Lw@Zg_coH@rTnC*GXg8*eZYdf|1#8wq{rcptnru@_#Q&p$0rQLgQEwd`z_T`fn=>&tPJ{f=#BlG_uvZM_&oK{L%5WAR!DPJ zdOGwjKC{ zlI82=d_|C}tu0><1k;g^<1~mfVJ180M4Fidu%9VmQD|yvZ(Vs4Q4{n%jr%RXIC`f&X$I~maWO?Lvei}&c>#x=8% zje!>Tye`_GVYd1sax(NsTmE~bIUvr|zALAXps!|oJ7+cu%6hQ=82qxVkHD`i&j=AE zU$GZgU6`K;xGn0752{x_hspj(xyAfT{EgSzpW<&t6F5InKRF+{`NsTCzjouv9zp2I z6UR0oM3|{hreprlfIrjhL7$)*l)%~qfjdLakhSJ)3s27hR7VQd3C}ubmzm3kj+y=$ z?r{g%uJK^D{hvm*7!jY&9_O*HWkkyb2+#z$VkP`jq6akVp-%Zloze~ z_Qck+M3x`9f@MUU7GJuIRp%eVx(g3ur1+K-hHoQEa(HB>LiG*hz$W z*Coc|geS23@}G!Ezlzt@Jn=(G6<`fNIe{#%Y=@Q#xhx%)K6?m2_8yH8>4 z&XX9w^9084IEFD>4+}P_=-GzZmx1s;6RcJMEEUGqWggk|d$S)v{YkfD~**g*M?i_{B_f5iA`}kJ`C*afV zqw(R^QFwowVKhG4HWr_39D`3cjK;Msqw(3UvGVJJ@9&?8&mNqLOXFJNsqyXc;`q*Z zE1?IWE1@&qp4`R9TM4?oI<*JhnbKYGc0xB?ozNK{Q2E#B+ULe~#@#L5x|FK`LOiU&pO4h+LAj^i> z3B!*$2eHZpWP|_LGFILnltR#0vRMa-CnuI<;FZrSDKxey4yc@xW*!iXG!wk#89GpT zbvgrvKX7n8G2phX@u_j@_B#rc0om%0$f?kqA03}bPP#r!n6+Ia%<=^YB9cOy#U5lk zFP;=Ev*iH>_c!n}D`&b3u4B-7#|M316q|Ek?wrv&WwxTe%oz z3{E-GJ^7p4rvMz!oNL&w9A`?Hqomp7RnklZnPMu*K_b=9zmCa>Nt)#j)*;PIm{VlX zH^R(8OO07jc)_n*0~>?u3oB8f9Su6|bKdR`vb7&BssxV zu-=ha8(vb+fi%l!^C)Tap#M;28m5dJg#vkVp~h{uVEu!8F?rQ$^qM&jou@BA$Eows zapH7zm@pL`CrwAE$urP_w(aQu4&$ex{pcy^JZ37ojHdsGO+@>_6VY*S0^0Q-hqeR8 z;hwGoQMq}0w45>*`(FP5dq4gL3;ui+2~WL&geTv^;It=k@2b7Hf8}nBI(-f^&pnKU zj4Vvd$i%qJEKDG>KI8EhG3Du(G5O_JFzbaEF*f}ix^3KrX0w;#9;&y&lsUL-%4{^8 zG7pU=%tEtqGtqq1EHoWD4fm5Q4f~JB-91L2ZpQ(*wN($)Zqfm@8n(tQ_qIg!dmccQ zJMTxuI~t+mFqP|)%s1POwGF=K#c~?QRD7rsCL)=xcT;bQ2Dm{s95Jt zRIYOuDpk1^MN3seu7ZV;D|aqgJ3P64Q+%1c249_CiXReJ;+vz3@bR7*xVmjJ-X^@U zbrN1&HyY2b9Ev9v^~dE|{gIK-6Q@RZ#O|T3u(EGs%lYDd9alW9AJ|0iNSr2`S_cCs%-o5=O+6T9KLv0d=! z_#U`4cO)jYY=G*8N$xzk5SKp=HS5+uV(J+@_SmE3GrDdulM-VF`y~ycytL;Gk!JQO z@p0N3Y0k(gu@EedVx40IL_=Z6?K&HjTu5T^VoA0%7}{4mIF6Jw2gF%pixWI2Vvav5 zPXkvPljbu_oP(KWT|a1`vhKjab#PrVptA?z7-d*LqOg8)nP_Y?ELRxZbCwVMfy;pn zWYIcpP;BV3^oS96r z^OlG&L%&Evl=gGa(z%_m$lt2yXwQ$ zi{*>hq}gUvP3PIc?@yRtDX3qvG{c)^HqtD)$OxSxGir01A~nL+ZpacJ-8Y3ppYM>sr2mC9hZC#ckP|eP6!g<1kr`t$sJI>|o?%`yYEM-3bj!6mWG1LwU`*zLzP?a!p(IXplaRwP`zG5)V#AXZmHW4 zweKL@-VnFl-Wa#v-V}G-(G+!VBhq=xohVAzc3H3zd*1yVOWybdLmzz}Bc6E$gCBbi z37MBMX46j8>oXXA<}AU2JqK{)+$EfQ@@ZWB)1Q#`)KfV1>~q-p_*2;U@S|9B;Q|(& zPR6hmo6&a6bllrxFm7w#6}8*8!_5ygL#1X-QU3m>sMNRxs?=|WvbFC`3`&hCkKW_82W8AO1YI8P_yJWcV|v<|fGfOn=7 zVJ7gg*JpIZt21d!|G!0q`t5}FcqgF)UZ2nwFN|-G$0v2g#hC-KuHOS_RH-Nm#K)mb z*`mluPsH=jK7-7ZH1e%8$f87fmXc!@lN<6SejKa59SokD2*|IUSaSp1;1c6Mf?X5D zpn>9>e4XRZ;z{C+WC`EIQbOl}*Fb}kPntOfId*uUIDL%dW^mk6pCnePeP=D9J8<|f z(b2U*%zFE7Y?7?O|G@wsYaolpxIqV@0M{$;&ZfG=&$iKP-D8IAx(*{&S*wqsq#$R~ z?COm3l0KPp{E6ySo2m_zFsBMKyq}uRmfsxNtPO_$D}cwTO;0a@9RpG$gOm_y)uH}S zPHg?S@Y{C$+Yt8Sz?WmeZaO87oa1iaT=?fXugk!zePFm`O@1Vc`XMvKoZ1Hfd(4?QDFLUjg*$_SF~r6}fAn1@<`lF~ z<%1v5p=-Td;r1mAI!`*5IOM!e@p7Z2Injgvbq+7KIWJ0{lKQk>S6+tMD2nTyGr09g zAIOh*Yd*%yo5FoGcy@I`$AkKKpT`u+DWp%kF66(=o02Ivc~6UCJ~8q^=Oben!>%zI z_~tEp9~o4ha+{59LYN6wk1HQ<`A6}5vP^XfQRZW`Ww8InxdXEcR)Bo_&kIO~o+HK3 z#pWbAL6j@WI9#vQ5mGt{6*#fh&7@gATSW*FXDah=k>-dZqX>+xbx2(zBkRQm6lYK& zdHK$UXTZ-=KhBd;FQ%oSWs}B20DJW9jrl89pyus&qRP#;;okZ!ap!G~aA&;+s9*nX zG`PDV?rP8wcQ$B*yY9K4wvFjn6Wn`GQ#8E0DH;*(uHS@?HAVG1>f^3_{b!qeaWh)=%w0kh^T!?daMux-N*y!PD7xcbs7NIrTL z3uev6;K4)CwRb;s=-M6Kx^%()jhdhqk@1Q|>TfDrjZgu1+**g^yB&4zxD^#@)}BED5fx3}fZjXMj*qXzAlBa|eRBJfL) z{CPSqQ?)=o++MN>>X$2rB5`?8lKRlIdw(n*I~f<3t;8#fXX5IDVR&Z&k>)ww@XpNc zczZ@yyfvc>-kRRoLl?ZsBzgvs=9xsQY5V%jPIzq=m7%gcJN@pI4!AnC9p0VX23IGx z!8;Q=;n|Vx@yx{DNFU!H6PnOX5cO+$!X&);-1Epv<K9~>Ya>ljWFyTIJ2WsDX{ItRX=W&CW>DMSz+t30h(r3H zu9fyV7x>4O$OO>DoPEF#$G3~1t}y)b1d)m15kntdnd{YaqhMXfI?mwwU2+_YG@Bg0 zo(Phc96yr5!5=8XggNtc8kNy+G)H|v<g6TwU$hM^GNcUzNv#lYU z-W)$OlIM;yGS+zh?m}jy9RGLWe_4Y3S`o}Dxw>#)stti-g5?ky!-UyhX7x$4=vxuQ z6JO6d2(x{Dqv70aHA{;U!YrtDJ+&D!=S-ag!y+!GY^8i)%so1 ztTWC|a-p)JL`Y7b6KR&DPX#G(!BP_Ic7l)1SYs@JKDTWa5dx_8ti;(Q10s$UFG>-^z&cv&wu?3@4x*X7EGCi7+=Yus7}y708sCLcomyc=pB`8~WC%8nACEbM z2H?&LaWfwGdX5LtVEgr4-n`6Pi19xnFwH$7}32art}+( zg+qs7-KYWBGlul=7uNJejP3lI2$IY z&oVoMh6j;{?HaPRGt-Vo=6u>Qs( z%{ETRSP|*35AlDtEwxL1FanwC6D{>u9)o0j^GLHBKU|O6RO?VsznMfye;vXsKB%zA z??&3>=dYbQqfnw8n=}WYx?QjwuJ4dd!bDl&&h*BD`D`E$?Nj~N%P)SP3I30VuuoBa z^8H)VZ6XoJMDp>(lP74Oe@KUM0%z#h8QC(9;|!7FMB)R9M3fark$C1P)f0m>ldlFo z${=}9k{=N=Ca=u+SKT#hIc08O*6rt;=U-A+&_tA?(nPp}l;{Ul{o=<5X$+mp6 z==muBb_XwM65d4qBQIHUK46GXX&bKo{yoxcaPk1b>Ii&{+Tc9nwH36f{7ol^6=BlM zb!a{EL1kox8ewMe|7xi4Cy!3zTxv4zX;5DXVdp-* zuwwH@H0#(NWvW&}#hO)7xms1+QnLnXRIiGPl`5e`*|NB)N>x;?b33Ztb_Z$@iLP~L zebl(EE=pIZjIyYQ`|H+5pHA)2r$c*GELxeXd%f_c%(Tu?{m3)1yH2KO(qlXV--_9L4w0$QIZ{CiR>o(xnvNAwZ3fseYo;5vJY}MPMU3|*(J>iBh5;fnK&zH zR{e(c%=T2f+pjxbpER@HT-&g1J%4b(HzuzC_#kBfLBr|5+0W?dxl5)IzB<1VN#E?Bp0Ti<_p-;Eh1%nG+$c9`5IQ|#QL z;67NN+1|FiOJ|CeILnVlkbetd7P3ran18mN%ABS$ienCr;tY{x!I>jClX!$k4zFc) z{EA}T{4*fT4AL8c3A2I;H0LSu57LoOnmzE6r)Yn#2erL{uhaeqvPFoq7gKJ`i=Msf zeBH$W*Kan_ev%9x-1=g{u470(^y1Hr!C1$5Le+d!p}GrvnDe^Hq4I@%g~_T%c%x%g zS~A&8Kz;c`(#-xvWyrrYP68XL-gQ6B;TUwiBa6PHpm8j5=jp=62^%Y1#PKha=}$;2 ze1mw9k9CvNGUQEnd4YjvnLN;ObBI(}mArKjS)`VE8- zwrkq~t5>f@hmIXksZx2=s&O;w)ToNu)vKXa^%^K!stl3IcoZ#O8nte}6Ls%th&%4S zPqr1S*F=++Z7_b$a*SQM6O#^RV079u=yTx(bh`8$x?O%6LoPjmNtstLZtrP~Uv~hT zPo(1AZ@?Iykgx6+V6I z1w4A;G+ue(A$=pXp(QIxSbnllxJf4#(#c1S7~cpu4}_Ak2gv~x z+wS^2A4?9Pb8u#Oo(Rsy9TMdGO@q&-*e6We3+|vikP%61EF#Gkh%j`B-rClaXfkybsQ~ zW+T$sjgsb!Q`t$gpSQjGET;=;CNjeVyA1H|nr4CymVs>;YG3j}$<3-?nQ8Xw^y}s= z$?Fcz2G{QcQ2im|Og_cSki0QK;ORj0;2!r7CC$D)xZmUxY8#J8Qk>e4aHO43`(KYV z%O}#LKHi!>v~I0m+loTIO)TIzQle&kU>W^cF`qP$4)`4-dC!P-!n%oq2j?8} z9YPl6qD+C3X7LsBJJmbeH9)k#*GIq^_`XTYmvAlfD zePBCDEU51YC`U72OJP)25oZW&6JN#`Nr%%JOi-&cRv*>F1N zgWF;KN8#FoL3Q}&NFI{Sosi{mjOF!poA!#+nu5d zOMtrX)x+0%$_#hHK%MjhT#xxqAmH}69tJC&0P9`wa= zJcamevJ9+LhAEkkscgK@hL<-l$&eXja9(e7mPm8c`*lroN|`W!~2Jw$jE6Ve{Tw4^H-vVJdyF4}-GOE+L|=4HI|?a#RO!_WBZZ@=Qn zv+0;MxG#3hNx+^ZQ?X?7K(uLc3q}p@Ohox93Z8xV$1fk_r_Wx(qZ#`#qGxj)-ns%w z2lkr!S&5zP0E24Pmnix8B zIhLMG!BWz9jeGA!i2?=Dpj=s0raG$T$&H@()<^mHT)6-4df2;d9S*ErhVApFVArH! zI5uhklE(H&(&#=oFm3>9l_`eWwQt3Swd-(V>qb1ZWhGu+zW`U4CE)7P;Y5T7;hhEj z@%9{^Y33Q`PC}AjBRoH~4W6Fd3QtUGg=eRA#Ge=Rz)OpJ;Oe5@_;67_yiJ7rjWy%( z^5$9i)8=`&xN-(E*3ZH0VQo>lL>vlJALGd%Dp#q1jI(LD^4Oy|#e{$diz!2p7YyM<7myG}5R*Y-XW2==3~Ezj zoAwWmCeF66{iUrJAErCI!4tn0d-0ZJ5%4)G>s^yw z&jgdjmxt=4t@VTZPy3uFcxjiscu#e^H=7EokM+pyTKl}EHVi?$DEJYy zdvoc-v!ReEvkk$>kU`@i0LP2(>*Tlo=kJ&_(+!zE#ZcdJ*Z7_Z5#|7Ah*0y=3|`AD zTW^-xmeMd;Q(t61WgBYSx>jN9o!lBoE~oxhbRL1ph{zWE9(woj5+2Bqw>0Nlt>?2yq{% zoZa|~uaWLj1!Q9n`~xrs)@^t_IopRo%Qy{InQN*mN3-_ZFvp7w=kO55!7 zWP6^;r)@fo;c5JfhSC?-HPD+~TB7+8&LI(;G zMwG&x*JS-i;Ch&F`vasasDr5Ug%U{wb`2{kAk3^7Ql6in3}iG`mdrwFpoZ`Vq?xxt z(0g);^K<*uCo(V_{2YPrye?#Apk3QGLYnWms~(oDSdPx!yP-V2{(W`eiOiLJz-@bhxKYjc>-g!C+Gl#Um`k6!U;h!Hv%X=E& zj_P&Ls6jK_->@axwdssoYu81IqNPzFZxIy8R}}g35K)dVgxo}$bIXi#F(SXk5udvl z5#d5ah>0-g<`)NvDCcIvtm~P1IR?)xGkK27og2A{1an`D6e@xeB`V?Ox=pb3KqA(q zT)_N2529R+>L^vX5bi8{6RPCNL&P}_JsQ+S-BKk{wRBM;`yH{ccXwtX#UpxSBF7A&v z=k>&!v$_yE;-zV=@pM9SJUz7~t|YX;#R;u&c~U!ENNA16rnkqRsobk`y5sGY!|~F( z1U#~C29lOf!;xh(uy6TH>|HkpLwdGGiG1-Wl8=|I#G!VrTVz=V|G){q0WJxc1C(!8 zIgyedrFl1CLBj9|H79ciu@Z@pW=hm9bb!@_WJx@94MF0E%Mth(-_^T$KnJ=Y27~yb znBeo?(DBJWb#WF)|Oyt8~isI!9USn$48AOlVt|zw(*Y6Cv z-Z%znCYWqy8%97BgLp8g52&xm6BEA;k2t%eSwJ^vC&=Gu%gZ%RQyB&?)!_BavZnbY z5oOxS3xG_(`RO;shAzFJc71T08v8+6JK)3q|g>>-eu?MGYkAtu^# z9bIoE+$GLNo|QCPPUQ9Eq;t~Y8G+he*P3%Vx{l8CO}qaREaylnr{LVF^0J@XOuo+M zGil~Lr#>z>BqTSFZ2@5>2w_eZxbgePpf#$!30i>ULti6jd5I;&Ue z7Du7u*+7aVfK%&1@gop zFQL{=<*;((7#x~D2m5Et$BsG6ux-(1>{@piOPB1%u!&31XW~NiU`$wmu486k*{;KQ z{MEPd_9vecdHx2EKlu!f9zTt}hmPaW=~T4q+7o#Ta=Piaj)VI8O3lgC&K**20x$+_p5#)HfNytTHnMpIRb&lgNZ%CPZ$VX)g z<}Zo@#Verg;PF_WauFLcE@Hy6l_*@I1S%FOio42`MePCwlr% zZHC>0`{T^Ckw}|1h{$p;TpHI057YL-tf6RK`)1ryxdyhc-+|-Xb|G#120XT5F@#@S@M3~#+nTgHt?DW=naz-0mp4cFxB}4kCJA6%*N2vk!p&U zUVL+`Gh|jv1_dhTo#Q;^%PoVL5@$E}xzKi59wP{f5J8NQbi?3JYw=cPaC4{RU#EZa za*OqYWL1#-URl2RvV`vvsIV^>aSmpjGgxkKz+mJ=nuYLCzxb11@<}X$^-pGzy?zHI z(TGwQLYjT~X`uEk3vwnJjv~k4MdD@lr)>axW`<$J{7yaia?$K{(06F>)!N7 zzRC5mPaA;zx@)m;+gDc_`ofX6FDLW7H2P-bE(Th0Wf&q?v6g z%MZwoQCyESn{Tm={I&MJ&(V1yiwyP|o@r*{%J!_6d_tDx(6zC0w>`yR z+`vq;f)#V)q?xXzvX;!*5&Z1}gd+XJc+1SE)<53`gJZe>`gatzfW6#8~ zI52Sxc263I?bD}Y>)fT7Icg>b^_z(PBWI)Qu&L-iayoj9oq@K4Mx*bf=~%V@Fpd!M z-16XY%v`b#y+=&OeXV++QOnNg+^q*{)~boyZmW)UYv$t951z+={o`|d_1R09Ib{gC zc506Ce`(apcNJq&aUX#4+*}L0o>i zA)xd5h*0Mz5}iN3IP&H$D&S=tx$+dGvIUSkUm@f#KYPKyKrdxgE+HeD=uwXiD%c$!iy_L;l)LL@hp+$=ccvC zb5mO5nMn`emAM`9@}h23S1VkZ-VP7VB*HwS6CS1Q6Elf4Pj7+e=CsG#D+l6*RDZozDuD^}>EshQnM8N^ z{8)_r%p=AEZy@yl_aJMzBC*2%b1a03Qh-=VCl*pUAxrF&zq|3~opy2&DcQUjb>h{^@bU^+~hZjQh&++l`C#MRXq6L}zuWe)chMpkrd}j}sBXEO7k8 zh3ne}KAydgIYUXj`bEe_ta~C|ZhJN!<%EBWG<$I_^^z`xBr{p|NV5=UBF!#w*8C8T z7awBhKz8d>bZnFt)M$KEzl`k@G3)mbt31QEuh*`$?ZjmEIAPMvv(4VJAUVbaJA56J zI=yKpTTpp7R=GVP%ubtr|7BS!{ZB-e;l_gHEgx)Gw~wA*nvKl2*hWnLLbIl1AI;s& z9~j)$F#^B)!92#BEzRaD?C%PfG;@v)coV^~k$)}qJr-%^Um#?i1vyH|A(LcY3ZpkA zOq}@`=O-;MPva!FbB?#CiXK6w_O(sYg%2gnJk!kaZ%-XqA4ZllQz#$t4;Ce}zWx}& z%fE`yT7Jnd)MvJ>rwfiX{W3~29p`*Pdh~4XgRU{Z4D`l66DG})>%6{kev3<`!JDV~ zIcf5vVm5>(1FgsfE05RVgg}avN6i9np9Io@Q=RUQNi+S=&ggYG3UgM2-bJ`cN;{Mt z++hI3Ks&z6s`i&U(SXGtKko%@u-FxM&efnlc%E z`u9VHiWN|{Y*`d0!dE1J0hAyDSh`qABFOh(`oyVNwRk0l^cjd66)U4e!TgvuaUAvU z1nS&U6E~GAiC(RnVcCej*fh33){X6h6%z+w_J{%K-=-aUw(Lg4c`SMl9E0wIMxxu` z5$G^*DDsuN3HeHuMZ3O(FmKgH44*g~vzKndn(c?NW$!U;-nI+VXU;_JTW>-63MJ6J zM=Q*mKLG;GI~`FK-k>zT72{Hy44oR3@LyMiY6?l@IZ8@hFn3 z01ELppoxUH7%~zYE+%nmC9pqi~CVA zR~+WHdjO9SxxX@|4=xjNeRxU_`o9;R7~KWWP9KQLP3od%iPG4#ZU^@7cn~MHZNs^Z ztMT~inRsf^AUr<36P}vZ4$n+$i)RyB;jK;e&+cY;YLX_gX4hX!HB7mRQcav zi983SQvLE$FeP*SL=VN6bKb2}W_~F?WGMv2y3F)&a+M^?0}aU|g97!#xz6I2;l{KN z9`rdMxS)EJOzYO|w@a2?7y%a9IIao0J~)8Z5yZF*RE z)b$N7KCQ~Jx2N}QEg@vNlTfQd89OZ{+kEV<5U zHvst>m1h!bKQdwcb0O<^d8UcBI*7T@y12i#Ue;hX3UV>k$2z3;d}2bf>o`F&8&P&a zzG?pB`fDt3nQ;GNz|Z{#l{2sr9V30KUwVE^V4rr!OB*M4-I2{R#0NUT9) zAw-xNY*)8`o_)+l>{l6{T>_%V`xIwBX=ZS4=QgNao^2+3@X~=;e&dp6&dUaS_r*Ua z_N4z=*9O)L`zQ6E>+$oQXH);ilpwmMV~GS=s=-@2ucVo6p*j?vCo2e|q?tfv?S@%r zn~B)U@>pG_rTLNdL3&E2{F6f9t&rwaM`zcEOPX2tN`w_0ld{kK6Mc|vB<7OoIG3e< znC;CcL>CIn@4SR1oTph=sy}^dbcRVYRpi{gu+x~4U1Mg*1!M#YuqZ+bBt)I=f0{J2 zTrNyTGh6_wQCXOaA29^XA)^XM|VqH*Tu7$*sy*r5lJGBM3zR4 z8I67e`lH-UW9q%jdH2fByIX z0{;ygGzf(X6u?cTi=c1IhFCtT50(t;j46ZKU})$2(fOV`(Br;_7}BjLdiCgu?!5{9 z`k?#3ekef%m&toPk=qAacR>GvBhiPjY~402U$+%&*KNYaORQs?_NESKYu|~sd6()mMDp0g^Qp-o;)a;KR=2SndKMsisdVa zBK)~H+7`-_TbGu^=SBWJM5yx=M?AkAn7cF`wIMJr2hcLU4DNi|fVxLV&ot zyBiVW?ru0ilbm>Pw*WyxfCzDScMt6U-qq7H{XFL+aPRM3-}=s4`^-EuJ>At+)zy1? zdU{Se0q37}3a&c2fLnHsHxsp={tqtCpBGhcy#`-4sf|x-mc-|cisR!(h4D$fV)(Mw zQ<&MJEIK`Q2d_IB{dx|-b3;brt$~B^PVX-Gq*DWY+@dJHYW^e%^W*ru-h=ps1bJ=$ z>R3IXI=+4834Hww33b~d_^MrTeDiE+eABTczUxv9zxODK#XU=7S=W+S-lZ%S_N<1V z`qsmU+E1h4HD}_wGxFlX0=aQwjvUBEdGERNR=hhc1JfqcPf3;)q&0VWEPd%M`p}!i zSo*(4nw^ii_IFIu9JPV;Hj=af*KR|Eq&1yHgqXmfW*q^M)%h?VCxeqCZb!$EA-v8I z9rtw!;ocp{94k_wbWy!Sq8!5qMIvm!ULG%_%Oh#?y4I6sH%Brvbj`6yvpNPpy6QWm zW5MZ6<)+GL@O2}!R;XWOU$9X~{#H7Rwamj{ao5EtO|wZQ%@Jrs&7bi;p$-V z-EQ7e{ZQu}X?Dn#z8+p;U^hbKI~sEwDJWJ}*2v^=L^^4XjcJBgWfQ}u7(Re^3c>Q_ zu@u?jN`=N_UQ!AbltP$^P+=Zwcv;oQwzmI9)EN;xK3&0z`_m^+#B-xY;*?YLl3zBI zE?dS(^OHk7{OT9mr|I#m46eJE4(aTF<55QU3B zh2kW<*WGxX$&f2sE}VAK=_p#bBp&_GlW5SQ6?zUBgznvYqFc8f=+mb!9)9#uTz2^t zxb^niapzrk;U1Fs2kw6ePd`xzPd@r2F2CS%oPWl-D3B{3&d7Hn&N(r^jr?A6`YE{J zr2IJNgj_f=dp4YsGrKLXc1oVy$eklMa^*hB&2`Rq4sztX5IJ(6hXVO8!dWL@g8OfN z7`@td#7C3I;nEAw!zH&|hd%FQVDQW@(f7+w@!}U>p>U;YygnaJKJ8TGE|3dnoO2>B zH`1IPm-5D>ak0X<s1`|cgJSr!Q4xGnzbHPd^#s0aS_(spKa6V& zN zlIS1Xmt?x~__ar6%`@x4xLww@EarA7hhMwb!5j69=6F?6tInj zotTHkq^OhfQ2IC+28iQJ0L6e%KY}aM#iR5*G-8vU-d&@5x_}&#W)FWZB$cTk3HFbe zgAP7LUdhF>M3P0C!%w)nW%5$vuzg`t$9S#pl>O7NDS8yj&N&_aLpq_3Rc@$X2iAx6 z5{WegbqnE%OvdZZ2RzAl9~_8m75RxLNz#LxlS-XO{u7dBH$uc?Mo@U3a)&;e?yE6T zN%QRx-f#XEku6Lw)V*J3&Y*fRO}ehUk9dD~kpEL2Bh87Qq{zeig@oD27~#{T*DD5X z>-%a4*)fsJUXY4lly2&1nG^c~-Q*7kzB;S%= zolZhh=E~zS`krwoUXvwsPU%9~yuIvPT1<)b?sT8%-Hk4jFjL<`@=0?SuZ%B6`qXTK zIQH;5FSCPm#X1M@L{~QwMv^ml^EW}DzA`d+Y(k8zh_Z+B2;;0`hAm$xZfD?cwoCIH zwmGG7ahUxqW%KELycINcNA=efUVrnYX)K{1I`Di@$83`_xWnLTZ|+C%jWO)toFSbd zyfL!c_sh4q`8p2n<%9$-=n!-o!Gc9()Z@WBMv7Bw8#%UzuiWuQn!WRch~E-5C=YJE zgGBIU>x+Q;jJEctP0S$CeHG`Qf4&jcLPZLrZ258|zNh1ad?(;klIa_6x&cooDY~(-T91`$*PRWkT3b2gqIdBfidEKX;L{Z)}{QLBL$Y@vr zAJ;E~PwN%MrwxkWqlQKCMZG66qfv3ZR3mZW&oz?#@FusRlWDQ(1g4hiwnel@VTPjxKrRRt@0R>q1R6|lTV zIV|Z_iYQ64T?#8Zm&N?nW${yo8fg9Swe~shGje3ddAza6OL_l&`z@F{i2;P|YD$J% zhG#N^LSos@f!RU(36w@QcBCL33d&I|iaYnz&fCh+^_0y>GvkiU0cQD$e%**qIO?No zCA%M?QVbV4P8n?$s7wDiKh&@4<#n9E)zgz^TMmwKH61Z&=DxjoPkG!6i;_!raIy=& zv$hV)Vl%hVx&L2~X2IDpgpPUWx06|Z==>+JkuERGk?ihwRBi+(k7UaX zrT2M;e&CR_6MG~ck1`7^Gx9Mb%;tA44_i*nZoU$7=i8Mdd~7H5v9KR^Qx{U9K|#-1 zG7ZZ4x=Q}YMnY&Kk>)9Zef&B1GGMx$2=gD2J8mO8ImmywHUA@YjR$|H-fuWCJ+{nQqo3@ zoo(_2R+JSHJYn`QC!mlX;XF7OND}zt9*#6CVDP%QvJP=@qC>@jF*Nw!z&pMtu%UB@ zFw$(!5-5tW5uQL{9`5n7uJcad%1R%Bwh?ERJC&G1=bbWsEZ%r+3<>I0Mw;(?;C_@W zT?!Xnd@-)P;tD+Yz=OE$&f9Rx=_ljzD=x;Mfqijs-!2~FFgehTyZ|m zVV-A_gjRasUNk8D7;Ytzex*hceBQJIKB-d-pEM|jj~f-k=XIaNw+%{QdY$sP=gia5 zr+E#0I=Um~4sMFoBb#B}s0LUww3^LJ{-)kT__6V$*wCjMHukHAwS7p8dzHtsUKO#d zZzU}4Rf+pmu%=g4tm|G0YkKH+1uW`Q4h#E~!m^$vvASDXEbCYiKen!j@eNDjnL>Bt zt_#k@**UV|RO;ZwT-i{eObLAW-aF>+QzlO09X%T)eRMqAfIh7ql4f;Eo;3S_C@28? z{dC{;#7=k0e%~6o5VQ%`jvXu@F_2B9|2hJ~eM6EafhWw&&jL1O5YpOiERVHQ%A8iV zhkl3GB~biOJPBip?-+diP)FmC!N8mlEQb{G!#3*kkStz*9(L2tvQj!1;&{_DQVFx5 z-;&Y6kt*qdc^TPN=M;@>F!sbZ?iP=<)ks(xX(mJpM3%=AMw&$sB20Kga|;vVY;(B1 z>|SqC=c>o|A!)X?oTZQGe5#Jq;p!THw%PRWkQHfG47C^=!OIhp<`~qc(}DEljyb!@ zop>jN6P|P;`#~hcSxH3Q=!aYn#hK+9p&$l93iZns7_AL$$mQI`{w^VZ|zuh3!vC! za&g~$Ugbsz-s^m0Xv^s7vCUI8BICjILQIrBeGv)cmr5uB4m zL05s1d27yzHPkf2`w9u+@r1mFq?vLvzpzslZ0W36NzU-GlGC7<3#7PA4#DH$sIxD3 ziZw(MXc}H$;(PRwM~pOIeeE@Ps?gJT_~D1~?>q0rDJP$dd?(~Zwd$2IckXN>%SR9H z#bFZW!~1vQ(B7RmO0sv)#_m{C)*Q4qaccaYX_oHm#CvfL2H`_-`WgWy_Ys1?OLgd+xmtci(eA?tkb}+xM5Ujl`l)UdNA{N@`} zZPN}{-f%r`y6$QlxxV$Xb1|w@ee|qe%1E=0mA>m%bg5Jv_nndx!%IGZubNfDr?rdW zGfmNGKoVWMAik?t5}!7zfyd4{9eo>?!u+>8Vcm#2*gUK%)(oqFwZkjmkFF&!qroHi zsmWtl*Sj({_T@Q}<&}LZUDiar&woCJ9Vv8G2wLi-i5xG&MK0+#fxfK|k5675Aj ztKx&kMe$AVCYUk23*KnolyC&$wX6LBJj8)T8u50Cef9z zgKkE`I*D+(&X4jEVdr8&(%O9jb!;whyEvAB_|DaAmx(k}FVVSJ_DEl;aCMDIvk=AaIF6^0X4NB^Z#24!os6#4%0(VI1@D$5KBM!|Niu`12mFjrg{tZ34TgWS*fejt$Bfk-k(=4sRF9 zZT52Z3H{1QbMV_k*VtCEEYOw9NB+j+>9RxwE!3-<){*>nPeGUuuMgqc40S_aO^5t9 zMR&2@weQKIBhIPo$@4g6as!K5!}&VNG70n4u^EI$k?AK9pLAQso(X~@&z?vNnSF-& zrq$cG!Q{(|ihr~_>)@vE&_@Een8f-;q&aTGhD2xH$MnK!I%(E9Jq$(1PMuV2;a%MFR<{sr~*Q0*br7h3h$8wD0U3KF$Yi( z2|F%#Gi-u*s*=wKX`!q#KD8R z(WH4}oOI4~&nn?8=!B*W|bR>7*C<*}l7c`WPW zu!017RqqNU&*iX`zZa1tFC=MRN}{~FS0$|O$@2qi;QMan(Y4@J=vVFm46gDtCbX@O z_Xc!9lM+wh;*;_sABpqXXP<@VUwjS|CTHNyv9IIp@vq^nabqxcoEo(Yc0MrwH>6p} ziZpAX{9}^lEIMOPnl-ZJ1HJ>xR%ghdACu-N4urwj+E>(B`QL(d0D%mGh;-QA6C?rB zkD|D`Yi^w8wyC{2@-cykNpk>4njQR$Bc{8Egd|X%)p1cyi(9TPg5sB-CvKzl*8aUM zlKdTb%*z}>^-llo;dJE4vd07q=$T2gd_tX-Z0&D^Y=r<3=1A8f4EmbaSYE4_26c^f zoU!g03nSZUgju9n`>sQBFgc_2si2-TpOAi`4t}DLMJ!Jx&8#E++o1ZW{yqi=ww7+) zbUtrqZJ8H!>iUML9!4&dhuaScrJh!LKQ+?q{rhha%HiN{!V-wg1U^bRCBNBH^$l$A zJ73pFq?2Zmt_g%E&D3Rlt@2UbR1fA6+JMJ=9n;&Xhkud~+mCQhcvIV&eAnNMq}Yt* zrymo-o7{Ay_pkc;nlIbRr!1+&Cl$$~_C&W~920zO3rVw+&0S;Z*(MSMPd3uv+f3Yt zGf&sQC%=6eLPC_s6S~>amT3rYn`4q@=_B|Z+cE`-t>nj^G$-k-=AJ@s(y zB!sQ!9K z@-wS@NSd8~MVft^&Ni8(c@iryi9!qdG_)hnbOHt;&C$`K>e`wk#KDRVZygRZv@J7_ z;EY3|Sdi|ZRH*9&lC43;)^9tX+R?gEVHwyPIl6|hPMF8-0}6Bykyxz-D?;FxRGMmr zqs%m-3(BtIPA1C9hUDf3vTWt4eUWBsL{g#}kZW8DI@8qoQlasZO-%FN?BzRjvU^LL;H8)z}{^*vVXhb@V;$0 zdSC~MG70lOg6H?`-OAJl@!fY{<6k%Z3#XobGR{2d1l)4z1sL7EJznnD86!J9gGVkr z56{;sh1tVeqvO-JqT-!bpk|?m@c6yAD~`Ldi6txE}hZ6XLqz~-xfFAauY7O>T=xs@PlaBvKgj*_%7xz`2)2Y*2dWv zosDa6x&~KXcO~w6@Gf-h-T~#SmPM`lRk3jC@A!Jghp1e+I0`;`7w)_1YCLq)wJ7nQ zhf(mM2l2q2_u$?;?nS|ZMbNN)2Xvk^4xPUJ1l@oB7JYyD0VC(kM*q)0M#~poz}>}5 za|kXY1L#JVEZkTkE=^o_pNFsFYF zJYW6}Jaf-^B=b4(&>8tq>A_nuv_n%gt5OCROp6$QRWE( z+qi8FY~v9$eZ$X5)aSQFqT>~J*v@UP9NVgm7_wa&xbNO^&>T2@a#aCG?bN|KTWgPM zbFSUFFD0s;JkK_(G+rmx(OPGu4%b5?%p^G6w`l|1YQ$M2#K@i1lXX;GEp9RYEZ_)> zgP&U}Jv*I83iLs+_nn;|X;NpNkYv}nCxn?gRa~S_1S6qw zyo!D+n48Vz18-p*(~mWfG|Mk!56U7~J){S2Wjhbw@rE6+lTX*U>sJRwJVNV2*v+%gS;36hX2l222xAxnUhdR5E0*bX%%E zR;Sdh*~Yf|t$hbqHwR}=$;RL6G_4-4Y-b1AOS<&_uKFcaO?J!f0TxD2>N`#o@^LMl&3O&_}|G(U5VZV`Fb*^+4;U(GK?^Lq^31! z-=KZhc0`ce)Dr#HNV7(nWd}lAowM@}idD8B`LFq-2km7`!byjwU$^zKFa@f6l+Hhq zHx5L-$S!{0`H$?x^HCZ*C!3r9$W~4kJ7(wI*0s?DK3+Io5OMz0A>)re;G$KTGLdGx z#=-X;HZPpp#0EMqU5hN!HW75K#dU2xSbpLk8eq-9v~d&2AN7Kth_eW>M?{vh7-=?N zp)U3P3pY5!K7;kt%aHa$q{*&$$=FHKJdvb%!W)7}GxcFFh0;fa*Qh^k#o6Q!?#Y*n zv9S)-E+WXz@-?e3Zy0Pndb?KVSU0Pq>$|c|n8*_(%@c_Uti&W%Dt;l59gc^HvvpX} zXp|?<4hp6en1ZArs-zj0hV+I?1r5P%d>0WQbyQdeRHfKZRRKEt6jefoGGn>NBkm3- zfkqy@VyuDl+LSw_03sMtuo*wXRF*3!z)u@Hf!9z+y!)D#F&VGDGzNFy{%_IF?0~h-g5_b0LCZF+(V=rYyfuz@Xp`PXk&=aQ_4QZb(yK1T1(#lg z5~WL`eTTN_(W4_?d|@aad+;9IcKtQD@3tH9fVTg-nnd{)6f9O4MN5}Q`SP_gk&F-gc?SA__6gd){4&b4YL0(B{t(W%>3W=h!>xGiKTqMg&TX;&qgSx()izi< zqA9-ZQx^rVxd1syJa0MsGz_a%96g`D1MTj;4xcuwW^Z-m6z$xJyT+d1$zxAxcuAp*Kr$;y#fmWIHzwxxI{89rp>2?h3 zV|9w`+h$Tc*N%dc>f5mk7cE; z!u~cU&7nO_rlfA^aCS-j%MnnoTUCND~Lt;Pg>mI4?)%C%G2HiXfN}!j!+|wU# zZzn_S+eVn(ej;Bg{P{4Q;!qstJV=}~gYhV8dyarPSiMC6r7!6_&|SJ;`ITqV;dCY4 z$)?9@<3y}){Z7hGwTVZRmQemF`4HYm?Pk@dRScsH@u$L-E(O#v)9E_(9pMv7KM3Pt z{?9V>B-F(emFw%~Wv5Oe-|(Z%)5c9SxRo*3zB7R_ASBF$o`88wWqJPTm=S3DrfwWe ze%ebT%p%RnNHcX{{a*st5Ar(NJ(M^0>%hjL&NaHOQT7SO_2}Qk6#AL*K_c6P3>FX} zgM@jaIu|*B9D$DSA9*>_>eaE(L0_>}eCHV$2J# z;Er2vMegicpEx(pI_-4Su3inFefR+huwL>5$?!g#YH{emE)wGHHp0Ar&sN(W+ON^& zod%uTyKA#;b^IWS^B$%mcF<6Ced#5%B?)i&8i{h7XK~}>MR3+tHzC)lXCv2Xry}PmC*!oU&cfZ-Ux!Hp zJ7dd7FJR&5#+cWy4rcVKh0?cPX(P>7oSYwhDitI-eFz;MxB(wDu7s}|l_6;^Vx;+- z21W5@%_s3wo2qE=@V&V1q}=$RV>zrCUJJ`fdKdRDN77uCq`4e^>sTJ&wt6}Mo2Kc6XT~xf|8k~|n z8}ey6KBB~Z_n>0I$I+&KO}svA0Nxxt0OLmtN5+dUvVrSG6(4w9;|zV(!;xMWy!;$j z|1kplVwO4_Bd%=6)=uc(^yApqdA)T4Y7cDJA!!yV4oR~o&Ym>uw?_DkZ1eimfW!8y zfWyFIWLbfV@`a??6X%dLyI`ogxsF%?_aBpHmC5aKNwZ>D8fnf7-fq zN%|hj8o}4a)kk%R>KeiK1OB(<_5Dzap3~}NH-_FXgcRMld}0Lk_fSV4P9d=4;Wb1k zQ>J54xnX~sPMUq%Vxwh({_UfiD__msm<_Hm4PDnGa zQ|vj09YSPHBIyrN8%V&sxD9RSMwhc7&5BFh#@r%+C(WU3rr$K*Iu^X{5`yNRCPlh8 zs9uo`?6>!aW7XHERsLZ<4vsWOX#!}3`FNyRX{2M_WNVZ;YtkIq!t!I=qkOtPwN9=+ z<^z;h?bwm#35H{nW+EWWnz|E`=8!NOSdLZ`adjqX7DSv4l*i;}y+YD#14y(9?d@%n z__VWif++5=o_@-(>qA|C?C=)3;9JO+F`mkc5Pxw-8G2`61$@JF1k}SHb8vTO-f>(o z*47kj#&*UH8-;_8xrUP%BC4b+tK$^djX-Fw8Ow?Ytqc_t0X14{kP%rPA0d>LY3#8Q z^Kbzm0Q1wM1{p-35!K5!8mV*Vm51`8BTSlxQNu>zmg{dqo*X%mn|S((C-C*>pONtH zH?sWa;R86ZM-+Ok7FuPSW_KR5W-;RAG z&%3yP^shsB@zs}c`IT4Tth3I*IcJ@Yvrah?rJj5gBS>_IcWH-~l}q9A+it+s=bnwT z3KT%wR;{po+g7Yzy$TOLbe|FDe7W^vU~b%S%U!tqnwwC)b|ZA_H2@tucSna#UC^;h zSG4ca33VFP#hVk}!pN6Kp?2fOc(z+Nbm-9)Otb-a2 z8lfiBRc+cDwcB<@<1W3?dC(Aa8#){vh7LpfQ6tcv+a~Qhpk%dbxaQIOaPl=*BKMi+ zBj-uyBLAruB5#4SkoTmMk-tD*CS8I zQ{xxp%Z*N@9>McfpF*bxZp6C{%Hqq0rSNtAqWGdgF??087-rNdikYpephK~zYzes6 z8$E^P!|P&6FA`=F-X#O7V{X?mc_DuRYJ=t*t2kfHWI%Hd3rOz%sIc8`*Pkk0ZlQPl$rN+7Zi~sLm;cG`o>zjTnTa z*#$%wYy~%3k%ma_q&&rhC@!#lrx2P5Zlsy@rtDcEAzRvKU5J!C;xbaTRK6z)sifJ%5#~(9nQ=C~9&x#p*TctN z_uI?#&q=edgUci7ob1sxbtYT-Cx9N_?t&-HaR(N`lV)3ODX@{VoAxPtR8|UNojQ2i zdHZ>}r61FCP?r#49?mx$;yhSin@X4!8Dl>-Uf_N>=QbsO*_ZVceBKWFt^A3&-b^3) z?Xi3@D>yQr?g#!n^D~`!yD=Ww$2Sri${$=`=t;BMD*IYO_EJBq!L`Y>p-t(GLAwVC zNprgFsj@lwLb!CUZ6t_}={&DVg?xZ@h+tDd0~`2NZ2U?#S&?S@O?{`r=_LKQj`=p^ z%J@Gd&2f4MlQ+IDNE1QiHhpS_Ly#uoU|QqaMr)%1m=MO#C|69Hjev%5<%@^?FQhq~ z?;hSD;z|KIMXWHW0tN%sv zmIEi|%ZFCYnqlX5t#|cPCu;xPCqFhuDIZA z+43qC2`M-oOyVoL2{hCz=_Cn;;G1)_Y`E$tL%0!Cf+-2Ih3Dh+p)zKa%F(o-IkTTp3GykQ95;EO^o^ zqTDM2$+ED-NHw=Szo17MdpU4%uS!@rpgvx%RTwv)k;|5w%gK7>&XyBxn>52`Q>I`t ziSpD}h%ql=^6RfSU!#wO9YLmcMaUNw^z6|ZebW=CRESimjqq9p3?B2OD`livP}}gt zneE5ge-sbW;YnuPNd&UlT+JxRTU!oDvl|geMw(Mcn>}e3+(@(ArnXVRmu2NyT!>lDJLb58`M%Lt~cgovd>Z}yLS zYYgp?kTW69H?plgsjz{e5Ka%)<|Cx^&;0$*Ah{){WKQa2@O^DWDki)=hIx;n{zRPF zhP^Hc=_eu0BFqygvnS2&rpFChY3s=G@sO20GRYU|IjoQ4BCHXPAO0W1o4_QBsIBM-QRG z^AEHM+bL z`}b^*NURpjQF(NK^cG~)ER3(4mnUg1g)bYF#*Bt#@J-zk z_^xFY46k0+KEK_y)I(T4vI&;;C1LJY4T}fX#2>w?;iont+K=O>R)w*=SCvep*+!H} zifx{A4>tZWS2-SV7` zyaaj0`Nw3D$D{V|MoLqm<30{pTnOchJ9sCTg00p@wO|&&y$qN>MMKBjNZ&sp%_(s~ zzyBXdvyc^OHbmw5dLAE6Cyp>PFh>MAW$GJ$8+n%gtUlh>4xV&)_~WX(#U{enTX8dM z(kzl3kY$gEGz&ONXd^$_ZynM{J{v*(pv!|Oz=a4+g zS6o&aS6-60Ba6zH<(j<(?dzC_(y$IOQe_fCIYK1mrj>1N%IPTuX?n_XgxSLgnO^!$ zpyNgaWmkIy?1-o%&Ay+D28h(AD3c>RB0elLs-xHOKY^1qb6w*1h74{QFOlX6<2-5h zU_R2J`6F#5(){McV~}Qj-R-|8&gpZB6GESm9fJr@@Y_GhPOn=6>MpKhDq$Y?hI=8~ zf#+#+e_d#MFO&C|blFXQrOBc#rTz}(432Bx+PI6ODTF!DgAmr`c%(Ugx}j^>hC;&Z zVRZ~ibEF4HnxnpjvN*!*xmQ%S$wM2k-eIgx+P44VUm969P=}1ae@B`LZ2bG*w+od3;&|lYCvlIy1bEN=xZ~e<;>KHUz;!qM3zuAV8P2}&T%301DLD1i6LIp1 zd2n+6yvUy?4{~cMyIds4d2%6J-WVhS2_Qi_l+GEAQ zrue;66@1gCBtC0e7_A?>38&`BhTOD4+1szhxTclS>+#z#uEvx2N~F1AS$xregt<{U z%&1=)-!!X$SL#)=k>+O#-H&A>n_)Q#^V0s+v3Ni&EbLtaKRi<$Up0CHKeTw7B)JMn zaWXx_&uJEMw)L80(p(-(2+fVQ`O$=tYaUzHvn-Z&=kdNZ@M7)9@yK;&Q_triTaFyK z^`<-MW9(GMv!kJ(sxuT>)+nAQB#x9sz8%2Yf`XqqfVlm*peFLpZmkz7Ub;32!EAkUMzN?ETy8HL=-0fs>zG+9}ls ze4R83F1*TeC}*@}pb=7^kJ&@^k^ME&ETsCYL#l2asCSX0;8<3+5n|g1^8YQUo8$SW zA-+LKw@-v%lr?N0ek)`puMo?e2KQt_vUxd}Z~9G^X%M)fVzsY3#+S1MZUaRfg&4jxZyg?nDMm{=6$=ili-RJJEE*98QOXR>_(0g@~mkYI;QQGO>41o z*&_UTcwY)>KCpK?4({HJL%TOIZ({Ez?BBNqyZC$W{;fECWDgD>-V4on-nnBF)~#KJ zRV)9%vL*AeV%Z;9vy!BF`8=#zF^A-N9#*Unsa}IIuf2kkPdf!CpL{9`6gUMZop>tp z<~fmdBI%8h!|IrusI1?9~eLgO_;6j{# z!8thdoKtc7$*19hGtbAx=bVdk&N!9lPsN!h_XoAZys-nZ;hhmU_~G+d|56wH(WfE4Yg-yKS`WW$joGQ4VoT((Lr)JG!e9}P-=_gf(z5~?DGtNw$RhIW(%5Hj2%_qz|l*f+K zMyY*-bmEYr6OBfOgy$GA-?tmbgfM&3Y$S!c_tP@`lm{ctB%)c9W=EVu-Fo@{3A{aJ zqtN~i+NXb*>>~Bt(jP3|#H88THt}7OV}oqwh;rz+Zn|U$kt5bO`W^F#kW*;i+M~+) z@9O79se|?u#0w8cJd+!GK_krxc>MJ@Fz$8PFSWh+JhJi~!TVKK@Yf}B1ili*a(7N? zkkJmKTqQv4QpX zaXW0!u1%+sW^4DyAkA)x#mu_(*V%8yNXp~Cx*3yZ>QW=+;WQ0LnyE|HAxoc$orSGc<%AJxWQbKxzT00fh(($r9E8lJwS@ct?ng;tl8>C9Fh> z69nm!M(Z2v__(vn8Z0|PD36m#ii;Gla@4WY&||6eX_KbnjWKWHx@)gDlBVU`3Kb%; zTC#|A`A-to&2ChfMAwZZyU}D%fHi{bN0_zMi+}oniKb>8B~f0oa4z=j++qY;L|N%I zMMHV++qDq~_iW_;M(o|Q0lRmv$DX}Bc6d9E9NmpQB++~JZpGexTd{lRdhFV|3Y*p~ z#`-mjuy*AFtXe)7%a;C%B}-=$zhm{9C0Ig&UbJ{|Tzbh>xbVD7an|YQ;o@^I#}yY} zi)${s0heERHHq$}xZvDNaq7uu@+RR##w*558in@p1S|vXjrNksyz4rZa(`AoJ_)e zV$N*1`(IaJSmUaAvEEY{UiBWlQ2Rc-)Zj4;s`eoMm@)*rXJugT&u`=K4{u`oJHzqY zz-E})u>yW*Sq!h$E`|$FE@1Y!|I!QaX2VJ(&9~zDGWTFc>&hg}74S{dikQ))B1v;u zeABEVCN`~%D|749+jn5e=x2;HFX>wi3rL#h^{9d$o+)N?nnjv5r+G#1%8npwl?;(# z65&NXN@GcnvYAM;8*wg&MI_9^LZ)4y^xcYMeut;=)w88g?3(j&?S<#zyz|cIO+y~E zYS9W4r%l0YB+cW-PsP-?-r-&KWcs6CG-W4AzjmX>0ohToS5Tmj`l${+sD(&Kvm;SH z`1v;H5w^1sG3ijdVEZuAtad`Ujx!?7%tM{L1ziRvH$pa!WqZ;rMDnv^5^?u$v;R+| z*>0Gf>?!pn!nwTZWKp9mVV!KfLkVEI|aFRzoV&V_L4d@ONt+LVXrdUsAbq&y;X zmd?eytm0mJ{!Tv*7PFYf>q4DZQs4iUG)tF3UeZ;TpuEgmHr8lpJTj^}#&V>>kvsWQ zDtw)MCn$u(*>{YgKCHgc_4bh!`#^}`rm#5Ttn(@}vbE`%$U-&`pSPDMt3JZ_eS+7= zq<=)3rSC}ZM$SZ-9Z8En=}i4Q((L`5x}jcE39~279+W-Oh0{kO-#>vT&4RN(^P~?X zM5{!95*7nq!Q*=NtStx zgxMp#-6#+5GalZL(vj8qSPt(e(lvGNj;F?im@Apzx!CXJ2ytxE%yJ_5auZN4+TI2{ z3~JxheI$$XgTz0^>l()Hzaz~t0@5smyv^$`U5A4D1J{qZKE+5g+m(@K^^de! zLYnCZOrN#Sq~_}(U)K%7c;>IwM{Lb&#uP!Z#gXQWVE$1i(kymO!FXX9z#ZOBQBe4j zhYhpGjY9c3MTdq+lrP0NhYas9Qyd{Mlqbsx0tE+(_kl1(B2U(3jD775Or0zD?MFU=t1;+JcRnmtxvGZ(;8InfMEPap>?a zp4)~$Nuu%RE`ntI&@Rfh5nH#c#M*U>ux!QeSWNP~V9{(WSok}>|L!}yH~mu#=|2+f zTX#mYdacm3ZcDUm*aq#Ibw-a4{n5Y22()hA6;J-BB%XZoX*~7RQ+SHoqD4xgY^f@! zTA?oLRd0&&#jD`f>u)sY0y)+2ly*v!BG_H*j zH(rU0PRxk{xw7HvvrfX(SDlUWSL8&8N6*6h?H@?6cTD<7MofI}tm6eG5B( z9D}W&j>gS;QZ7Sodh86Ht z(+Zf;tP*B4EQha~RlsBt=j&N^(K~L$V&=81Z&fVnQx$*ot%14SE90luCGc&tr!lkH zQ#R7PvQHKJaLMX^m9erfNphd^_POl^T}#>&4I71yu#{NRQ^Z=txf~V>eac}$j}n;Q zxd^_X%*8K16PKKnAD3Ksk-a?nWWj;W)B>Ln;`r`yPSQ1bA zxnDjkbzWK$AdEQs;2XgQc}JEQ;Dof`Cxiha9Vwl1*p{O6?qwhaSst^XBrq^(O33SO zK6OHxDU*o8>k48Fa0<*8{D9nO@Yc)y|{#9>yd$ zn#gSsKmJ>Ux^uER-K2C#>S!2%<1gf8We>`cS-!VY=J)Bh(uUWn6Z31R9utwSQgz3D z+V1a&vyC?LTv8wQ5iw~_mBk~agAM&jI*Rn>AH)9Fr1>qjlYls9MVf6?=s2X==Nrqy zah-&S9N zdh)XU7cje02aXs?MmLqq5okxEbu*fk4MUrxK%>Kv?h>%P@tG0o_E?DJ&8nfyW#FT??s+mC!l7HdYC`&50d4hIDBv)c5d5%{kyl2{JOQ4MVvi6 zdA3nyo4>rB#CVfo`LIlJVuP}jnBTEh9iHGG#}cAgG~45p`G}PWd5%|58zMXuY97=uxxUd6CsucBX{7tyKxP_$~=6V)p;#)JQU1UFxE9d5tw8r*W#c_{nX zZTRrz-uV5Uk@)VVPWZA{V`S7ThZ;9shP%!`3Fn=V4Y!?p25R1O1D>n?D8BDi2CH7I ziq$Vv#qQ}{ux@&99QfreY@7KKwtPMU+uj+B1;blnZtvPy(6ItOXZF6oLmNn8my=5ibcKQi;{KjiBe`G5x>8qvT zsu^jX)2$MIZd(%HH!qCun?8xf-71hYSH-Hnm9e^i6|CxC39ANF#)57o@kf^uSl+t= ziE=qC@2UCC6>PhV(EMh5DUg^i(%h#k=5;TP*}dzb;k`HFyqr03$$1yzgxq;?@x|w3 z@aRDp|L#Og88;apz4;D(I3UdoI&Rd^2YC7>ebmFxKXb&Hgft4kf*w2CTl(5pdS0)< zn%?G80!JEr2N`#i#~{r%(i{caNxZ>KC(Yw*%}*zf)}i)mpNFK`jX1mDXJEc=*`my( zIqWDr2^5Y^nz>cGW*I)j8;ncThyAvWNWqwP zp1@se5T6I-T}0wE^L9)Z3R=s43;HX~4yk2Z-%wInBh+cEC@ zIk$o*%{E1Y@Uk3Frr4j-^!(qDW=+d5(i{`#%%oY5Qq{4!X)rvVCYz9+zomDQ&5<1gxj|so=+^>#kv`saTJ%4xDo50Z^-mVTNb!?v38WH;Se*-!1o%GEYPtT*OQ-Dd{tK8|sK z`|jEVSv~!U&u3w$W5G?+NyxJQoM!LVrD>|k2%uJYX}>(qi!5u(Ka(e z!tCIppfYeS+I;(=$kPDX7hv6D(MV&YLtkBh9RY zJx(Q#N18QTf_**Xzl3025fPI<{F<06PO%Mf88Rvnm^ zf~IXVL+8y2OhM{s*`T%0>#J9-fpg9{8+mi*!?W$$W9QCYB&0`4ICo;tP7-BJz1X=4 z2lwiWexCR`0)2#J`N%<%SrXrEo7Z6Dx@Fk3VFk8rS;K2LV6UcK=xcyRxVK?{M3~*k zGD&i@-Mf#+4s0W7-io~kHsjzClHI>{VE6v@*t~ri1`qF!pJsmv(qHV}y}`(~5p@#h zLr3@H(2@Nl)%%P%|M}M*{B?9E@TW$k_u}XQ{yw}PAhBM)=vNHs+n!|kQQUFe1vvAB zT=tUPjh9`I8s+MsRjZEZ)oUPz3>c2So%*18%@(NebY(nt=i|8j%3E;Vg%{%LvroqL z=jBGxJ5R$$qZ?xGgs%8@bR&G)t2%z_*9hY)KZfp)-GJiPoP$gAWy5_JosZX>*Tipw zn_$VX+E_WHA~ro=6T8N>!@6-Dap=33uxG||*!Ag9Y@gZ}OGmfD@4ai0&{o3t-Rk0j zE6y>}eATH1FtEatc)sj|=zQO0_?+Z;M$^jprg>!}%U?GlX>L*;@3pFmyUslo_g#G{ z<_v3zr6kOno-w~~P5j=iB4)KCX>Q?2^P;ZhNtj9EMVR|l!pi;?v1U+Jo3H#w7mYN# z=^4vOnB7zjhb1J_BF%pg^LjHaiSwLprSV&jT6m*jIb2g9H%`iz&*qZn$dMfttCq#Y z>1rbr@cx_A*p?#FY|9$zTmS5|*l-=`5z&f*rUCLi zL#E&ng6&7aD&ATzZJDy>7LPY5t zq8P(inZz1(luk}PGPKS}Hxbe2TjMQd^c|qt$~rvmd)rVC@t{Clp5$b_<-Q2BeT7UN zOC)nR0O9AmhDagJ(upSk(N_A?>MBmJkvxeg@ZR*+{eWYjPxhmM)8j z@3=z!{Z}K+8Xz&!takviCe3~{NYgX?G#M|~alvat;%qiBUAr4PBhG4bv{l?M#n(ss zB05Go1lBqBSFdkg&#hZ7)#WQp;l2aQ_M|BW^Q2$KBh9R{M%?WNB9iZ)fo1zX)nr!Q z(I|6Fm?MZ}s4m{eLOW<*Z6-vdGyAl@be{<#%mRIhvY4K;fcO3QSXAymh53m5M~FzX z&2^5B>AY_%p`Rxs&LL@*&165>(b+V%3GI+VoK*(%bv#5qoRDTCijvRk(tO32P2I%D zA|JAJ(z9d~m?na2r|D20J}-~7yyCJYhik(LVNQj7)~;vS&c3n*^Neh1yh`UHox6HR zF`KXf4<55EV`WT` z42AHnVhSs0$4n7UQAU0v$9D!}844D0;B^&pZ{Tvvb2e8c+NzRvovt?}I>`CWKq|}1 zlk5(0!CU_jg8u|-s40(n2++VS-o1kq+Ek~^>FaOUhLgNvbtlleRxFV z_@L%7?tElF?n*w`(&tlO(TOy$BoDF2>f)%dvA83G<%yMwZ`)>{>^Hy&1c9t;g;? zo3KZOndn z{x1xueKWorSq7`d*T?J;mGNW0D)_N)1Crc&7+2|Dyj=DUw0QV>T$wL7?mq7vOlwyQ z^LtmrqCPdSyniiheW4|`y!I?sz1|iFz8r!5-;BWaPx@oyv~F1NLNol{uND?`uYkFI z>!QT9=b3Eh7RZBcWgo`a>W`trzpubYO{(DAmR0agi>mmdSvh>$tSr8$TNcxsSHry* zpMx7OI14`xXoh9_+;y+Yn9H(%)kyQRCGdT#LYUe7X)Np_(yUSDidfmN99Hx%kCg*8 zGF<`lJD0>tP2cEQ3CnC*x{6pT!rV)%Xq0DN92WL1$MoFqPU76D9A@^ci;8z$W$R1l z%dZzNbKv3&&&Tu6jl}dR(=lzrJ8UnLNr>1EGDw;k=*Q8&$I~a(SUq8OjoJ|y-=5L~ zXiSWv-|i(2hm>HEhB%021(C(5eOVi`KohoOjZlQ7*%O6?Fk6Qd&FfTM{G;xeC z(%ne=mkX0oI}!5Kjx4?IuKudCIyl81*U_nSi^)0RV^atP`>A#_t{|`aQci=5jjF$k zm60y|wbZX45l;a-9|u=oA@$L|x24O+It9}?uZ>_YPsQ=s9kb4i>HW3RQ`|9H;7^6! zM3_IY&Lji=>t7!B6hL~674x-cC_ zK9U9EblR$%NN>`MdkKl4UNh{*(cjofSBX68$lVCDBg|8IOuyaolL;hSiVkAkxjyTk zB9@)HZ-hgbZV4CnwAFx>gW87k7jCVN zZy{TH2FHVhn953|lV{TX0MAE82^()pBNXHa zQ5tQ>vXM-mJOO=s^}sb(UyUwZy5iYqpT(BV8%bDqVUOk*lPqfl*`{NVG#}baqD(R? z;(O$fMwfSE)B06dyLu@$tXqj)J2#Q=^1A&bzWXqzr4W22QNP#7oD6JU-oQ> zrTuDRai1!f)2ljWlQhq2UkWpyDTK42&!Nn}^{Q8B2ku(aCy^T0B82rMd}rx9%7ipCZK&@^*TV5gbD|>=0Vj*^ z074v(G}~{}A=AckIe6l%{p7_OZ7B=u=dVjA&FSSkl9AXoscS--J#^fs%M6`M$g%_L zP5(?lnWA#NZbLuu$1)+h&i4!6&;Jf_ov9<~B<#RNUL0w5zUW8wRi5*A=a1n?b2@2G z*FmDY|BW=8%)ZVBPnrcsnkT4DMt#JF|>gpg)~zqDUg5i zob;$`>F04@P2EVwbRDG1AHmwRNM5i_yH5Ia^>FE(-E?Cwvixd9n*C_AY^R%a#x&aM zSfp79HCcF=Y@OgXaSm?(dS&?S*J*%hc>k-+HaPgY@h}yG8 zmVKLdWv9Xo93-UK#UPdw#n1FM>*JKA%Y?*#6R6!Q7P#M`t&B8BeT=_h522g=Om$aW z<2Fvm`u$g=ndP{fT-PqreJxvPwDio<5zfDIFYP$k+!CI%kAsBpft*5n*vJbj>fz?4 z9HSuw@Wx4$93oQoI0Xo%kHIo&^mNi3A6K0+coQ&jJf3aS3ORFR$Ak$JY&p5zySI}t zZ?}xk-0=$bPb=Atn*t%ss3Ga67*uEKS)-1=Gwac+_(`q~a z7mni3qX)5fr>1o5B3a&sy}W)e&w0YUdp8O0?u|CxLfETOW0Li~dv}n0vrLlagNL_a z_rA4~4r4Q3#K_V8jIbaZaOUw-e^_|^9Izk50|VS+7Mevm{*T(hGj1{$JQy`aq!Dw*#2H`tQp%5 z3!iI*1%v8fVYjkaIkX9$dHgoyAnD9cJX7R8OlnXJeIB_5Z&xae87-^hizXHEMdMQV zv`%4sUZ)J+Z&4lpx#}XEmOUHZ?@}8}2h}6Ft%A9|Ymziq#!u}_8 z%+({(ToJ1VRKV&%Rj^_}6)fml%80U&lA=lU!-<$OVIn4s&mdtQhw&s3S(9e9 zBiAO<187!cQV83TkP_hHHs#w@_&WdtUK}h57m#>7Lgta?NFEz)_RDR#z?@2&eO``8 zWVIqgdVNCJsEW4XKDU~BphwL1mN+{N_gK%76v5hmS~+@rmyl-FSDgjrix9>_#W#PQ zO#{%Db?|WHpXpdfL8D5BNCysy3|Z+~aHQGG;EpAvSshH+F)}auNK95Dq>yIm$P;Gm zOIJK@pgf6eo-`AZ+eUD7JJ^f%g9_i(~SS)!He{%#mnC}=t=R*HXh0dje zF<-XOmSu!XZP_{MB(9&y>hztCL{?rGL3Q!_HmxUuucuFAkewqxo)qcF=~tbvWMDZ$ zRG$39t-r0l^i53VcCzW{za-5(=Id@ZCdxzm{#>;0>)~W3;yB`nlq1dBx^^r=Wm7bw zts@UAljjMWMiy-+(ryW9c1Y|%y|S$tVWuyIQ>Z*)_A$=sKT2;pP4p#rduENm7IrP` zDt)BD`wY`Z>CDfg_>_z^yEy7%Sv2Cv>m0PN>!WrU=`@ZLJg+4?p$*6m9=A!KGVGcM|aP` z(Y2FsXkK67_fFXMQ6tQIqXuTZ+zg8{Mq=@6y)mWEU3l@4ocOp*Ui?=746JB<1=e)F z8S8r7jZOU?#D;zkWA)%dSk%85mJhCswL@xQNk0+PhBg&qN&l+&u6G?geBr6M=EQ6m zUi%6BKCmW!<+;@(nqcGTmRL2aDV9Ii44cMx#KBJnW81WDSod~YEE-iGiwD=XX%H)i zHAJ`K_aP6-r@kK8sNlabwP{%le)=A~P~uU1+qxz`Z&)6m)h~`uY8Aql^~&KR66qq> zUv6I!%xGH`%ZE3`e3JD!nx3K6GoCGjAKR2BX)caA9m-*4-|ARJva2Z?tNK^Knn9JZ za$r^b(XEt`mxb2 zQL|bdeDu!yn36HsUO=5XaS|p*qs?D z%C?t@Gz-Tj&FUYQ%2JZTpA+uF1Loxl@jBh8Fa32COxMw*qzA(?0Ibn3}*`UtZ}sDGb6 zU2n%nsvXi1>QS(pW_v=Sa>RjroZ>c$qb`QYpHvr5ng!-br0dniil~gp-{bOPSu*=p zI+FZYhT3}cJFHLa&uMMaBlJPR#nq%8g!pHqIRx`lUA+#{5z0@$NJrwEp`Rzdo;*EE z@OPv+B+MRhI@a5B&g7kh1U}B`eIR#J=SZ`@)!}^K>E86kZ8qIm=_2~SG;ey z&PdwIc&QL!7SupRnqM3}0+(HKk&&8`C5pt;BKGaq)QlZ54v;+K=pJm`xDr49{5`(< z=5w@e`wVWl=?0v0?s>>x;AG@I@g$sZ(kaN5SLDcjFyy*xuQ7w{-@TIrcPoi3$#6iL z_eAS43%m9R{JocCn&SdT-;aq%O?<)M>=5{RYaxYePy9ew0 zK7h3Yp2C{J6|r$xJ!~3S8*2vE!|Fi|uy$ZAtRGq%^M^D>iGN*y%kyPJkBax=m%%mh zbFWHRGm^xARCC)dAK3_--fEA7AN0r8iJh?Vt=3pPoP>E$bu8^uo&>ug23LI4MwvyN zt37lJrZlgB(WM^6h{E^b+cq^xn9JhRI>qp5ts?lec4>UvqB_doew}@O`?&_CSVn8i z?^zYU^{#5uGk$DW7C*HuiH>ZC-OFBh9M^R>JB5jx>8hyr6q&Ea;In z779H}V_wfv_^nr2%v(ov#E zjRuJ@PtlD4b?-ZgFs>zf41>P*6TN!fsthlC)^IWiDSBqia)g=k{v*jrMB+cyMAY`lhund`ZgV6vf^Rnw;med4r7nEiOTiI(`88H zaIw(KWO32!ARY0wJRg$f@$^|i{*vP34z5m#Ke~Yc9e2Mq58K+4>XTOQl;$V<i&leCJf!TF{WwrNI+s?fdB&+JEA)hmk#dHe@Hk zdW)UB{`Qfu9zCoV__o_f^Pk+`N7DKI_g~_RuRp`nMGGMtugR7zJGUf5IdUVXzO^-;{gNOCTC*ORCeShx4ww>#-Q!fVcI*lyvBVpdZYonXCv3)B^^9~&5R-@5- zNSJqOiU`wU{}Sw)`xX|DYmC{$%HWUZ>to*Y?eY7I{V?mL-kAMjXUrbi1apQ}$Gkyh z@pI?L@onpS@NJ7b@LSuvv8?;USkwD4Y#dMwn+Dgw`T-5Fjf8n)|GHR5(!61CU2GUo z6Dx6S7F>6o_%<5GcYeqH0h7rxMZg^9sX@m`9TI0aGeXt{=BR0I= z5=(~H#L_`ku%u5ptQb-UFV`)Elc`Is@=^NE>oB2ddAwTjNs{LK@ny?uMw&mXT@0TS zpVTgi&stVRjk|9$lHI>X5iB0r7IV6jH217*Bh4bsKej20A6k~cFKtR;MW1R$npgGJ z%YYTIYCuJ-8d#M?xGd&(FNr^Tlrk*rL0yqN3k&oGKw@F{GFU{2IM44{5(_BvT%MoX ztt965Duf>>#k4Lp#rMAMsXxH7P?N$>y3JdgT3O8emi}JNT2SI89dKA zdwX~t`%z}Sr0qH?)-CL`D2v%5MTUqpQwPbTaREniGkQ5`_BrCvat^ z`j^*PJk=qrn~~+rq}lmj;AfEzLn!~O231lK5ggj_Z{XAj-2YA8lpaH!W`WQ*e7=65@}GhE#AOCh++@5BW2i8han}YWJrr{- z<~X<&mK@hgk5kfvFE=5{gx$ou*yQ3+;+I|?`)%o?YdukpFi~Hhwthvvt?L5N?=%%7 zSwY5;W_R-z+0SgK7)JRb!h9^G>qH1ivyTmK6{ckMrI<7mt}k=`=Rwl!g10sF&_FV@ zj@vmYm4XssCzaJuw?|fuL5}I}=Dm}?#ym6vr4Ch%B{HG~q698rsWYQdWE?fr2)=`N z0Y-{;*O&qMJInQ&y)(!GNSdci7>9QzPe7vvb#Thb`Hjpx_UMBouv>BT&wV&#gjq|^ z?Kb=|cNRYV{PVC#dnM8V%;ouGu=6!2$Xz?dl zG`<~vA5{tSpR0;FFEqdp&$q>lmwMv6mwVxd7dm6s^R4jf$oiN&tSS}_tAr&a&wunR ziuv81BAI*&OZybU`hjI_gn83|CfM4i4mOh@t{*`1Oai^WR~2j;`3%}VdMnPM9vVJ= zEq)qM4ZrrShIJy%Lz`edv1Vu^tbd^e_D|`C9pj(H`d6D`>F^p@I+$hlEr(@;Y9gaq zNu0@>6wPaX>ZU6(zF8%_S*-}3Fa9t-<^E?4%HWII#qnjGlK6!9qD5uYyZ2^Wy`p2e z|6sw$_9U{^N&d^=rw(QCL)$W#)vg?VXjKBUT9>eq=GFbHMsu1;oQV|!Dr2c$2JBS^ z^GTR(M7gI(vnS4Jq&XgmE`xbY|7*_@__arI%LW8UpAY@X*TYsGiTe`D9M&x&$PUMz)9kk)!YLT(m8B_u%zri^!KlSl-@;9C3Cd z^_d~U==)tym_?wCG&5Gjq?vLjggdaAz%sK!$P+wi4wp(4Vm<%QPOa2JS3+t)5RMHzdT%gER8hB{uW=uW9g(>H-|o5GyvmctM5l_045;KZr|I*>GHoJ z&1$E9KP1h9;)8ta??`iqY~e_U5IT+zIv(yZs6(h^ACR?m^Fp|<<5|Ez3h~4_el$X1 zeL22-4=>}jNlLIFe$cL}L z`UJpXTQ%daqx(ph_u7gXpMUZmKKk%o6f3DO24+L9JSQNx$a2mTkRxY4xWdr#zA$kzJD`p>r)?F`qadRfpxI1e{F2+Sp{2% zw?ePN_u@?IqT2nJ;=A6JF?T>M8)4owxCu55YRo(uVeQC9*fXvJc8qO{buTxKIO?kQR$_w#!lM2WnIew$U6ZoicS$y8GG`^}^5?|LXjgM-V z#Fs5AqvfM_*hq8zC-25@L)zij4ps17+oJfcby3WGrZ|3hrUYg*c^W^rEQS@m^>Gt@ z)T9zt4y=gfB+bi7npY04hUEjRU}2x~Muz<;vnS1ebmxskgayQ+Zd%*96y|g@IAcxN*D2;joeX8yJW$^mCU82`2b?CdGo29atAj}I5S%V# zH<3=}ZF5_hhtet^%IxzEWe@8R+Cs3G#TAPwb1F&x-$`>!#B|JLJ~qU(|v zYx{w`u`e>c*SFuAzs9;qg&!PnqtT2Vk$(xHZ+bh1@MZihScb2+*(K$CRDXj%rt&0D zX3}h<%|0DvPO<$0E&&t)4VbjN)z*2b>IEw~7T}TuPV8-=u5J zXkX5pWVG>6Ehi$Wv#j1q+=&U-8< zU237xono}krzgxpXq*^KuaPOApO?WSc91CWO7VVeL z#15CF`JKrbRNfdAEA$ld=gDa!nLWC7uqhgc4(-C>L%VVG$bKwex&YIsPQd4%e~gl) zOOd=~N4^sZkZ7NX+_^}ebLA&dJ_UJl7qH(VcNd<2F~+?30@kivW-kJYG#}a*t)j6< zt8^qI&RRudj|lJ1P1wDY$98Td(PlcE1HILzi)aLT=MEC#{kw38WctANb=bdc9rkTq zkG<|pj=VpChiV#SCCSUZHocu);&99k3WhgZd#5f!m| zR0Zx=!ur8=v8I1>66U(t)VCVe4ycYb{c2)Muj<%7q&Y@cD2Q`&WJB3I&c(MqDq!*8 z`ZmJ6X+T44AvO*sX&zP|JKt=B9dEbB+7}y8zUo*ss5}<;uYg4Zs^Y6&HE{JQZrQe5 zFE|5Z>X*Tk`lT_cdSQInpcFo9R0?0$FNJR!l*LE&NT6F)M)%?m8$qq~z|ELFv;%%_ zTM@r>EQQ~?l*4Zw%i))2OWP>(ybfifF9M1*SLDq>1rp^-STU$7RtzRcnwR(IjY6;T zm`g(J$?|+dd7_+QA?voVYkAC38C^@`mmUstx|QK|l`xw(6|XccgR4%;ZKV0!GtR{D zK|?Wp%G79-nZaWG_+X@2z8v{>&@RF@_)igDAGUiJ@G^+32(#eYp0z)wH$ucbd@xeL z(~E$5ktmfk6V}dM;1-TYnr#F$3i2MuBhBF>&ScG0A`0kn{RmH*{fmH^bQ1^Ku;cVl zKAs#TgxN^5I@O58g$Uzbh@@_j4Awa(WySUJb&28YD%*LIa2oj2kf`D6-cFZY!~%EA3~;;ix}l8m^ukmYv{iLYgr+rz)(lL_hP z)D|ND$wZu^yc0XB4a??fU_DJP$;xlWUv>=^hs@P?YD!|4X-0y z+evJi_-5#D4vG9uK8Msao}3E4-tKsKjbs-|%bZ6FX%2{UYC2a&xV)Z0#{$??sVF|A zkU(#b;~~^%gse!jreCQIXxgjb+k`#!qaGch1f%8C{$h$Dyg;^@(R zIC6L|{ycgBKmGU}rca-QFTeN{ojP|$u3Wj0J9j>uknbcz?wlteSB|_!rq4S4Tof-- z(mrDG$DH4=YS|(j+`G%5QRaOl%p%PRan`6Zv1>DlvOvPTgXec@4zxz5-MKwls*dM& zZ6`>OcWhsS?OWGk=k|^E$?n}dHsF9>^4q%}e`$%jT}yCq=?v`nZamga?2EWe=X7AJh0RQw!L_t(SLR}Xt2Q;&(8Jqf5#+m_o zp{^P>cB_aTgPLJX)j~KwcQ%y1?HtVPRS7GHH^zoRjj)+zZzVPjB#9nUA3I)eg&l9T z!m8)%V%f;5SUjXW7Lvp+Bx(L(Kt0@kft&7f-B~BQ0ySg;={d6F;mgm(jGnAhms*(LgLzVhg*~cbQI9HE!tJu2Rj{Hr zQDl+x|G4}uI2E1k4hxaWl7XaVm8V1OxCyA(|6+}-bAP_k3IAVrcIiJNfRbS z%g&8uN2?BmK1@IMgvo+)&_@4T#BECLh})#?t23NPh&X3Oo^?KIf4t7fCuQ-|E8L5K zo;1HrDA@Q8SMB|uk!F20FeN@jaGg>*QahWleH$dJVnY-IJc&!DXQb*ec@tfosXLdz zb+GA#Suf$~D{a0b43QG|0(d>R_?OtlBeg!Xk&X1Y80H9b0_P7N?s`|YBg_s_dDOR? zn!%XNn41!>sdpFeoy|1D>`1dCQDb>+>QlsYxz%@wIEi#Ju130cUy_~Y*S6vBK=(o8V_Or$xDILB?r>X^utB3p1Q0nf#CO<>E}1%7Jri|twMKuf5_ zv4-teZQjK=rQ`L&>#}u}CW6Xyz7dq;{q7%FEz)zmXNZPFx#x2$$cpc*667djU|fQn535y25MH{V-d4h0RGK zr%ZQH|2rMZC=9?Bgk_{GNV5f987aQjU2k8d3M#Aeyuth(s3*#PnBPH0a7Y|XhM*B- zn85HMah6kP&R|HK-<>)UFOD9CTW`GHh|D!tT!Ljw7XUbFqs$_=Uwr;C-g|cnKKb|q zy#M|OC|tORrOlq*O&Q3UBNxs)=X_MHRE_l;kNLmN!`xqg!|%WRjCHG)m?E&VCPT-g^#D67Che!3tntWh&VUE(iiHJ%#z#^8=tF( zO{42$%X8cxT@7o7)xyewjY*hEqWe|Cn*Norx?dHn?@<9;1~s}sQPvX7vwHDa^W=kv|T?LooOUYC<~6U^ zD?_r}5WjaSMbcXyi@H_D;%-&2v^z<0uPRv9y8>49uZYz{Dq+*}wXyY;hOBEntQuMa zOIiLhBhJ;ZsAmQH;EG0=MVLKlUdTKa^{R@woy*~;4#n|vr_z|+u^i@htBN^&D&w2B zk7HKnqWHCUEqvIkIqtjW66B;@C*;Y4cCFiD+O&7r(Yn#*jPVR;3NYrg3_|kZs4b?q z$K$on5K5Qc<{d;>M3yHI3|3jfrBz$<PWK!jDoP6qTvD(%eCMgkY?hTq}j_E z#em~N9ZA&D#DP2pgJdAD;N`V-Qq?gS5li&;_oP__AtcPiTN9a=I#TA7hA{4Vgq>5? zq*-ywksGT=S{)2godws4L~$?5BUKLO@6)L6)VXe2!ja}UX44i%mejfUPSr^BTbW3+ z$zZ0;?Nbhtlyu?}KPG2cKK?tOdU+wmWPM}%O;YSpd5U^3K4|fazOQ&2#vIdM7~fu3T3+`$>m!{;=>#jw{30rY z&=;5;_?xzJv5EUS7PVWCV-aS-k!IyDh%}2Zk0Fjvn%zy2bS7P9?l-c6AJ8#B&7yrd zLhN7!S;q*spPUQ*PRK->r6dWLo>0fkC`95UkyFzNJmLVX4#AAzJIpj#zPdgX&LMV2 z$wI?~Pyn1kW4`0cpyEU5n0vg#JgHO#cK4f#Wj<1x0*8XQ9??#X?v}i9r)Dr1PULnH zgE)zFsbYnZCx<>Gof99tGu2Je*uNVaH?Ad7o{slFn2wJ=dLN&B@(JF1?>*G7Umy40 zeK(4eKzHxl1@BCmg1NJQ!L0AU$IP$3!cX6Ri{F0z8EaNA$9|16YfiJKXK1Zvl4ucN z!L6#X-TmhFz-|)i9eQby=XP%4G0jgF^yR;;B+oiVV$F0r_O8dD2i5^Q=41ELkFnu{ zky!C^TPzw>8_NgP!5Wg$^+T#--O!3yHLN<8jcSNF!yDp<{&n!vpt|^VSbfYL*%0$a zH^rRiTVT!$ElHZ2lQcKQ!WWui$qP-eT!i_>x>!pxx&DRP*z{r@YHoi;}{$gdU z7*-K$S=NR=RY-(uU_X1<>-br#`-}drTs|0 z``6>P7Jt{m?lCQ}ZA??F5@{Y$6$=NH$HJcFv1~|9eA2rzt~tZ4JaN%U1@K(8Qh29X z1x&493ZFHujGvoU!`J-%N%PY9q<#r}(WnXrl`f2nPRNT>c}41!bxb{M9#a<^#_&etr8-zStSXj}P;0uzlD-x3 zM-P&D%C$h-UKOyoPbHG-Y9!ZHFsDmN{7lk3yJH#r+P*Aicd3X4!|P*4*V1^W?t_@u zrygbxZi!)yO5?0N*{wd;-*h8hQxMOXO5dHp0K@$W^wo@Um_~vz?G5^;wT~>?U|OeQ z$Ktl=+NK3+rjG+51B35W{YYR|FaD*&f~ndU^K|(sUmt{2cGC7Wil7&LGv0dDw*H3J zf+W+}Tip>H*-40sM>p_IJ3FEF_ESk!jv8lXS6EyPd$d6(@J9EN5 z^utbbJa16Ok+6-^KI`OtI)36H86wznTa?9(sL>xx9?C}_PZ>B7NZ_=VzwH~LO|_RN zGO6DvU_GoOclFCs=LVi*dS@s4M9}`se<_zfRXv47A^v3bG2r|l{L-9mIGoOob@Sn2 zHi+tcJovgg(&_DxARe@`{!?~UUFh?)p(m`mxlUwaS|dt?R)XQyen-tc+MDm#F`Z9MV+j1p6IA&|=x0KzuXzOi&AeQlC97l(K^?2@TK);&vM2v+ zt7_AZR8l9=jj`f(+(+28t{zqf>*YVbGgaIv`aPNZZhdk2fwP%C)r;(@zDpmy(TXtX zei_y^)QOYb*Td<5BH?aqTp4kDr<@}Gj{LhiX_+~{Z=^cvQ@p->j0%x*oqij<3?3|p z@G4Qb%$hj^vu4f2hV`o*NhV>|)|2La_MsDvB$Ff)dq|jVOQO7Y*H#;q-oHoWn7{Xs zH18o{-b--*z+MsU%{aVwEeK_uznWOri=??PNi#|4*Ig^3$R(%Z z@e5AED>VvY{pglhIjBC?3?%xJe3QJd<24(H)W+^tnqlk9jj?)EEi4#R5p%Q>MK_Yw zp>;5-+FiIHmz&aX_6hkgxXe?S+N2C7)GLDb>X*h3O{mYQ53)O zQk;}M8&1!e4TUZ~8y__$k!@ERvs;zIoHpgLs8eMdX45(^L&CTvu9CJEy`z*`p zSrzk#H^RKN!)`#X5jJmGZ_W!lpalIGVan;#`|@OlV!AKQoIEFsNrr;_IA zhRtl{Ni&I!Bg+$PJC^zSekOdp9j>G@nR4sq(szE^YPADTn)P_tNMbbV=|6hq23mqV zx$hAY8MT|tqnTNO=a}A;=J1?+*YriwV#N6k`jQc6%4LMc!Rb`KF27Sdb)&qly-Hsm z`h6T2(PdtL16Ng7Qib z%IbA0f3{BMd2NJKa;u!INVDJwTLORF`L!=U zhHPs(VmX$l&sS;F(nmgzRB<22S$TO-aq=H3xeot@1*X%_tID&D3^+a>~g__ijb)`{t< zdqy}N6ShX$nCugxs7?6=ZaOqwITq$9a5(Mzg61(u<*Sx3o`E&Z-k2g9K0q2qS3 z3Q`6EblwND1fgT36J{At*U4ZBX$~FOKn42->l>{a-2@?n%FN((+Lzo0HDD)??hI^w zOm4k`d=RqFm&Spcj$y=^dSK9acgjQzAJiWgpMQ=KvpO}Zx=&z}*na-mN0>5of)VBq zKY9-zlNf*e@rU^E!}oda6I+Jv6CVHa(~t1Y7oXz$8DC=Nw_o9_&p#reo`Ib^Hsj!d z-OS5P#n1;-_GuJ3!d?3X3Hw+5goUqm z!`%M$FlT5zEFDq@%LZ1#ib0h~WGi9q;3`;80=j8%d8`{ol1DN6a z%Cy6(V#SD>SURdc7L96vr6keINL*JEYev^1(It6&t|m4;Ujv&-mW7QZ&f8zBj}?Q< zVzK4{53g;cd0n4sEUyk$_F>tSXF}8Bxc#JT=uqTtEEv@WD~B}1>cI`Lc2EPXA5b6b z`_{JAHa2La`IY9_@oG~nqs+6rmB8=a%HX$7!& z)Sx)tto}GMYCVPbY8A%h3J+sa$@}nel|mTw^9TDjHJH`Ome*E3Inc z!n`@n-|o2Mb`qfpm^ywErjayHok+4gMZT%0Bl(K!ol z5!BxQZ%DI~Cn?vpdq(>%K}~p>F{zpg6I&( zuDDJ~3``vnVR=E!il?N+aP5dteqWw?N!GYWY)jgCn9G){%YEWA2|(eXDcJAP9!0Y zEDPge(rj^FejU{*E7Ba>hj1^gG4H6uHi8(9GW*d-uhTeB{`Lq-vyEJOIRiaQU*T^9 z(`QARovxhDM3#*(zs@}4&O3q99nbfVg_Nl$k>39v(v7ass6TZVk)Q;!4dsd;BB3b@ zvbowpGTNMw=8!l$((Lkz`}N4~DfTygaZB6918U}5Y0!1150PX~m_58MoXs5~yVAEL ztG2%F{yk}?4C&=ZGI;*s@=TD4G+TPLLDvr52(W{XIcis_?ad(}&z|J5?y+o%jPfa; ze+>GNhr5470)GUgPm-f5Pk36L0pVtFTW=N7&B1UQ+exZob@&vcrE( zngy#sq*%W%sAKV@Ig`;+jHfa}!Yt>~9Yrz}Z;;a|cp98R9BIyoVJi*ryx!{domU1G zIev^3=N!Y7&#gI$^20s1lLwrUe{Mn};;**cxw|UOme)h?S`0C3~@by=p8b12)9lZbERDARG zXV|!5H4YuzW24LoS=OjCA;P>b!5)n|^O&6@Y2Kli|4627&E|dEv1{)(?A%MzY)i^* z!=HOLV9)RGk|;MK`K*rBgGkN>RKv~w65`}+E`1{sI`vQl8mq5b_0oF?_&4imQ%7}a@)$-@M?Q3A5fQMz9CliCs7_) z9~=6SoDQgsEyL?$-)pT{UTyr=wFG8$E{<867u>m`t*bnX|GJRk}Uty<|)j2<_Y}XsW`rBRT#qx--sWFw#Mg!TcOZJ`7xl6oNKhqg9znvk@_?m$u_rmdJ7j#xY;_$ev~Sx=VMg zPe_^_8Fg{rVn1P|D3Za+L0_W`o-BK$lV+QOz_e~|w!3jv2kJ+i91?vA{G<2^WBrY|MEX@WC>-J#n1=RUE(X_sb}ec zd8Nf>Pkz(ki4t9g}8(vXeA>grmn<Bw2bliX!;b4dGR@1efcFunyt=Orl8q7;1`_Emg zvF)2N_VJH(y~<+Uz)B>|m9b_h2{cLbiXo*)k}Hy=R>t~)B*z2FW7VKaSjFF~2A0K| z0i}6OX>1^I-Y~2x)(opba#+>QtsYp?ur9*-L8Y)^Xc=rAS)Qc18g{-`7kl1pgn3* zAcUMvH^J{6OJY|0qL@wc`4jhlB59u8xjg2PG>>my9cSgrj#F~wM9YWo z#IO<%VQ7iFF}%Y4c)s$(m{`3KKCe?2pVhB|;Z;lEf>ZRR#%#FptdlXmL1mKVO8C0T zlbF+`ELIGti8Un5BKHm&Ij)Fh11e!TZ#veGu8plPH^dfuA+V0!7%bs=KRrXE%-V`H zFXxSl<~9G^t_Ws6^Avt-TL`mS7sQWkpT=)JD&nOQx1s$5S75=^5h!unB`A9JdH8N% zJ1iVC1YOEM<)&zym=Cq;*2W7XpT|36$K%6^x}js+n!sR?!JrWY{IJ~#;c>s86?|L& zU*YmeEmvFL)?)$~1w#?!aWP21M3h;NbXbSZd_@TUDZ+aF9lX4utU^eZg>kP_*X*3a zPRvGv2phHV#n)r_p==xyHO6>Pntd8Cw@CAN<|WcRo&;tb^R~|2I#8xly;-)e zQ%syG_ur9bD<|+hAttxVD_u^)Kavi8KOw@tPh(!uIV;0#ljJLTHFYswDJtyLqbH%R z^XBp3ce>wO8p`NKPCZ;3$`X{r*E_2=Z3NfIvLnn>#!s+)Be^L=mvv))Ugyq7+-R`+ zRyV?{DL0x@#(htk(?*)jf0bXT%Rok}C++4(;oZ~~S8tE-X4MmHXK&`~k1IdfER#N5 zIS%p-)-`36Iz*&%BK0~U;V8yaYng_sy)zOeM`;XU->*k>(J-9jHy1Zkay3&XH!>+1&_-u(Zb_&2dakV6inw z<8Rc{Npr?)tjlXWp9OIaea6Gv)I)hH z<_%1Kor+-IR(Zl4#~jug&(&#TTDMr1^crCm+9yZ)SXkpJsiBB_zr_c5cSeKMz13M>%|SABpo065pMM zefzfBZ>_m(TM=bqFMsdbzYRwY?B-<$aB%l_ESUQnmM&g^BZu~rJg>#UgZoIh_h6^y zJ8RXA9b0g8`${Z-Z?MfvUQMFBVo(JV=t|tLfpx>GVf~2mSTkHxHL8#xR>E46>eT}( zkhJnRiSfGrrHGQ)!2Jy((nB;qmuYxxUH_sHir98dzam(p?O>)KRS`SK)WOcz8(?0K z(k!nQR*`V898nV+p07*7UX{eUIo@wm9k=JphBDV(fcbj)Zv+YRka}24(!73feXQ+Y zhlIH=0h(5b2)j{u|Ql$%5N=F+G&^STMa<{gRG6GkiUzTgfN6 zoxVf;8@!#3jB2E6ET%B#Yo4yor^@c)wMH@>aeg~`Ax!#;(gss}tZuRiW%*~Mnb$`M ziLKL7BBw^AlNX;*4wo?fY|KX$!?d>EtZmcxsUN0E!1|=X$(Kr+ z6S+McLE(N-@ANjEkYx|rBOuKQ(G@7S<;yzB_U0obUG@~0ZIZec(VR+q>fND~k!Ed0 zsFaWNpXRfUJGlC(ZBJ2NF=?h;%3tTDn@C5=walC6L&9vwtd!`deGFnd3uBH=)i8)K3zXfU`iFU@);n2U#t~;X zFI;|_+P=+xJeL(|Hs2sGl<$Wy!c17(;5O8Uu1O`$I+nnynp^b*6f0iIESOnA$bn%DflO?+E?80e9>ku1fi$OJ0k-HVj~EVi7?Bt1Xjk8W;%tGZX}<^FtQwxX*nt@H%X04*HeKL#=L27 zC8y3ebb(FFVY#_dZ(-bP>@Vo2|vxc0J3aN#*;qR^9%qe$T=(V}G& z3>etQre=Kc(#@}*I$Qb zjT+(L-rcss#$FQVqZ*Okv;^P1-WiiSm&BJnOJd1rlDLsouzEyotRGqv>xPuW>S6k- zou+P7B`Ra}z=}i#tQjEETn6j3)hP5}lIFqHNB}GG7>RNJVnk8H`hFzKeG8K~7saZ9 zB}l}}W9uumv2{#cEa=DlhSqkY)Fhf3pLdXU(ze6AS@b4{!r zQlI6KI1hBA&00lc*YmBhvTtqt+?quB*)sUGLpjXuR0cnHE=JN^jwHGimh`ELkK0tk zqG{un7~XdfKAJe09T3|O+o#re_4EIPFi84i z$3ocX{}+ULh2{PexD_5moE`jE4%11q#TW+e)Hrf%sB;bF`a75`=^cjetkMy5T!N2> ze#A#{&F%A?jl?E-rI#UiU5FTH#SojL7<7W_JdoQC6NwKW+I(GI}|6O_*=ku2TuKp!vwr^o)QsJL1edlqbvZw)g9@XV}QNC!XA@ z9Px@2IQhLSA<{{W`tKA(e&W-Gq&X{?o&tFL3F+r7J#FWG=AV#eufN2H+P|h2__!ma#T=*SkOWdz>fHI7&)1j7ykr}T35pR+Cz&K)CfOW9(kx;f z)-Tfwq_n;1FZQp@dN#e#cchQ-dY7L5$#TN--AyOYMX@yjW26XjM3y~lYM4#O;B_{_ z974xf|A;h)knTcTW(CEKR9h(yr`GKlq&aF3g#Rssk9aHtBCCQ0H@xmT7m;Rf=q#b2 z76|Gn|~4K!4nI>utzRLM2j_H&-tFFykBiMPj{o$5#BgXDfCtn~e`%d=^jNd@|}hbT*d0 zRv*iruZp#!>SFz{dRRBC0tqcioyf4}IcsW$k>>JldIrhznnC4BrZxIZay?K}Gb)ob zm%;i0r3@m_!rFetvATZ==204}hgT#iu4ymJE#!5pM9>G;hbO`nQbF}x+} z+<65aJT*H$Bxzmud~-*x2iId7644r zpE?%7&m^C}cPxbk-OJ(Ac2#igX(!stfcIZ|8EQZH0P5X+8_HaI2KpDd7hN8^2_?@f zfd3T8fp!mGk1u;P#=PNLeya(P${EoV4PC2|=Gh&JVosM5L^=G{r6PXqS`l-* zl*Yo2C0K_N7}w}=Ja9oCoSZGE5&t5$-GDOxxyS5#)BX42%}G9RsBpGa~L$USVt$zqg) zC(SkjXLU%8XI{TnFX{GJWyZP8!2VQnhrL_DU^QW8Pm{`{1v%%oXx*I9d`tky;>wz3_{E-~Ei6yFxoW#RRX zIJk58JpHPXy_8?;Z|jiKIb zv{}EkWgbDE;SJfba6HoN=XqyKRX|j!V<05W${-xp_DB|h*C?}loarX6C&o^o-ww{P zJoKBgNFHmXAw-0t0JBg;qj?8N5PE3jbpEd2ic417E(18o{N#D!;_ zWrX>JJo!kN-RGl6j~GrF{=)9L&-bN!>Tu%VafQo;T1?|wIT$Gvk3E`iX_F3Fs~Fyn6+AjU|+!_kz6xaQ#mSNtzI0|OM-*S8Z`e| zQ#w`-CdnRNnPt?(`WLhU2MPJ`8YJD-ZKQU2HxlgLmGM)LdMI>J0hGP00Ot3t%WE2w zG}osLby-Gztk8>qBbs5`^KCG{Yjw&`jF^m5(V=iuS1uf)~dul}!#F`-^bOl?#eZ`Ub;*J?b4ANn-G{2|S; zY)E60;aXTTqz+aLsfNWvM4H_wvnS13Do%vifn>Q~MJy&^UaS`!Nt!j%yr^G!EbLRx z<}|01W_t;+R|WjuS%kS1ekW=EouqkA=ZcuusSM_KEXMs3nAxQ~`j)&0HxC-v~&v!I6nnIK9{-dFD4UAkFcX zG0R?J)|PS10(TH|W3tE2J9Swr=BrLKQ<)Kb$LnF8kgJbJdi}$5_E=Z>+m^p+rw}&E zu5%_&s$8M|g|Guwe`7w!wAN9}2Vzi1I;MEe{A}qd`IE1&L4K6@Pz>`S^#$}_AtpO% zP{*z(mNqKJ{H(u8`oXYY{Cm_k_gb zSfts1aWDbTt6u4K(N=9M?YKYZ$Z#rQawGMUPjLG2=^UR3$ue&uA~@fX-UMDJ+eNVY z=>=1>hqH;>QWmqZbS68=Z*lvq#j>XOvm{Rb>U-60MFYjK1QV3 zeFi+0Fk5^wVl2Weu>NCM|A;hu;;feg!#w|ANV9#|h4q=1YY+;{{NeGPq<4AQ@d5Z($0RpsiDewoWne@<)_2%+t-&mVv0)*Ip%F zq2rFh+poNY_a;w3)7rIg&y6>tdEGi#IQKWSXx(466_-fxqHAyh5yv(b3~Yrl0cI&GwFWdAoJa^9jh1qf#1IW z)-d*^=kU}6_aSfg>^LD;F61Jq(r39(KkZEX=du6bmsvkEuY=gTa~t+!VGKS^EG7@9WfnKIj zW|HQ$S~X$_$uo(ihh8Go>KkhYSFn9e_gFoGi6O;|=F8);rQl(G$|i)}BoAyIFFA6l2ik8LX82a@G) znw7%$Ez0BDXUbwm>tgtc$9`#E48J^E9Pd6`9@n4iKBav|-n=+Ddp10H$vJqgYEiu3 zs3bmXQUYJLD2FdvmB-ILYhq6S8YIZI+!qiBR>8_4wJ3W%)|Z6YPtoAM<|m5?i}>o} zB^qJ2b(yJ~6-JtClSEfA((L9mSF$DNM40`{fWq9arQI@l-O6L0K5EjbBL3)D7W2FF zhJeI*R>v~EE4q?86u5$s~1%E>v-1S@MlF>ImieaSL zrzhN-(CiEd#RoTsEr|u`=97qg-`WBaVbKO)U4 zoAMHqM3^%@K4iUp$I0?6ErYh6v?ZScR{+o8aZk{K&MrWvvLijG!pFj__2AFNc{27N zYc$K*R_8Q@B2}(X_vTM&q&d6+jHmN?uzVw5My5zuNMKyv{u*6R-DLvp!tz7%<#d+@ zBg~F8yY+`DQy^Q2lqKiXFKIdiWpH+qTxq14>6FHO^-Sa`)ETc=-BaN78`dLThRjG! zljz0yj+4dfEYw?K6T2Dz2c$WP6OuEL(fJ9_lPDWuX20%M0ihmcKiV-)t8FYxf@6{9 z%rf-BImf@zry^e9?#u&zr<8_it zYm$pR%TJ)f>=!h9UO57}>>57L`1f*9|0A-H-;C*0RStLm(} zd(s@mR*O&OZ;X#dnzdzK(TKAn%*sdk$)^-UT+H@9a4cj+npGcr_d?))Cej>NEDMCB z*%M|@nmt)gA8)yi7352i_ zD_*A&0ULpd0<*i*i|*iF*Sr{qv9G>_=@Z7HUgb)-?dq%1rg0-|Ubh+p2lPRoK0U;l zvH!pxlH|kq^Up&@oHg3~*PjP*hy?h+{$1Fw<>RzeoPEh}2M+Jk9OivSnn8kVBwC}? zN4Wiy`Rw0^o$J?Q7qMo+0{rpQPx$iPchRzLU0ib38OT8bl{bg`Qrw9r7QkbVJ&s0= zo8XgAKgGdA2e6BTdH1euIJj#i=6ydAMeh3-3NUTW`)|gU7h7Q4a}}}a1(L((8)Dh$ zawL7Fv1+hZ!{By6X>LnlCHI#_q}Ko6^iH6)R% zM^?wG(KQXrM^wS`ku|Z9~4`_t- z!y2>vde}IsIW|B441VcQ5kHVf{@AtxzH3ny-!&(3Zc4)3q71%lRt!HjEsyW(7h;;? znB1let|MvAPO_|*=g!QZ2lch|T-{RmqD2XO)36Y}Z&VyJnv$S)tBA!TYGcW;s#rRR zq<>fqtQbg{HOf7h`+lT(V0D{bA@aMZcSRCoM~)ZvE03krjYgW+^X5RSXDrf4vqqvt zm;=)6z6e+qbGnN#mmzU3i$6M-!{QF*NSw>y7kwqLGts6PetxC|e(72fFV}e-H=J>z zEjOo;^Gh$f4C5F8zWU%peE$A4Or64jFoA)9K0Sp2+%;wsq}KdXoGuAA53Rq?zrRm=y7pkT|QX zn0r{PXDRJmixS-8|h6a%$_t)5NW2dL(*&=1EGfHAtjA7=SzhdDj>@q3aDDZUP|zdQ_J({ z?OFv|otr}AIT>USSBAZ%E`>|+dI9jYF_eeIdCaSr&Y)SPWC`4G*(K=N;aTk7x(N#x z%teb9jj?0LRubMLj_e*ehy%NJVEfv&*syRZ*8H&;s}?N9s)Z}C;t&2_ybK$cufXON z%dutoQfyef0PB`4!lspLv3=bp9N4-8*s~YARAWl_5ziO_CUq=7D8Nm@AMxk{sHLfSSUgCF!ae zRu8R;RV2!*NIZqrB#`9a2r0-5*XUqULm2FKohpFvT;IXKUhOkGIY>RH|kX1TE6fl#Z7s1*Qk4Hhr+8p6X zoeOv#KA@!|S$To75`JH!DS81$1kSw(r}I&)U_BjCi8?w9z+OHEGeSfy%eth&UuUZm zSPURMS!MaQiUnoS%Xu0#);8|6;(8=?cB}vRU{ATDH10{3ht8>EjOt_CsQj2Hr^3ZD z$}3nol06CD2}E$BPCI}nw&SQbTcefGzWYLJq-V+bI_1iud_5YDx0J=|;Pa3CEHy^! zx5^yLvh)#>3~ov2C7Wy`e-iOD(4KlJ$dggRI%Vb&`><^jSbr0PdC?ET`Z~LE8}&&k z&~eIP(?8M>`?kokMp?&;IC4t@?Y1OtgiFV3f_&XT4D}}&16f=rn-I^qJdp)mlL8$} zg}0p}zJ$r2$b4+LbZ*36*G2k^^+T{s=Pw@N^^#ll6giM?BBZ>)CAj*>`N!aKZSv#s)`4k(M@Y@2G|_!`;3^uzi^ddgB?l3R%4gJ4f+*xp@?Q4CGUvPM?@>KXn< z+`lmF>+7aQIgDjpgmEl){8(O3vh4U1V~@2zXHREi@87J8ZkXaHE(siI7Ff@eeQyiF zolEkRABB2K;Pe&QIEK?FRt&3zP0!cFrqNZgX^39-)7<59B*r4lWk{GE zj4%%@hvfs^oadE;s~R){>qxUB%EF4_g61w)$4ZiBEibovSWWv7$!e0y#U!8e^(pGE zC9$$=QOxaA1!b?j2#=qc9l!LdhYkH&VMX7XSkb>a)()zNHA5R=?Wh)*)wv?Rd8Qb? zCBAD_0zW)chJ?8!K5tSA-!>^r(p(td)Gde)YCVE?8WhHB4a(xqvro5|=q@<(G}I_~ zKRW&AHcYAV1b%3#=^f?p{WGPFh|lX+4U0&O7Y(V7r9-P>`QYlTTMgbAXg%rLL@g3% z66Zmzlf3|0#nx)p2Tqn~w7FM#Ea<6`Ww&;-i1eb~n!XW@G)JS%ZeFw2Zmx*MeamAG zb?{r45}4ho2!4L{N&M9IapDR5)aD7yYV#z1X{}UMae2@@qU+pM@?|g);NOK(2 zqM#oK|7fHj1&j$k01EzkjWla@3A-_7j4`O}NFFC2^+MYQWCT5g$_&d-5X$HEoK@%Jh>p`mSU(Yo@r3U@<4%-*BYD)} z>P>x(ZfAvbS?n60KlAavmP(qb3(7N&a)~qx@ra=OiJkPs@dC>=a9_s=OB3ap6~g{T z$V6}=9i*p=;m@bR`?sH#;Gh@ZCSb~V_X3;{w}Z5N-5h1fBN+lYA_5xP%H2Gut|6Rm z9Hc9as7jBnURF2yb|~{dL1?2yhY>7|ND9-1x{K&hj0?)lLwB3jc$}g{nr%5M!Utt0my*v(uMuM6k;jDTE78KN;;>G{v6ilh^x+=bj&qAAkH7hYsw;{NH9{ z$-FsO^~ZdyT1YHjfaQx8V#$I(uz22lES@t5i+}xv#P}!7`{5_d`Tl2ae<5imS^n*3 z{Px2)`2MpG@#^TIs8*p2F1z5om@MZ#F)yya?N(H+*BH-q>WwbF`eVYBso1&GeGp~m zwzb%?brrU6U1`|1Wfhh#oP&q%xd*4@&5f(_QWB3L)NCYJRlNgh%K8=tFV(=FDB zFh_HkjV$*sO_agX0cDBuB!m_1V;)+|*%4=wV}V3duup79q{I-&&a0t11?DD~tJEOXKUVm2v+WC!*Swr(@-SCRo?I0ao;@iq!)vVFSzAI;tVb zX>I(_x-`CTSq9%VD}^6hl*KoVHMOE7zT>fv>lML!wF=>bMkO)5c^OP;Qxz{ZuZ8=r zxET3GnomCkO-dC+=Ys#nxQhS5kF6`>*A7+i)3fC;yK6-(98?Pnht$9lVkvI|^c6&r zW{oTxaV9jA^!d7HM_?FEyZfqzC_`6L7I0J{AQ);s4|9z2mMZ zvMo@MWX_|`IHS(!IC(}V&p2bwilT_*tQZjiLCk>UoP$bI(ih1=LB&iE5KwZCCd^K} z_pZ0r-d){w&UZd_?tSn5v3`9{hpJt>c353qT^+79(k%aGT(KC#prt7$MqW&&B+x5? zB{FN%a%47-c zu(H)&_KWpjf_&1f&%aM+qAX_rhzNSprJN#OLLPCpq+7_A6U6z(p>}4rnWDmZer|>H zA_US92(s2C&Y*RQebm{{V_6032q;un(oZST&^8~mUlNTJJU{NwP*%JzMs9sXB4?~S z*Cp+acH!qm*HMP74UURLIYZnxUWeBc%M@%TMdI@Tsa?ySQeTRoGR%%?x6J#~r_%4CtHl`a@!QbG|g=4rvJ~8-{h5UE7z|eJq3d2eVXX55TvKPeTB-*bmPA1 zq&bO+3DUO2aU=hFA`oWtaUDx^XB;Xb&GI$bN3i=0F-gux@Zk*UONB|9m6K*o`6)K! z#;ySAGhatvPnxaGd89f2yi`xrg=CYROTM^Yz6z;4)T8w&+gwhc21CusT?lIiDpNFDDym zj!84mOXSS~6{68)$vj)@P)?eN2G?DSlaD$Kw>E2l1ADgN-~ax?)>(eB;VD-x{;;olnz}lCWVBM>)W5e>duw%^z zY+11aFU_8TzK=YN%g*~Vjyvkelq?_itrPLsq{g=aqQj&dag*-WxHub1O{m&=mcf zUxTLS{R+Q6<#_z!d#B=r!w3bu75FBJlG5ydbPn?*1w@|D{LBc z3yE_JY$ItFwg zz5OM?eA4V|J2&Rd33br3F_w3_8p}IgX(P*Tk}L}=Qqrsu=#Wcxyaf??N0$ zTc}>8D%#!C4l_%NFm3!en?p5IAMvq-NS_Fgh7*L~TF7VrH^}#MA@-k;^gNMdaU%pQ zgZw@(3B-vY-Iow%iwViKX~Ls_YKm1L%!MG?V%fhAfsp2cdW^_&mZY)~9-Zfpo;{G~ ztZS+yl~+$%Us_fS*7tuR%@u=x=m$@jE>}LJ+kDboNCrXEGt$1S0Jk}h?flRBZFEqN z*llWtY?ZL2OL;BXvK>N7nkh?2ypb*EBapQ~vMf->!ul-K9!JG?d%0O0$hQ?D z@%ey|yPY)a9QP@Q&JT7>JJbB{sS^_DH^KOqaf$?1j@zXWVn{wx*-cjInl~yb)P~%c z7IYl%&(UFuphq3g@6Uj&%uF{=+kE9PXE~0Mf@DoO4$s$)R3OaPz^GS20s|Ze-9ny{V*^~^G5&D=q$oOOC@45X}2w54i?b~Pc6IAiXA^|R_ z8%Sn6g$h>!%_q&_j!rL11mn&i673E4N1ZJDX@@C-$NdgT&~FN!2<%WJ9OeYPCN-$a zNHZPD){CYhYz{V+H-n0^@2O9pf?3?o;(u)=|1FIg+DP-I=l>DgHm}2f{nwZFscPWA zvG(m%xZ}=SutEScLCn7EHE((w>sGG7x~0pp>Xp~A;-#0de94Pg@%&<}TD%bJmMp@S zrLSVs@@05_)?ADoI0APxXpUc<{v*_?;>*SzcEkzz>A8Q#4b9u&o*sSi@W?Uf#bi2R z(PFIJyd58Y`XxU8;xp`jcNY%4w-xWcyOD%>D?UEB9bbI551)T_05i&_;3sE(566<2 zo_%;#JbBwy*f*vv_VlfXeI$QxcdLtyBy_unlIRYoPtr`nY@^BnBFj#gx7oVH%@Q;x zNHj^9w~)-ol{_A(s_=pKDAh)w>P$_i9bjek-<*xCgsNbilIv8{mz0b+P>Jx_I->>#^+4+IZo%EAev2oABZT z4Kb-rZM41kH#q;alW=OSYBujz(=tvt;s_j3qZ*E{Sp}^x{UaW`^3Qm(&iQz&TWjnb z+7_Gpx51WyT0gp_O&i&okY*9)lr-yOFNWsWJE$3NG@6*+_KP@^Fz@Qy06Y8C$M)X5 z$w)}^rrr%~ddB8n32D~cW)kKt{i2cP0V2|kvF4%cNt!RmN)ly5C;sk4^4#evtmtx` zk!DTLSkbKo#g6LVU9_&h@Do#(Fn6G^OFR-n81YJk}JRdLWs+*2*r{lnI{mj z)wYZ}7J0Wgqu7`P^+1|K0{j1uG#3i`JjarCCQnGHX<}tNL;FMdh3XfKFh~7S5t-e2 z%vPQ32OB9BaW1AjB?PxAai)C2{|lr!Aj=Cx1)Y2|1Y62Tb1rg7bEN0^sceoMF?2qW zJAX95$whA8>VGb|Dh$<~fdAvR6w;HH`?43)Odg(_v>~j&n=7~46X550Jqgl!eH+Tm zVnG3-BO5WBal3IJ=@)djZ*tVIi($DNh{vEEn`4-&QbufC4oQ&RMr()RT87O`J zbu8b!2fIG|J3jdILwxkfK9c2aIIw>+4(!{Ak3ZOf&kpXwjxBFt_KeBs*scw}ee&@( zMd9a1RzcsU7hwIUdr5RjU`d4cXvDK$6C--t2Wbsv5@yX`HnQ9>AmymIf2c%^L}ysXtO?yZNHIyXW2-SyD#=1b7< zoYV2MBWmJ!wy_4GkC|xsy+f*2L)EH>;Hbk7wU-~8U-(;ez3MzXQRf1@)8}^V9o3Gq z--a!NZo#JhEl8MK8)**HGE&k!u!T>{NJ#U(K_bkWI?@RHsH45q)gEG3U+RtId2^qJ zMw&PG*2pu7u|}J#aXcJ6s-vDPEaj22zdJXGi;<%?UY2--EWdNR|In4?HMGO`iIiO9OkEH?j|9jFb z#8Wkt9KMb;3x!5c(qjclGXt>Ikp(#+5ST!6M5ZNEe*O7KV@xFwk&I)PNU|d%&Hi;$ z#zVF_Bh5-&idU?MC?rdzq`5NEYO&0bk!H11iMWzrA0_QD5)~6>i>pyvEy+|?KN0ID zM%FIX!*i`4c`WFF{h?_YMwX|R5+crF#F?@yx#p2(J#r6a6@s_R$(@5XnEuY%T~IGy zgM8Z(@(HsL&dIqx)SIA!Jn=EfAJyf^2t+PvF6>h|h;_vOIqj)TKgyd0WUnZ5$g%vL zoL`~-2-!JK3^T%0(SDTk6$^r&**i}K&U2pNoFS0r=~Kp6AkE=sJU_k|VjUHNbo@+l zfr(=OMJ95dmmzB1cvQA@LM4YD+elsAyQadCk%TMBfUCXk4 z&bK3!6`yA@Lw?VG^p!LSBab|f@;PayjDa+Z=;n}S5oAYMk&fv%ZOc}b5mg@dt#j>Pd)dyi~zjd=nS@Y*{2fHl{ z>L@KkTZgZJ;=ij*)xyvun}Ixs&UNBkd9V)2Q;3sJXn;K;NP>2TJmVI0M``DK@J)ko zQcfXQT~uObC{zX!Mw$be_mO6T3J8QbCe1SfY35c_G=}x&wyc-L*X>Y~5m z>@&WDLu+Yml zvejR)<*9qIacFC77f6724QPyABz`*wG{p9S4M?2plcZ`|Ml!N&AYo2O^EQ!Y66OGr z<}Djprw#-v%(P&9AbNpsU=q*uT?!O+(yWEJ^ zA8Cxz2V3HY#~oq)dgV1&;K@-VFlqR3%%u;`WiZkR^K>QZsp zQHso;EBbpy6pjs69(}ZepBF$O(j3~MvO~0ggM>M-$_i# z2y;GZrYx4E9G82RY*ZDI{}LPX;Ftfiwr(%22?zqrMhmnXJr0^%I`#Bl!gpyrdnu z(7CSrP;an-LN?>aoiF4f*9HS(Unq3kMiLbpgor3>nt?5qm`9ika*D5m{6J*+8<6IJ zAfM}nknI=HpO2u5ENU*Vh;z6Z%KBC1Sh@09yD`wsV~FeA$E|!`?_$LAu9~Lm&DoEY1N_e@OZuMG|X7n#-(Ta_jZFbNxd78p`vjkgO{r&3bY} z(ky2XaaL!Bj?Ivp9JBwSGrdFQz)?kJ;Jm{jY0cgY72>->-YxLIFJ7(lDTk6;q_|9Y zme2C6o@ARtnklEgmR31wewrj%(=$Y#C)>!f5oan$q*;V{jtqq7-`2bd&OZHAoO;sn zsD1TiC@Y(WB}*3JnPo`RBj!<(FULZy$eze|_*i z{;_Wt4(?rrJv(2*t}U-($Ht}Du^4C9M>;o;aWn6n~ z8q%Buu_?9;ZecIIiM;J3anlD(c4&1B`+!N9lHnuFf{31fDq7o&FpFS{Xxg-fAtKHs z$C}rikmenFIZ!Xwk!)`#Y2Hi%xq(>Mqds;IxCNtc`YX<$ymihx3Gei5i4TZHErEx5#?%CMUrcg9M{AlY;V2uX8|Ls7G44ID>kk$JBHm(9gr*!B55{wThu0-P7+3( zNtSmdq*-4;+)KjjBh3x*?x02nC(ZRqnyEt)=52jQi2G=j3|}!rYdhPgw*}pNka%z7 zbz60lGT0+$D?^2TIEBXH_Ebn|BmOW4xOL;@_a<^uf)w3;r`Q78J zk1x9De2g777!!FjHkihB_xdmIRWN>ftWPAPv-`ciiqPv1}l&L zn)!LJ`wb)=cLBg7OzX>q5n($| zb%pv=UIP0a*^%AIB=()yL2M(jt@t^CDCZ-)KEVBe^3bk)pUx5g5Lm9+0JbEd5xFHGU#)K5RMyZwrLspggNLv(Mf_JyAZF45%;x2dX|1gltY=hH$*`% zu3wf(zEs)BxNIzr$N6OmIg1>yJc~EFp7Mw|hp8EEvyx3Vm%)f_>Yucs9MVi*Aj*q< zn}KykgxRi1NVAVL#}HZ0{3(Ytn|;zQ!wp)R98vh%9Gy8rZktMxUfvLEMRR)|pYKqzYC;5=9fa2{tNRMi59d8;CloN5*4^G3e{OzQ*3TjlZildIGiIYw| z65l%c7@T^>ck$zMe}xOKxe|BZ-vu)ky@1CC4acB~#hCWOO3Yie6>n_ciH!$8z`lQc ziI4GL_!R%b=l}gTeDTjuNSZ&uXCLpur|)jVhr8Bc@3z;med7ywXZ2!iS@#BdKl*@u z9q_1{N8r5Q{S}SsHNw4jwL{NtkK>u%4`B6(d$E~BcF*wUB#I=kLt7bn6KNK<572Z0 zz37)a${a}ZW{sW_HpM{e6c2RLyiLkQ6zZaj6=z@b!cE+k6w_;kG8`1X9KgX|+&Zb|O zeo2n2)zD2yRU^wb72^<}>sy7om;S3)tIlIJP_23m8;!o~r>9|P{fqHxClcPV?WqIJ zVb&;fD{N6aNc@d78~kNJUwW=ZI?}wG`r1n!?Hfp<9Fb-b=zaZpBcS=seH&v33G)u# z9Bl8)_V*)5mi1v2`#6d|#Vwr-BB>sz8 z0Qly}C(VMB#mb;q;DINGl}CRKa>mHtpPzsol1F9b1~|dL%0%C0kS%A@(mKxh?f5E) zC~HYJ!ANrg1=yG{3pTo-B>PZL659!W=^Vu22`-6346mS4nf;D51w)BC0=4q!D7$ z90>E*5oagdilx-Qw!to<`fUwuOT000&Y#!JYee8|8zHpmE0EfNB|z5qm898olq2hx za6JAkq_(BeGL53CT#?~qe|)andxmgMmiQBSf?g6C%W}YaR3GaQtS*&pHk4~)j;v49 zeohd}=5i$R=OH7_B6A{MK`+{7I!en{-sS&b1F>zy$m}GaEXT-fNpQj}2p4BYQANZ>U@xHdr3GQ9t3 zBk{EfxzBED9ZsAD+NqIdaw{Su&Dn9Ey20NOY4&Y4x}5rn>^>#UY)@*ViL8k&2e?gE zq8s)}`0!EOXPF-ovVHT-#6O4%R^LKmct`gC(jvScGApx=zyBrJYnPrG{3nXe)Ef;;mk=UyY5MR$$ZHZ(zgfSFmx- z8+dL`DbD%nkBv;7b;eof_DBzOd#o4H2R$DjfKfeq;H8n>v3X=$Y$GY&HLMvnk~nV{ zM3QZ~0Ey!O54(%t)q>%4*x3Bsp7tZs!m!OV@iQ14X~qYLu`1oCEn`X6fJ*nDt=6vhg|bVY#H$wmUV598Fw_nV^{qS7k~Ez zoKAAAb%{lmYiKmtP=lySQe4&d6V*wct5>guV~#!=CmnwjzH`FSIOiun#8F2cMG{@j zMx`$}^JI)}bP3ix)&l!SYidR|A3OXGusPZW-WX`anfq)N4eD#pK(=E*Lv9-&j5ce# zx8Kc1n$^yoy`41gB5B^$kGi041tZS-EH|O0{xm&fTmPG}xmR6m?pYUWAH5FCx?GD_ zNtj=Hq&}X1yeSs;Y>7ENTVYo3+c0ayqqyo%zqWq9OmWE(9uEaQ zj|o7rM}eE${JK*=Krg(ifpDEmbv>&=byhil9IFOl(_Ag2-|}d|r6k zJ)QOzliaW>f_}$!XY=q`mR_t)Nw{AtvYf6{PK5FTDz;b1Jtx2h6%}I{w}SpF^hFN* zxPfJ6ZHVCo|dZfObXRp@Y$eyh~3bx0gr;K4*ZfwJG8F7qB%lyX3 z^lE()p~}=r<7pI0;#G3fiOGfe4klDannPo3l-WqL431?)PNJk?M48(%Ve&*0fr&H< z3dSH;phH zSifO4-g##=)~#HL4J%jU)kV*v-JQ4N@MKkmYp=N;0|pGmLl5^r_wIwyz1v_6?Kc{e z`}D&4u@Bm(uJ;UXh#f;o6g97w1bFknme@G34GCOJTYK0@GvTCJqsva3wbk^3twxv~ zMwSU%dTy|mi6c?eC^HG>4ie0rtYc^2hDIj$kyyU-aBZx6xCwC!?*7%;IHy*1+;Ywj zaQ!)_>0Vk=*W=+>xG6*a5ZM72XS{h}7Cb6evO9zUciY8_Gq#~gVW&OGx6 zIQQJ&;@9W=4uAdQ?{URne}|5@Hb#&8+v7Xm`j+WLpPv5H87E>wtE;f7XKU;kew!2I zWVGBy%maxgv`(`Kvp#g`weTLZlO*v(kfy)owXZiSisZ%0}G zJ5kdAZp<9p1NE=E$l7$>#TQ`Wu>P1jb_nfqqLXIYxNJIn5ipR?6!eE2-}wrpK_$=M zV!|8{h*o-y0+0fc#RK=*3`tpe@`czC_6MRLC&4sH>3Rn7uOrR*Bh8k8LKzXlaV3o+ zUN@bDrHJ?{(ro3DF#Gzj{H=sS88kAbb}RmAPIVe*h_s#rsym%Z7QlEU(wr~j*C8Zb z*FVcG2x5486P7ml4i-| z|HHmmR(sKjIw~ieFy|v9&Z+IOuVnW@ekWI1|I2@J$XSkjK|d9tb6x&;uC0*ORTxh0 zoX}Q;^1HwMm%E5SMP#<@T!1p!bWkC_p8r)R+vC%x*tSR>KVNmSpJQG@NJoC0^76WA z#MA`(Z7=R7$KAFGi%D+hLM&$t_X&?>!9Kh%^gFLlBKTu|!RyHC7uX)jq_{-*O`7`4 z%MWD-&SEPHJ;OX;AW2R9(Z7_c>*MPZ`O>(G|njP+&$@TOxi#Mt> zE;m8w_YB$o{BtF9N}8vTG*49mQ!=w8ml=`f3Tsp)g#ts;Sx%M-D^v}kfpMFXX6;ku z>@0hf$qw^J2x_cWw+N6?NHMI_I$r|ox0Fa32w%?51B($}mJ=RXIM_jyO^M7%bOO~= zBBRy!;Hh|xXp2DWx7OYB`HF5>GRzseZW7DlR&$>3Z-0+H{y*usV@Zt5v3=8ey!GZ% zYaKR(0XNtQp@vjO|It;LRwtFU?9Dr|UrE!M4Bi#2ODW8K=F=+?DAF1h$h z62odZ_UPmA$U~1}@W8?7*}V_?_8NlTJqDp~k0E$w$S}Myx;r)wzYDubnD-8Dirpl% z`of<`^M*mU*voMC!e4(9IUdt=4b5-Xi)}WinZ$Rqre!N^&6x1rB~vC_H8kJ z{3t9aos5V0Tcp~iYw)^1eD6dQ-=j~3-$5B$W4lPck!BzH4IgdMR}PKv3)Imr{@+95 zyoV%u7j?CRdf7$NylX&1>}DJH^E^%2*xjcA(U8QMgxSDrNtkV(?0z*njulbQx-~JcK={*HESuPMpI_F23BH5J*}e%}gLF!x90@PY8&h&zPo!nF!?8;)~=|`&EX@ zViNV_nm}bLQAu}}oKy#YCpz#rleNQM`eSU+#}xcLKi7k*j)$12c;m1UdBq#aljzRl zipQ-$oU9TKfswK$v2neE~+!5;E*bi!$Uu*D8VX9KC$U-{sslWBN4OxS)NS z3(0a)B#Do8S%MX@NE0r9TOab6^#kiADe%uTvz_dRpp#(BF~YU^#3Nl!OzpOMiD;{1 z*?+Y=+DF(l3=w9+Mw)qpG1CaNatpUS$L8h>tRv*H;79%?TgjjFEBhhLeRWbMIVHF1 z4#?z-e_K23ct-LA(a+aMrmMtO5+8D+oQ%2}0&&(kBD8{^n+w@dV%v#bX^KWhasy$` zNOQ1D{~SAIw5=b@Y}Lz3M{x~)j<2CUnPfSG?NgoHrnWCXQW@Hp_?_vSF~q_D+)kSm zVifn8?>S@}+3#d)eUl*B=XI!T)e|9=L;dLR{!%$!mSUW97upDQRAz)(a31EQIrMP` zlZAEKh%B!)JLZ40GZANA8_2hlX2m>~C1R^77aD2S$TAU+G#hCyDyBV)ER%E-Wzz-s z+j9DGd67e-(*Buxhkncd&2D9z3DV#6^)kv5<{w*IOh4JyR{&Alc%6Ku5NXzL(}~J( zq&|`QgX_rpM!HJs@MG-fr2o7hw2$Wr^uH?4JUt#$CQrmP5=tY@Jkjs+GDi*_lnV+F zI@=1?bu=syW+Tnn9hN&_)UdQnmd|rUUj5OXBiw!1g3>%52(~^WY`A!cFt|@&j2$)#PmCCYiDRC^l+mLwYv|)xJ?24d8+MybGkBMzPao5T`HA-w#H8f3wq^=*dr1Mfyjy9T)7tdsGbYE_Kq54GI7M=N8tOX zeGljT{&%?k>MPLn=DKLzs6HBAcO6>XbTfK%c@XVyX@eo%9>;6X&BwE|O0jTOIc~qD zC5}4!P*khwqt?GU`B==nk2ea#NSbZRg^!SjPl7u{_Ju}9i1&&V4`_k|B)a=~e7AHG zVJAuU-oYf)yph=5o7b?NHa(+n6T(NCZ9X#*NVAc2mbHz9e3QKl$n%NaB;q>;Hl!|_ zV(pOTc(GqI%>dbCbzyY5|xW2))E%T1MHeC&VCh)P$zO4;SW%m z3A}u-BkOzDf$Tygc?RK60W#Pqv?anMj`P3f3mcR~4Lpxqt(iZQ=W38*CJFOQCf=FT zG$I<4JJ!c!SH!Z4SVn%X&~cH?^6<#{X_9Udg5ty$Ws~0SsGSkKPYJb+Z47G|hf%+5 zbh{A3xqfXpzihfdvP|+JlI-vo(CsC35`#=Tc`oL&}+N_YVEn{)z}sPzoU&&u^c_ zA|hUbQJ6LUq^9``bd8#ZHsk~3)x{VwBNslAjDiTyUs}SWCri4DQR}23kugr zk}FwG&IToSIdP*xVG6K;G+VHhAl6_tCTnE)tp@6&BY`x>0Z1rgun0*qo=TAh=TtV` zt>^xCupCoFAk0Al`EX;BBkNA2c*1y;jei3DAMA`@pLK?fg5GsUYpmb67O%Xv1g{b= zlRUrl(tJF(Xa?Tdumb=6-=E>%|Mdkv|NLX(bA0jn7x>%feSV&%&-u=ce%c zXAQKw>uxNZGZ)XznT?kgF2qaEK8sftEyjzBUf}QN@LI`4EF0Sm?+m>gdxqXhQrO%+ z3%yHonfsIIk>qYCwrGV6?$Zl@TlEFI1o}#!KJ>AZ#BaBzE9he;BzW70l1!8AZPy%T zLL;@d>{_3u*f!u+>=||sHuk>*BO6_Wzn*jizC~RfQ?)7%t5%J~SVY-JnMIPTasRQ$ z9FKE;`D@g+;{V8=TW1ncG=Gx0I!}m@*1&1D4(?*(qcJyJG(TU`B*qt_a+sSK2oHburgn18%tUf8e zZ*W7Kc5#4&R-}0!39?=U)X2QO+{mp+v%V6zo4PU5tP$pvG&iL_rB{}pya*Up;}A5` zymc^dN?5)VaEC^?^-1rg16yHX|JzVLpdCsE-iKLz@5R!7w_#$_3-J5Hs^IZf7i0F5 z4`V)W^cFBs%_*VXD|l%gO$!$8cWE&B2FUT(JY?r7*d*uYUz7N_V7|$H`F`jlV;NMQ zkE$?1SB7wXd|joa*}#1X zVYcaFnx2u6WhW6HCxc${>kIKXg~%i2$|Vnu{A<4o>R;r233~AC}Nkz6f%ejZh9J z5+cpKjwH;UF61D#wG3&WvV3n_CcBkyzqKud?A#dNj5G)0>^b%8NOR~5O&QY7L-37n z0+mx?pOKHypBd7T-aNSew9VIRrrQcgeLmQ`!()K1%OlNfv(;zz$Lmwu*YC{Mb9{n2 zjF*56$S2D&Y&sHU2>&H(xQVxy5rs*U3}L^{$=AtOA?P4nUkLeSnw`t<(wmFqH)SkD zn*HffZoLkbliRO_NwZJY@vnK3tZFov_UXi#=R~Br%t&)$!$z9v?+J0vUI5hfg02_B zf67{bD8~qd+1h2X)$KpD$(|CZZ{-i{bDiTkqjLCph3#C(beus0tHDkKKNNy*8)=@( zAnt@Y7$i?h1kOlv)~Miw6-S_0DG`XX9!2_j-oVgEOEeY?*(rGeHXlZuEwCssd!p9p zMmoYl*}^{i9l@q>ust>{!=T?8{8*w8LJNUUFQTDOcmgw@9E~XhdZXpFm*bdfRq)+Y zPeJLN5-eTyCZ2zB36{L{JYHHd8!tRxhQ*7@uy)O>_}dpB;J^O$IsWzKXZYu5ALHMj ze+2yF6a4oV2XOG+HF#(BT)gq(R4jdd3YIONhBuy@jyHJi(ii4q`#W!A!OS`M?KyuS z!K{WK{_tnG{L-s&^x?kBP~5%=whX=vn@MQX7Z^#1brYbu%0`w+l=ls4fCGaY;9cSXiS&Mw+&y+4 zuM=TT=6q`u-EKNaltr3%^=X8ig61@fH238-(Tjk#_A_rbM4CmuHNvbLi~t+uWe~Xi-2=g>12K}Z_#mQ46&jk~Z z0-d)hmO*k>B+LTqWnD2z6>-&9?R;KtERV~f^5QzP|8cO@z7S9=CbbM6%M856ZZrsc z0;lthkg86_L6#NkK(N4^1Z&lo)KPhT4u7PZToC7QO~mBj z^_r(i6zcy%b{Ff`a#}=&L-`r>KZf?nMiV6Gi8Pm!Fo*W#6KDG>t?XKP%i#LU{yVJQ zw0rB91nDR{kHu4CDub27wgo+g7-C4|6;wwGvxi(?as6qVuh&es6_9W9(G8=Ql~0;O zeY{rvZ3w(1MmWy8skcI;S%`^pWe|a$`ZNhLNvzH5xA;*Jp-r()z7FAB+c|Mo7-MYe=i|?wHqh7tc&&<<~FBuoD=(<$#XQd zDy)HQ#Bm(=KHJ(hNbLN{Q$Fs__psJT>L`Ky?K36aZpTPrS z=CpcYB0snqEVw9`I`n&{wk$Ir`J}m=PN^6A(#iA@>}5h3Vk-YLn94vmxtIn%@d?a& zYBXk#B&oamR$O}SkL^Puf4%TLJoo%_cKY#lG|Nd+@-rfEN)-5A3UNQ+UE*OoM7CeDh7L3PB^Cn@*yeU}q%CkJS z7?0fF1>ZXETR7wp63rTi;+Vsaz)8m)k6-@yr?~o(%h347dbs~NC%i<4;=$Kbp_U4W~u zxC%c!^K|_AcfUmAo9m!Olg4uK^cie%qPdgo5?(c|GPCd!$`+lvOn0jv` zY#ekO*7qh!)qHFbXAx$RRjsv5vb=9leUjvx@$R5{B+d0mp0zf!M&x;omd;BEvwiS| z*u{NDlzTPECCz=fKbfMjE1A=*kEQGxLgK8^W|pB3rD&8}bHQysIAJ5#16yI^;M?)) z;5$%0@Gg`MxeG7%u8%EUufp1UF2X&h*TRT;zrkzN`;w9gSWq?%vxpfbMNGgV%^sX` z0uT`g%rwYE0Xdu(t~1|D5RRn*CPI2%;-|qcvn3oABNQ)GkCLaza@bC1V$e1>c~(Xc zJT;TmF>T^6%rX3BT|^Wer+t1!(*FY=jB+LrVk>- zNXi<@^Q5TX+L!FNYb&B+eFBdK*&WMCA95Y+90^qibW*oYm(fiRaK3D?OVTdpum1UuHmlnbLP5(yV=@ zjHTb4G)Mg?PDY*S#y!ML_thdp$z&1c0N$`FzZh}mc?nt0NOR(=w3RT$Djz9f7R)z< z+!U>?a`Lyq;s0>8rfg0`$ACM$k#_Ma$>0IN&6h>yyi$wdl66ziN^-% zLDCUBQ%-P@5^{vef~uxM*c{=sgQ#2^q1IfLj5HeoX9sBgN_9jCu7(1gA?WVro04XK zdtCFNpUxne2oYveQjnj@n92b6bdelq9A-~?3Jb@L!K5A!qkV(x@TZ@gVN0O>{@3SX z^ysm8;rW+In4ib<&o08V^X6mW+&P##qYN{Pr{U#AbFl8s#aQu5F`k_{2=mK&V?p^K z%qtzj-y=wbp1_j%MR#4`#((iu@*ZufZG(7hQxbwnu@Nk{;(XHM^7~ZB1 zrgUk6(Rb9vBXusoo#&s62Iu|+SO4gA{O*+FaQb1@ZNBpHhaHZ(*VVzGfkQBE;xiaC zVInRdxmP=W|A)Wef%_ju{Tpw@Z+`w0+}f-GdUkynci(Xv+P1w1{re9hdG3kF9_faz z4?l_rI(0^ed)i{ukfB($>TWB%VkqP)G=%_PbV3zBB7o+09Fq1c0F)R6qHP-VfrK`L&}i^f4_tGb1NbQ!E9fUB_+vuQ2jprfj(Vm`TGnk zJ3p8TL7rSlhk-bId{Y?}&XbN<#y6c8S>gH=JQm_sQl^1AC{#|2P(~Pmtc)~Uxe2^~ zLu}7-f=2qYQD#rTO2)B#c3xsL;kOfJZ8IWi@jtO^hw?>ce?oYSYsp+lBtnC$a*vRy?a%tg;cggKWq`!o&jix??k zwlZCJLQ0xdK5ajqwpb`9288nBTxwuBp>JY@{tLueaGg03i4lCOAaaPGkWZTBpB^hz zPF61W%U3Fejt9bA%=0VuZ;&}(CM|(GmA*2CJ|32%GXkBEW)bQ9YXb7?i1m=lZRh$n zCdc^*$NZ&8-)GlX(3$J%KPSzJ&xTP`!s4d5Q6SCFl(3A_1pF=1Oq&cVVPrSBiA~cs zvcVF!`^;|=JK^zS`bYf4CirLmxiLbU)W)DoYfCuCa*fK(x5=-@e@2?QFVbnKC-p_5 zihX$}Sv^C>7!zh8cB1^?kewF-s^)-9;<03f3L3Tt0R@IIwIhQ&t#?j_0*-Ca`Ib5mGS0+8gYNff}e(1ud0#dL#iK!DkR4>t5w6VfAJGsbNQwC z{`bC%V-7!xq`8(&MW|8r5FA#kCXPPxFq`&Je%*t=KuZm6PUyO$ZTYP06UAk>(-wu~&1nHRsz%azJCl4v}VU`!>Rk-t~zFL`Ir- z^=XFPeSHOuJp)=|H?faIeK+geP13wiq(J|0C!^1$Kf$tbz3^iBGnhY%9t=R;*>?X6sI@n}yRRO4$iNQX|Ab5ZeJk+e*fLOO61x;BHy z-dHX}fpUa`Nr3R=>A%DN_+}vx6#dR(yr913b;zo~pb@ax4s*yWA-OVn99r$f`XkXp+E&&T5R=0Ux$VfHeaZejSv^;YWaN?N zJmPGmnf3bgCbcnx`Xeoixh^FAP`{C8LS=;S3}oLM5V9N?epg1Cm8&vcCgrPt6Syq| zJC*)|t%tAaIoS<19f)8Gaf=Aqw;E_iyusDdbA=!x=}RN}yH@$g9c3;iD!C~lf^0F; z(Pp+Sj7o=n@p)E;bP)M!u8kzNl|nko>aC!U7}P;-Tx8p%r~GqF7S@~SMx-*3=R%~} zWH6u2?MIPjkCXC|gfu53%>n)uKhG_(pM@J#O_xn$Fm)E!DSy##Jq1W_xbu22Yq^e3U)78xfRNBeTUxx@p*-jdwxFh&zD||G`kVfF!CCh8<4b*7xE&TZjp0t z2&N*<6M61LDpw=B|5wtia{@$|eZsj&vjqhr4G{VNX&CvW*6BA((fNPH!=3KKz&^e4=z|a7ic2rUZ+`V_{ONap#7QT73r8M)1ipRp z$vB1h{%NP-CqFzJzxlpw9Xohpn zJ_~oWXo?Yo`(psxBBI={S3i>OM!5W<3sL*>E6}BVd(0~@!>Tu&Ft2$1RlNElZ$#$K z#M~Lhm_MToFV35f*Is-MD_?&J?Qd_xGMh8lO~QSh+TqwEkH9I_4na|;rdZjl3Eq6H z9(HJYhDMM@nDz4Cpaw>o50E?`7(|df?;qTN1Xoisnv*0qwU;4d(k#Mkq*<$kq@=l_ zlV+{coRH?-{aRoz3G<%*Em>}B66U7Z!*cfZZ;ag}%G-uC!)8KTP0pW`1o?70X)Zh>91~^> z*nuQh0IyYjl|W@>?NxGN`I1{mVuSw=qvB2)1Dsq1gmR?A5aeUTx=re1O!6B8wafqK zw>4kJir{hTJ72cA9e!h?q|E)HU4*U`F^yLm$|cQNd4V(wxrEu}i{E#NVP)~WkUL_V zaj*|lgndb#p=`nC?8|wI=iG`QYH2!O!YY*KWrlhLCmFGD1c=V+^a$?Yhaob6NC|2DZ@y`{k@$rXy8apewx=8m0{kjSr|KV7+N)NiXWeKHm|9bkmbWrwd!H~U4y8O6Hhn= z7hQBI+TC+62K4HKp}qTJNZ)=WoP*G*{k^!5MEa78FG8~>jnSUxw7u(Y+|lMX+;M9g z+{taHdpqF%`#Pap*T*nv!c&-CR*Yxo%)sJ#Wq5AZbj%}ZFMVb#CXN}35d(Xp*CSof z?$%bQcgWjX;`(gO75xDd2JFQ(;TzV0DcYhRr{oSu|I*IwfUXNp7U)I;X zI|ldbkG@@d;MS(iQ17~HFr;@+yz#>GSVOYB^0ik;ieIs58A}!|#B=jzkvNy*rNs-e zcI66e-@Fc6-dT+yeR=>|L8ZQ7ILn(n=}AnSI2M;(@E4q3y$WVN)CwzmHo~gMZ-&0~ zmrt5CJ!2n<^M3x{H@Fe@=)){}@vnb#Z0$=@n@rCL#MwzR33J~DHa%l|ulgj*$w+fb znwyazw;(}oX?TC=tu`91HJ$hOYX~jNOK(A98Q|E|5=;9 zIca8s%PT)dT&5vODse1Ji9#XL9G@G?V64f4d_FS$hPw4QI!+d0e?V4lWpJ7EWy{fV z0l56sk@}78_Tod-3}P{@9%LMExN>2H9dA7$RL}Wvec?b+;{UO)j@r z*VX@Sq`CO%2?_L@m_)nO7yTx&d;@i$DP2aIwdS(&bUtAgST_4D5a-HnK4ZjjAPksQ zI8Q{7$3^DT_avx?6lq^cpYe#Oehco;DkE$w=p+T(k*A+&)J^_*LFO<$BgR*ezEsv= zKOugaU1a`|wLxtn5?@AR6u9u)*U6TC^WvplAM_`v-$D$OJoM2* zq**aR^(W9fbNOUAv^nTXD5#49h|kMMdS2v%RkC23*Ej`NZi55q10j9~LBY(C1YIkl ztm`wvY~X&8W*-IqdeU56~%3WMBK)~C^BDX56$Ph~wtBqtWICyhqYs6KdlP*+SCsEWo9bMP z6OTI*M;uliha6JXNaLZ09*&bv`W}Au%Rk`ai>}1wmtKR@zI%op)5kpY^}9o=ia1xd zQRY*>b2_fM`Z_$^WiPhODMp(kOFtK!@lSt1ty(qFrBi#fZraF}>HGPQ z&c=`d1JL8~$MN`M-O#nmgXna}-5AlQ51uC>-mq#J)-8JjZ!cepx88UaOP9QWR~J8v z7v|65@!5EN$s%lAw+efAZ^yg)w_)Gz&6YeTQKkt`jKH+16VS48ef;q7swjV`Iac>+ zg10p~JVdK!_`1@&^%-n?39tc4EQzf4XUk4(^s49cs!5u6i8SjIN+PL(TT6pU52dRfyy!jDno{d@MB+Vt$F{{Lrf3WdN zojgJ84m`5SGl!m1GkNUh7y_CyIdZdXj0 zt&TjR93!o-Kshl&tPApIfl=`+hM!ju)RljHfh0mV-4(!Nj4NS8%M-OHiClQ_&icX; zlb;N(k4oEJ2|}E=zNAc{ysszC73C6gNOROTreAJDUE#(`$OvQ%?N4k;36eU?20uia z9bZS99lW0BX=K^He3lUA3Z$8NN5olA89nZYBrW9GN!;$VO|iW4DJM|wFZuFiZ7tPH z@+m3x{;+;wouQ6UXYf5EDm*VBt3NBx!Ru0jl*t^*iQAtdl~Z~P<>t$m^r_jY^b-j4 z|AjOs#F_c0B57v7kT_3INwdJRXu}zNzxx!=PxSUx2r|YoL3|*NR{`NGn&uyo?j1p= z&SRV`i|Dxxgu24Ef=*I=bJ8p_pHG+tAMMTBlfmOkdLu}=Lwq*yoXr0ddrSIW$oDI^ zEw-I?g=saqvC)g(xtK!k5J@Tgs;tr|_>==8k-qFixb%i%Fr~ zp?`z^h4{Qe2;kdc|EnS?ZlH2}+}mN4*@k~I4Y0wEB56*QH^-YQNyDIR5>%Iy>$n?eT= zX`W1I^15XAMYKgM1ztD(DH`-}OrJCa)5i}-*`#rp$?~R-8%N@L3x59Nvv9)kN1|3u zA9*f>s#=CE!I4KEgCG9z$7tTXCHnR0hp7{v#!R+V^U=yCQg0I}=Tj3gb?g%iAmcG* z{20nL9#bccMbEAe;_?grg0sGR8qWFA&+*H%&&ChW_zr%2*7tGN55AAnPx~IuJmWNc z=fo3mRE-)qx<++;`?zEA^Rs`1%P+YMZEkIa?%f{4a1!2;Lxy0)z=0SuWGKdu9F3<& zjK&i~N8qX9B+C3hY2*l!cM^FLEMQQ1>*W`)>ea<~>-85% znqS4*rLSSdOE2QJMbF~pXBXhb1#_|d2<1TK!^hbCPC~Q~R(;eUfFZo}m$Dl2@&0VP8fh zS?=EqTS%5S8)k6VLZ(g+HE96P;T&#-e$1NSZa(XC`Kpm613XVP=UZyV&6i28cXg_$s8=7-8_= z70zSO%1N*}N!9~u7L>V(jD`srVC(fSK*-r^XG(u8yWO~m>ZjuRLU*zZ@X?GgC zlNcTJ5dGFM$!%X4OUnC3Bh8k4C8xG-Gbw3SStgJ7h3S|7LpiznoJ9LcNMzAOlos)t zm^2HF$@!#N(-0~n&0+pr%+&;Li#Tw%1C?bS%Ly+AC|EQBY+ERQypLH}7V+vtAv^{|*gh#1>fJ0qk-IYAh3 zOP8e$`mvZOG8g!a2yh-~))S_Q6xql!iL!2Rgr|k632BzyvV2|>o`l%VFZCwB_Si%F zGx#)-^2sGUCKM*kiW3=W&g|Qu`f}Ru$T_DHRJcy*k`+0)CbfvYBHNqU0bt^Q|>`T@~q?wMZyZ)kLb~4FwfRkok zWAir|^mx91Eh=}U?JGfrc;HpXFff`@*xB4pqeJSu^din-w7H0#QB2Zoi)XXLib)mL0*Qz?G`+u!1xbAE-N z{N%?t?R%%<`9bg7Y^DG+R4BpYr zpdM$@aLXo-=k{qVo-+f_mX~2v{{iUQsS8H*>x&+b+>b6D??KP*kD_;v?&$vLBY33C z{kZGaThX#n6I_4gwfNatKg01y9gibx)x;?$9*5JueG<<6?y30Y56{B6KRO#1{O-4S zplw?`NiscQ_;3>Dk(fAYB#HDeOdLH5qlXU0_%UNhm`C8RfBrKLuUQL+Reap)CsnP2F>S8Hra^76fuwiaU=rZugCa(n2Q|cATdTR5EeW?x>reOdX&GDk zHo;~>UpCZd#zmTU^w$V=GZO9Q*rVwbT0X8H$#P1X`Cr628Ew|wW=+q~Nb_5rufjX` zUx_XEUyjY4FTMVYx$F5}E_u0=s+F$Nop_;r#)3;*FvA z;6v{1d zJ%d(Gf~}BbJ-8ToGBCg}J~)ZWl7b;2%ON3zq+tG;1b-U=3T+{rm}#E}dy(-_et`dH zS=VR6MnxWu$c`2yG-%(wrWT<@EN4 zK8gFl+p6nSm+LL)P+L#Po|H?A1pU@I83Jk6JYT&mr(~-+&a%buiE4+FW-aX|XdPMY z_m{<;EYBpg)pB+*VYE;BCQzRR`!~YDwxxY0=s(-)gGFj%Y+G?ZQl}AVcKeA5GuxZk zl+FpdcCdVwn~ofs42c}lBkMOE68>VA%3_)JlvjPq|0+l4@*0zcu({!EqmyLjV#}%0 zfwtk8b-HfQbjxyG{N#9t}5`Z17Y1NS*;CKA7MyA|YF{YI20O?RZEnSCceQXRA-Bh4&BFY{{IV4J^hy5;YbG}AwmQRXru%_7Xw zZNl|u^9VaxJ8PRXtl9TY&&R}QG7ELJAfTQ;*dv!p@G+Tp1djJAUSn(+H8|VwS5>)YB}sMZ|hGWmr&Bg6GO-U>*ro8H483iBDqcq;YnGqZ=XJ z#Asf*UJ{vCRD@|`pFr;iAHvlaU5sCxeKt-$_E;mf)vH#urR`4t&Uf(W{r6+)xbb*; z%oCXWXYB+>swPua#IP!?Yal#45;kUmz z7uBmBf?C!6Q`x^c=NA~@RtCE}ox16R$o$AIn}@gypX-#_LO-#Vao? z#F9nx@cgrLuyWZe*tdHNKK+P9`QUzh`oVsD@xi-Bm_H)^_34LrW#Iz0r3?Bz@(7wY zY=9%GSH2U=mtlyXHQVH17(d%?6&QrRVyyE^hTQ;Cn-EG19!g6G=0P z^2UxAV{Q8ju=3vXv8CsYB*}USkVJcUYwQ@_65EHhz)t?&D&pLa^$ocdZ;rYTSDtew zjy>!c4CprubLY-PaT)!pY$|4!%w*un$*fvR8ar}@B zUmNTagcUdn9gE>*6>e|bt~6*`Oc6q`_kgdd04cc%UB>hn>&&1yq$K9;En!>^Umc7Q;=5qvR{f;p67_vPf}RDFu|l6FU=pdYFKJWU z2Zg2>xGiZbUHiTi>;v7@#qH1SAGbrz8}^SaBw#MF7sUyY<^Utl(~51N!#~R42;~*h zfe@Y!IB9eE(~t}m`kvQiZBe^KZnce(wKdj@4Mtg=s@wBSKpDwMc*@{Qr~2Sjg-75K zlvnhEpnnG;(YduG6(J>)(J`x=ZOyeq-=7{&;ku0HELQ;Alf|Ay-)xWGjf%fW z?4bS<9wWGZTpvNcO2Hg$gOSOAWV(`BXrrB%*qD)I)Zhe+VOskVw;}C zJy}=Z2sd5t1oDyel z(~)NHybxS5r>@`45E+NmS@Jo*UQ-FZ8XIkX1uy6iVtKj3a`)bxtM&9Gw-3A3g`3?bPa(gYiO z*TeQf&9HS~b8I0=-PWI^xmROs>Cph2d+Y0mny!%%W_@~l@6hJhJ%nZHg}(kVX%=Dj zdC)uhk_ZzTF&1go^o(~pU4gBgufV2`ml$b&v)x~?@v+(@;Z3oNW$qZpGD(_+og+w~ zN8FAbgIi+DpypUJwi6z0c>@lm4r^b1J!a3Ig_2UDsU2neED4&vL*1?yM<$bEs?L`!+VqV|NOPE)k-?AW+qfm>H0Ear zdePEkDQQlKjKk#$?R42P|L`_fPMU*G1SNbC9jalk?O-1zz7YFpKpb~8B5R8YL^(Ru6OX6M8K(Z3^;3>M3cv;>zk)Oiv8{z^5ygb> z%gVC4*!SOrH0Q)QL&nAIe-is*n0==DlTE7M9btqy17%C}5Zk(ejS3;W2UfJd}P9!MQ7x6xCTlk;5KQ2dYs4`{ZL`)@do+dKPPI6=fc}j$^ zPsr)m$XGSaz=(5_{BjIXkYQ5kv7rUhtQYV!(i}*$<~yfJn#u}8{oyg)*TEhGq{hH{ z1nv`pQXDc)wS$Ii0ZAP|C7QuGj8GF=6n!$=JcZ|LJ?F4Ex*n06QrX60IezlZdV6%v z{XswWn2l{njfnqa(yUQ~QU;7t8lFbo!eb&isU-ro%a$2pkQeL)8IAUkn20EgJZr>x zGJlJtSWsgBma@;Z{N5}QdzG_f;X(`}aXzY6Eu3`Rap>Q(r|E2(NHU>$W?EK8O9SaK z@Z|C1FuS}Ay?Z>46Hhqa){L%JwJMG|@<^O@#`jU9dNmw*_+j|{Z+?Tu^>4<|f&DR$ zBzaDG3Fgiy#oTgz=%pOb%@%4;5a%6QvsqsTB(&t*rmi>P?ao(XOXth6q2onZ*ZxARxaY5U>;B6~yc=Ne z@aB|Dq`3uljckP-BU@wZ$lI`GSZiz^(hBQFw8e~G?eNo+kH*QzpMcS$#$fL3xtLi} z#v8Lz_II(_v;}C{F>TU8r&h8~U@v;f2NLA_hVWIS*&pE&6>eS=e+>4Uz=A4mw-7vG z2EsgngjuAS{xFfpV?t+yPd9=U<3=2~LV){E5&OLUhw@^bq(~BnlZGTF@S2LGnK8x4 zgqI)JWm6}Tsc@3n?08RFT9w3eIUZTV89%kxR}E+#jyZz;+0%NAT%QY#-ZT;BOH& zpL#?+u`g#(w`qQ+ZY-win8l@_59-16ldFqV9~D5E6%U!Wr%h%|)b9jwyNalX(EmQb zz-vucu8&Zj)k8Tm=)dC|lVbe`A7Z=xMoC+@fk2eAl{5smNw>pbgKm43 zk!JUu41s&aq{C_VLKB${=} zP<4I&yrB>%g?!TN>sAUpPS_{R*=8foN)XhGk!1tRlf1lppQf)nK0V&Zv$p;wE`t(} z+cHFyPYFYPFuS+}V}}pNNyi?GGr#ven|>jE6(z}4^(ml9HqC+5yg@x}m__2;xKRTe zO|DhFhJ9>BQ#|zl>MO6n3(qaY;)M$^X9h`L@iZgNb4Zvq+B~nU7z@fvv0$c1^Gxn5 z$Fp;1Vbj}d@ac!|;)8;S9;t&Z4_<}0JCHPYycjF*y%5XWUx3X$ZXjva^bD78=SZGEswFm!z7-orkZ=#X z1#b{E#{K(s@b_L=Z=!Dpb(iSb$rwkB6VLI*6NHe$7Nt%mj z-|=xng&30{DYLSEi1nC{Ad3@bZ%dVDCQQWSi6q4Ah!|68urVgvHtA^HCB>D6;7C~F z`-Y@BHa7df2+4AQC2KP?NPhz(Frkm3U$TvpCO*se{KbRj#Hz~2I|IcD!4TxWt~P!tOI3qA%L*HD3Iz7 z)*O@N`LbJXK+;ug0QsYRHf2R^k*sn$l4XrFPbGn#D$=a|Jnn%{Z?Gy*P9cFU$HXU| zzzIhli68&q2PtXR-LnNw)};|`jqcAVnP%n8EG@Bqy7bbEe2T^))o^IF>NteIk2&f{ z^zGRL^JmXuo2O&mtQnX^lC1TjMVv*NwbrnGu!LlJHp^N#do~I4T73S|`}p|%1H^t} zAKu%$9ea0e#s}~1!e<}9>r*v85t-hDe|+*Gk8Q``0lhHciIL{W-#_hC{OGtEnDbBz zY#DYdNofo08qtCTxe2z8XpZ&6THxibH)3Vqrr0v%R=m@rK2~?R8XHKOKOAxk_V#Or zRh_OPdA`}c4!CoW)>I}57I7A7?k~a|k!F1*P>6_g_^kGJEl1a@A$E7Ki?tmt!I};i zV-?BsvUY#P+mBp}-GiE9H%a|=lIHCq^ptPo$X3`mstq;{y#<>Fw8nCD%EC8_WA(5I*sgKQHXb%Gao4*J=-$zDrQOIWkVb)Dg zjQ=fZb{|TTSrRv#H1j+{3DaT&ZQq`*P|g@S7J{C&I|*np!ZEd}&~G~@sVmfH@k5}! zJNTPEDA)^C5yDG01xYhwn2!o(a-?z*sxP!P2izaXvirN@gc4Tvn~0kY%oD}1X+U3WZ-_q zNtUZ~3zKGzDrY0ip^vlU1mkkJ&SIaS+z`nZ+8gU8ANeubprjt@C$#J9pmTHdM;{NE z#&*W=d7}a0Jj$e-6ph{|FHPk@dNzM|o63lblV-iR@1!~Ili*8P&dw#xu7}h%1j-S& zKR|gZgZ|6sGrzPYDwE~sD)#$Vfcd&I(j4G&27k-?FP1&@Um6!_8?i17$uZLj^(cgX zh>2eeuSZ)Sjdgr;(kz6yQIxhNSN_8G!0XtbDQOPv3NTv@@g_U35Uec1V=}Kzph1H; zzBr$n$g(rLH$8?rX)~GK6(vYWb7D7Pnn@tUPL>7g#7MLDXR_IoA?hIXUx3M)^ojJ8 ztJ@6u`V6pgOn&csomJpdrh%Cn=&7t7bDH118l8)yl3PGWzgx`rq z4k%+|0Lv%M{%F9JZeShuqMUVFlJt13Q0bJ7Jko5Bn;9Hcp3CVn2hvq3 z5WGa1r!eswQRa09{x-tQvJ}KL7ur6SLc@6?uG+#&IumcCj+EhWt#TFvXwCAl$2u3@ZmV&sH5<+AO6rjV?DFD znB}-WXHrj^_94=&x!5yHi%?!X9oJlOg%Rc>4y}dihaBSbmrpzagZlNs!nw0iUNW7; zc_x3)!s3PVuz0~-JkRZd**+aZU;3Ljvm7r!|2(#B+DMZ8E)KrCFU9-&c4GgoE%@}{ zethxCdp6Si@kb=gB+Z|ExEFsr_yOK~cMoQiPsQsmEyk-azKCD^{AW0pa*S$o9kz_T z6MINPcan_m9NCPdxf#}y{67E44Org0DK?Olu6no**4%$J)<0Am?+>k=2y9PAGo`f{-3}iWmNVC>>*8FCY@9mo7+@~qt9ncgTS^irc zFT?7Nmtw`e7hrYg%dx#reeBbG=Apc1IOQX8-a4{1$d8m%)@Yi9(~yA%1+- zNS|Q7QXpuC{dOK>kB!(Q<=Gr%p5s$kv}N52jGQ`I-mrnBfTrnr7?+HJLdi;2BiXWYUC@IDBaHkfaQSte+l|L~`J^M*R3QX? zW?+2@Y>FVuE=-y|9;BntDvy4eLzo5WSGiR-Wz#YOs4te|dQ$)JT!Y#YbfT>wdz0-6 z`cJv@^cCAi+`fu0FLIl3h15P1+pXB|Uj?^4d%9}5Pcf#D?KlC=?axg9V4or18rhQ` z)o-Rd^>Jn^=7ZFkO_ejfsGI`&Fu0E!D1%Lj{l=u3a+04F6N)7ol+*E?cBp)_1%dw) zglP^wTFUmsu>F+VK)oBn{}?QrP@DB$RH=b_AUwXP@7UMWdmzo>iM_o9Y<}X`J8AaO z<(M>^ULv^zad!T~_7a+!;iQ>ul8+?_Wn1}Hhr@D6qJQZqSGPWRBZU94J}st7cG_)- zO$lk9kdS6g%?PB~w}CKc(0`3K=T?w92N4o4v3S5V4chkG}4*CJ5L>&TfUKU zDT3u{8-ie^FFk4WASTWHA4s$2E2X2&>|k}a5#}(`OjzJ&z0-Kk6#kx~Ew7nM+5F=u zf>zy--1-!^@1GWoERnAkCVp;g63z z0Esx;2sw>5$gTitt1GDOiN3=043Xwv{p2UspR-CzNS2Fi-m~Th&MpyQnqs5HD&vw1 zFR&>CHLKS|^=j2{((%XQuYdYIrc4}%y*syI?Wz@6_S#E$<;BIsA}oDn305zE9c$iv z1KT#P!`>ZRh|SouV=LaBpM2nRnYGp2=7S&c+=ILE zDgS@*;rsaD!+m)9m1pthvL&qJWn6sG#i&KO?!WHOSU>D864I77Z+Q2}CfH8Wyk=ly zEPCiVyzzJwyw$TAmUpg=HTPYOw;#F&9}H`akB8rejgQpDvi4Wvoo)@VzHbw3B5~f_ zzcDuTu7_R8I?Wkb&VKLYxjVFEUGK&O>m%viHLxZ24{3$%y&B@JPFG@8`^&Jh{iS%b z{l(bS?FQ@}LNZQ5uF>YL8g(X#-b9kVeaLOR?sjbI(Fm)D-hrOCT#duouG?GQhWYas zVG8?0tGmb+)SpU{BFs~1KhtP8dO5DKcfwHq16KCdrow`4Zb%rX-)TO=fL&}0fR(PE#&dU zS-0PyE9Jf_&rBRo!mN4CKDXIk3M7G!3rRuajvwU(K~m=kPGFE?6|l8U6q+EAX2A@J z#wlev5!J|<8-R6s0(;~{L(rofdzp}sLJE;)AEkC=gdii#Mw$}^`-n+WkAh)H;@v$vuzkyV?BBBu zAMW3c4-f3d{$1Nhl;6f%E0*Hz)ywe3$M51xjU1Ese)Rr+p1T9@k|^)rt?f=6e0MK) zZQY0iySL+0lH!B!?#34%y=PN1KKyVuK03GypT18rO``qj=Lhln8;kM!%OuXPzm6Mj ztY@+{{KMH;HSkW7&(_#CvK4j@YeeGQ6l?m`$D%IR;JDswydhq@=Q6zcU|kaH##qy*3D))^f$n)Tc9A^qioOuIqhEuRIPdK1Bh4Fn z@LZO$y}v$M(i*!d!|uVYvF_nIB+HlKt&UgV&32by?fq9{d!L5bJFFGSeG6!R>7%+_zyz|JIcG7R>E&8-Dez!%*k)OEGKKEKDn*eI@;9q?!G!Cm?||>#Ln5 z(-Tb7<1xL=N#aA>Hqd_Mlld>>hI#*5<2JRc6i%4qQRaNoETle^ASO*BWghrFri3|f z*-OJo0+ny%A;b?0c1ghVGCT$a1YaqH^gP8K26&4*8X;l%Ar6FT0wKBQx4}lPn5>E> z@>oK8o%kxbdwhryPhrbPK50&2^6(h@I89O}+czN1ysnJ@0|Clz(p`sfLi49+(xvLB z?5x|`pyNEx5@OOX4qltsN(RL!f$~=9b0y1wLZmrsPgv799~tSCjFMHzbSTms$__cl z2ry%Aem)5BW4UBGAiI&w>MaE6OLZ0^Ai;K&2eb2YZ8ZI#>MDb^#kToy{e?0M6IUS| zcUf{|BVjJiA8@kq0#D)~rr(w2hyC{%ZRJb~mjxgDu)Y{ZuTlabr-u8m%Pw;MY8 zgx!pC8~1l*$Vg8vX$~v&16B6nzW3fhKVI+MH}Bh-%g9NMyc&2Cot zU*%X^*%s;1^p)u>^LH)RCQk9qNONU#nq%02CjC|0Ks3J@Y1VOVDJShB+=Nvm&Vt*9 z5AJcxwq~1vr~zrvP7sr3 z1JBWiK9u;UvMiARyHhg7XB`pd2ih903BH~*hmo{+L`}h($4$|hLS+PTeGWGczpGK_ zxIqP@noigYjQNm($w0WZI_*(}5*c;E&bLuvBg_>^v+my9sMtwrC$9{G2=9y+hT5oz53G>{NQp_#& z7x=XDMV)J}Hd1<6Eq(sG3jT85A4zm~tYL+C>#2YJqi#C!X9rqKWGJm-y<7n6W*BWd1m`0#*U{^KOy1Djv1Nc>RBVIwecE8v z;10O`!e8TpU;flq8?j{>X|oD$3L<)3C107UC5uRyr%$IX6^S%Y;JM>z+Y{*{S~Vpi zK#5J+4K3|aK|dwUDM5~*8#LN%M3@VaW(%YWXc1CE!20CpPSWH{IJlNZAiVN95X|CBU) z5}+QF{*6gdnDEV!PZo@}a(w4Ac=3 zmzjg8&3;qD-(NwRXD~PEg+P(!nY1V6?pSZ3T$fwF-R>N@ZA&2?pM(Bcfi%m;vvXX2_c!@yM#d6Qj_^D4LG^1UuMl3$cPOVP zP`^lL$$1W5Z#v9$mM+ysd;aDl&6LTWQc-Tk4d18ZSsVR*GcV&mAzwDP#Efu z?I$M74&NW*N@lY`Z`zVHPn|e{WSO1DJ2fNA>;yq?w(HKXG=Y9wg;bOfgfPbn1ru2y ziiD=}zM3@C0C=sFW*<2-(i{aztBX)sF=-Cd4HP6p@Qq2cmQg96R-7P^W?#tL-+-0E z6NO5Sq`t8Ags-|;7L7@dw2W-%dXbm}BFgCpTKL=ENKKN1mh}nqc3dZtn|c*#o;sfO zXQWvjrBT2@nst6kn28xQ7!l-@G$-W7{_~jDaGt^PwZ^lM>Yu+gmofB(UI=_@)F_;G z$|?BnNhjG$f%9k0v{B~yrR7GPwX#LWc6X!ZA%_svP`zq3ocyhC*_`u#{_S%T-0k@E zpa|`5!@>7&vQtq-zTyD@V#9)uy-pycy}ju zZQp^&;TLbu01qs%7|`*B$(5kG%x=`Mdk@*?|N2;NSr)dY-bDJ%eXw z&c=NmAH-ob55t*?PS<-l-0jK2R4+Nt&0FM6c-881HCFxdAPl9BXBb{*AD^A3?$_ z;;enU2R6rRB*V|&bs;wNY=SL=Zowv!=3OMsI|tr^wf83^z2h}ToZs$J+op8v8bZ=M zn521VBWxMm1RDlw`MLIZ?w;Cstz%v8?}YJp*2N8f{wYc(J&6UyGiZOZE!r&mS}#V( zXQt71M3{?;M4Cy8M4G3MqdkqMkHn;zKAPBoKLzqY7<{5K(wq@x4;uQN1rq+|z7PXK zuo3)2JMv-qc?hiXwP5E?2GU2hNxX;=Os~39~;&_I8E1m;=cyB=TAkC;U^vEJLH-l*QMMcbP45 zS+aIme5I^u(n<-lUYv>c^W1pU&~82ygV{FeHkUL{V2qbclsiqzY(MuYHU!e_L{wYJ z707y!L&-`q=k~cn`U{E2k1N@R_*{rI$4>$6oScLn>QG<04aI~rlQpv;OO(=O(*KHo z={XT>4PGL)IiIuXc2^OUM|CEC;C`VkuSsQ2`qApvHt5P_N$r*Ohc=|3?Rk5+Bh4&dTemw$kU2wWQ-HTC z2Zh>3WCT=5`hog4USR&k-}>MYNwfY}=I=4=hEM*<8#PVK5NXys6Y;g#^=gaztC2AF zm9}Xd$b*wjmsJqQiEX7^eL?@#Ryx$aaIS;;HXR0Ci7=Bd@O+b910I56OhAs@?w=ZI zmR=M)(x+Q$cY0n%n6->uK4Jdfkmd~fpUBVtN=dWy!}g}_vh!l-SYi(bKUW`T@o8iU zX?|Lyx!81KUm4^%fu|XgL&eSFz6p>oq#Q4`8P{7N%;CS~EA~NV zuR&+pI;v=u1vj#6-9)_*r>)lZjbWA1*v#PgJ1E1IE`v%4v%$%p2`%Sl2M6SnW=+MI z#_I!No=TzZZb^YLsl)0aM3hxt*ak8gNOKNh=KpdA<{49q6Ob5)z#3^5sii?McmxMA z$4T1f$mA4Jw!3i#899lO=Rm*%S&pE6cARzivOa#DbtLP_NHg`Gl4f<3f+NpKf}+k3 zZ4~lJvyhTz?dS2C)P+4}Q#nK+%K_>G&GDNsb}Y_5;|v^GvnGc2?~nO2NkmIZFt4l( z3un#3Gvl7Z8Q(n>)v8p%;XKDjng4{Xo7UrRpB=AGd>_OKd={{zyBWglUUDRP{KYQkA>wk(XY=S9CysI_)d)~=vDi-SpH~z8xj6+ zcw@XftP%DMZ%(p&6_(t0osrRH_lh)Mi?_R8hlAtW;Db@OV*lXQSovUGy!znHc%xTK zEbHGA%X>G*27S0>P^)C5S;RRZ%>g9Mdj|5{&evexEq}u6ovy`Z67bD~+hEJk+p%lJ zU0CzT&3Kcfd3~qrv9|p+HtM{&`_0%nxCOQiYKqN6>SN2WX4o+J7Hl2W9xpxE2oL@8 z1gv=U7R>L`0-ft#fGHyf;MwAGyYUP5T|{S2-zc6=|1{D}JJN`=k>&}sBa!AL;nJq9 z|LME_D3RqT&}i*vmRpE4iwp!}U_^mi?F;^H`xS7L7!W5&!A1$qheR$tXjw2KvJtee zC(R+K<->zu79(=v1^pm~0`PQV8h=k^+ie;ZWp_k3AcT$7D1k8!`TR&9iHN6X#H2Y+ zv=&Q33{lKXWOdR^y-cFs?b;-XP^MVkm^Ax-KgZ$ZwaAiLw%>@SVvXXP;$9Lj6pN)> z@BfT6>m|&XG;35(B*kL1?8EvbKz*#3S)A4NCeUsj*BEwdrnORxdA56Z~j;RYfgBg_?TFW6l^?6~wPUr0!^gZopN zGn+FK8+=VNNtQ&WFoJGvN%a`%o%zAQbHdH94HP|1{SbxZ6N3EO+O!K1WFyV2UwTyh zpw88Yc9UhfiMFEi6Qs77r^ig6I+h);0O~08QAK1oRbkqg=ERy$kt_#@K*xk8_Ngc>eUCzo#Gj*mXalFR$BmHSV z&(qC(4s=~^e<@aG?_LCui-nQq9Ps9Xb!Vf^4x1*T`Xae(ii{!d8;A8tV#~n}a&_!T zZG<`uwkqAYPrJ_xRRYsCM8w$;f~dh8oF=|V5fUgTASQuBzw?DH1kDwm!Vb_2b^)5( ztcl-hKtPz&ta(D|s4tU5fF+M45M*CDQs78H2Z_nNWDD}S^0|%bELL6Cb_F3_qfXI? zv5y)n*r+b6KOxV0*-k-8fj_*2;`78}urt}dEGa2KGALS*R2v!aXf$4*kX@_t!q)-q z3xP(Qok$00p0k#s^ARGpF9{4Dz}ZIjt48S6=fC;IFYM*M2k*NN&&{2S+4>xKsYvWX z+}66a9Xs;SS~#@&Avpf%Bhb0yJ^1LoJ^1@)AK>$k4&Wc3AH=8c@3j%)_xJ69R=yDN z)$|I@WB#1PTEFe3z7O`=_TalD!7M}9=<9%s=g-96Z5wcq|FwPdwHNW~i_c=)#!Sgax;rkF}39 zz^*}eVExeBu>Og*Hfp`JQyr}Ba052BuY)xt&TldKPC&Ax1qy@GP zycL_rbU>?fPDh8oo`YAr-ig5tF2R$Jw8g?w-uM;K&WadJNN_dcFVbuazZbDz_4w65 zyVZ+;vMWopOic8%;LGNBvULR?!fx=U(}r1|)db;soH(;l4sBJJ`?3==C73u#WbplpNz_G%OI9B1O_M?9=bjLhtQ3s(?m>9}V2TlE zi#LLZGn1-FZ|*b;1>2-720G#@J3$kt>(O`iwF==?Vxs^zs6{ zzx;O=`*Uq45MCjDk&5>xnTrO z+x+?D;ifku%`AsDSEf}`?Aicb@5uUzZAf(DeUS7~ViykgGeNn=o}x;gU@sAJ{WpTi zFHrZ%zK}x=lsUltN&{lF!NmV5?; zZc@3mYZ5&LIr9Ho{Ge?4ZA{}xj8u0Hjp(OOp!0krAK8sYvZZ}gl%){Z-Vkrn8+OVP zLAIoqb^vBUnZ6D7n|;oDY!J@)MhtO)0RB(%L>P<=Q-CzrINYck5vGmX*Z5L>@Laab zZAavnWcHb22lu%iBb~}da-g=m?Ry-Lkw@5LP~T403C6+jg=y(H_G@pe`)=^Jps!#f z!A{J+D3hi~3T1&$;D2pH(B?UX z4Z)%06i9Qr-Z7yhmfk%j$bFMrP#qckj#)`vgqetfn8O<%-+hGg{O&k%cEacQxdBxq z%^5)tBhDht3QiSCv(9z0A^E%m0z&6_XQ^W|!kn}txi*kyefi43U@g)d2XM`Mwj{;& znvRr+sbA|WwTG}ru{?I=#TVNMa{U`_z#@`n5oWEmtU&b3pZ*-RsvUx3k30g0*Q|zX zueuC#XO-Z~Pu|BrzIY%1{PLjv{)i;_AR)3W()_{xoko@gkzPB8g!hY&-?Q_aK<~qn zMe~d>zpHC_?S~}Yo7TRGWiKzb`OUjYpg$uqH^NNf{2|NuoMpYubL>R~^%whd!V_cg z+h6?>HQ0^|zW*&OyuTi{cB_Z|B)z)^-HO%S8)NBxwXx!!Yq9*EI#|)ZE|zz=2J3s) z#rtDh;^UDma9~Jttms-FFZO7K=lkD|mj>K|H+t76X>N!;1Datc$?}erG&eEAyob;- zb6bdY-5cP=wwGbytruY7Ef?^AQ*0d54$DX0hE*f)z-wLVVNHj+*x2DFyrVhQooi!# z_j>l(?k$6E#@mDHW5dA4*xaiTc8u$cqK8`IcSjt8qTB1^$tG8#d!s8cf7(PF{S|30 zDRyWHx?&OLBKC1YE=oz9?XH-<;>lVz?0qko-^u1_PZn@QylizDmYon4BQQyl4Stpr zj3Rtv(#)Wo1<`x!+SyiCR;IW8FmcAKCQ3%e35}YRi1z}!8Oc&@| z{?ab%pgq|Ljl}@gLDDS2W)=>=ZN z>!jKHMdVndH73nI+A=MXJ?lRs$$7-tZr~FghLJyyYfhRaOZrAWk&*YvK>R&;Gd1Cb6p=50KLb=m9l4dK^f%T2c z9Jep-w}5=oEI4WQToU4~P>?hWlw}5CBighhpX!+OdjeZ3P_}7#M%VatvJ2(A@HEB< zwQMdqH&c#s2px{>I>JHeA%(gpELNQd6zG=Lr7yuftcduO6P&x#k$Qu zFGDO>`2BSVq}jptC%&3%Pi{B%L_)S2z&)1UMb^uw*5@Y`S~>&{4X7~)Tea~N$-mne!zvp2#U>UXnLo}4lbYyyRh zNV6-YurVk(g;g^GoHX+|g^o$Hl&)(+nK4vXY9tAAf|wM^GKG9YHd!b~sK?=TRYsb_ zi#H<8VWc@MaTk+jC%nF1Xdu8QUm}Ym>L`<4bqU!ufixR&{tD78*jlp+nr1|D0Jf8D z2@W=G!US8FxlM}}HkzyvXDv6Tk!F3-Pai5d>afFc6f2hfCaNlZFE^9 z)dxwm5A58Gx0bzv*Is-UtGLz3wT(yGYp%S)+VSIK z4#%WBuEQo0<=x%%rN5i-!kw36>AlxsMceC%8;m%wCW+qEt3KWx)dHW4ZiRh=o8k3G zc`ixw;{LZ`N&i+@+D9)HHnK1M?dsnYJ4tYNku>if(A56#=ug5-tnbs9Xo}Z5U5iDx zU4$3zz5=DK&nE$IjFnH`ixng9#v5Imk|;O8hK@HAH(@P_^SaJ;uziF0FY z?Aria`!vKl*1dPu5Zrn3Ik@$l(=ooqb$Ixi^Dv?JBY1XZDawjzXC)*>Mw}Vcis@%U zLO6Ung#sFFNdZs+BMP$qh*)+{KbDUg@#2>H%poXw{u_dh0#NK@LL7|p5&Yczmi^_B zFNaY=o@dEO_L})*>X#8hfb+T1I<5fgfz+-^%FoJFK=ot!q}k!5*%B}H6-cv9tsnwv zo)!XYOq!`b`KU&kLOn4|{-_Q1X13y~NV7(qr62A$a+JvJatKNcDo2D`dZCQ$r$Cxb zcK(h>S#k+8Z6oeKm2EN2NBQjCm54A0NGA1Vh5lt(`3UU{|BWygAk9J`z%dC9cBbR; zbRg|ZkLO%R*&?{@PvUr+k2LSl zrqU@O?Eml~kDM~CT-j2R7qn&l^4~yMonW}lIgt=KN%}V3pCN67`GM;t9YisE%6zQi z^`YDdW3NxdXE~IJ^lq?nDkIIb?f(U74ue7gaeamMhd7cB81i>|O->v0 zkz1a__2)6c_oezz5TBpc^B=%-H_IwOn8TAHA5rF(at7N@!Tq^>gV(ShbVC*hv#n-i z<){qFEy7G&V~i<8nuQ$FOq+DlEc;{(G8;|$D@Lk|STFKV=`iRq*kQ2AaGvx}o$$B& zx9ijQC+8&fc>9FR-(tP#x?l&P|5c8Y=2%x+%F_9l+G?a(^X%-xuO!W?L;{9}$S}Cy zoRJmf1T+e&Z4_9uMi)kwEGohi7!8?g~qNHU7MVfne>t@T&op|)o7%^xN=FXgfMGNNRuG`w!m;Uq;pyo7pefR;| z#soYwek}0sFY&j}-m?`m-rc>;MwSJUW)WQxW07VNWg(0@i%kFR(+{xX^(9z1rwn^{ zY{C~Bjn?QhiSfp@Z{m$5&l$FE(5JQc<9%CQL!-=^6MX>h?carG=gl^U)rVAyi|C+4 zvt~_DopPR7tqQu|cp={DaSJwdse@%Cuy5Rb4TVZC!<)Y4I z`VRHDt&7cF>f^1>*W>N3^+?K_VN>tASkt2cb{6-;`Z?qAP~A(=zs|*Y^y1&5$1U|S zPwQrvmyuwIG?T292u_?!6XIc`(IOnOt*2>cL@AS!rUL|Dwy)78_MiGFeKw3B81nqq zZAiA{wi|3J{0?R3BYfe~{E7CR1<*j619*O(PYb~hgP-~+b3iCNx+bY3v@5j3NplFo z8EJMRs#wGF64~_INi)G98VIikSJk7~=1HaE=v`K)f#+xPQnqwN*MoYd57jG37KiPp z9F|l}P9+L%B|HBsp@}dnPT1o~-uU`gBiZLBm&!;;0AVX_| zwkILRG1SKRe;WHpa0}t9NOM>Z+|EmmnJwzL>^a$=n`eZ48*+WcqwdMLw}{o{YuD`aztVLs+nleIB z5pI{d-rFWS3fmk68_j|0R5`i(-F~S**-}(Su=$8x?>#O`+hZ8?uXMOAN z{Va!N=}DCNqqfwMt@xA5nO`64iR27?A@Gdg$Ehu{UBMn5ZjUzQM>Zpw8Cx7$>a-|9 zMw;y=)@)pLIA75e6~$Jc(jglS@)bg^&4yU+_E+G1${qS7#Ank(YEN081z6j7PVlvm zlVTmlF+n%V*(kHeh*+0G6@9eCjVK#wcH->wo6|3L@f;<;tdXpM3JaYS2QdeQ%yazz z1hF9rwste?^M&1WQRw3<@%W<;V%Wew0RCk!^zGTc zAsuNJS=RInjSL6E9LTe_8fE_H7a!TG81rY9VDmexZQsYd{xfd(ZQp3qGK9@MM`dab z^amQnCSev?{^*1K*t+o@%qTBLsg1B|q?JLIdT8J7UL1A!;iyS@>z(rhEP1dgmbbeK z%kQ`htM0iTEAF}h%kQd#<#%6a(=;|cS{Ls;dIL6gtBd!BG&jQf(jzxwVfQ9j(5nTW z@7)9|`ZUDWflWxPW6~@%$5vv~pq5zMubJVU{w?w5BR65?12^K}Q=ReQw60h&><-N7 z-T=?_ZHbqAw!~`>G{TzBjj^Fa18iu2GdA2?7aKa(!&@Ee;+4DfVU?S(so%|5^Ee6h zxJR&YaShn-CehlW!D8mfiFo-M%h%}e!BP4-*sQ=m5PMX>O zMw*EvXc%bUmc(gu@?ix+Td%XAPm#FL<}+mW6znT4HxITxa!hQkJ_WS`gonp00aOw< zV&4>ewnC zFdJA8Npc{|F<}mUDp-snf-dusPniAVIQ&nNn~!fwng!XS^sjSd)0#S@X&t3SL_BwY zI^|aq53uXhU($C%n%N$=lZ+rqpK)759;*QB3)T~D{qt(?S$^C4TOo;?O+=jM~<>66&5i5g|*xrIn`;^QVK>y`agHqx9AJC|Es z@9`qog#R|VRJNV3zS1?`cQ*Ajh1+{5@9Pl!q!2W~=bVOZAaa%pq|VL%66n9m3v#Bi zs=ug@)W#A7zL1Z=g@WzJh-H>u^|VUwOlVo%6!gbHn$354URJi?F)=JtSCIh6c9jcj zlgW`>leDcO+-`)}&T`Mmps$;TC#^PE6zMQVXnWv_aSZhK`F|tL8ch*_%1E=msH0_x z6l^IBVHHLV^bZ5bs08n*j5ISaC?F=rWu1^JtuVy!Z3;nxkZ3|O(`TveOMA)x9O%60 zT7&8|1$bxLGH#MNI>)X{6qb=@IWhO~e_W=5lf8Us@bWa$9Nq%AH`bLj(&RxT$wS1s zTtqp75|Q8eIV78b?a3v~9z3nn)OoDS>QQns;!Kjv^7BcveelHEpCLF$i4r8+sI|FO zMz@C^!cnzq;iqSvg$YkSf#>GWL%TcgM6E-r*(h`CW=-+d@;7i}?W-`eWGe7qf5WbA zZ)4An4bW$E@Kl{)|mgmhZ#@dx{*gku)kK6b6Y{TlMFJr~4 zFJSG8*KB^XMx<5VL7pSh{O&%j|GXcoS1cuArd=_SYCf;M3`gBR_UNNH@%R&LS-hVd zQwx*txE`;xy$Z`nnwPe@0&m`ZgH6v^cGp!{)8%@sf2~byM?0gmUdF(Cy8sYf| zuEYFC8(=Pp^Rrz^fcrGS&cS*Kkc2sqW@3j(^S~C^K&swZe*rNyNK1!k%Gw zVbjptv0^|gY?{~^uMfEmOL{cJ;!ZbWRhLG1r&9yG)BYx`?@$+SlQ6IC+z4;nT?Z>V z*TKfYjq%2V*J9b=_IRYt%{Z=R6K#E%AFjS(T+%-jWClq zYoCF7u@^zr@6kxJeK;T@NJf%q?|Sh;!Nmz<;!9Bw4M{X6&Gw>61$a=(M-l7y`D$4( ztc*1KBjhY@q^)*mOZ&H5N{*jA`xk@ryd0E3ovqJ{X#w2akWY0AR*0^S8tbV+49fwz5t05%pD)!}8&N!E+vtA= zBhqZ`Ov*~i^S))CR5^qb=E=6|(M;OYO!lS7v#)Bwwwu2(?;4ymYwLa(a(wEcE2?bqa2*{p}ha!E67nxuL9 zB({&Ssz~b*C!|>~&*u2I)j55O=zEF%g)IugMAxpJRgPY zaGeCCubVBnu+kPRd`w+NV9$`#+t5!9#kLgQ2NNXTYU{P$A<~Cbt3S9 z7%Dq$gW9C_kuYm1-Hb42q&dCerOa9RLOyAZ%4OY|O%*~oFVUabV?JavjAOxeB3r9? zZUU7Ra#5nYe8?_sI#!~iSa12rNHYyt%e)1`Y^2%dzqvqKC(;S0J7a28Lef#$XtTi1 zNenAjafOlJd1{D8WkRBv%7z5Q2`EOYfVf=B>umo!*u!(5e{P=Xb!KUiz5G^IT5KyHj2bx%zdrX| z)T~(xCseC~_Lu$=OFGxZ%eP&Mm3Lq3gt_fCSasjESl{httbVKkmUgLwH#%I24c%_U zralev0!j1i`)@=!F~7@oSk?1p>>k>JB)KVeYqZ%2b5kSD+Xl77JN=qsO&=%CZ})GB zm5LH+F;$Nd$7Dm3oN_;W~}LSGuDwXuOnf8>%JSY zvLj`=rykZn-U6SMKa5u&sEwyuUWNx-*Tb<#ABOMp=H>Q3{1)x5zXTJ955)XAGmS{h zDl0a!tdZuFFcVt4+xnjlu7q1^^WbQ|Oc3h#U;_sEHR0==CbpmYk+!@pNz8$83YlMI zh;Kr3p`;|@?6JU;2FqvQ6H(5M8}!-4&Vyf6sHb2ZF;t(ByWi*jDQSduXtXXN%{pIY z%jYSZE#)T}4Gg@rF`sA55(y`Wb5Ah}+YNP7X?^GWl=H)~2 z(8w>ZA&fNhe@vJSDM|M8((CB&F=;L>V*94C9aE{x04B+pG+Ta436hg$LDLe8xlQ{k zAptR5f4Pa-@*MYxH0KlMu#Jhc2229M(6xff3H@v3uzy9GXHb{=xX4WDpZ)3UC)1}1 z-S}#`F+JtfI=3ZSwlKBjTw8ZP5m}BPUyj;Q2pQ4P_3{gqE4yVZrmz|bP!&i zjN6lOx(rp^sc1axG)+y=!q{O+m>4FoH9XgNa*%YG1sy6gHmt9LG5o0WydgeHP#6k| zO7az9BR7$Pe}V;>*3B9lKL6-FTMBOPu8sC`-rilCv2pz>TTfXeIiEP| zw+Qu%&&{>9mt(@Lk>};FJa0?PZFp-r-d_2JjW%n3b2!IdG9=;NPr|(7^_NJzXOJ*Y z!?W{eW8SQCgRMVWJQZ!*-i<>^@{g%m1?QiBBueh7gC%XR#qzebvHYIeSaI)_So`pG zSkt{e-ss*8i#uG8SKD8W)eqIiy6*L`xKnMEwYwTm-*O>l-*Xw>?%e>phiHVkxqZN7 z*T5!Dn)^4yw*Jkrp>K1n>D2^lNt`vWdClYXv2$Q6671I4)~`9%_h^U}B+^SCZ-_NR zZo}r0B+d^vAaTEmg!v|{>`({G@2i71JJ-kSZAqfLx5U2}55(RH9ns}4-$%PTSMo;S zmv%FA<=JPW*Il=wWWp1eQ(kKGxs5cluXO`qWSN**thw%*0z}(T;G+GAScL?UZ_7eT zBuL@{dV#tQ?+0yUK!l0m7UIotGa-D+J5Dy5do!&^AHGb<^O! zkl!AU3oajj(|>F!zXZjE5=B^2G9dNG0*HM&){SIwg39ue8yb5GFp+X;89EVH%IKhu zc#Vir){hx!4t?WE%I6VUeQF1Fr8SvLCXY|R-$t5A{t_ATQGqlwS ziP(oSU&^j6JU%q|{qoUp`oKq=*)<6JtO$5EiNY7&#b^}eD4St^w zn>(Jg-*iKnL!XPorm>3%F^Z&FN}<4PxH><4p{9rdR>+3JDX1wTHX!}ZN2aiVpoBul zQc!3c$>!D@lo_={iGpCmrcoi&D91+`DpT!IG8{R0FuwDxZ{ZKW`8DRuD93FrTjQ+LPRF>>qjBvO zmtyRQq4@CKJ@y5^_x5kYryuRd@;8=X&8j!><)~w!+)1mtpntSB(IR4C|Oi zo|lp^i!_Tc@7cQEMw>UTS%EK-Yc=h|$n$%1wQc=xVI#(*jfP zydHD!y#cRxse`4Rufkhh>ta!dt5MYEVhpSOGc0U-Id%?ei9Lfvl1Y{aHpMRf-l^Yx zxg{yy(uc&jcQb6@_U&Gc@pjLKB+E^)wQpl==k}e)>S6W6H`$VR%X>D#<`HeN`mv^X zv;B=E&GqmmN%I@`T~Ba(Z(XeF+7h44>w|wT=#LRM{SkG4b|$XB`YP0_Q5~lpbtE3V ztu+bN1k5H`F6T{xNVAb;5@;vQBse58vl7y*X#@UvmVGQ+XFuoKw0up(k|ctJDPAeU zmPn(Y#6it$k#_09H3@9r2r9?pKo;0D!W=#jVfHG2NCL7E4W63@!Qj{O^Aw6z!PnWg zlq`qe+6sO=jI>*_5c$xS@Wnosm3^P)}eh}I4{&n~qZGo~W|=s`v%(fPB(yFB+3D&!SD07v+-W0FmX;-q7Fbv-l>+l8zEiN`4)RQ!!DF z5uc~){92Y%Og*Gq${dkqP0^4qlxYof={!mJ{*eXgF4(<8@z6*!+hEh_5@;%lwz9=A zWzU`vFaZljb|cc9#(WXwJZQhKn@@Qod-l2O%Bhp6JC-LuNPMAOxkEBhHk+p&NR)Dd z*)46w>_<0L-p|(8^aqt`q}61WuTN&4RE&3D&xQ05d?$hVlz#g;mX8%Lv@b>MfBE=^ zq?zT4G;0cy-j%YCS)`;{b<;;^@A`O<&JVU_usEQt`Zxi9o9`At&b5Tg z5%!2n2JfGD4y4)E#7szHfHyel^aQ~3|!lOet~AK{$PAQ^Nn*lBVscIH&d8Gmj& zCd|Q*GDT+BhO#V~QuhBXX%2V2y33X9DQV6LpswUVVqN)JCxig%iNy^fAb+GeXGtse zRUpkJ49Zyo$&ySU&2}dn=~hUSWYPweDZ;FF%`Gp-1NYtM^O##S$K>%7alXP5HY=df+Z7*upzVLiZ8Z(llnM9_P zq^6X_B@kv!(IBxg(oCXbq}lwMHgCS2!R=bU7W{!XqO^&ek!IPs0w)oCJ0s2V@m#z5 zDx|h+_8SP5L%tCF!V{F+Zh+1YfifT@DMyl=>FXI$77{im2E=wvEJ9zL4gD;6tWc zjSdIGkUP>`XsVoG%YYI-x0wlFIo00(p)4f)8a*#Ap)T1rTS`*vu~YZOlta@3ed#>e z5cNBWWLq%u;8SiicU}2fgI%5zU3Qz&nD}}IFRKC)U-I8USNT2{-~_qi|8GK?rGJrr z-zMPR|f&Rlbr@TUXB`gjy7Ws?f z>Kothy72nlkJvYCOJUM1cs)UPPMU*Gf{lpiQda8=-)83*f{tf8OZCfg0 OKdiQ zc#qbxH=aj56e7)W%#++e4hl4gMh&milhnMl{dj2oC4T>8Zr zGeqJU{nqi!P`^$&IvzJV*TIF1_-ds2t4Om!|IPVxuN9my3sfdME(Pngmqy}!0y|FI zAX{jU0;ukI6?~Ow>}3ubj)BQkb4!iDQsGVxvW^Ij6gnt#Bt2ST^3aPQ3Bq&^b5tei zK$;ntH7%i-LDsiC#*Ve}5}2bU@+xuXkmdsYpt8?D@6R}*RxJ$aI{**e*9qT0^*cE2 z+b3aApPnSL@8SIeJF)uBS21^1F*dxt5?_Ayp^pfYPzR!{ZJ45=kDCM{EyAo3=VhhW$G>;UoUvuzHz&+~l7u>vNtLMwvesao)QXuPvU3*Pow{`Q=kFf5vnyd*yko zB*|Vpe>M{<`-e8FWIM1|Zyb5}kvObIO`LUjHH>Xh8!vTigjesq4of>+iVb}lVrGXM z@e7jbG0m>T)?s(sT;#P~Z^n`i*W#J_zr*uwuf^8hB+9+CMsrh=rS^dvC_NcJ;8TeLXC@?*=UEbS0K`ypHwXfGv|c;a`geW5*K@qyFh9;=*%&hHL6v zgF|W_g72Pi0=hoX*_?M)sYcXjV{EgNXWC3cnr8>n%tReVAAE$uZFYJ}1TtUncN?5A zdvXrGods?!wdhGqzFjdH2^H)qf)i!8_guovU`1l!qn26FGCPg}Wk3kNxlll4n+zhA zY*$L41rL}a9I}5Ui9nhIan@9pu+F;3vdBwDoCUT~I;FhfV=(TIJV${QVTq9Y=CV@;rl>$^A1)o@es>8H7l)r3iB=PQJvq>z1;#x?b);$9xI#N>kv0Xi103FO(ID6y>c#XrUU@t8(TeTH~?JpYS`>Y5^w z?NSox5+=G5maiP8_)kAzn`H|&(k#NvJ`XQ&3;M0L>38r6UwTjkR8u=hPNhfMT=wyU zFkqs0Oq?{&mOn}My<*=X}L_KTBd-8g8QAtn(S!W`|l@?;x~Ax4@7_C0m)%bc+vDVNV<7ATW!S+-=v&fo0`BavofmM3Wo`cJ=$ z{jCt<*raVnnhO(Wk9S_au20`ZGV+{Tc52@VLX3*{g*a9LR!0_Jg~~}YVX=i}tUGnoFN%n-u>n9wpHJ1is)yrfVC3Nl3Gxwq>N5aU_Wa4vz(nBz8x! z!I1QeBW`cl7r}fd(XR+|63Yb*Mg&{4oE8W(A>W4@;68s;=mr`?4G+Wh6~N=6 zF}cakkr7{wHtUg-2LvSuzavhN$m@@O*cK|$gM*-Q!v9b%x0H1%w^Lc}6#W)hM*5Kz z1&jm=1m58;vyJLRiC(J~*lHP(!X50GgdEuU3Di-Vd+Jcq@*ojEnh7g(1cM59zWx<3 zcCr#of{=`~-;CT0H!*nAG5@pu!5KU#s9YXn+vb#)*qY2I9eXr>f9|=sq4u>n=7_^_ zc+DC%-}%AL9nI+H&nd&E^{esEzkP&{50WH*xQEbIgjv7$;M0RnoQ+VE20`wr!W!k%O(=<)ZcIH{ zG4_xQ8Mwblqk$wjTD?L3kOq76Ro&1O)8E+-+!vl4*(gqul=Q!NJlj5jWk1RGK1m&r zG|w0((##k#jsaTR2~Q^@&eVwQ(j+2Ces1lhKy6v}OdCmM8(8KvmaW*K ztwY3#b<-YnBR98v2IkM4iTSf;VSzAn1{Thq!EHJB5jsZbeFY)Bt1xF)8RpOBwFaIu zZzkpm{5^*;VD`*X%p%ImNNCt+`ryyZa_%E$mX=}Wj9DyW4i?OvheZn(VDZBFSiE2! zo|`v^_0C}2U_G-bj~!o#7oL5Nc$P8cIYT*pYSOr;cr#wg^XF2g`SiQFl$9uB8_LV@ z-0U(em{EdxWyB1%a}Eh}33WyCOk18U(#)-NCNgbAoBzvb({^KHxq?L9ZVX76N@zE{ zAucb~4KZ(ysplC}ib;^mF@1a)rao1QspCs9O&jF>bB=g>%$B_vA#;oIBp*uIV= zJ6oM#1|b7bL#V`j4(Tt)ODvU(yTiO z?qgY*(I-%OEMJBq7%8^vHA3LYQ%N+GYf5E#BFZAnQz)ZDOWai?&DoukkT^mxqzDxV zvpF%7sX4MkBgnp-TrPYLQ-(m2^Z#2J3dADLsy8Cda;)%zXA0`a$&vcajavx8MS)w7 zNBj4B91U-}0WF#|!f$@{OVm81DvqdC!=`Q2B6+TT)#X_C)^hy&AD`mj`@8VTM|<$` z2fGvEoQp8R91uwJ%B3%3*Vgse^!6%j-?YxY5~vYr9sdW(^PcS+Z7#G3_p@`EAj_sf zUkTJ!|6h4=0am>60$zJ*AyzG4!X!5vQzwqb{PL$Uct9`II`jzCIHV@NcUTRKyYpr& zeWVeVKTroN``(2{=bnk9*_L0Na5NV8?ubqO@3AikzVmQhym9YUc)L?wZ12`6A<0eb zOMrX&H^Xj1|F;fqNFv<;d-^scX>NioJsM%7<~H|eij9x)_rvw^_Cq(3d^f<-duwAw zr`p)j`!=k9xCP$4?zrxSHeHy9(;mrWN_Q_L?b^8*JRiFbu3%wCqXU*x`41RJw|^m<<^!+3r5V<~BfK+;x}e!PR%kQfNQ7QGTZ%QwPIGD6rVU-hI+i9xb_@^+9W!QH7I1;Lf zjL!`6BFohOf>IJVkuSzS#c{itn8dR9Uj%Lz?P50LhMq{wqwOpxCvhbKoI}z%mv%Lm zHYLpCzWF>hudIv&u^i8m*gi{Q_#6r6f*EC4Go#S|Ru6qgk_Nxap=F@t41xhu@rg4lcRyLfmlOb!gV4Deh=<2io0zFJ*iX9qzgv zH(!4hYF~B{u4XxnZmNgJAL&NoF%1jKNcx#@XEUbi24e>8$P%z_#ANf#9X4lDd4Xiv z=8S7pyrc{>N=5#8ZrL;RZN}wkrR?KbB+aui?MZH*nt`J6wAqQYKi%xu4UcYs2==jl z`|^dfH`=T)#0RQw!L_t(X|HDhlBFs*j1=gRC=0Gkrq8~`JJylbmh%gg2mptM^ z29t|+5y43_e^d89=U1dTdK%7t3%o`BWAd=gYGY`xwb^2CWuR=VU%yjb1Skh2NJ(=7 zyFoOrm*s^FY?t_COq%U{f#pU09&%(%nuQ9a*&HEp3MYm>H_ErR>@BVgg- zZp{fZw<%nRe5IJ;Oes=kMr@tjdcw|-5!(z(d|}v~|Gx=o7FnkA223XcX%0pbNb^*- zX)2AuG1-=a@#SFL0KKyUVHQ|dqK}a1^N~J&G{PJn@i?3``zTI~e9~M9fi(N$WH}M* zQ-UYlxY#!KgZC@joj~pNfR!0!_$|Ub@yRjh)$LIX>e~x_dOVIZzI&>D;G}xhsyO=a z!|~{YU9fB0X8iq&k4czGlr_pse436ne>G{=$g!qw?A-E>k>+))mKrhEvUGxxW)f%- zY0Y`wws8&hJb|)J#xuzg@t>=@Xbq`8HYX8zwcun|dfL+tI_7<>9P$JRc28L$O5 z_GyWC9&L!X?!OVsJ6(@u9j?anpLMOJl6Q|0b-`dd=iZrWjN=_POwnycZxtKCBV7hO~*XXy?HJ6vkoVqI)382oVOdMzP>NWUKN|IDw z%9|Jx%USvmO<9SJTH1Vbn@&R##b7*_`<^DDoHphulH^H7mP?)*kEvtEV8ZZW7&~YH zMi1_bu|o%vY!1X|;)#(%G5(2BnDEqSj2k-=PxAk$Ata##`eMwW{+RN_7#rObS)TmV zSWF)OBqmRIl72lJE_Hc&KwnJkY5< zZ_L`EBS~rdyGau7ybbrw7V8lv9ywb8U;J=}3i3$$+52>%anZvkCJ zvc2&F0Rl-NxHGu>3^TwC?(P=cA-KD1JP=$HEbfB4yGvYfhjF|2e(&2=r_VVFT=~EC z);nu`tNXN6S67$(YWMD{Ap`orsedo@?cE(+J9k8z)-4rqRw(-wiWe?|q6G_~uwwx# zKRbR7D3s401s&{Bz}_B?tXJ~qwS_%9({PjM5%oFkk!>XQEL+CNDNhd8#@&xlt6gw>u;Vj3IY@{RD z=?UjK1nHoIgk?W!cLWuR;5~#BU*bjEYZ>lS$ZZ#(MFpMSYS#xP|iOI}E>aB!T zT`w}AS*Jh4k+=(gaGx5b$?7;g!UofkamfnJB6CMCi~iq0vl)Qd78uVmv*bS6u315| z8J?|<6E6k1jx{tJ;4GuyWP;`cEFcGYQGa>0A{S?Zt1o)Fu4gOa!MxE3UR+z6b>KR4 z$g11*WbjAD`{;rQPQYz1UFVHNNhq5qRL$jJ`NUmebBn(U9 z9Y7htEb9T%TVkT=`;m-j%7n_&uBHE*(!`{!(5%F;tYO(uCTQj{`a#KqW1+Chh+mQ= z9HucJs`d3$T$vWu>1m{|$Ve;eOex?c+hl}{_3kbzQ_Y?>l??_P4x1dRQRvyVGyeYL zSN!?wPxwIamC-YP`Y6liUj$}RR%jMrmX87jo@E4$mrs(_b^&F9W|=imjuD?{8KzFg zbrxSe$*l4}HX;be;v%^H7_MD9sVvR!UZttYRw6_86BM@M=8bEJId%-iN|#cNYZZsw z*f^>iPP@#){&}O(vU+J0uw?_6D;qj<{oI=*7TmZHl?cq=Q|ARk0=-Xb!G>Cxg3D_ zBm3ag`dbtvNJzt6q(FZ7c)IB!=U90PFo&D`P7N!w$YN<`evrILg|W<-%N_HDR90e< zGV;d%3eBqAOZidyJa1Xc`ipO)92t@dPb_c7TP>N~=ksgxKC+x=EsN#As6b5>qiWch z|5YcM<(c)FeDEj@hJpoUZDxAyXQQj8#%H>*JV_^4KFA6Za8}l4rUx4)DO=)xCBCHM zsu07*I$+yw_-@(+|J^&4wOp*VlFoZ~@4%)F-f(wWhn35gV$uA0m_2 zH)05!M-0Vir-7I>d^pBB4Z^VAz0s>fJGA+}F`9f`7Y*yxM7`Qo@J;<1_^x3cH2MB( zv~K-9+O=zr=FJ{IAqU>LZWOrQIIN>EjK&KwkVP<4~ploN6Fj{D4HWT92qY= zo+sfMVdkL<W`A{lfPS|swx(A6r>yX^6OY-I=xZCB1 zecs$EjADfx(5GiRY}@3CxX3W|y@li}8R15~ndG|lhmP`HLL|#`Bq2L8k{zo^Hr5e5 zo&X&l9L};4&3qS4P>vul6Cw%CAp*^8)52Kp1e&G2Hl)iqx3hezw$}{I(k3(Cn`A(< z1}c_cmKU)m|Ig4&2`nGnyGO~Nl5Ihh0nJKQ(5%3jGJ`WE&@9&G0QDwWLvx6HLm}4Y zFF>RVIIXS+{PJH6KXZ zZZMEum+%c(HuUA|eZEg7_R`;4FzzaRk#*gu@5y|R0*_{nlJ6pz-jw_oL!vwl4N!%tkReo z64+%SrZNMwmM%=DjL7(XXI^%1n6vU)?iA0>#$wEdI;_%L2|K83$Sb z`TVIP%Gw-BfDs*y4ObJS9E*!2EFZNUAjnka!J_PBmLy)m<7&6?(;PJXqh<0v+c+c^; zzicqB&FYB@GrHp3JI}i;EJ78+ZCU|pj9quh4FmwNH8Def~U&3-9+?d)EH>dT$ z?U~#+Q)UM2fsY3k0vC7Tm(#oO``tsh7`zoF*f_WS;d|6242i|>P(&D?pY(p2kHjyM zz-&OX&KG>XCM$&GO*N#IfT_96G5<1eC_i9R1@gZ_v$!JYPP52T@#s7zGWN0jvOH*T z=5|$)TO_Z@0KGs$zrO7^)6{aCwaq3rZ=8BQ`qRYM18}d;X z*I6H!p_vLJID`|L!)3|xCGiNCH#h8r1P3rbF)ahtby**>@{A?m9E%7=G}Cjdn+sOW zor{H2reNlnF>oF-6a%_;MbodpLDjP5P_|fc6e~~&1sxpWXqS&rYR7V9qrg#MG%w3l z(fs)duq9Bca1j)C$Y<`Leo;Ipn+?wu>!)pY!dec(T2AB=a1+?gZ38>8)Dl3YvnyrS z06_(~JYSV(9#g>1PFY0@=CwzD+dQz(nVYbji`Tay4CYYh$u-sa0?a&DAW{A+kXBY$ zUW@mTH+OE>riU_*}Vi@!VW+ z$eR}hnFd7)79waCM#&<@@KuRYg!^)+QocOu)vAS--~WKlZQ7!9+t%pNx;a`mYl5bY zzQy<7HRSR;v}n`>ZJV}0hn8*7u~l1iYu^R^I(COsn=TmCrW*#e?t#Irdt-2$z8Kza z0EV|8fDs-0VMM1s=+~kn+JDmo-&C)LdR439+v+vYymlS5s#OmytJOxciZ$?U=?bV- zs01q76+o%n_9&D)FN)@KL=ir(Vq6x_??_nZJrTfl`Y`PY{@MAgs#mLog|lX|GZmsn zCsG|;qkKxePxy%p3PdzJTQZbnR3w*C0f>$8=kWo`%_cS=0x<`o5a}0c%WN12TXcoU`sn@NirZ$YI)-ppg<0tNAF_Zm%ZxQ2>1*mYo@O4C&MoVGiwMp8_U^|1ee5Ml zgX8a~8V74=4&%+J@j{u9Quw6jkU3KN=BzWb%FvP_6f|3u37U-r<4sBd&zyMNCX$E8 z^Na^+NWLV_3e74GvMz~fhGyOq!%-Q<1kKhy%@UR>wDO>{iWyR843T~~T1*pOL-`7t zq1o~PRi-sPXk8$rAPC&bD_xNR%~l0SJs^2dS_U*r+zc82Ezlz2>Oy8+FeMH;gM1Fm zM#ivCGi%9yS&GFnh8299mMzQ;cD2eClv~E1zx_y1eua;O-=99d)IQDc{&)8b1DFjr z4Ou=(d919?|NP@8<(479eCBwZnhZt2S**(f&4;5ya9oC+iwh@AAHmJ5XYuM;5`O#X zH9ov~g40K$5y`Z?c={ynKfZ%o53XbWf~jawxio?|x?%s?CAbkAf`sd*(CC}W%F%|znavy{ARfCYeb_AY?wb^aB8cHtJjnM2eOl?p1 zn1FlhMq|&whIr)Zj7O`6eLeNqBi30q8_+GpuRPu zmxATnGy34>biy+K-dG}j2Jk8 z5YZYjGPD*8xIA|?>VOHHjmlR;GoQ2M4Jq)1X31Y+rt+;n9+D20u2AIp8~G(`-muPh zTCz>@kR?BfEcr~AEs^o|Nn|-z_KO$>m*z=$R34WG)A~C4I8`y4D$LI%_h3LX#rrUT zIbbgv2g;xIjL~qYNfS)Io#6)rn%QWvK2Wo=NjkEj7T1k1me;7T5H(uKLBE~wT(b;Q z#*9GwW{pv~YzdSom=A>sppH3gkuQ5LIOeiL*&-!TzC>AqX#wQV>qux8AS{8>CCZ_E zxk{*5r3xxlsfsF9YoJ!m`lwg;Ykc!fV|@4R_h|TS6V(6aJJhb<05$5>L-o3KQMqPK zRH#-B81MCT^wzjr}QwJ1qD1btag;2C`F_b7?5+whERflC>iT?C7x=%1*+NK|?WO_$Z7SHJ01PWAdcwm^OVD zX3Uz0Idd1W{4L=*%dvRza)z-G^XD(bqD4!TmMmI|Wy@Az_3CwSbMu6k=LT$8zX_W+ zZpD_3+puNRR&3w8lh2ILBRE2RTq&+Dkzt_}0W>@U(czI)H0y|1#6=vU;s~3Ea40km zM}uQl7=&#Lfv<-jHZ0!+*F|1fxyTjEm%8%4)?((&#c+0>4d)57n0C%+ z-?j%zm9C5&HhEcZW>eDD#|*Rw5rvGM}-o zi;WD#Ap-NE&>$QR4nf?(aK!A7K-9hng!{3h?8i1l@-Cq{ocY@r&s!O}*!BrD8|!A* zN+l^{%;!q1Cz$W8p;`O>i=|onFK2~jqdn1Oi{;JF1kM7=*2)CUtVkNZs0E}GKji)CC({IQLAnd8cBU%s9`?z3&It7NH} zi`SGo)GD2gd()#M8Ne)q9*8T9q$#MfWZ00RP3}243etuevBf~Sd3;zdbR^Qir2F(Y= z(!2-z_xmbnR+E}gdR~0vbq{Zt7Y%2ImSv`!q|}8}q)j&`5jP%S7HGDBbLMbF`dXI% z3Y=w08L(`~+S2@4W|sz(asRiLX656U8Jg7tH+|$|NGwd+hmkU6e31dR^7XSc)R`J? zfoIdhQ}@qsH6R<%Y^;l~Wksd00WSNJh>$b0UTf%|0B84#KfRVK{Ll9Oq6Q zR)DR;dZ}g_Jj>$`v!S_q_9V`pIgI4gdpLeP0)4x-!EUd0a9=P3k-nSo^Q(AFozP#s zBCcJg7>@cZ#*+i9k-ljLZY>#td&`F+ZQUrmbRC7X<^A!@Wdt%@h9kpun1W^*zxgR4 zI-Rhb>NyG5SB=Dmp0)7EdomIU-uD+c;lh+|xH7LN*7j?TyiC8+`Rw2~wHIzL>xZk; zI^o959=JZEr}AkQf9C4~(KCDF7NPmpTy9_36Q2SX;rEk1`01)Y0=?&=XzpyNT)G&# zwr!<5OlDuc+ zXRh`}qt5q<#(#WwbAX zo)cr}1fnC4V&8#Kc&*c*rHsX%nL|F7i1caZ#mOWuuyU7oWH~V4P{!n-v zK8@`cZXod1BgEW_$Nt!JSmnM89lH#HW5LoYk7T#WiBexxM589n(5+`*Oq@6kt5&Xo z-|ju?qh1+#BT9XU8HAWne?$czAW#P)g5_BzHjxotg7%8#H3Y!|&HD(7?D)xuDHdp! zHe1UA%=-!DtRq-P^@rvHp#sbrnzfq-%N_H+23wYYX*;A0acRihmIx55Zv_nbS-~>b zS$>SRMcy21Xtol=)xOpmj-=e{>uP9L5N9g?%56&C0vXUOb$|(&4QSN9%$cFt+C@#( zgQj-PXzPsn#JavSWbCu9`^+8wOxKpB)T_J?1DIL=s_!2}vQ74zB(NoK7BX3yB~DiH zGvaI9TUM##3??zbvh_X{rs69~)p6V>>B_VgSUzZy`hJV`ssYW?X_5|@aFKemFZDFj zHLEm8OB!fsRv$ZadsZ38Nc*6<#!5!~%;G&~lD-G)^j9+2#w3He4DPY!E|KvZb^9R< zW#JyPtN~iuL96i0&rLl$#!~?^w+l2M5MLyMvwZz(OrI@{sPPI{GPruJBMGY>gpt`r zyZ{TOIpQjzbmXlY#LvYWlZ+`hWeUgg!J%AF1G5F1jSQ|q_dh|iN>CFpi@xO3ESbXq za1~EJ=&aDJ$6+7H1hn zBmTj4M27B1OoTsyIYNQ+)r+T;b=rV$foFl{qv3?+(?@aga5!#UJB3%z6R>9aTudK7 z42x%sL(uN^z%S3>zuO&!^4X$v{ygwsH5E^S+>p3&2JWsIi(4y4;K9lvNLoG+&sGh> zQy2LtaHO8>#Df6sHWE)DgA&MX=Px31N3&ubDAUB)48&3Ig$v|nb7Y+N%;2o!Ao*M> zHm%Qr*|0P-@3TP{X!d7g5a8!eX`e}dDf?WCrCBU2s&Nqu8;_M)*5X3L5E~K--%ab` zJbWO&u3Z&*ZF8FeId{&yEEiSqb%SQ8QnL}N)NP6%I`l@{o=)g7cnk)Po{VXWR%6S) z5bO$!!p?vw?4Y&ppmF^o1w2e)C@VY~F{>TlQh+&H(J$69iv^wk(x)%Qk=PCX8>}#$z|^AaHJmhsP$k zy82+vS}&|z?Lpx7#A-tK>Q(L(*H?OC?V1g6bKMMYuO0C5*-6;k0qajYW_<51jjLhlJ2IZQY{eirA?U7;@H%H?ag zdHpu-+`flcsz3QnE9fa9mm;t=8a-1#dCxR)$l zg++^3z}3wgo?aWUZTl_+5#Em+JxS=+Ab#T58AA9OoH}`d9p=k8bLI*nkDS4-gArKI zwDj0~5Z*i3G4Vf)ec`7O5OW@(M=v6Z&>V660>V#TL)eKM2stLp>j*!^^%Iw|FZKjB z`Uhdv)-71H!5vFCxns+&T?h<|LF~<2@PG6G+aKS9*Ml4Iyn6+kZe7Nnn^$otIT?o@ z#$#h(Fh)$CjIXLyN3ML1$eq6+^O1JrkY$O|rP;~q3s1Lo2-vd=hgnvSgs{UF8i1&9 zmNx>l_}3p`y)Nz2K6YGoZ{CGqUzTy^X@O>K*s?&gvNoIAl5mzy)n>8$Xjs$WoEe%; z;LLKM$^s$WXd8_BK-v+*dTcG(CO*(v+ZuB{Vl2(jtX)Uc8)n59($2GN6YgXNKpD|N ze`^rTj)0Y=S?Up~W6aQM(SNcutGbGk`a{=IT1Nbg_-2Z;)p6=rW8DBVRsTs{D==k} z)WcH$nk>!QLaEzkkzVJ1X=l}Q;5ucgUv(R-;%aqo)>8YexI}G}PL=#83#ezybdryg zBltYU&?zo4!c7J=AK-n+=wO+kS?WQ`?4lv{G{ZA2$Fek_S*0U+MV>=eXto@ukCC?F zKSML0(Pv`1m|4e43`^MN{|e2T^DNIhlZZ%Pe;CZNf%BDl;1&XTCSj)rFIn>C=B z1(}5}^FB&$o>I^(fGCC93Yrb~3^Onr8N>>hg9BNa5FRBn%8+j6g26;HLo+jgn#$Ak zaQ>Z)ho}pLEG_yHG^^L5Ew}fR#{_AvCONOC7gc2y0N=%8{RK>uGjn7&4 ztW0ph{SvOYWEjJj#YIG7gQq77I65d$Xx_XjK7M$O-+p;d7}mbb0?c26W&@bbBFtKY zvNbS^b=X*!5C6m!Lynh^1I5}b+uprQS0DPFIu@nC`Q*_^96u6{NB6E1mS3qb#5ekR zMiP$2hT}wZC{D*k;PfHtNEE?38CTCA#r&D$F@3^d9N6iBfBu+(`ak{}VLt=Q%P2oGeusy3SB}U3C!UdQ#_zOu%e{W*IA1^0xYr$Gk20Qjf6PP2gWp@}h?+C`mZT|4tO3))Hd;0jo zo7%W}Kelc=fNfh3VEeX%*s()v*Up0kS^^cXw{Jhi>+L2$Z`-;T8#nBNxA!)i49)zL8J>- zE?<|CZCaUVyo-a_Qr^Vr9FWJkC! z{EzL&sXH-v@ZuaYe!7lVAL8-!O$sh1Tu0P{iwIA=fymU`h)lYUum|T6dG8!TZ=S}X z$2V||b=l3NMEFNVVbYvA7&l`ECe5CMVPnRjUHkSZUbKk*aL~aH4J%hb_ePB{p=S>) zoj4Zmi)Uer+j8viS&J>}*J9=3x#-=cGp3AjX1*0mGs_{{G;w7#pjla%Ssvs~l7eQI zO*JO8v@tA0mbn0UzDde&SY(1`aifqrK*qN<>H~qwtiWtYw=JK;XI7b^ne~mq4rZ$} zVNF~`WGFN};Rc~uEX`b3u2QTAgcqzK(vX_3zy!!yf!V0Pq<*rFi*#>%c#?{N_7}mFUxa{ zU|lA_9L02tVp_`E2fv1A1sh4 z&>X=V5ovH{qJ9yY1(v1fq@me5qpOTB5FpkwW7sLNG^?Jo?C1Tc!YK)BYP1F9nHe(! zvxep{ljT(v3`)CF7|^WKCTrmm84pclJV?VLY%Sv<7#Ye)Q=@nOAH$P^OPRr&;TZ6( zIt)^onO>opD>z9jdA4#5Q>K&fB|N!LJqklG4n;;|!JN4Y7>cn{moaHyzI=jTetfHd z*%)H(-$C<-HyI{nhGv0dk!%y|u&lp-^+Z{lZ(qBBYZp(eb=hxNoqrUo_J?Q6+I%!F z6vq#T;JWo$BkNGUihj*?aCL$0gk3`~RYy^%+hZCNo@ig%Pe*f_$qQduL?#xll z13vi2rzd#M2CY-eW(tgE59x@NyhW zgxE9>F5O2Xecd>uteuD?uUQD4*bU3uRL1#b!;t1VmEb%D*FEQBSnIFV7yY(*^1`k_ zA@x=M@~%yBW8M&)pV5=>+zq#W-EnW#K)efIhn0hx zqjI5~s&TDby$U9d9E?VFYoJq$X7Jsz5z)bV%3fV@>q=BtsIt`SVhM@hbBqXNUNgC8 z=tfJ*i=_-%=Q$&tD8#TChOjlNjzvm46^uglyvVwuYZtaRIi zH9XFZ$9ZkshYgz#!eL~>WW1R zS7H9VWtcO25vEO@gYo01VCaZ(a2hfS{rU|-w{CsWp+k4HZPOWTsMf7Jp%vAlbw~Wr zq8+|((H7q{X@+_Yo8X(qE%0rV*7&Y*Ycy=s5)Hon0aa_%Me)*QP=Mv%!Lb1B?D8YO zV__64UJ3;Z6=jFI5F8xxaeH3o?_9{4D~I}EP}0qi`smOm8``yQ&a`cVvLy?nR56B8 zAU}#yMI7_PF`qrd$*WYfXdzUqRvF)Z`!(a<9!^dJ(Y<>&^ytx@>rT9GKh7KOg~fhQ~C;pwS)Cs>^ri)UPduj_!{cQ^#WAj0spmEtt;rc~h}?<$TOuFbj>o zZ;UF{s-ae`TI_Unz^GxPFn`8u%pf@bP_qWg*ycjTytz?6XAYFho*iYGZsn*Nxopso z>C(JtL3FKA9!<&?LyclZuww2!*6}PSrqMGDXqL7(j2%hiO;R|^rnFU3UUa$9?S=qK zINN43ILin!0$5_1G(of3LTqYx#K2+=%GMGXmUcF49rBr2-^iOS1Dd6+QXeXkDfG1^ z4iXoU6*S9I)dP&%7hqDxPgfI=nA${ZNu6ZegAosFNY1KE@wJk5eJJ5702VH=0%jSq z)eO!2AIADN6F4)D@)4-AGRwV~;*~Y*FW;kFQ|k20lDb|#gp?VC6);m#%o{Q6R7F$a ztn;KZB5&DbNWTND!w>S<04}9ImbzCg(&8I$K(nfsrM_m^fs`yIjiugK@ialR@gblY zs?~nRN!U$sjn0z>*XZxpxLx)uX;{uIQ`(r)!z^J+;T4k%eld9SKS6U=w+!=hW!yyO zFs!7@i$&7uGz^^CWX7oA{W5Emk(tHPX$8%S)CHQEAf|>*Lo)+59j_86D>O?3DbTFI zSvQ)WOT)7@G|Olm+C@Wp&b$t#3Z-$Q)PII%0c9&_=CQiBp=I2IYOGBlU`$#dU(9P@ zW+4q?!IeShzW~kZep%p+M&A;?rIHa@KZGwSQ%}s4p`-%*9GZ2wM);C3j7q_PW@C61 zS(gl|di6%7W~$iC&@9i^{H){)T6uoDP(}ua;CS3&bZ*xkdGh3ev-1SFx~;*Jj1>G# zp#2gw|7e;4@N;O+z^69?%o>&r3FI2!Y%IkMLq^dMtFi3Ack{CP|Kq#oD$Mt<({c0a z8ReRB=f-)SC;rmf?T(H!=b@AZ*SEbSRn)!P9!-$(C7o>MWJsKI_ zqX@;Lkl{HN&pjq0!*vRwc{YN^cg5ls6>xM;KRorG$zx_=dG}_hQo0a|mn??ddF@c^ z+eQk;2G*{Gn+wL^JmFZzalSjPBW}&;tUdyij{@(`>W_PK`r@997rkZ}ZhB6{(C^Bi zdI=44_H0N8cJG8)6UU-s^QKt8ZVh6^_fej`s=QednLt?svyq=QG)F4GW@%K7%FKXf z<}Xtov(9H0>9S&R&ydPQ@{cJWsY=TzQ!*KWNe4oh-w43|rn*2`k26cA*VP{+u})CX z9Bcw-IiA~C#`HBA50z)-nv8c=XeKZp+^5G&HbXPv*czHe{=N){*E$pyh5b8sV^H6| zDBz&qs^rRPgRjbzL8C^k(4or^^cXY+gD0` z*ACShw?~a89Z>C?HcAz0evj&Po1<#&?@_*T1C%LO8(&qdiwc$NQ*}|cQcYB-QWur0 zeT^D*8l&FV&G2ob)@am(F#AJ$G;h`sO}}r8#^1HT*A0I_-Fn}ldiDCKRIxhBma2eK z#mk~Z(NZW>pa}9gI3jOed)Va6g>3RhBYRHXhyDQVb8)+kEwbgxEE@uGZihm!%U=Wq zie@f(XmE8arAn=Uf`l9477=R?w{8db6GT94J|TkSA0fkwu?Fl-3s^O3BiG8x~6A zzl5c%^#Z=EU-d9#7GM^Ihj70Mnq|8gn#CuUaaB{aGyN49n!+{0Ru)=`^Jn*`l@*vp zW@u*ps=+yk&>W1oy}=5cBdwtM0J-NN>uBYc!8j;r<~D)m2-cGV&0_JlhGu0*l=P)k z-7n{mF;u!Tu34E*>VLU6{ah`KsKq?WG}R+%NIuk{9KpO8;U8l9Z%P}6pGmCuzYNXR z?T%HXJWxqk>XA3j3hwF z1d>cE1xOyU5-%VLqatjQK$~Q2_3&jWiEBs_`E$uR>Q$10N8SYEpumq|>!Ih21Q3fc zdk)-|^+N@-SRjL#Kq5&f6@P}KUNuQ#GLA}C@sbRwKKx_G(7sU`mLs@Sl2_kmXqMiF zB%XxBbYUjamUs=%Y6?n9UMEQgQ?Dr`UH*;C`W5!`d4~`0pHhV44;lE#rHs+6CO1*jXA3Y3D5q<2U0~OkT3b)4t)W>US==gQ z-LOInC>ul2$#wx|u_BA5S;G1C$5(ik{ur0e9Yt!=T@{v`Bga3!cLT={MdIx7LpXo( zFb+io<5XM>P9KiLz3Z2G+)MoV+iRRV8^tj0BK^@d{LT36+2p0hYyPHuAzazN2@isH zqEFpQ%F^RExd&3V%tVsg7=rTzy!0N2r|v`XWc?UC**FE~3CMfC|p2Uocna^j2)Xcs1JtJ zgeGET=2DH>to-~4%(CQv8AeU4(UGhGlm&|AP0bjmDl#eeQkI#wm`98@3%w}t|vN%gPx}K0q zQd}~WrCESkTr#Kt0<%b5&HVQgocGEQSo^TwcP|?Z`3Nu&``Iw;^J6)ZbYVKpo-!G= zt5jBSlb`Tet9C;)ZrUDQdk@EeVbd{U{8CJq;R@$D9_Tl69=>bY2W2ZXfJ5FAu*uF& zO!mBlPCI1FX^ZT+@*ulSZe-8xfb4b!k=-^w;g4XM-3GY`ZrO73xIA{qk;fLcJl25_ z=V+e|h4N)X(frv^f+|zM1{I3tLDiCWs9riJs+Gw>z|2b^%!7RP4k+MQ1jS30MDfza zP_URI0k4}-+2V>>j30OX3EEY}~iY3#B^Z!_^oj(CZpJ2VmRq?pWWq9lW}?gh%J*aP80xYg>Ph70nxAdF!TF z+qFG>`gBLgs3ABzZ31r0n}M@4Cu3%VTKL{R55{-xjQ!iU;Q;IP024H8aMtanhGw>( zVsQ>&`DS@hLrqGXC1sy&o#lf-fl;=1`&pg^S_PV={Ss(W?Tx9;F|{R`;h5JG2$m)P z%P3nS8M#3jyVUyUGJKkqA2!34H_B?LM_yZvWaG#7){pg-{|E*+(tt=3wr8z-?l<)%UK;;f;Y8w^zaH)pKRNJKPVpwb^wOfX+Yr}(? zhNh*Azimunqb9P+0?o=YO~z0kE=pZyq^rmrzP_)luxu8urM|TaVtvSU4bCQBVVx|K zj6^V<#M&Ig`co9nG!;Mb0B$?LeFym55As;U(ky_iK8BVK3+qM&&hq>jPZ>c>EX{$u zhGA{i)?~&-iR<#8GJ~_a4?b&wQDI2MlG46Ne&O@y^Oo>czLWgN_9Wb&&pIH4>)ggL zgh?_zLsjqdIfx`oX#>+uyukfsm4&ROQcoCRTtw$WJ*a>Z$J+j!kaS7NNkC$ zG=%zCeQn9emTQ}515`^RLH-9x;b3JV3SiiVEK^VFL=s8HHimRb$1e$e6K_4R-L_aU{!&F~&nhK^# zDy=2sm@<#$nd#>y*A$>Pe5=IDy=#+?>Zt77z7-?;_fxZfb?e$uS(;X@UW$)D5_CTj zfkBYWWbk`cHvr2`?k3vSKoft$0s;no~c58Rp68}}9r z!XuXnSkk&Yx>V1PYQ-IqJzFkh&ygFYN|!|ay0tNU5F6udThzn|s`8Y#BV3A$O(ZL! zhya=VgcTF>a0DyZD1m0?-AERE{SARwdjy=dRFA zep#2g!cW1nnwd{NZuS+ov;8>83R6D*+eg3%VCNwyBp7R5*PvOm#*lG`ZFA;AgBo?w zrF|a^=7^?{}(P(5{`u&Q8ZUBOd-f^T|F6Vmd?k@)$_1qu`^s2Ou!De@d(>J8pi^M z;#ANeTnHVBt6|Q#65@oQTBwvX@ z>h&1hI~#O^z;sXdwCy!zP^vYU*5sL-f;Qs z7Cy$G#mB@8_$BoUeoncJUsA5%x3p{c^T|#8^^|&c6@NUrz+*2_7rE~=eoQ)!_mAT6 z@!?_WC_X(pjvwPs;^X6U_!NHuKP6tm&+&{i|NoYJk>_7#SXc0mmsjzxSJ#2pH}J1_ zSMc}iGx+n_3HldJLW z*fOQ}$Cl!kGp@k3t@!6fAN+O71Am?L#P3Jl@W(NC{B^>U+r06|nGN{!tPlQqZZmLh z8~zdJjt{$L;f>Eky!Cd*JC8|t={gx{Yo;J!^-MfiJr_5Z&B67>vvGOhG@P9?0f(jx z!@iNd;XR-$2Gp#K78Sn2${EvDhj2duz@HsLHR1vDcrgDPZ;QnMrHoK4FBZwS5Bu1b znc6XyS81O_hHI1|G5Q$ok3WM6%NnjQk&@64KK)$#RgsV=whAIU*MmuN)&X#sn$K6`GJrpirhub9e zJp`FUWLPo-ShF zYnhFBW=1mor-FFhV3R^*xKRboCYjSjx9QPLvlszWLU1s-B;a5$4ju?1GzXcXSxxgV z=LoAXtwT*?f6G|j!es$W*8qkcz!&vcxY z$e5HFnk6CvbowH?;S(`&nE+a{E;lR*YFwCwLqW6U8ah##0r+55Psa?+nXjuGLVdl= zAo+iUW_e&HSwpjV3Q9^9IA(0=g&WXp4bPtgGw+Mi8DAchP6uOPc3C%|`QJLWo20|A z0JCwg+H$R-S+?o)=CP`0Blr>pq1N^A0$@1Z! z0nLV$S-We<@+Thm3&Rj#7SJ`^G6a$ZmJO@30n%!l$*tr4n{*Y9gz@G@3f{d+RTgMD zPW+x_qzxH;LtH#g9g4(}$PgTjwLtTmm+AQF;|u)y(@XsI%WEXwy^OaRkMZF}D%vz} zh%!a;Vtn^bD4E9=waXMm`)cK|vU4N6+`Rx98z$hr_auDqo`yH;C*!#%A$sc^>>1D; zUF@@AdcA5`+5B59Y*H5^8&pIo!c@aL)v#fU7p6=cg9S6EqENv?$Y$q&=AAmhcQd!Q z{sw2}55w)beQzBY+`E3c!@`~IB zRjXA+r}iB%dE9s$ARI=9N(C;K4dywi%vr$+Fbh1J|MM9OG)FQ2MlcWQSEKt?Ue>b& zQp|6kL9@9`TTAkorA+I*tdFyV$={FIe>KVf+v!6GCm47+!3#UQ6XC|XoTFc3F@T~a0s8p#6YF4j?W_2o~u1z+q?OG2PJ;vkaZcp3~+kxk2_u|71 zg4yE@z?0R$iy6SP(Oe%6qzwg<2LTEFfcWk}Qa6I=00QJZ;B6T2Qv&eod*B!9(=(8a zfp^^Y;SqJ4|F7}5OTeqM4EF^7_4F|Q`6NawgG!IZzcLQ-ddGOo8J>HQAQ^|hQ;!fz zFHjeOH9xQ(~7881$6#G8}e1Xnk_JhTqaW7gsmw|zK8onifSb}K%d-lFu8 z`gE2$=YtO?z3}#^2VU~{r=iP{=D!H>duHO{&MAmzXCz_QWTg8|$CEwNkj`x>+a@Au z^LVvP+wP2{En|@^+B%m1C*bL>Nl2wqc2D7w(7R(2GI(Cv_DM+FGKuFp2|?oD-v@)acj$#9Qi<&=*b)u-pu7cSr$j!^ z)_(827sf}fY2igf^560dr5^t>G*ec`Sfb4w%6v?m zOpTy|S4u&vA$=1f4R=y-GfUvn(s&vdw+fdL(iwpdpU3@IXlCZN0J}wIApReu!eBv= z!XOVbknu2}S@$9tM^ncB5;TjfVcC#ChJKaJ`&H85Y)D>lY4P5y3a$0SQR$(!3m{wH zpZwR`3~TfM0?kGt73;Hcy)Q|wE6^;3OTbw=2BF-3GA<6$?7_~RIvMq=Re__O9cpnI z5f*??A7A1Vq4lR9pX1fbBs@)jq-FyA@xx0!ZnFYpfm;hO8`fd@|I4RWgz2Yhc)9nl z`2XW;1;hf+20$C&EVBIPehJU)H?9AFcrALWJ{}Y+vjB7Q2+o{1 zgd11S@to%b=GV%d8JF9^|Zcg#l2lpbhjmkr$tJ77W6x>(w#0UNg(STk!P z&NEJ_uhQTjv>)reyiuD#CH|(PCr-dW{`>{=26n;Lp&f8%$#C47(OnHQcVkvB+?w4( zjnjN%t`p*2r^2gS1N5w5gFfGWh2nW+d|>fa&4G#)Dxpu0URb_h5u$>E)d!pr;-W!? z^7-kCofW20!SngbYeeN{r1G&?mj#vujG1?s2b4^Wp0#w|m3*3|ax;}zOJil(Zvkd~ ztjc%Hw>tmHf1U3&G*hZUWte+|KZ9nq+%LY(JV)vzD`?iQSXmaN!e&{K5Bp?XW*L_2 zpx;6GOT`=%1bP6IH%S1)v_OQ5M-9kptHjXE{O@3<10 zeNzh)JA8wIg|cB!+oFgbSOqboIw5XyKOCOpgd-eN1Ef$!$07ULyhs8|%!?^I zW*}|nbY%F=MT+knB>T=n`hhuk5jYPo17$fI&koE$3L!6Ht20svc^Q7Qkg{h6l6Fr= z`rf(7*gFqT4=%)${R@!3Z$47@%|$A=r|zAj)|0tEe$Q;&-!=^ocTDGbvz1bO<(hNV zdNTK?`OQNbk4@vbX*@U0kH^S$c1=@CrV@8dL;SX>NT8CpPFA3qw0Qy^6EttT4aTi? z1998Mi5kFl%Cx-aHUJMj2%8&*B4Pb7-1iuSd+vjA$9)iPyUOtcafi!$ZiDc^W3W2* zzFU8F-hGB~cWqxhSVwTB;`u+>-3cikPRQ^aLJh-H&*8}M7=~2Wp-6EVrnOEhdEE#k zuN|(Ik6nl1fjh6ib`b8a8i)sLobYhXAUs|>1dm;Y;vx0GbvPcmkL2}vPisf=xFLA3 zVgM3X^hM&*-bh~B7bz=v+>#NvzF-WlE}o3*%Vy%{ve~$}a2k%xb;g0|Be7;^FSM>! zUinG)Y}+2ufnkXB4@bDTEwMwWfZ3P8?8h>(*O#D4?e}H5_9grnqiGn>yjPWfWdu^N z#CAZWI;|ophb*TCOvrz!2Uu36oGMUO^@f&uOU2{l7}c&=$sC4hDQ<#FsvWihQ>jB# z-N5Y<7QtACmlI%?j;Ch6_g_x3q7%TkBO zV;KkI8CgF_bK#`a)X7%Y%qrvB^6bUZto*il&(T34*y8DlE^S(&T=8PaOTg&fu|3Y8 zJceICy~WQTUn$G-i>C=xB7XUi;F}4WH85L4vmVd+IYBuC=}C{2rCESkK-mCf1Dp+L zHvSvyMp#DJVtE!2mdQ{Am}R7m9|_rFZ9Wneg7bvs#D}+W@$7Njy>(f|@h8SzEZu+p z@&=D@U&O6TCxAbHV}%!l26bzoTl>~1md_rIYgR#z+Lhqer4b&vOvLjoGx2)IJiOSw z1TXxSayc7Mcg?}+;)Xh5@w- zq35^dk4$VEQdNT^Z7GRNrkC>kC_*k55Ek}BFTd~AL_g#c~CcaIzMHGX3Kvg-?DtGaOA(v z_Zph_ni^|?=Fct7YRLvvonwOLpaU}6k9NzD?ed{Vct9|YM8@K1%wg&XVj~Z+!3)Kz zm21(WWoy{x7E8VU5UY00T3EhfHP)_kN6+qkP_t4^6ts0vZWjV4MYCr^Ek}EFtXLZ3 znl-{or{3@yKNM?6^udze?Xj?9bIkA13bVU*M2FhdQMXb>LUSF|t5YAzLne4^?UL%pdVFWS=su>%G;Mqp*_hOiy19_Z9DINom=IMlVuR#oR5IdMd)Owod z0HnJ2XMFo`*^hA;fOH<4PN2l>!SYc5~7F-&fkvDu%2(5zoE1hFUQQ5BY2oSM(t)_d(J!g7eD3xVMyLebE>K^Au_ZZZ4mR zD=TK<_|j?cpXZF#V+Nv8<*!t}ZPKtY%Xu`8M;}9!e<(r(nhDH-dsrR>nyHX|dsuEo z1n2#J8k$8ct73T;tFvKg*2BF?*^`c|NY>eQP|{|K+lL-nR;0f(Qr|(aUDnduEOmrM znc7|xq-anf;0v&tpw+s)*B@)iIc%q;EtEQeOd;O~iHn((bv1&6VirmOl5L-~;{xOQ zgL#XDAF^)K@#B*9mUUaIhfxz2koJ?;VO=R>AWOYz2I4IGA~cI}g0QTH|5WD-bW^N@ zMXD~AxU*g~+TpCqnwFZ;jQizzR#HsE`;l+F{CPhCav$6tCD2SsheJNJ4rHC|&$QDb zG;?V{v-%iRpjnOk&A3Z?^Lk%~=0HR;yodw5XG*|X4GHKU%yeY^E$1;#0erRx`CJe3 zxk}t*umPPPc#kGtGTH(CJeZG6&%%&4xPHz|+Ay1Wn~_%5dCq`j8Gclh6`YM@4Vl9; zU!Q6J1!c3ouyKOLDwH|%>zmXtXAR04@~oj*!>jF+63CBNw`4RDVZlXt*wrJP3DT)^@gt`^Vp?j^$@af+kmllt~ z9hb>S*tQU<2i@=_)CVskHXtL!8;93S#-Q58(Df?^46IcG8zv9L`$repA$W?67s*I{ zmW0QNNw|37I-JK&M*Ui~5g&ga|HSWj_T(`JwQh*Dy_@3N(h;~K6Pgf~AI$E8J2SiB z0f9Vj#sKsvV+*HmzQTZ(_1J;QhV0qJN}3%-3Khr1vCcTK%NIu@qY=$YJ35S&SQslw z1LKvwzR8k)soEOmy|32I184b9pgk>xfp2>W;Y!e!-Z%$z(OBL)vg zkIvoD=<9}XG>wX3Yhwe4TzS!f9QQtCd5Ys$ZdY?W(9% zs~T$5D2?82t6)Lr(%9Y622YpNK_bC9$!#L;y3faL&xN?-ITv@`=HTAiS$ME!Dju&H zkA&3(v9*1XN*I3P-W@Nzd*H>!et5c-pt^k|o^2((ZX{f89EfCZf|pW1#JlDBq- z@51Gp{)F$rTn^-t#}Sg#yoVEF1%!tp&5P@V;8cQfvIno{H4KRa$`tRBNF*?(tRJP6 z%>T)6H5TF*9)!P5)J)7&6*w!%8h5 z%BkbI-2i5Pa?*aL;Q^+ZfV04E6ps^VHY~a_hO$4;)A}5mBlZ)1_Y#WtGOm)|JkA=L zWdsc|hKPHNKyxIIw+3emGzTcYl6Z&?GJmK+8w}22T+|>5rZzx#UX15rJTJXX!>dk% z%#yan^4~hWtn-~UF#kJfHs?*Q8|kgDn;DvgtBmz*(tuh^Ud3c;G#Qw=2dW|@5fhNJ z0J8;}jRuhyW;puwuMw{0CUsB2)TpWlClNIg&1e{Prnh_ysIRRPM<=T4U78YHz2Y&g z8`9?*@nr@u3JEitWH^!PU78X|4d2GNi4{)8*^~zIwQtr6Mta~DXto?{4H;j8<}W>H zm8nf}w4|-xrqffWsk(pOr}0qD@wR%PW@ye7zLs%a^=w!G<-buNr58MP>^L;4TOZ%m zsgJLUmO#6v&9HRNY;5*)$Bpx6@!-}~Whpj~d?Bz*c-5BXXKGkDi-q}_f>iNemeDQJ z5+5ii7Kk>^w_aw2WeY${*xGV!NLia-Kf$f57jX8(AvFcJSg&Q|jM&fvNKd?vfBa4$ zCR9IuaEs9VPFtF(-+p?Hr)dxI=9 z@>wcg$;Xaw-r?Sz#~3|y44g-dWO}^A-~akUS(^{;-h^tl*$_IjKdvtuhTF5cB7Sx! zJebh|k5`Vs&Ot5FqF^?RY*iP|13JMjcTQx>uH7>VIu^m=xeIY5JQ`=>jvzK9Boj0% za1Ld~6ik^c&DsTv&z<>4mq%R&Wf(Z+hc6YSRX#Rke!Xf{2Blm{Lzj6NA*l#;na(QR z**5aB@g_#X7x0$y#PC>_6f}#KS@M4tXx7)&^@DcL;CU?L()hUR49sRE$YL&PiyQQ-VMiRkHMJ* z;}A1tFt!cpf_2?mV0qi`u)ISPtmwwF(4!fa^lXl$JzHUM-;NmBzA5TdDuLQH%Ar=x zN~l@08fw-miNReeV?q0(h#g)4PgmB!{UwcY&uc2Ku3v^L>lZO!O~ozGX}IG)8~4`D z!2LCo@Mz6wB&;Pcx@ya8x~CH|ys1rnsexQ4%z2I`)Q!`V?YKG-;QAqn$3I#<5Dy7u zk5>*Qhz&vF%E4R?LJDCn-Ib8)DpSAgeOBfU&Oy{-?{!iup6e^tnnc*>7 zT}zZsiRD=VH(@w~66|5uumMB*rO`=ZfM9>51zGBT2*# ziY!wJ&}ock8q+UL!Y8CB^VnprCo^pm*NsB_>LEy2)gMVK`yg>ycO`+UmGl1tQ|6ams0Kq~g!_rW} zwpW0H>t=Tb9wX2lByJ#V0|kDi{A-Ie+ZCxVgcB(FFS7*-Hv}r5Ykiz@BcTMi1NZFW zeqPg0pkJ)zTrx~?yHc|Z3it@nu$>l2_UASW-11t|&QSphy0s*p#?p|52lPH^LlwxG zctF;TaOFBOLPc1xhGv212pNsY8kVhqStbKB1G6ky=PKibDXnEGZ`=*owFY3dZnfQz zw98W0s-gb4ZX9nu$F!7mk@S%c#(ojgQXo}Bvsi<*5l~J1!h4b#`~tWhzG8o5brwnNRcM^y;fY}j*AHFWwI?(;K;wE}qfFsqm_A_w#*Y|=$zw;s$K3^2&Yi?xzkW2quE4AS^D`}i ztb70{AM1&QS@z51Cf6^WQ4(vjSbsm7#%;E?Dho(kuS@u{Ujg+S4ao*De|W90@#aMu zuAV=IOJ|ScH6c|dMUhzqkH!S!MMfh2{_T@m#y_~tJRckpmx>xCt>PWb(`+}kU>Wn4ZoZXaJg!(YGufrs}Wqethi@Y}K#_}Aa~{jcAU z&U=f9JcxGRRmY%)<#1)4Gj7fAgZQ~!@n}{@JX|pZi<(zM^CCGhqU|?W;5-Zk^VuU? z&K!hh8|1flgxeZdTsnRlx6fa~v4}`^7EIRWP*#4*x0#i!RQ^;rD^68$Gk>U$ZX^$= zd?@8mDmsB@mQht^GfR!(EETaV4Y5orXlA)kQ{wVCvC<9Q77m;oE4hIa>aZs zzKn!sH6tL8`R~xoGRSqA3OtIPgpTc6D~ob|c6tgq*rQZ|{3vao2epb6K(mTv(Ch13 z7}2Z=hJ4ombGx*_jtK+cKd~>uoqHpaouY{0U2uT#yt%{o@MzNrD>^p9B5E1`uk6+w zD|)uXvR*B*qE9O<@6`s&2Xuu~({E9;TyfN_S`Ia;S4ExLbx@~fNsR7R2}|4N$A!s7 z@oaT<++W@dcQ;JKwGB&gZT$k=@+L5QPQhLGDY)l45f5EP;nBK5NO0**Kq8hF zNcbCs^bK6D7gq)XDS=l_-LBjO2!k&Dk?b-632TPn@v2c2p>EY+YLHSAAv{%JjsT~u zrUJ->MSHzV0_7A!Ws10K5Gqq;oN_AFYov1D5STVx zIz)1gSh6Lq0-(y$EFem$rC67JxP2o5m3k)P|K|#z1*!?hgzywMg0O%tVO1>5$&7o7 z04TQ!5I@@_mTX?jo5zWJh=;g{IH~kX<2I4(6E~4G&q3U;-BHA1ElL&GXB@Tk`V%j9 zNY;+=WV+QJdIx@0`Atel9;E)xmOlW=Lx6dYSR z89SYaqf?D4%D=vVLq5!3un3n}2OU3t5|N={*iUHo-LW0MJGXJU6?=DY$9~_PIN&GX zER+1mnC04ntJ*`hi?S|D8EHnepYXbWkF5Ku@b>TVC4l>3-|pQ?{(JoxCin9gv26Q` zb=!|i8L5WrG6lW(R{QVfI>A`h0~v-|Dt~JN-S!ZwW)ag$&XII7OWLih z+jJXaFscE=)}Z{KwuzKc(iEpJv6Z1L&}?e>2y9x6unCw29J9Jh7$2z_ez!VN^o7-! ziiAHSOhcbHsFFz}W9h^*63|Gv%=*vJoK+TR7K=DDH19{IlQgE`R=pnyQ^Mf&KeMu% zIt;;*aiyYBufW(~O9jMj$~wmV#_`6zXO;%vFFj{qWv0$3si?I8}P< zf)K{u=IW)(F>>%Qv}@TK-`1&*vPDXu^$*Q3f7Wyib?S%N(2^LygTg1#72yC9mD$c=tK)WzjPVW?HLf`Y3R(62)qjOp4Q^ZRte<_ROQnH`%=gL`7zke)a= zp+9yHX@^ZczQfjT-(pLbZ?K_5L%6qV1eXrYu)1?=EG8%~?%o_Ld$hu;o^7zQH`S*N zR>`tQ8>|`76W!}nMfG9@QL{3kxmpd>tx*@>)+~W(J*r@B>)eQ6P!2Cv)nq5A6&`FL zBzw=pU5|OV=P?)e-Dlx}+Y~%-9gWAXPKbBui3Hbf1hDReuReq?CyFpeI2AVw`Cv_a zl*OMmWgTHoK2CEPh(v+rwN7}ndMF->YsR{Pc(8UL9%maU*!Tcio%givwG(5@K< zG;^JR{A8nu`!*3&Hx5I_CIUT`wu!*$!+ToKco1|GJQ;855##@u_b-y|soqWmZ1I~G zKkB}OVo^WBHs#fy8mRWAdx?KF0b1Nr1e8rmBgm%lUSxJpiKn3_o@4OTi_pwvh6g(p z#tfk9d;+{cH(@?W(wu24uxu!a(45RphjJs~@ySdtL#fIt&$JWJUc*k#s{Tk_H9)y^ zJXkUa*X9hs)%nA5ZP8d^Er`y49$JLOHS_PtNS0WdL+rfA zgoUYfk*vo>L?~;juz^1T`rvLt^X^@WC$wL)04CcpY5Ps>m-rDYFbpCe$oCl`j2EvH z78Zo)s8GZ*-qF0axSYuc&=MCl1SZdyVK9x4_(Ej{RPLAYla+g#e4t0^kEU5is{0LA zZK6n7m?^10CGH|smrB~0jyFE8Hoh5&V!R|hCB64a$Af?_^_0M~w8PqnrR!ROY5`bD zN4bt%Q-{U+koPRWuB^wb6ZaG7{dVnCod#*IjqAxXRe;YrQ(#{@76SjG{bUmXXE|To zE#vt7WT_xu+J&%C1+xY;2S{D)&-k05Sxs_6<`8H$W;~R^5=0uBr5({OtKz<)-AfpM zv08JV*`JyBE8mR?H2cd3+X{$6GhGxxQ)@_U87em5mCg!we8cUm^Wo$Q&6!zID`#&o2tq1m`GeY4^MAYoWH zrdb=*%rK}6P-a;Y&$tOg*89G+Uqd*T`9Xs z%PS=X$=1*;k~9#|HSSyP!4e+{Lu90F)`x6JKWrl%wEi13TVF?H+_SnaAFSGLmBExG zKFri|-^QybDM%|8FTphDN$}mi16wz2#5Z*sppZjBIQ8j|xihDsf1mDfS+iWZVtgRL zzJEh#RaRf|F&0=Bmkd27^G^g&0cM#@L`KPwS^7jWW1m2?0Q1iuWSVR}i=P3NLFxe0%n0{nMV8KnPciYa=ci;1)wjUJ;L=5DqM-ni;NV;O+EyCgCF_- zC(*~3`0*2=TqZpsNI#9g4PU>_81Fm=4z_t2#Ie!K*ha(XbasX?V%|p`)MQ~tBf81U^61Nxh!_nzI(5-Y1G%TDQGo8BO zr<6xnHg6`f*-EPiz2 zt43&MK38A88{jPYhLxOjTn$jx`IhUdqRUe5sbE%uW?)v5&hJt$NP{Hq88Y@PD|vPnnl#pv$k^nxK^~@2ozf*S zyu}Y#+N~2-v~2+oc8EONeh-hfjo{g-FCgnLyR^j0?j5j% zTHd=o*7RwQbv@f*ZTGfV)1xic_H4`lT<3q6{yovQLTOYkV2@gr%Ar=3DyUnr8d}vU ziG@8Xz^7dpQ(80IRo)-bMVM@2_Cr2BiK&C<8`BuK=4Xf*B?pi z`Voc+gRX;3?h=Gku>gAvAdK}yl1DEjxe2h1q{a}KhbyaRJkNPdaDG65zVF&!jTG^~ zbs!$O4Z+X{JYhOnz)=8_=ZU3O zATm{~!3ro%K+65;TxKx5C*ERhCLD8_K`GY^AJK3;+bGuJ!PF2!Gr^Wif#&3mgmwaP zJi#}f(EEsb=ru^`5dk#fLS^h>0D-b^r5)qzI6G}kjJHqpETo=E|APL zOmio^x(}xa!!i{b=D>3doUrE9ci}E^lT-Z5D<}(wC1v#0| zOIgNwoU+mr!qemf#q~UY1L1tVq!*8IaYFJMc79e5LHgR!xI5noS7#BL=M2I1`J-`d z$v9kF&AMXEWZWV&pCdHyoiqepYgK@wK(lR5RH#%DJ$v>+_inw=x@8+Qs8=6VD_3Es ztvc%0tAp>p{RT~cXu^H1(6N07v~SxEZCkfPn^tYm?1yG({9Pk7{H6iAbnb+~g9gHB zK!1!FE>n>YVqM=41Nsf%`Y?)`3;3LkGzwH!TSIA%_ps=$85lEq5P7B9exWs7+1 zDuz81vt~?YSQ9XL(l|_X9*Z%fMq=2oK^QiC5C#t!fFXk!uOS05cFYJ&nLGj0Cp%;2 zl!=%%Z8BzXImLM#MhzPbr+zYpdKdKV*$u-755$}qQ+d6G>{u+ts%5QtX!_{aNd;Ruc zpKyVE1Q;5mIulW0!H6T=3#1D)3m{8-t1Q`E7XTL!mT=`fae_o z!0Q>?Vxl8)I5q}{qM}q70;$rbD{C{EB zmw<2{tD!kUpqZQzMX)zqOw=f2jFSP)>H|`NN5vVMBSd0^Q$TM5W-UUqzb~&teo{Op z?TIiYd4+9;!6pVnWtHAX$lWi{ZEAZ|8^C8GZI_9mEFATDSk|DNb=#MqS#gtuZ(=HI zU^du_&@9#`CWe^V#YAtC0nHYFUWvFNeFKuA^q9$}NUEFCQZogrFf5I!`Y%fb%>NrS z8_zOdtGNfH|EYYkc_kT*0wS zT=MAdb>(j?VEf_qGyMAL9rwRd%jC!RkeYB`fvo|@vJ|%sQ!@F1TC*vy9cgy)=)&5C7Vk9(w6bm;Y`~%_s)2CN> zM_~T#mk)TE8joWqVz6cVW>hF!R{0M$s8R|MyF78wdj;xMD~oNL-EiyLX`DEE2&YdR z$C;ytarM+u+_`WP&ypW1w~r6J|MxQN9y=C4e|U?RuT!wq*AqMaHzVyyDxRf3$JO(f zarWdf#Ky3~Wn~y18HokcCZlJKGB~q%46ZL3g0M;5(6UetG%A?~TNcm6&&iMBx5*o| zYFCAWqXX>i>@aM=KG3a?h*!kS&f;S6#9Q1g2%ub=MS#p8IE#h(2|@WO;rXc#0eb_XR7A)WcZp;IW|GGs zMzD`jOD%Qf#;xu}RM$|iOo zd>FQ*llVWg)AMBAc-$f|U!K+%mu3va^|>Q(ZNX?P2QHHEuO-)<{YsOQazg8A|ypIvU)<<5aTxp;lk zd6Jd|^E;wMks>HtvLq^(FOOQ)s-bH6iYQsQC`uJ8hAI^*pjy=`s9wD)YSyfd+O=z; zZrwU)(4aoP{<=Qu)TxE~_3Prh@4i9f#^14H(iojOv_Y>PUC_H{SF~csy-TNd7}&ob zmmSfvIXfOvfgfQ(@CaZ1D7;+~MQljt!KLhX*!#dSds+&G2XYE^i-?#YN-z;aKbu_Z@DN z4*&yLZw3WOT^z=`SD-l%5#)|Y*4ft3tVb8q>>}-m%s3nzNX7^-M6M9fr}Q8T1~7{V z@Bw5d10M@i^1f}&}?R%&%((pGdLRzNC_XQ^Hn>; zIEu@z36@0$TV)edyD_Dvd0I_MTbmgnf?=?NW&trBapmJEOA~mih?urnEX@Wq3p8qJ zXx0s?c3aTWCUGM{EYc0S(XcZ->9Lt2FZz3C%KeP2gUEX7+>#&i;I0>JaaF)WrKNsFBKL;+Kza3Bxb(+m8aa57aip5BZ%yv5dt`P!*Y%|jrN`E z7m<>12QtpH0&|nqIsX0)ynK?xFyAOxm-FQK*Uys~57Ttq0^T29Gmc#6Qi;$kV@`iy z`0qcx#V@~nRB<^^_&sto1{21NfPK#F__}5Vg!yem?EdW-+`F@yEalAc!#G7)J{Efj zC*ls{^x-48bn+x#rKjS@H!^df_<0k|sb7D2kDIs8BPh}jXRaK_%Qr6w`iZ!9fzW*H zD2~KMAv!jO;2e$+!t?lE9WnO1D!Ab_51R(HMY96g(XMg{oZP3QlLpA= zXs;^#VvdfOJ823wtX_*P>)deu5W@_OKwMA=p*a{)EYs11Q>o}8`K-;*EU+wMK2g9b zuQL^hYIs$7m&eKEJqntQe5NF2)B?@nECX5$i{(sx7)$9H#RQt=wVi-6%fFHV&U%}2 z7h!o;7H8&f>8yuP@^PY`LR(;21GB0m8P8B9{YqEj8!bM9EJL+w)Kck^+tyCa@~6hN ztzI4LI(1Ue?A82xxVLHw*EY@I$~^7Tp*7aEZvziDNFMFm!K+<6Z0yh;TiCE|CP1(6 z+z#HI+G0H$D{nS}9-Rrz>_ECu>$Gv!I-e< zMtCL^Cc23$fDB(YRP9Tk;#~;ETt0H`i^uLg5bxeoL-SgFT*4ZHB>_57K-*=QK9@>z zB`k~eTKt140p>Kf(MWe6O_-wyhvJj0t;m%6V2%(iu&u%(C>y@J@{yakB#1?v`{g4$ z0a0bu6$y|EBoi8CneHv{%yofiA1(>b85;?g;-;Z|nYF|sE!Jv*TLtBA(s?jRf&U=H zyADwIo8%_e=7Bs;zhTKz2g0u3K9>n;Huk&ESunbkFmF&qh z_tHuwc&B*_cnd@u;4Gbt5h^VOn$uP@j8zPa>7BNYVp^m#9mKl*gwQQk_Gj{uJ?tLqU4cevLrg9kAu>}U>tFpLCfS1f^tW4tRo2OBG!xd* zGQ*;Pv#cAVn#dbXfm6+l7HBq)uJIW-o1mHPjb<#}4t;5?W^3M&bLAuUtTKnoHp@JA zIHmUUJn@jw!>F?TS-E5gF#83--&ep{Tb%s~ zzWy@AjheB>Gypdf(&BPr>5Y6@0^Y{4k|y%d4C}DG8j*}>q^*9fp&-E!<1Pb*3qa^j zgaO8SGy_AH?IQgQlWEdIki^O<{?^9p^ zZePBHv_}sR9UOpN+qa>0%jS@2TmSg=M*{Ox<)R@nKFrg4LKr1Pe@MrZj0gDi;TfJ~ zBp@z26jv{tQDZ!dkMWBqi3I5|+`e|6U~Ky6&oo80mRPLy?1Az>E$O}*MlL1(9)1<~ ziuW(n_IIz+xbL32wtUcMz_zkh^O!Hm7^_*97QMpnzkE;~fh(6!v*L|qoP#lY`UI?8 zG8e~ULUBAQ6k9yk;rxlCIChA@OdXDiB}B*Jcw8J#9gaiN!@FuozK=3#3-3wXN5sk; zcPJD`k3=Kk(L;77Ua&Kfjt6&dc^F8ohj#gwYd*qCg=BIM~_2hLvZn%GI%Q!BQ-ry9oX}{ctjh@edMU zK8RRWa8d02M^YM=DO29lvv3hQP3y+j^}(#%G(a19N&f3?vd<`A8k+YZg5@BRiekqy zie)NtFT-QUR3x2lsgOegI4h_)p2xSGV zW;$aYSNFkXOlSahZrg~jN|jO#N)CYu8ynT&b*NGmYdUqtx|S{A+Uf_mGcS9zY7WoV z&9J^rb8KkW0zMsDVpGRf*wmpFHgdVKV{2^e)CS&NTEVMRORR6-67JlmV3{E8*0DWY zyLG_~hEd1f78UuNsu!|Dt&)Y%sHh{x)+q|tA9CZWb8)F%2_0CDblG}D<3}&Bmfg`3C>FDL%{w@DPCG?{Afb5f_mPweC(J? z@D_`(jHvO1VEjbjUgFDb$?gMq-F`@t^Tc;rEY1XYu?~xS#uE>LU8c)Ag7;dc$0{Db zlGj-)<4F^usSJjrhPsmxJ_yJ!HjTo&ZDa6y^Dw;LHX1LtjAhs(3C|Oeym|~SOze)! zlltI-b01uuIuMs;4#wpL!*OHjSX^2(24@yc#DS><&$`u7koh@Jc0Iw7j1H4GS1#nw zR|t;ziy)tULFBW`k9@YQ8*K8zK6@S%aVUhoz562|I1HC=-p0A}*I2(@#j)d8aO~I> zoH%*~r;c7_=l>#(#h%BZ=+ih9c>+fwj^l9Tal}TQKuqKjgoUzx3W-5jcr3yq4k0q; zD5BzyBj(5{#8R>z7IPTEk+C=s7=~@T_F~hfoj7&!91;_faR2^2+_`fTH*Q_UZPsOX z@88Cq`?qlS;Vs;_a|w?Ut|8;uT|9Yu6E9xf!}AyS@$~saJb#^pS1*(Cp7rRbcWL

)H^QfUoL48kUowtZjV0#qGIZnu z8-Csu1m$`YPYI9r^+@h#UiT{B5(l@$RXeP9;09i6X6EpTcB*=y85TiV_b_woiH{zd zp~2Yzv*~HR$otFij2b@U8dm;d?=q%L=4Y;}Ye21MQInhWMR(9D^lsS(Ge%EER>3x; z74Jl9!6rl}K-X?v5F8qg7@0AHkQ_xoj&f!oHZB7;0<@DL zo#@U$VoW^dj~86TY+C!t;O%HFT`t^>Z&A+kQ#=$Q<#@bZLqJ}WhSzH{_`fKX zzY(M>TzFgN#Je&(VcUiui=y!RMIowk&z?FO5wm6^eEJl8>?o3ro}&;3=t4ZkXg z#m~hN__3JqOd$DI#yFQUj1mXF&9mXBe1=Ild0m-=lWF1T>)#r^J9I|J)*70J_3n+$ zWkr}gek6K!=!nT(TcEx&5`S8=0rfj_F>`otHO<+Qc{3qnm@l)@sfK*fF0asMuCmf^ zxF8=uiC;;X#NF95VmM~2U|IZ3`9IGU-(oo@WxJ^;P1)e} zN5;VnF|-ajG#wcNIl8A=_Yf*E4UOtdRW9nf!qf|Tj(B}c7HaVh@iYH3JsJ||kA#0& zxa;b9-h;YE-H=7NSRP!iJ%{4LYz?7Q^0K+OS($}9X~DSZnva`_i*YY2jGz~Rud~8& zKZkIa z;kcY0hl7bujOp23-FG0NIn17m6@=!c_DTfW%MlTqO+Y3nN7xV&Wmof+hsW9xZF3Ns z$ML$BXxXNXy4Ru!6R|!s2YE3zBu^WMJZ#;Kc1B7j70NurZK|&r@SDfQ^a-Lh}om z{erOlCd;m2nQ;8ZqgOIbUAp84CHca}#7mwd3#F(P26?{+xzW6L452)h_aQt}kJ)gP zd759)A719m2Z0Q?z=0P$|9Md&&a=@GJ#qjBwr-)I`P{)HkiYLhdajT`maZe#`t zZ9gKifY^_R4W#o>cO-eXs$LP%rlbMok*s(rOU27a>}tt*)*shco|oiiqf^_q`0U7G z+`oMbH!fY$x}w2ZwvX<8jq|6^;H#6Lt9rUd>OR&nV(V9A2of8Idgct;yzqelt@x~p zBjZXizC@mrIi>YhB*%>CS02@qHRVl7J+GJjI;nSL-{X_k;kJ4yY!7d+-ZBu|RJyS# z45H7yDq98R;PjC=);K(fHHu6*BzX8C}_3k;M%pXzO9<|r|9^|)phh`hH7R|*k{JoiO9 zF;b}Ma6hEq^t+NJTpy&!!(_ah$bhm{lKIxc@HfeOuV!S0W|2HNS?SO`M1fK@1cvd} z(Co`USti_9X?S2y@>f+VT0Z9@czM#VD+HfRuZdN<^<|YdE-^;w@i0cL<0yuz^Fxbi zX%?DkE6p6Y%EYlbHFtg^NgA!s_7`LYW*;<*JojUGdIk)Jr^1!|^W>uxQ0g}k&sAfK zWy?4cNcDJYfo4zMH;Sj^fr$6z*4)qgJetJQ*W#5zk#X_f*D7A;0cG=sK}|(xfo2)U z+!UJC0zkZHwenjQh_rW^U|z;C*VZ=BF9`+wzvlB}SUG(XdbDhfh0~WHmrxp)y%o-^ zjYulmhNQwR{C_n<2#T|phGNvDnQBNzyG}jOu1g=xUml9+_+%^%4ng0+gW#`*G;~Be zLUu;+W*n(`fOVf=$H7Z4F?O+o;7kC`EQLL@0E4(mpy7T|( zuxG77O6kW)DBPi@nR61DWB7jrfjO*lKf+fZ)T%n96hTE+>_JrdPQ;dPN7}YS$k}@W z+53(n=OAJE=zbKQ*oU&uj`RO7k$>_&N-lhl>`(7w#rBz`Om$L|$};@732_#$c!VkeA7 z%#t8DX3s=@Ng}>qmw;~zqw#ZL9DY?ua4reQ&%Ex3;uyj*?^o)?cO`N7fw25L1#Y~N zMH~p~dz>rKk@?X^KGtO8M)GFCZS`uzZW9`ftUACw)Maoa{A!&rNJmY@T8ElMSi4U1xT9LJUGt!+OJhzD&ROgJy zTqlt18T+Ut`x>70s<8sgMgq+SnCU-KhBM6Op!vOe!2&x1(*i`2PJw4#zOu{}SUy*$ zbe+0!zFtFf?HOFZbRKJ0m!MS}Jyark`b69)O2dPcAl!?ek6S6ra62m!_X*AS3C+?& z`Z(K#rKJX7PEBNZ$^<^5&OW*Ha$fUtfqKL&RQ&0iPD zFb%>qx6eGlEb#n{|G#7$R4;XLDvmfpF{@VxbZ^W2>fQ<8K79)O(|^Om8<$XX`ZGea z*o-WFeU>&vyBE9h*pIN7^wQR`fu!G9dgSepvDnZ#!YXC{H+Ci% zgH5puXIAdPcqNq}0~b!ydmjydlsmX_9)rsT!R} z2}kKpM@a+yn*QzOEXGNXt}qQ0VHsX0<&MV5!Mb-$OA0pweQM?mj|?z3f@Y5lJV-CIB+gp^dctq2 zU;51|Ur)1^6sWrQv3Z$visT+vNHoUToaA@weVWQVMDN37-g%*!58e!kkvLo9BhW1Q zXq|)D3uYc?YAil4m2O`i8Z`xG(fiQsiBl6P1GVIZ7SqCbdE?`e20GbFPdE*-$?sNK z)AU~YJ|w17oTs^bZYnigMk+!U2Bt!ko+P=Csd%+`zT}+(SC518e;=?iKNYmSKhC6Q)VR=1l8LMH-tb`-80*;I_ z*i(uS6`zCgvzKGe(n!R(vJn{N!u(}H=rdp_TDI~>`*xkt#=kpOMJ8jDFk~x*a-n=z@{sXA+jTAhm24l8Uz@ zp=1|agyz`N-GtK*}o;R4Q;2Ph5C5#{^fTzvph`I|5=HXX_9cOz%dQDp5sjO;^) zkb8vCeCz%iAcd{}v^6Z%}pa0ZI=xAZ_~@xVC3@(8>s zkHx!k0&ztG-mQ)&B#ZRW3>lX3Zgn!gsdVGJ3LAc`h{VsOVfeW?7{6Z>ir>i(#A~Wx zQa{*cPC?9~Whe?-iU(^m@lBZ>-xHdDDzM>qitVbW`B%kJYBy z+=z&bK}2+{f@YgD4%4R1LhDxT(8|wW&A>5Z*l<*)q#-3R5Jih-;j_>=xMZ7-&!cB! z9wD=8`3!uWD~q*+;Q^uf9%27ZeiUxzN8oNw6mBPnFNzsEH)SN-{~aFw7bfO)Cuh{zhn~1e#@i0b^@2 zWvXnY9A8^e0U0|juq@xINKdnT1h{Ya$5>lgfeJR(jV4Z|aqSPRT~3qTWoGyj`?^UB>6s^)j~+KzEH z(wXi%r#YGXhy-M@`J8D7jm8CwXb&HfHH_aR zlg{Sy`8|1^NlTo|IE;}Q6WM4yqJ%GL^fzoy4q^?DmE@Rkjj(60}A zwk71Oh(>nFHpJ&`BrvZdJa2?6cfIQMwP#f!GNlyJsiHDgq`3%l7GPC)DwYH~Fl#}e z8Ux*?ZAbX~b-}O^ld(9^j^(xtWNkTw&#ym6&br-b-D5D?_ZfmV-Fl#9y8v|LwtdIW z=-8weD-XfTieS7ZG|RMd-;}%XZMlqDPQaTr0?qQ#Uy1_eZ!28{<{123 z9**BD55lj@R^itrEAYp~EAht_VK@~r8R0{^z_ola=8YJEqQx^9ej?uGg%gNn3^T#G zNJQY~{{qcFmU?E~D06Z@PR+vcyf_(8HWVOn_GkrCE&W=fMavdw)4ml#!jR82DNyS3`emX z*LokCbuTl4=RCn#%B$M1aO=uB9NxbJ-MZ`P{T2=%jJmvJ+?Sa%l9%CLdJt|A0`C!; zzs@E!=Mnq}jE^%3&lypKP(m}oQK0#Lei}ZGUWGobTBzRUvZV`fDaDC;XBg^}qEt`w zrNjtaNs2*rb~3^y=x5!n?=S?#W+FJQ466(@+X@it$b{WSe-LPnibW)$*&gS@ym^Zh zG`DQyuX>uN6Pn8sl8{PhE?Y7eCxd6=O3YN8bfG9D!Te+$Klh506*izdb*SXFHN`odA2cBndY~rRi#{c3nvt#&+$bp!o$Gn9pR! z5axw^5Gd(?!Z05bnjfcA1oOvP1ZWw`BEv8WwZ1NhAuu!l2-_n0i1HEP`JsFeSQv>% z^p8h{@=;(YUU2_8kH08(<92=$++zl-)h^o*n&Xn)_{V?#XZ+1y{RM61oEn-TZ!%>D z4Z~bs_9BNh5I{vTKG+J)-ui%QYCl$+(hGE*COqo~rU0|p__g|aLVG>yhSS)&brW(j z(vawKAR*3y!rWYZ{^@79THio>*A2%;)3aF_KyizXuYqQxtF=~WmhBtrc%|m!sqL8;-)M9NsqG`1MO9#cuJlayA*li?#VBom9rl1ybau6uX>uL$2lsc3{eE;(1bz+#uXsgo{we0$ym6;f$>upqkSiV=1yqU zsy(7&;<4?M^x*|*p1Y!)F%g#cHl5F4<17P;ln68z88g`9Y<01O=Mkqiu^0DPiRF? zT5|%+Q!3DL;7AM{HVRSE4mjhJ5$$v%S`E=iR?BEd%VH-{Za4|fVQg#!FIhGP7*tq|XD+(BU8haWd=B{V1E zNl7SPmWSeX8KJp6hR`fSFOu}k7;D`6LqM51qr#2vD{T0=A_Bjz2*RIMuEL*JuE1YZ zEXSXhFDFEYB6n0L1dkhvumy84tbIFF%%6aFrA~ZL!1_Ly;9M*p{)OWQfoFol&qcBL ziO~FgxyXqh%H4QJXnt0nj$2jPSU#k;nhUmNYd^GXNpL2Bb?w>?gZS*G^z4b59b4dw zf))7T@=o;c+8%8@z0u)8D`|Jtv}xKb{jw=E*GsP&p;>xjKhz)>KuN-U+m_^M&v+5~l+|=7DmU~L{UHR^z{;WmnmAd>U`)7E* zbWnMqSq;yqB{bLK26a=wnR#`k?n_+1bP8X6eu#}vJyqQF0ex^bJAr^4g}W&$a5psw zw+Vv+&G&KynhAE<(Rd__gJjB(jR-u?wd08l&&W%{5qCI-wr_)W3@dlROw`Gk<~SM4 z9EmysW`gq-nN1@<9qw6ERZnq;?gOw=mdsYrT&0I+*b8ANG&^bkG10Mzj)|qO+u?|d z$1+*fvvqs4pp6JL&m7Lin!|w^Gv|_5dm62SymW)#B*ek6$Q-& zk+@H2zF*8R3d8UsKLl^`BJqajzAj3}dFI#r9-YyRz}&G@I~+Q8RE=l;{U3j#ZOM3J zsYhue3OH$dURXAwZHRbWy@{}N-iZBs$HA*QNpB4>i^P^>IH9a?D|N8=fvLx@)YTE1 z&!Fb~Y1YZ7uy6NHWTmAbDl7z!m>8@nufV}Q`*8j%hErWn$k#Gy+-z+rC z&=4M%^vQXOSv0rksaGXE^ywzh+|+NaQ!y(4rjL33)gtw+mGF&4K3~??NjwxZQ`eZ^ zjiH%wl`vQ@>)A)7t|o^t|0Iv3fzhI#ux#|3pjjYUVA)6wIOj0~hz6WhubfoyrZBTW zvl1gB86lZd%ZQ2$C|lJWFdG^8v&x&nmVGOnm@%vGU&r4WcQbt9mt?)p6pU6`fmtNS z%9Av8t*O=JEF~4!}%pC`)iWF%}YC@%M&9YUMA)gn|c@LV!(U^wjDt4c&w&{tx=lT7T-V0a$w}xqf<_3`r z*`Q>(Y_kGjbI@EvFsNpMD2-N0r}(6NAXqKee)$#3k`pm7paVMlbwpTXJkm;cz?ruR z2}PUX&R++6W(6|JHluvYeq@wxfi1NJAbIj4cNK=DE1ux61i)(qR)u&=+tL0x(^(NmhC&CWm|u=>(CxD#azqQEf5gU4xPJn zBOv%=*5VLk5hfGMwkv3s-ex!DEZvD%DynRk0_W)RPY^>`jv_R>s}94Fvl;EX4#$i| z5y;#82@*H&LgMbdNciLcQuZA}?xCY7IB@`ZpB+Q-`Ku_s@&cuoULpPPWu$C6h4l4b zA+hQ>mc$jJQ_ta;IB6DKu4Fjfx~DlthG;kw5fSS`tb%4298n1fUJ`*7GZ&&)>(-do ztt*b@W#Lq5K2?AV6@|FEItzbwZX^D_x&Z%Bn}&b7T82N^U5FpnuElqio2k`!QIvpZ zMUi+_7DZ@|!dsc2xgu6gGxw^>jaP)`w^e$m#105 zzYR3&X~Hbf%<|AwzN!vj+@zc^-}RV|BlQ8XXAXGz213o&qky_OXs*FE>iW4F+$1#L z5@@brSyS^F^(lSyGt8W=d+o>f?unCWF5D*c-AP%AJE_6Aof3twGs6`$KOk&A%BBdP z4+NTJxoBA+Buk(<5nrXnVr`Azo7Kb`#7%sUZ(2x{~%PEn#loW-l1?k9K zsHdIt@6;R1B9av}2RbSd?5IG5y#Nj;q1h1+8-dvtYeNj>q<;hlM!?^{6B|@*6?6+U z=h+;{2o6TovU&I-XeKU&j7PO?Hllh3z&&yx9u~@qh7lU}Gh%T!I|4Ve!*Dx`(3}*G zKRQ;0zrMT+*UDmWqsWchB`LU7=7HvlOx#$Lhv|KLA!*`pJSQ|imORJ`XIP>L)qr@& zupSbc6$}d;OHVQz7rs@K zi#^+S;KHdhJpU5?K@Bc!o%RZ}tIG);u;6X#QZKPs#mdtBp<{Gx7GyQ=u4#^4`m(s%5ahGX)d{g-}pB{JPxS5E8{N8hMAvQ|(6d)J z?)O4K$DVK`7a(KZUP7}B&mc6HZbJgME&{!yOu%^;qD$q&z&(g9+Xq|KK_qWHhS@=h zXyM-%kx7Nf-na+xo0ygzd*Rx{-}fCs#{MJ7JbDm0Cr+U1+;yz4dyd=_w~(~`JTf<( zM%tRss3VxihCtUYA7R0q<#5HPz)9KNDTsB+lyeD)vd1Gj#tnxp0rrSE1T7B6$UZ~R zg_UQ_)X_Lyo`bIn^KrVg9A`>%aCd74{?oPX_z&mv@gHk5@y|EQ@Mn9o@Ox_u@ndNT zE8$GM$ce+td>h`BIMkveuPX@9tD`78UR1>?$$ZVSu!;0C|K93I{L$)Q{KeW;`0F)G z@$c6z!QZc6j=x$ThMlwfVby@HNDg0x(Y?BJzZ>r4#NZnO$~4 zG=1Jv=2_1*3k!M6PD_3BA!$euFgCEPA-Sofv8~1X^S-Lwr>y=fK5q5@#?ocB=^a#n zaIRWI^MzV9G(+aCzRCY@N}OsxWhH)?>G%RcA*)ng4rgQRaFQK2a%{Mr7Kq!aA%tgv z<_O#)%-zcgBOHd~k$_@WB)*p3RzkDPrtx)_3+K~gF{5t})dL(db_mX9XlT9?7eQE# zM168NuBFG|YHl*hf)*oy=e6|fgn6r+2z8Ycnkxv+Wr(ol!R|q8V?hCIX+Y`*Bjf4el%~DRA{kpjpcan-QwlR&dtU`{&G=wnuBHs4{%io=bdSzZko z$^WXSnGsRYzzly+3`|1!{e8u=irQ6fEX_DgDX zK4s0=#?tpOJq9BEzbTY`=c*~eJg{d~(|Z{i<7_r>EK&yC^x=BsGeCtSrn> z4RIi|wN@0l8n;DYzQ{uM{Mqw3ck&df zkAH^LuyAy4(GuM{^@1(A5Xt3x2+cc?NJy1gGg1qxP`+UY3d+`?eDy|b-nIvuw(dr( zBL%A>;<1uiyeb+!dk;~wUko1d5q5n1G4>rfgagNp;^Tt{vG4dX9QooaMB3xfE}$De z`e?LT0A={_p%^^4F9r?ngWjyr{Q^3o|KQQcDcg$Fs!xzqz6;5WM{>yy^+BNOZ6-9^ zN_Qeg&8D#jj`Dp-s5*f34M)&*=nVAvXc|%~wj+K09wZQ)-P;bpz3VU%KRJRFf^+KO zBgj5_66GgqQF7!8GIm`=#+Eb4UH>`qH++it{4E$Yc?o)U8jKZ$VOL@b9PT6na}r_* z&XJBJMA;G%9V?4GxZox@FPyOeom;k5Q_bZpn~DqN={S?0k8`DKajrZUFAwD5|GB;y ze}BFR|GA+6|8%Pye|sPuf3P|QKNY0nyX-`~%W>ju9#!bV>ylW!ERVp8)!|e$o>e&r z&9caeQ>_U2Q$-AZT^WHtsS3tl5}N;(u>9}WFT;P>umax|tw74q_K2G?9@$ah80Fs@ zn^#W5lL80c5h#A7ejt>*Er`PF;#j;bw(}ZUF57|cie=h5p0`?{+0FfUe7i1}u#$rE zx#O_BS8L1*Xn_UoTVqM*c39do01`dZt#U;l`abfjDK5bCXDbpJ*>+UjwdbR z+gAT?B$gMZ+?Jsl3YuAgOK-D4^L3U1*9p-#S;pLuGO6|i&YwQO_eM~W^(hg!mKKd0`N`N68K@e7E!woh?4?l%jVn?!Xaw7f5n<1P z-JVQnb`qGQwQRA7bH>3D<3v|Bh+DLjx!YS{>hR$xaJ!Ke5stMh7ZaN2;!5a5)JM-j zeD97ZpFI)Ztxd(7{1`UgocK160G$`B7Y@ma!L7tF{NBDY{K=UOxFXZdl_ueCNeXV3 zCF4$ICZRbKSJ&iVM*rT(m_C-!9M3$6RK3d&G6>EYVT9&zivJUur3d*j8y!!mr-bAe zIgx5(s&S-BI7N{rOnX$k61AeP$S8}|Gblt_@xixmL zjuUdF$5!f8v3Xh0M8+2%J+u#38VHE2Ph>H!%LHHP^pWoe#Me|^Cq1$9Rz=Mz&1=1) zjZ&tUxhXVjAmw?UxcPjVzToi(@7q&L{byu7-|LSbNVmx9(?)WQ3Pa@s{mKLQS5^Fc z@a%(T##b0o)#FWlRz3vu0J3C&Of_XBU(ib=1o&jGOg$7sAW-&20t;HE2idB6-cVQc za&sS0G=d8OSP=^(Z2(e0{WkJ}TT_U$X0n#PPgBW&_NLG*GT>z0mxt7x{&zrgb82qL zg;8_qY-(sqysy<^M9LABWB#^k49(3haAM@SmshHy)4y5mM4(ybH8#soUp#w;TX}V) z|5uBN2y8M9TEs_gJvm|F4M+!+GNZ8 zGIp(;O~<2Y+?$g%-i;-3(mtfb%2*1~+8SNBS;1N2-V~bE5Df#(0>_j<^7}o_+|nm5 zUaVCGWsUSQUpS3(r_bWT=ciD6;-p%DpaTp2{{25feBK(QRPH65?m`OTRA$S_EMAM! zs!b>=t3pY|TCCr+9UC_9LPU%kt0LpDN@mVj5{wR=WNz2i2nh|xiO)VGG#|z%hYw-T z!9zIy`4`A7DnkIzkyXYfPMC?|AC1EBp+hiqNI&!+&;wn1bW$G%PMEe31#9*YeD@%^ z^kXC!??4JA+jv5A9D&(hx=TTGZ0TM?^ga}A`4o=q4QSD27-odTBV)q>B(K|t#PtV| zu!+FD?FbS-K1y)r{@x=<-}@N~J~@rN9jB4GRX#9R<82-%~Li73{{Pl*F_#1-r-)#=Tjnst*?%xr` zkzq($JRht2c0of%B%TwX-e$+(yR0bQD-v%C7#<<{Rk01;uyAZw z^=iLguUX)=r>ljFDS!2pXk%4A4rn$*^qF;vQ#&wAT z6Hk5fuZ3m>&MF?h@Jw2o_jQp69w;e4rCd?-G*eetu3Tf7*Ldu@NXmiQ<2duhUY41k zBO^N#t^C`gds{zjjOOz#jK>W^^F2c0-Q+0TP6|`K&UaHo@sN6)Ca@TZdpR)#L_%{a zp*bZMHw)abO&XzwO^oasfSpmxaW*Olmu;cAoE(iy>5;ga8iCt{<~;=I-mO{^e%oOB zf*|$5Z=kIh!L|ZK+A;~viG*ewfjN@EOxdI1igUv4aG|Gsmmt^jJ`+Fs2*t_C$cl=_ znpI2jW#~Ly3Y~=7@L6zlZHsjaXW;vF8F*I^i*E?c-{r^Sb)gL}*_eBl@4y|Ir+IHN z{`kwaxRxJ@`z49ETbzX3w8^^!<_A?-xV1hXa|ZQ8-rPxq<^()cW0)fe%>vHS>nuI+ z8k!&FM&W5L{YiR@b8S=<9_2;j>wG)zu~GRof&D>F1RiIG;CW^!VL1X%Gn@o?-mf?t zUlN{^2>RXIv_S7cJ<)&caC93m2&2bLK~>p${P%zU@A!BB@OQX$S(c?{gGiv+Q|GFA zkHzMs7ngO6eiJWtNb&hMwoj4#=Bu+*-Q;1nrpF{qGyFleH?Lm7$>YaRUR;RF^{fNV zogxd${pc&SuLds!SL15Emib>x_O%Q=U#SymXqGvf)%1%BnkD`e{Xu*{O~fGnzb)l0GBj+g5oerJ(=&pxF!`u|hNBOQ}vX6-Qp9;>dVNe8?}N7S>jS zvp+(+k~&adBA4RmHB{YHQl7ww{X!)%fUPo-z_(`zRJ&D=_h&GfuBRdZlz zRdZ=>P9~3jTbW8(4S}F!l|C~KqOPcQg{JaT{KI4a|Eth!l`sB`j|#`&_D@SllB38w?up%NBv*#~`pI--b z?9c`4)@{IvlV4!((WBUN=m-uT{S*fd9mA+G6ES$mNGx9(hN+Y1Vf2Uz82!<3)zds+ zU=MWa7JwFQ+hK8_4Mpqs>o}C|Mr!dcq$zNgsps^#W)01|5nHw&uG0P3u;UaK1}4D2 z-#Ek;Y(o0xBS>0%2r281AZ5c*q-;KhpT z?b#Ja5<^j+7mv$%`KT_a#HF<*_}!DM@%QHl%;)p)|6VS`KQ>h2|JqxKpNo?5F3XK~ z*$#Y@LwFUi&5I*A#NuU11fG|%k}q=+oE-${7`&9Zo6Do|Lq!CBry>}Cye0^LzA+eo zu^|Y5wJ8Yyen&KREgONb38V0FK>;F04n^hc(Rf@Chv&Htyv}vv9q;olCkn6fop@2? zCZO>D;y8R;?8e(tH(r&;6W9swo{s^4ERE;y33ywPjPKT_<5{T#SCUuZ@Y1PRJ9!A! zkMEAJmiNFP?4F62pT=TLzYgfyzAd7|LRcBoR_R+(p_wvEwpPH^bE?bM_@YnNa?x8p z%m1Em)H&Q&|I=@+uxw=NB5%DQ=c~$<{;kSGU4Lk3);)v237SRHxnhl7isA(yKg2bd7PVsX2zzpYOqiuTCJZun=w9bwHQaZBQ1x9GBRzx{)44Xb#2g zL_%(2xSG%SZgMCdrG(*$eDszTi+eIgH`9*CDK^|sB3$$NXD*no8Z?7D_z{eQaV61) zo31FraxAW7M&oK)1a9Rd;BaCL2KvitpDi(E_Hslx^ATh#L~v{_p*bC~HWz(EhGv8l znj>I$M#JrLATiF3zJ2?vk3u%)q0y zIk>y27_$cSLjF8Lb7=w|$@FqEV}|5|g642M$_{4(A`(vs$0{p>h`T=AH^coptEc!mG= zU;i2Z{a^kCFWx>!LjxN@wS1?^|{2_UP)0+kz536HnXy&=xN<0M; zt*~5u=FI7g0bo}T79g0qY-KZOfw zcm@ks`s&5Q$1!8z5Oi$O3X7HoBX_NUDj~IE50VNu!IfEx%%Un3R;))+<$6?Z*nzC# zwFo6V207BOg1{IQ6^og37NT`4fAs4+5C;w%!QtbdVb_5}*mv{ zE?kLOGnZoel!cfyW*SBf8;K!<`=L+YE@;=$4=w!z5R;ILq78)G(mhBi*+XFd80kg3 zkW6S!Bs{xHcEU!*@OW(LJ|wR`g00)X!0?d^@X@rD$lP)Si5m_hzUna2*M5r3wI`6d z;W*N^9!1j5gGk!=tdm6%H(-9e+23t%j;$jowim_vG&mIbz!-fyUsWbveRsybN=b$FHgq3;~ ze*f5N{JWFs_#brz_~$F-_^&l<@u%C03Cn4Cm6?b)IdOQ)3j3RULUTT$xk#Wn8ZS!i zcv_N(CuI&iB{07rM87GC#CN4(_*GdD{$O5+)-VmCH5Eg824KXH0r+^! zMqFowTzmEu%So|&8GotY=xBeW?b1HA#7BQiXr>yMwHhjX(_J-|t_!4IF!h04L*Mq6 znZ8b!=73q$6nr%NnCo~hNOv4>$FQ{ugYgndh`<7x<0~vv3|AA=HT0`^DKD|+tnu=Vz z9qU2@@kQt&)C5h$*`R5NXx9RTGsfX{RVrQ)n%@*V@lAm&0q?}CG8>+k5SlZ>@$J?O z{PBr$d|ecS`vorCFHRyfQ&kywvMv|5*5zSL=Ky3ftY;T=Q2ElX_jTipJmy(Ph~{nDI0`OQ&-|iK_sdP&#PC=z>>i|Rb!)L z+ct=qzW{s7*JIa)UFbVxEanEr;Nk0^)T|kQ{QG}MJD2&GFW`C&!I}0ayd|sG`RvBW zOopYJWfo&2rhccq{Nw>vwdMc%Z4OzEzS5_QzACDhj};kDHgKB)v&i%^%Xy;apxL`1nx(F^@sO7Ell^hcW4Ip&9aE9r&-`Y zB!!5&xz)*~5U62f^i~bE@JK&cNe~mptw=yq#*C;BC3wCpm~ojEwO)>q_mX=`Mlhp& z;4E>GOp;8~87D1$U5FV;TyzHO9!@E6bbR$Sy5N$6O}-p8Tm1m!cZnzOBM+8gSXYJ7 zAB1^i{^wRQQ!m%rRLpRZ6z1H@tP9q$u|7$BB|mjsy^{YKzE_@f$b(ncP|66TU}g9Q z&a6`7$>si@dwC5^^3Y?%>O%uhc)s76HvTU%`PLu}4B5(a_@w{04%=uh-5(V1Z^oSxNaeFs&LVNN zi^|z*nRf=91)9y*`?{iLS|y$Gd?d|uDh&y za0GZMufIur>F5d`6v*rNs4-}sa8#q@EbYJmG$r@0VLYoD=NhK3hPEK8r9IU0xy!;t zH7vyBoEoN6dYfy`e1%&)etg?@4DHkvJp;PI>CQy{x_w9@tY%j2LqhI4xYCP|UAzXR zYqp|f?M|fTuSJ+M3lZ)d1lv=vBGQ3X5jISlwLmRQ(zQzutXs1YpMP-*hmIb{(G#De zv|`8U{rYuBw{EPUTeUzJo|l%l4%ro-Afafd zS`jc^#C_>)PA=YwxI#iR|F;$If{nlBZaRj-q8;$>I27|E(vZLVE4bDiLejbuNFg+* zt*16}pOBog{REN-%*oq7N9MNAkg@SFGS_^9yt3^mEnuUdpc2z3O+_pJHW)E^9O6=B zNJb(eUGa#Fi$|0#88K1mh>ggEBQ^tZvaEJk1O{~Ju9nbEoi+y5xrw+`kb>HxRMeDY z;pL7>{Ncf3{LS$s{Ew?ez^x7VPp8)7_bUtWHa!im(+SPl1Z7s{Zwl>rUFg6|Lc~iM z*GzDDQQ{^nvyxZ6&QW+-7K%4ztMFafN(Ih;!u`Ll494HABOE0!K;pRm*j-VI%BWBj zPZ@zH`Gk;M0ykm#d0ryH*^O5OuNQ>#XGH|$G8v8$Pe@L{D?-QfQo^(h+bD^{_oZ(9 zz;nJYipTebN%*NKn{b_^S0UteKUSpUhw@xpPKw}tyI|_z4(Q#p3nGJ8q3+Bn`V1>- z`h)T%%Gk6i6AeHrn^!$yzVcPJ?@cph{+BYXQTaySr0uI=W=eWr)mNS}&RgE8-%b6Z z>kn0j$X4o)TAwbKP0&gb;;=Vf&x8p-_ zCsoEX%iP2SKT4)nyGLleof3&#d^R@;)i-&3Q*F1zY! z?$@V3T6m@#9MH2Ta@_GqkBr9V*l2tau@vHzB>cf~LUU;p9x8I;47~pN4;3{3`0WoYTfd}j z2($?gk$sf7FV9$QJ)FUP#Ze;KvIa=czWQHpz4f<;o37F(uZZfzK50KPoT8rl1ZH{L zQjn8{!TtN8Z_jQ>O?0bSA>{4q)%rTmHB#Tx#t4`K$|5x@2EkZ{awsWarhUp*-Gf_i zkF>-^`<$2EJpL(-02zItwoI=AQt=1l2efb5>U2wb<$A+vl-HlN4CiS7(RI1;G5u|X z_BE-81)7C5FL7(2S%I_sFTY8-BIN?Fk-Gdc>rS;*fY0~}G@Bcn%+{dVI8sI`Fi#?U z&|J&NXw|cD&^?k;=twvMY?AR3k+Uj7n&>q+>t&e<%`8kB7;%-!Dx;-^A{pv%LP$w$w_Y%EJ!M3%737+VEzs+Uo|IOV>9LyN5UR%tV7rZMa3YfSz$mT7TG z9!OrBm9R84Gj2TIv^6JEOKb8R-6KNBVcgXl3Id|c^Q)G=6qpnMTIWj(>U;S8@KJSzy7`_ewulJ&HKTG~Qw z9gp#UWw$&}p5eu_G9TqB{{JQJu@Kx`REQ2OT4U_UDJWU}F*3^cAf@sU(ku5PKBo$a zS;Z)<*o30h+mMp8j<8&SaAz(e9GM7>jaOruSA;t-e#&gLm8qs$w8en_gRx`VPMke+ z9!HKG!`N{XF?7UeELk3c88eq*%Cv=;GM&&oeKrOS>W|Ky{m{Ohyb5lCk>jT$uXG1e z3%4V+l%QO;1L?)vkwIWiBQPfsmfb}k!%?&gaizNuTS%BE2nPixqGkI52uvzPCV|*h zbr^~3KBbN$VZ(96Z#sd5O`jrp(@Df{ISKdHlSt+NlFq{NPf%J!XwF}c!t4r67(Weu z0i7{%_8iy~Gt}yTp)MD~s3=um>)HJ+Ji^sbh7hdPt@ruy! zvWVbAaCj*qG`}pd;dM#0>Un-r9)qXlA%y6agp3sg*+BeB!AksP;d1<0WfYFBn25Zk zvvGJ`C9-FY#it>2)NAioInf%D^AhkZH;LdJj~9hb-iIJfSbo8{zATT&O9Jyt!tzTV ze_iat+mZx)LlF74B$@D>f**?*U;h8LG6`=O?$3oO_|=+H9FMo7zh4Wq4`_p-!vrP3hHEsGW zmXSApXDc@CLGHXF zKs$Yq&U3SE_&U?BJ{r8u^DgAXV`h&|YW(w*8H*6?$i)(YW?MG>K7r621-CsIE_)Ch z&T!b>vGfHelHwE5uX{iGvWDjF9Xevg%z1Dx55|soH;#ub!6gFonN^b!(z886h7ZPz zstmj;ASe@(Ul&EFcsJfH?o8PreedCkV*}i>U&kM+2r(em! zB{KVkEMg*1dzH|9;mlVolfu!vdl&TW)eUZE49&g{| zM;lgQvObq#7Y3LWGz&0me~^y^wFJ~PI4j=p`kBW+l>H0iG?qH<@da=CXybC+8-@aK za^mFMKzhVyJ7pA(U3&M}=H2{<SuPSpiYS%X5v~OQp?|2danJBWv1?tZCO3iVvE#VN2`H2g^p%TO$>cslcqzZ2fRl zK{G+n2hD^x3+Oe4W-C0a)dLx(sA+nClgxEi={z#ekonH|H-%=MK3~3n2%431d(tR9 z&*~aa-st;Tp;@OzK78^)GtX%P%SHy8jpM)fHbU;F(%IxX>1p;rvy63?IdHAeti^r$ zq>SGd{}#Z%DCuJ%dEu+ial5V-sj+snYvGTOASX)J93ZIfMS9f{q*i>QJ_1ZHSdGHU zO$6q0#Kh+zJT4cZ_AG?i(-7i;W?4deB?ZtUa%aKXDr6VX$x2pFTl)M^U=3&U-N8Ou}$_>2{@z;;qOi z*^ac5T}UY0iTD!6t#A+GO7_5BxC5ozPh#jtGtsH%XxIuiA!*Z5xYr#;;@V?`=3{Ve zIEuK9$KV!tUVjYEjRfg!pCPVtFXHpIql9TKE8K#zoVCbE$ic{AW6`_sKr9IiMyxv( z5zZ8Z5SWAGTnM)(BQ`PJ z@u$Z(;deK=@$Zf%5SXj*PqmxzM;l7;ZAJ#(q$J~Q2BA4O4zF2pD`>WWv-GwJG`}f| zCdA564OaF9hi3%@i`;O0m%SXn%9)2B^XK7V{t6V$9)c|?4s6d%M&`s}xJ6j_w!nec zIWY>Rp5?g-&4~mo0&)=nqSS%sWdvwK^9$K}p!tP>c2S&yFd5JMO;Hj7o#0GBc~_c* zS7izKz9bPp6{h0bb!Et1I8#l%)t2xwclHdN{qhS{R+zHDYs<86?LS(^->gz^@x0Q0 zCz9(;+4^4j)~F6K(oie2cGR+-sV5i!7!GMqj=g+9G=Re9XV0Bn;D5)>EXCZc)pn;io{Mr^MH1G)`x^y zlW@0)keOw}9m3=t!t(u02X5s$38G0@IIy>Z=1GL+AX^@mMCD_7Og2KI6A&9iXtsqQ z-Vp?sD-2F|4BQEENRCf}Ky%9$te&$+moCJo(IGexx*YrMk;q)KP}%#)(W7zf%TxFd z|M36dzxT7ZRay_y$QqeiUAA|b!L0(omzC3xHarpu_uhpp;G|GzdFo5tMymEzS>@&k^5HKcW z)-!$$v~dAq1Njs1#fN>@dG$ePmTSb1to5Sx zKBnH}bu3#n$2Em!vZ+8b%M&@@H++PCOdphCie{M9Zw1XpR%kY3h1EDq1fn|{?s0}oW%mY#Pmj`FL{|6;TV&dCKzBDwI^j>;lnX0*XTD{`8LZ^w7 znLWeY&!q8R2hCPs7J1^#=iF5CyA_)CP!>;qF+QeJQ}D;Fx1#jK*P4d^9njn~Kg@Mj zso^b3vhRahk^HaaIgfe&8=;wL_Ez3w;0JDK+foiqVBI8^`AM=8Ow1u!t6>mX0!8x^*&|I_&?&7_0hzfQi zea#VMG2R_J4aLBbvyifmAiVJqThsc3TbPI^TMO~W2Uin%L-6kp#RFH@;y-@64!cmn&oY(8rMwU^@(w0TpJDN^GL;?zo~C)=*xO> z69df}n6I)Nlvl}@>Q3SOc|tSGrZwv~pj|+K?z#2%!>WlRQ4zKTpJzEyUyy*u`FtL1 zbiLxUmFM-0jimcB?V8LxoDr#pZrn)@$F0OreCmk6_%8BBs0E^?jK!@2f@i9HH0Z$H zTo>+T+Eq`p3@-}wtVB3|>O!m}EH94C#j>azghVDGHaZ6Ju^~uw1jFqLB{WCC9q&Y< zD*?T`2yC}T+qP}dn$SFq{*)NyM3pNJ2clM?CS*2hV;8|SqCX}R@~-lp@7Oqd!N%gN z!f5(IEMDi?xKI0}9p1}kgRwFRkBegPATJh=SJm0X=Pg5V^!Q)eg6H=S$y#?CJ2%j5dYNU}>^s-5s`G%7oqgJ(q%ujkPR%syx~g$EzBBu1dQ zPFgAg%|?uzh3*YBt1+|+vaK0m;>5@*XckH4$;0vWGV6T{(5Xz4jFF6#u(X<{rK#i` z8P9D#gjD0adA-qZ3(X3kn6BR_Z+xu}ROC74sT{0o1k8jQC2vndrE+n?SQlH{)BwXKjUMS;kZ*n3wsPFY^D3Om8)1 zkB&r_fKFJlFc5_$+mK!Q39?onM)unMuxFGbIgh}cUk*o7ChUafC|4#T-dQLwXJ_=j0MR((qkitzfV)QN|c3Ak65f?LIfgojc*O;5qI zgeAC{Fdm<}$0B#x2vpj`uszw0gt0^LY2<#HJ}?IM_KXqJ0QPqP5BfoAE+{7&GxG?7Ziw*|@gX?+ED+Je!gtt_wI0v`?N zkCO+E;`-(5YP`M73SnjoP-Tgy9#LZ|Ju*J$^*v8Htjc0fS)$4l`kN`I8rLI~s@S!B zl`+dT19#_y@E2>xEtcGZKf!PPmR$%s)fpSlV`5rXWHyev(Eqzgyif@YSm zP9;a-K}jlJtElQ zk+9FkVF3YIpgEV&%tqdGHUMPWIT^40C_4g=*!X+Gc)rMu$5Z;sQ~rKAB@9W!yJ7Fr z$;w`z$_IaHQIcpj6s55!)7P;vD(C0ec|GsLM#j^;cvPp_kTz);W_D_c*#o-3wPXe= zV?xzT9~&0W$H|y*T#Sptxmbd8^a_0H3`J4kGWF4~lyCKQ*YR)v>@V?GfBEOQd-IyM zMd1mlHLf#?%cS+x=vIi9Y0EG&7V#OZ?6gKaX!1nR?OGW120*7bR?Sjqy*bOxV8g4KJErYF`-0U zMN2@~CvPD}`QVKAk;tfXOeSclmW#KL^YtuWVS%Q{m&r1wdcP6Wc```$ytdGP-nG)p+r`>WF~ zGBE4~XKDCQvf7XpnrjHV2AVZ6H|_biG+bqv6bsur0p_!{N~WjT+^;b-GagFvKkp@C z9IC1FQgGDQV*kF~=+asDI4xbW3Pt78qq|*wXcv)Kj1|$zh>XvGBRLy(LUU|F1|pql z2)Cyq%$|Y}LUT~G3(Ldogytx8>pK)JS`wIB`>RGzuA84#T;eEKHjWN$!DdKp$N4MSf-bKt6Qghj<6G{TM0a5qA@9~|wZ z5(v>5gyu{*V-k=M7KYhE4ju3%$9Odm4B3hX6a=XP?kzqwi=>& z9(~a(lNL(iuaC<)41?!NFY^_qO?@B2My+~{TzyJG^TnDk&~T*&>o#qGUwio|uQl4X zrf6&alnf8(PEG3E9SSdqtPnvEBQ1Y&~W!z?@Qq(!P>8g&^i*d~ls zem|j4XVm8<;c<#XEq)?X(#g`*_cI92#p!TQo2bqkJ!v+Ug(hQRP&$?dXCN@hg{Y87 zI3q%k z-r@It`aQgS@{l%jR(~v~09k<1V>g}#l^2>RfoSu)v8`W{w(DBe%WTFN>z?K-IQ7{H zq$b2+zXE2~Y4oAh<;55}s4x2U?gmFp1Wtec8SdP=jcUf@BI8!gI^_cE5dFVQQA+z{ zouN2S{E`hwEsuXSlK36Z@%pazOXHuqZWQ^TSvL@PjW^5-RxJ{_#v(NfhBrgB)W?9_`Z7SrMTzJ83O9W(YcF$CH3d77Tx%^Te_8LRS)}qv z9y%X@DYO+xd4No0jNxAe&3qs}<1y0E>wl!&DMN2HXv#W%~nZX>LEIfB^8JH(%g4k3{>Y;<1x4lLsba8pHy-tosg7FMNS!4eM4(x)nIHAkZHez)aJQ5=a%_B!m!k~{PVA6~wm_BbM zrp#J`sWTR0>Xf;dIDRGu3>XQ&fL@q5XBm=<)*xr?4!HBzVDOk3j9YtFz8x@N2;n*0 ziKN^rWRz}0O3^kX=WQcMe~hw<{TMWG8t>f$t8AIb+4Ts>~d_sY8{<1_%Dl z&I0`P&NcWkzX0E472-`+HY@)$yvk0-%WML2u1p6`30d_=45_ZN~kDy;B^t@f#r8bMJ}bc@?jv)li4(WB%FP-u8j8! zS3^@;wrYW{J-T4uzJ0iLFK#`ARLS)~u#q>0aA*?E=uFTX*#6 z*a;n5w?R8XsR8JLty>ZRd!S(6OdO*v+$v1Ri#7RpT#|-c*)mO?4VP$Bg{u}T@E*{i zH9pU95|~}MmC4`p?YK!0y_*?@uS+wKJez*S^F~jYiKRhqELahb#VZo9a)lisD??!q z4Msw22-4h@{iIMl$qdGe+z9>_jdzUWo6I1* zO%237$5O1D)F0=9X5ps-HaOWhl-Xk5$eS8ALSGgo<3&Xl9+qd~L|ho+hWEvyZfy`e zpc~5P&cq3~6UUMr*zF3z_L!yExO^(ka{p>VBpU32XmAGMtbGN}B!pp8*b)qEr&j<> z%__pZJ5O=5;Tmq#*Qn{+WX6oEVh2(W(pJ>$Evy?ZvE06Lz7FyM+7*`NQs?@BQrmA+ zd2NpSR+bT)l^HN(HjHb8=<}z(Bou$@>ztud;@QYwu>`xeZesnaSLQu`TIv89b6w|E z^#yJjmy5LDT8WF)_rh|@H)N~*%f}PCvD&OGaki>yTAH$*I!0*L;rKdO4KteKQchU! zuj=OyK(nZU(!i|C6D5!TsUcJn9}~W$jn7X(vq|i4hGt0&GpY$Zo0hi;R7v((5L)G% z^}>k9l+3+!2gh_cF^?(!n@-q+4ztV^oz4 zpF6io%)6%0EU!={FO?yAWTtLX&p_q5HheKo0Zav%7HHPf&G39v$x$ZGR#>#cvRXCp z{{%EMA1_K?(ot%d-v*dv+BGXQtA>lrPVrkpvszez`K{AzpjpE6L9-rr>;YLxvwCBo zpQDjZhor%Tqr*~p=&^4t0cHtT!uLtWP}j&kByC=3W`2kmw;I}@OhI>1wgS>JB!t&o zWFze2nbWv_sSdk$Y)0QcJyZoCmE5fPOORW#7P0P3HBNbPa1<5>MiQRGuqwiVFoJTl zGZm4PK=aB-Cl!b33s%CvQ%|&zUS?L3ojP_VG!Mt<(GxIo%w!B3GYu1`FD5WA!;~3| zF@5?%%$znK6UR*_pne3uF8wes#DR1Iv#WFylB;$gqiQ<>39UW*3}f8<8FokX9WtJA zu^}$40+~fyv1Y?TWM!;I$94k{&~Y%_x$BWmAhr>j<2IduYr}E4HXes_^HJEh9D#G| zQMfiAL(Eq0?>vdvHAk^LVFSY5>k#WILr|~_y?PHpzX8J$9O*_-R1yM1;}Jngy?vJ#mcU25Z&{hBw=6_^}*4wb?t_uOJ?BvqBOiqx8rSQES_b{ zSWuf<#6k5k7Z9YSr@73A7c#wH;F9v*`FOP5hw-GG}Hui+Xiv>#BfSL7Kg8~BnF8e&Lr0N-~@4YYm zZHG_twyyD*W}Zi1R#R;?k}f07+_23;vu9|d{zm94mPcRW+Ld|~7Z^4;{Mxi5JhxKt+_z;5Ola?q=n+G(HFPOzauad8Gz+(j z)6kHgiXCxL=;!ajPZb?rwfm3GUcvzH^lDDekjjct0^1PcnjN z$Dw$Y6+ycW$F~{5c%85eUptqfYGOZ}3Z8@SiWBi7--#DFYzWFqn`PN}Qkse5jvz!2 z8-SS|TVZ-9KP1kafTPK7)FrrZ+8K#s@e$Y>z7RVDXQRd)j@yYbxNKjAtL`AwyO*Il zZYjP>i-m2U@)VV9R?%an+5Uwm>_f^lS};XF{{Q zrO-CT-;GT-BlDY=V<-)@+^0`nAv8-puRoUK{XJ}V_lAsfK8KUX4&(Tt12}c^B(`CGytK;n zqL_TJegSVys~RiBba^W|PkfE6X>BS4%>P1mUa%6E{9`lViHXrI4>1V8mV)$U$$b2N|%Rku~>HJjpF!}9?w;uK( z>291JBWpV4wc7=Tb3xAMe)So-Up4Q4UiD5~yHtye^klSa?}t`x+OUFbg-KK9A|t01 z?vxxg{9iSYmIg&3lrR}#OG1b(5rHxBSWcLnI(I4Bb?J$g(i_&Y6KyF zp=Fy67&UGR!eW@d$YiVtbt5#)g{UwaqQmWo4Cj7CGVEdT$c%`EWBhQ0bZUW&K^<^D zEeLh1-MCPZgZj!MJlvFr@AszT&5i{8yt@Q{`tdde%im^Z;@jLzyvrp}6OP{ylx6me zH+dqr`u}xaB3|Yv;bnpJG!vxDQ}Jd^GM=xt;eJI7Zk4-mxik%*raG`LY$?`72V#fI zj_g@;5fR{roAKfJk=MUXjm4Wxg0es}VfaabLyZ%ad2wG93pi7S(P}K|D}uDlTJVlg zEh`hslHT7EK)x-cik$eS#7Vf}amMGn%0le1hoOgGOSJWCfj&LD;lSQ~xN_|}E8k1F za_*8Bnx&^%DtM99CH1VJMDj7JlwD0f3jEE`tgmZSmRrhtU)d%58k()cF5W9AJ>{Cz zDdHch@!F)WG160SYBl$}$b`{YOefQ3D)9H9nH9eJ4oK$9mDvnxPBX02YHVs=b`EX4 zrTV~j;DDi+K6Msm&6tOIvle3Zv;~+nX$}St9)nJudhl5gp4$+f3C^;t`+!z0u&`%m zV!_-6SiWchB7#;R*=i_usv)MCbI1L!WM*=CCPZ~f#w&4<`+3Kq(VdU!_+7| zN)E--Oasl~_$G}?55cR%Rk%X{FPbz6M?)9kNl_}pq1~^}$CENPD&r!NF=jBPvr#v% zTL+}in}RC(%WhXJ_S++HAaEWw&KZs3$%C+c$rRMYh2dU8ENSTPpsyol)DxaSk_R0UXJxTq*GMsh3hGs22JVR{7 zl=H^^8cWr^7Q6O}ek8W5-U5padwfB*!gkjhYH|A-q4?Y>9NxDJCl2q&*)P6eJZt!y zjJv)Zor!=nG6+Kz}XAT5+M=K^AvDK>OKr#|0eQfp4{(4GUr$`U$K5g(!{h^oAii0&}>bsf@Yp~PV$*?z96sZXiqY8 zh%_u@p_ebe`U0cIjAmuqUJa$_)oUmwPMVE)LbEF^4@(J(i-KdYU}XdrE)BurWuaIW z6r*~WR}q?*gvMg#!sUeK?r71f4O+Esr7Bwaa(=|9v6wh@4#rNNi%GMWV*)jK?s80@ zvjjtjj^e%gq33`h=<(4w#FcD7;<`@=&AZ{QBs`bxg0o}`5(v*3t9K$He=TM$i$>Sp z@^MmEDggc+dJ`1-aK9ZUOkRqTZ6}bp@hD>CM} zvN9m=^-7?*APUb5qZocPUMpaZB{16=FN(k{`kvQ(PiX$0(EMGogYmH8TLS#EHAM&? zGYoABXRZ8OGJir@A+r)@y-`HfZPwbm-OThGyhP;cadVZ94|9)57p1 zDFoLE`OZ=OQL%guUhdd{2NlIQMp#apHX4)Lx5fklbL{Z`Y&1lpIx`j3*=hJZH4c?a zrz2^6FBDGbi(|{CW`}VSzmL7ww zSxigI%B<%!REj`S=W)lt#|O=xZe&WwWvn12bH zef6b>jhJrZuUbBz(ckJ=J6DZQO4yPYnw8AkR})sV;<*Nz^dhQ!&Hh?S>Uz>fS_X-`ADYCUbC?=@U17hNMPGiY&Ij>3K)J(>T)IH?E zE5j1WKCe@7EOTZyhGioI&63eN(*>X#8(33W)A5Tv&ED3HMV7F=`mLbZ@{rzp$Sgbw zS-#i!2A<_rJH>}85`Y${Q7h(oLCf?^ilOLxNj$81`@ag!3J9sD*O|(~XP`_U@829W z>((6eh-pwzt3a=@ehHd2@HPhH=EZn$EAVJt7)6Dlw(=a9XX;$$v4~FS6`!XX>&rA) z7dP=ff3KJ>Z=QL?_toVPkXj! zcwdQ&R=r5fQrw{Qc3k zQx}YwF&{})JCLw;FA`SoM`Fc(#8bA_dl6Hz9k%?9NG#rh?8=XkS-OqDo`NAGXP|9+ z8CoKXqO`)eDNB*P_5cz$9YyTMV{mN#OhL1A(`Sg?avaezY-8&QLi2GrwjLlZb`x4;$^g(cB1Z781R038+#jAO!BO+{s=2%1%praxa5g(I)#HC9S zIjAd~JzL=5s?oT;+K#&Q@u=CFjBDHD@%-Z?{Jbv@zw^lk{O<0L@jNdNKeEF9IV&DN zG#lQg5I8dIctH5LPgs6L2!25*el251 z3C#k|uc+4r(RiEBION;#1Ht)60`re~aoi^~Gd|xHQpI+{y$#>2&cH!iAU+~2w{Fz} z{f7*|!2|o%M@e-o&#PIkHB?_F{IcAjpQ$pK^@N!N*$d4g`jRR~ER~!<<%ghIl}Rk$ zO!@7t&%9;5r~Xm$)GbP$jX__I|Ek7@WIG&9V96*O1Nm;AEG z!u!yydMKG6BC|@Hl<8mJzJ-kRObyL`{unrT5GGBUOlY2sxzpxh{?vt7IBfwIPM?o? zgy@;mW?{_u3G}`GXy2hd1`ZsA+0$oY^5}8s(Y~V^BGIi)OY~{mk?_|B0sd_eF>5k5 zL$Knhj^$Xf*`tp@(rjCyOY?O|ffJJi`VabA}Sh{#IRxBf|&zpsb`F3n{c4*xJZP|cs!Lp!L>(*%BzLWYmcwm<2TR49CgD7>w}@W$VzoB`ONC30voJ?!uR9 zuK69>!W6&nZK!eT+X45XuI-l#b$+FuGGrMe==K!NIEHb z-1Fu}i#{ML0(yKxc<&l@qoD@-cCJTU^hy-xBw^e73T$0ph2sYgsq2(o(kE(p9|3z-OG0;q=uuS6cJ~Zq7=7zWWn1{bBev&i^vl)KV(!MEa6}Ix!cj`KUWvduJCCMKx z`Cm;POaBqc67{OyZz>Nj)QeA_WBp9OR%6X2oh*+UJkWfZ`SoFFCP2!v+9s2=OvZ3u z3MEqjdB&mXEdiF9eTiZC09v;4yF@|~&udlTq+8ETr#_Sa^+$smMAeuuKC~-@;Vad@ z37X|x$#{7HA_+r36xE~cmE2#_pd~BW_@LQrwNwj7J+LNHVK3FM{`mj~~m!CCT~=bz(m4Oeeq{pRiXX!uwxn7`D4=QMOR_#MvF|el=ne zR%2O^8?F30VcevNh_VxyBkXEE+EtMP%}#`~aS$EhL=^KPiXa^y6$i(hSqSYD0C(>e zIBlPShil?+b!`H!Z%xGA9dUTQJCVR#j6d4H1K+J(kEac+#! zSlkH@!|vJR5#Ff{&Nu?`LrJn4mhmhl2G6pbgbxQE5`v%R$ElS8U(49hLV|OlOz{~jo+oitz0Spncu<;(u;KmCkpR)IT^l$YPWAEMrHia^ zSrJN)RShK{7Kx3jo;Q}k^6{M#>kpBPE0$Nx$`6(A(g&4&3p9IVpqYLw3k&HtmxN1R ztC#l`d4_;|5Sl&5q~1~Wkd&3;!y26R9PHvdru>qv-j}kkDL6~rBlVV4yaF9YdI2y4 z&5|yk)GQq0*G!uNb*9+|%>*6B=_>1^YfO(w>Ly9sgS&T-o|>j$t##Wr=+vb%1`ip6 zNo?rOAt=wCI$r_v!l`pGYr-_lm^2O3Cr=|-PeK0y1JSO12YfVQ6lTv`#K!St^cmO( zox1wNkG|TXHNm!RYc*Am0qwZ>RByvB9!h+b^e%5S}UbjQr8(=nSsE=$Z$ z7&jh6KN^bOeR`{DGTB7A4|a znW8b9jX`b;XN^OD{}$-N#y}4?;$*z`6owl)dIU-q&&MaW2<)?kV&n2TD4g^Wb}pEN zn@O@lW)ki&+SII_K z$v{_Y>S-h7A+dFVWyNGXkM^$`gWQ*I2gJsu&bWD{0Y?sg%m(caTsZqV&VKnRkBj}w z>VYy%Fz-jz($?z8elq62aZfYjBD}`mu6T5rX>H7PV%Pe9veg`<(`2OiNz!JZS++i& z)4nRFp=T>4r#4b&wbqv{$6&Gl;9DznU}dT^e?%D|V*&nc1*&#b_#UPDQy ztAayX`79imF;`{$8w*_-RNokwb=bbb(DRU(NhXu6^4?3O!6IgmHLS*AzOSah+~}Bk zm_DOuPI6x3|1|`f|E)l1?Ird0mKuok$Uw8m8Yi9CbQVdACq3SHdd_Jso!TKN^S^8v z)-@Wkykhb~q8P1-vlu4xzM8toXD3@VI|H}g>6v(*fo74f()08jQT68teN>$X9+d>< zq!*TOX@(i-{4qVTzU!@OI*rUPiqB0IU=l~}8;RYhkCCWG;B3i5lOL)l-eY5`kl^(S z)YUW9@(z>m!m>!GPc?ufy#!?~7P?aKQaYUy7VTInZ0YHeB|qupIv)wwXHWCDQxx}k zPf3p~AYql9TThwB)&=J68C9T2J}@r){{jo#3zx1E2JM(Md^}vikw}aT$EwA1FrMHz zZ1h<4?l%~nI`<(|PsIEMK?n?XV9Cm8%vcbJSp?@9i-RzgfIV#DG_()shF+a}p>x{+ zwDo%=eDYwF)G2<{}VIZc=3dGd8p_sNH6eB0hLc5OL(5^!lv}n^FGnNFQVB>7d+r9r7jHvC(GEB>w;^NA z31n_Rg@i3%Ady;?z7b;=yD&E@2aXDcv-wj*6QD!49D!}e2{_gq#42|MBAkU7KV>1> zwC{vPD^?)V?o?CFg@qCh3C_V$Hg2W2ISCQ`KU!wZ;IWv=6A;v;4KfC_$JNy3c)FTU zyE-0^Hzwf8=6HO!I~%{=TY-PGdm}6OEWDBx0CQ6DrXZeRY{R>vaKdsJ-j#>qE%l}% z46n;03CA&N6+l_aS}$3)N5D0( zGaeS{IhCK~IPiqP@i;3EPcmJ2MnHH;V1JQIa4sMy$l?(MVu9sv@?>krYk}kf2Yx0X z|B%Du#dbVbJ>hZqDK{S9ROVxM^h)$6ocOnGiGhO#;pDL|aJ&8%ZL5LyT+1?{me4HI zTFKiSHXM}%Fj-$nr-$0I!c!8!tfp;B$AMIQ;`6j&Wvkkj)p$e?Y^pMzes_%(yXY!q zrr>I}e4|VUkN)t`cU;OVT{cSvEpPs`KbbdWa(_Jxi@s=qW+SP1%`7z{>8+EGi|V;= z>LagMc4`RFX;bwcuQ!ra0I%`d>x`G^2J2U;FYn*JiR5^fhGtn|udN>fI&?zMo_#Tb zjon$(7ho=-dH%F{m^o=CW=@%jxijZr-rRZk=%bGiARptk^GDb2-O;yyfAsF#i~F77 z-`*cBTeYH}wm}<)*@n>DmT=pKYAgFK*!UGlY~2EF+qFfvuHDePTW@sj&`pi4Rv-QP z2cTU*NBFVf>&J$&^kNU`*a1N^rXiB$h&^yVCJybRhBS0!1G;DD4oH|g9j9H9I3K$l zUnPbhuzzMks(DnQMBV2H9z>zWy$!KjW*G{Jb`8> zuCOr}KCU0ScaVFuMPJ5eZqI>8o;Dkm%a>!#()q}rF$QtNx+8sjKOBf!h}#7&Jd(M} z)1&b?Ba&O*GbIX-Q=@U8`(M+q9wx5BL)R)?A>e`dR)3ZSK6pdccz~=ZJXt|%vWxp`O;b1h=~9DAWqk<$~Kh!dF+E*(N*qyd5U_U zXT(;-2Xrs-)dtzp$9T>K)-j)d^%;(zIEGVSeTiM$b|5=F1BdqQSN^AZnrUaMAuABg z>#fkN>OR`#Rob)~v(4=_ZiV5zlLdI{Ns+N}tMqRS&nsQ0HkB`o#@8E}JQZJ6TNRf4 zuYdP4)d!@wiqA{lFw5$=dQ3R;Qu34O7yZvev-e?$47|#?+PIJ)QIAeW6UPJ0kp(xgAa@uZLT zsDasMPsWy%tx3MnNmaf`-g)d<+n?te$piCDy}&Bqtl`;wz-ORYW=S#7Tzl~p8zXE) z5SsNfmiQ9_Ji2^Aw!9zHe2MvV>AYFx$dWdT)O3F`TZW1Q7>7$|Sm?395lCsc zb{z+g9L2~!1CX|0IqJ&RpgJcHTLM=ha`ZTi>)HbYI(A0)Ze7u(XKyw3c-+KUSiB?* zi=V2JPAPVD_ zIWRdg6OkJZA&S2_HlIK&uM3VVLU?QjL1GYk_8ExqXdA*~;t0u!1m{!&a{_{690-ew z=WnU}Egdno6u2T|v1;T;Skb8^ipTWC-GV4Qtgz#Ll^c&XBq(tHd0#R9a9QM82|PdJR$%%W z!+1tue#S6g$)ALTv->s-ES70CO$EI14mOT4lkE;=FA1?%4yqdi5k2c0vb2b36YI=)y*7 zzrI5;cEkkCm^=%!rq034sk5+T(NfHqHUr%W%3bNdJ$i84wJUABD>`@WLci;TPF=d8 zQ>Sj|+NCFfxEBK2cR{;$ozS*zJIEU&KQ$J)E&ST~p?&*y=tLOp%tmE~z?1C^-$QBO;L#8NvqnUFYfhPRx|C_`)55voXtY zE-?}jLwmwDaWHPtW*^e#9u^W_^C*I{^fEumj#f)uzsrxs_cEToh&C#3VF)HlIlZWC{`wCpnw5z4%pJvg& z(qr+E(EKDV8qZSgc$j9#&5S7A$&SR`#6UcB1`(p;ur*{UddS#)rn4Oz5M`x>s{Foq z`b(Cxr?qX;ZVl7Os&v}6Sk}^Z1eyhsrC!qC{AhWhnYN&8$737TGW`mFzsBpDLbG|p zuWeuV-s*KNcu$E7f0z2?!kN!_o=ji+1ybEkESfbFJ2q}qwktzoWk^PST`j>`H)?&p zscl=*p_xs8sCt#@6Um3Gs>6xDH|<0;^`NfHJn542cu&36vifVI*|{uXy)XG)K-oy2 zZ@s6;)O}V7M+vV8w+J|EXl8z^{M2e5npJ`01LWpp3Lz~ilr+HV6_i9gW(8&|G)o3B zA!;m}JYW?u8j?g#V7WPHrg0f)_QA4|E)-4XnvD0tibYmveh-{k5H(JhFU}v5d5BH* z{)23+rsrG3liwRXTpGAlrtp35IAsmrgsBQ#8o7aHbI<1DVU-zzVPySHIfGnJr!_M7 zHt`XoQ&XAI2vu4bXLC(+*Gr3Bedt5*lOEdEQD_dhv44-M!JxpW$9&{#_P$ z{v~Lptk7J?eLW4JfD6Iq$`xF>cL($5%|yhENw~LVBi^ho$J5jlJWWeQeL^BOELn{3 z;X^T|XJ_>5-Vt3`A$9KB9RmlAz|`rBuy9#87A}v(oP|MZ9_0W+a}O$T+I+a?FT={w z6EL__7d2F4(C{&sxi|>Z7X)G2ypxC{|yCEQe;SU%NS85@O)_;P$jr);C zcurpXF_JgzLCThcNZ523@r34tjmHSnhvBZ=3uo!aaFp$VvtmEustzJ~({W_({t7t< z&Y3;A|fFlTlRec|9)f8cJO4Z$X*NkhGVd;K8T2n^@vI=!lGr7goRF+ zGG!(rV_fRPztEUu1VyAEFghNAvGRYSg6nX5CSsff=g>&Z8`uxaJGaK#*~9R4xdV5~ zZMeHGo^s>yrX>9C{&IZ3AsOQzu@_FpT0byeXv!%c7T+5qMD% zrJ(s)nGMeg&CdzVvb?ZBvs&3K+m5H1HoQoU#mfX%^iC%(Iuo!fECR8E`r>>dA-5 zBEF*~{wAVdF-=XShGsDwsf+Y5FA>9dPv#AhzK=>9)9Qt0mOoM_s(RTYnZoQk!CA_f zQ=fl|y?Z{!f&Kds92|_ngNI_^z@Y@)u4vzmjavHMfIdSocH{(1qYW>Zw+Mp=4klQ) zQ$55zy7!=ZV)(ETwDB33Jb5NH3)AIe!dZ(jd*&j{m_8RHhK)t{uD#K*Bf+{;XEk+X zk4|0DvvW6eLXMC@_ea5B}6Q;G5T)De$uY-r}i zL}O^*erVm&pK z3Mr%d<7nh^Tuyi3N=huQrNrP)Y78DEN8)j6lv)AtK?WP48BW~Gkh!d*aXlj(_tPTq zwL1(qQd~HY;J{F(S9+QI_U?s)`}gSb``l^T?^#v1sbM~psuQFhQO{ZcmNsGm;QIW;}s8rFLcmk-c@@-N96wl#iMXg04(ekm?S-U8brGwqVO zPEF+^@uM{K@_2Lc=DxX)JWFruvq^Y9xFiINJRto3q90A6S&UtuZ|-fR!gvM}cI5s;uJYfoE}a^>L)+ zfwiaE0?qQlLfr+%%hKEI?LC(Qi|6^~oormdisYDqW|L1cKE8(jRCD<{9=v{o?7|#O zA3G4&_pQgPt;KkrWyg1{I9?`1;eL`04en?hj}F4-&}E35H3?&Tc0td8cC0wtp+kqR z7(8S&X3buT6$H*c{YId5Kv(qW+6(bZL$D^qhJty^uPnfhBu%_&C< z|DI?+Xgn6i<-%6BACdXH5S3O5M{+(!j+}`0{vENHupb?rKyXe$P?StLN7)jw$`+3h zTMEJm%;An~M7uH(8W@6UeR^V9H$QA$HW9bV9Jsxj+7ORhYh8G~BNxB-NjaVurQk_Q zJl?V~@;WsMuQO!TKLOSVyebaItCBFhE(<4+hT{dH`FRB)R;|+Kz%xSgGeWZrj}T~n zk;g5e`DGd*I)MP~;Qeeae36ohl-V0skkg z%ZGqZ3j~@8*97Gk3YujFwOG7i_}}F^6*RvqvNL|tt0yZ1#xtJDsLyj@X1}iJNMP>N zt~JurQ*qAQctW?SdnnmWrRk3g7qm;b17Jv|V{tz^q#;Vj$ zdMxF?2AUZ@uaS-$GX%>k@m*DhXepm<1kJi(uHzy8*F=&QPaX6gD4UOqEguxXZI=&tT4+IhZ(c62ilyF>%5))zjSGzbo3d3P4Bt;DCNZ zFr0oD(5?gg+W4b0VYh21f^)Z?gyK~|!AIS@!LLOt__gYQ0Ke|&MITF;Hx*wdhvHQ1a-1PFCrloJ<$b&0 zN`8|108oJWaW=s@D+c#-WAPv-hISmz|3hi}p?H?d{UX|ZNg7VE5t_VU2}TbZiq1WH zphKq)=*GAW3}}H>qx)e~_#&K6io_K*Ft4&)yO|t|TglOc<|u-4H121}LN2km%d+pT zEF+%D24+Sq?(z4pQ=@T>=YJBh3X2E#M>l_&lCuTvd4?=eAO1!&!8P_g+$=^>YK9Y~&^~@O!^i3JkL71jLX^HJIo>Gr9J@VE=*MIWH z$3U}B@7H}INt@(>_7hWQ{zftHnqvqZrfeJMo%NoL&wXE#&xT!0PqTq$;ly8tX3qm? zPNooQD)WG3-xOLRHO|u%R+KlxO`+9H#`wNm7TB^vb7LSk@`i8CNb|5fG7nIN&qA_s zZ}a~?Xcn1=BWmvd<|My0e&7_Hs!{KY*J$N6I)4{1RSxJ2*BU=-;NEr;SqIJ`_uzRHAFoFF@v~Tc@Fc2s9YDA(0Uf&bMf+|8 z;n!;zmd544QSu3*a<;*qwHgu5MD*=900VmW$CA0r5V9f`5#fmliAX?bY!X(+#AAg$ z3BmRZggSB%>dZnU)3jeSjrAWZJAQrE9{2hV%8=mu;_mzyZ%`}kRkM7($ny3g%YFH%x&BNw8^pezW&_M3Df7&N4{CdszRYu*gJxB( z(0&y}3QT&+bl;m%Dc5zKVJ#OML$k-Xm4EB{M`ZPHB{tL>0kda#hAP)Q@og-LCwW)b$#Z1R`74(juxZlBCp>->n54j!u+6KVCZF_=kC*_mfx^zdM?tSqQp?K!Bd04tAj19yP%$~6X z<3>*3+BO??Igxmr6-65sV2;Io?%zod!vmQqBin|rixN=J2HZ9W z%cF^7FuF@;bZpxit^8V|eL!0b>l%QRS>v$Z8H}?j(P&5(SdPMt_;B3j|F;Rn*U}W9mV#-xzt)s?%nzqq+6esiy8UVKpo7J+*vc`oi@||7Pyp z^tz_<@{~!V1(<&sni*hIU=o>Q*1|`N55^Qg5;%{UR*wq9TqBY7M8=x&&1MXxz**f} z!Vze$V>kjc)_XTr<4loE6fp2e{-=Ch|B_y#_kmf|+&zq}_p++FhhbIIbIrbf=<<*) z51r|k7M}rT8HDcxX8~!?L$-utjicnhc^2k(IoB9jb8(XEtj|w9L&n1k&2p{Bkwt14 zgva5nj^7lRRTwJo-rEEt7vf$ZV`ez}y6s8;j>i z{7j|lac(ORi_Ew+-NJ7CTO)bWsBKc>TI1Toh522LuMT~Na#t#Lxm@^m?`AwNb>ZhK8$l@u zPcxR{VcJUEO$frR_z+xkg`(aWidshm&O2i9xx<0YAt8vFI3B~hbWpG8<%1#rcI^qJ zozb~tJESaHic|5KI2jm=Ju5?zF>f9g{eQImcYhSu+UAP`$^n5u0s$f;a!wM;Ifqux zISWK|wsFo8Y>WdrDGLz|28^@4_w(%i%seymoB5qra9+lBu6wOoUDd6|_MSQCkLy$2 z)m1Ciz3%&4t5>aB7l-8xZh;Iu!GU264F&o0G`nZXF2vjDTgq7b@xEehKYq2bUwsOmY6rv1+#v#>u~$!R(x~3761LCr}68t4m@rrbapi3Z|%+at)m{lcFP;j1e(k6i>#4+s2sl@ ztspcLVxOv{YVhN+I(1!sI?Sz_&rHbW@jpGbW0)IRtqsuHW z^ZkvI*Jl5N!2FLReYna$!-Dbr;2}JixWynmrhL`V$W>h7ji-U;(M!Y1 z-rQ@6({2?o)BX)Gi_D4wvd=w8ApVMS+c!V!IhFkEfM)#`HueYlcc)Jj$%9PBhpG`k z_HFO@BmGsD1Jl#IDCswHqp8H-rSGWUq&#+CG?M8V>Y+gbh_@g3GiWv=Z)%RVlt=os zl*f`ljhO}}(EQEqn`murhQFUbii=C(Dz8Fbektewz9s+ML@I@a zgy~Uc_OZp$OB6J(ToQ++3*)e0{xU>H#9(&lJT*cc&b}BD5{}@IP_@Roe}F#%f&$@B zXbzyFc*7gBaEY2y6&M&mXr^cj8F8y|;i;pzeDo+@XsE*86^juui+({K6!o8h<`A(Y)H`>aWRwZur@jIdUaVPT=FzUxaC4k#jj}J& zt_1E!6$6bizcJeJDD6%94D)cao6QY+1ej$Ld!@W>-q^OTn~u>smT{|&`3umj@^I!U zX_z#1x}FT+=u?#UdCu=5Sq{)$Kk*xEGbPKfJYi0=0F#XD^0R^^4NM}l&(i@hL2GON z=916mSe1r#jw-0gDwGz_@zpq%bly@1S^i9*SuUb8e^2Sk>#e>prZH`q#**WGfwEKH z=^N>MSbiTD;*aD!)*Nqt))!r0Y%TKujQ1$BarmZFI@58M(|HX{e%@{2T`uQ&?(^Ud3P8s8#HaHhji3d zuCWH>CjsdVCAo*aDJk%xON#p;c6lXZm{tu4ezRUw95g}7E$f{RsEI7xs$Qjmd+ z^~Tlx{bb`t|u5Y{K<;I&hiu|2B_!ArIxr(qC{ zPraoc5^Q+oQ*^#}9!Ec$#M7s)_cAZ&I7(vve$lv#$N^m62-xlou>iPF+~-pL0;W1^Or|>gE&lEH_K-lV3uhZ&e2*oG}E^AzipPka@e=r zAi76j9nh?%5?cG5K2c<^&cQr=1ZmZGO`oPe(LRlR((*TUAJ($_r7iWWgjGHPUE3#U zIl)=J3!p1MC_Y2JJIf-+)30cWFEHt_?y`<7<95QH-3o-uT@|RPs6k-7DO&#It1kqnYuy94vj!q zXt)~14)FI^4*?1^%lzdq-rTOBDsLqK0RQw!L_t*Bum!sb<-2zzsOcL)1Yjvg$Sgms zTd)ux96F4y{Cq5#t=EBGFmo1G&z+5ul%43QEy0$Rv6vegf^{*`c)Ezt?8?Q}!ZiAl zd>q=h1=&#%xHZs%pJYlA;q|+I-rUiSZ&TkKYQ_k+@6{Bcd-rziiJFH+LH?LeU$bGh zKdQGa#d{qtbY|?rx}aHT+OU>4l`Z(LyAk)=E2uJjL)~mG#huOy!ZG^+`>Wh|KI~;b z>|%cH#dy>$$5t@kR-A9CKmZ)hYohUtMvhG$_+~ zP-e*yPe8HdY(Q^$r|V13<4szBRnBw0^+(Vw4&F@BFaYg6J;Ry4DXY8vB@O=mr)^=K zC-CKQ{rH<%g2`7pMyBJ7Ovm<^k|BogeVtXOveOiYV}Q=gT{be)Hk6o$KHgVnv1Q8a zOU|}ZpB>WQx!*Vek%uF&(=m=qWFN!h`Q6mZmR&Y^08qoSHOefEs%s(Fg_6+>X-_GSj7%gX>{UVF`d6Rh|9)@}_mAzzEyD4)Ejf79 znT-b>1m@;kJZLTPXL__CrHUo}?ZquMeY&d)^V zhE-S=9)u{S-?L{shMQY(rLY8F6PQ0P%*Sg5c{r4ug6hp%k+5VbmWG5Qd}e?eb(YmL z{Db*@&H`-Tn})`&!{~qh9X$2cNgRFs9kd*J2DJlEk%})_9^zCxQb(+O%Rx;@C>2(xs#))JMbDp`L!T!W&jCgjz1pty#>T-A;8x-Mkp z7GwFmNbH&$g4gnsFwt9yiH<_tJyMJ7PZ4%s-;bZ4>B8f#8vNE%k6#GOzjjvRmo9lx z1EG~c`{zE2aPZ6i8iKPdDMz4I5B?F94-%ROsRKNBfWPmrP%k2Re2_OY2MaOOosTnh z1vp#Zh%XvjQ6IMoFJ~p-`y(CdujYrnH42*V_toGoq4^sE@;$=x!$Xz$-azvK<|lKc z3C+LtSK$Yi;nCrGJU-fhpN=%*_ruNj+h7ZRKiY?pzGm!-o`WC)b4)}C-gxR5ew@6C z2g8%Nc~$<>l^{%8XFrmEQDiE{5Ph^jv+A#uz_OXk>4at@<DC396J4U?u@l8@U**peG>v(~!z zZ)(b#nyRH`Mw(@NYjO-hhJF@JxO& zjyKhyhrS~=YBs_Mx^eU8;i=pVoOk8n>!M_ws?NdTj6En^G9Nbyy1#bGw4xe3=x@ZG z!48Zb>%z(QQq=ETi;YpiSV8++$~?Bun}fPt>v5_+AKxBn!naSg;z-J7Yzv!(+BM7Z zu&)J=`xI0hffqEXS4b3vbYlE`^S4-lMOyvm7N)9LGly@cF-wlvhk_OXn zInx!@qiHe{D3bH+b3LVD*~m1QZzuPpG<-Lzk@GlHkH0GCxxS>KSuV6gth*U${#T$` zB;^(0RS+cx&)=PGVA{dgxz04amwQovN76ZnoPQ;W%zooMOy25T)uEV|xz-X4EIFNn zSw>I2_0T}`Wev;{By5>$sym4NguTdeG*V~PMs{9KiH+%b^^S5)ll-R3Z|qM`m5}z( zwx%JKzq1UwJ>_@P-m>+U*r%~kBRPk1meB*6WgpYCep6*<+00F=h6BB|U#>!kX4=!C zS>KpQp0Z8dy+S0~Z=A%yGjHMO(HHRCvFDJJQ-t8q`Pj859c5(= zXsYi*Lrps>%j!^4Sc#I7YC>}}3hFvhTGNfP>TXolc3@w85?0Kejf^Fc_^c+6z+8%{ z);xk~HEthk#?LPf;QIs3YA*2a49>sF@>UGmKQc&5pqAyXej-f&+*^xZ`l;lW*$WtY0JUc>LPsA)P`5&O@&Jq<60|0d4B`G zmHD6wnj7$7pjItU_bs9MJA$$T=D|t=IANLleU?weTbPg1L1x%kdd$rx2|5mcMQO{cq1q;q#7L=Cot=gDgj{Wt7^1a zVANWZR;^F_{~|QIr)!x0;o-ZT&@3^-^jF zSzc#;&N^slW}5VEe+0}TS)D=Fjg^PaA@d zL`_vawr$%9f4@L1U%CQ4UEL@uD8#(E^9jZgh+VuKn>K94ve=dCS%RQ|U_vi%Rzqdf zIFvWAfe7&9jjSvu#~a$UGF>UF5XotUYK`{H^b#Z{ zWdCA+`uSi3UQFMH&0#Z8zIGYz^)=$#{s!FcuE1>q^BvyQe$%TT_Ip4me@Okpe)gDs z@7w(?cyOo(-y9vlhqYzcIyVf#elwAjx)1N2_z)A5leFQhm^w!Qr9GRNC)Ze3V34I( zZD_WZVU>Nn&hCF}l|2~$Z|sBLRov22$MJVx(Cj`(`i6>AR!k!R>VM10Zo{(dob&;n z!~P(=rw6NO=dw(&S}J%9lVb$vG4h`L&N7RQ+Wkh-*6AAQ?`9wQjQOdp>Q~YZPTM#2 zlRE48rDK@L0JAQiUNwVdQ__Q>_Q0rJ9#tl*{<<$}zvAp~dcV$FjyLH#XB146u&@jw zTWl;Z0KECf(5&AwW9AX72k_KLuQxQa&{l;-`Y&vDc$pvORWTz|K`m<^({M^bvvC^M zaWZ-=(5#$|+Rv>cEuFSpDBEf22J_|M>~p+j*KN9-$C)&oX?mycE$8{(>526D&I?Wh z5Q*WbfZ;9K&*P=D_{c*f9azJY{@s+(l+*jWTRd0fot{9LbW|rC%ekggJLhJpPR--Z zx$5;^;1K&}C1PiU6w=LyZWeMr8#&GgF;jC6XcT+Gv%m;Q@WZ`EJ3?5wiP zl)6!~%4SRUu|1SCl6uH>(=v9Z<}VTuqi(+ZLfkQ z^Abs&JjzPB!P)H7pLO4JIvPtO6w7R`y%URoZ}t-lx(3+&eh()+s#w(Rn<3Y8Zz< zxr}3{hVcB?V|ew_4LtkR7>*K{pZ;PT$4-x`?K7t*ap1ix*z4*=M(qJKA9@W(Uq6ZC z?|*@3UjGCyy!J6(e(E{AJa`CO)@;UtsFlddu0m~nADUbG(A3n4`i53i5JE~T>rqtQ zjKaDOlvQ=2s1fNxL&E6&qiy)^NISmoZ^GXR z&c88O|JEnaTfu$h_`X-*xq;A3DCPE--g<&_om#v3=lvR(f9~gR)NlQb_`R{&7o5wxABUhsH3%pe=v#-nhj6Li@iaeT3Udf~WwqkvBBUTxLC=*+~1R zKZa)cEc>oJ8JgYcnSQMMtP*{Ro>$G7BU3{xX=s+dY{wKku1J5Mt`RA_lD*F?b7@AR zjV2XH*ivy&^?CX}Z)g^f7C*>izP)o3`@6f;=;x~CE6~!^g4)_fR8}{lth^4@wAI3b zVk};?1d($hQCCxoPd@q>rNt$L=6RS+xSbm@AKSLu1P1!) z`OB-8twUK+HS)7v$Vx9qPDUj%Qd~$+lQ)8vV%gel2#lBq|G;3(4hh2ifB`=tcl#%OTWUP+C!JOSV*C_wCbm8}7?drzw-}*}Nj{#Y~ybdpA z@50W=KoqW6h+8r>NuZg1?M{CcKiA?O`@{VnSy84QH(E>CR|(9An{c(i4evKpqB(vC zVgvlKiazMYgGX^|bc)P$9hc9^I?S}SD;H=Zm+519O_g2A+T#ReZimb@r3Akz>jiWI&gwPW0=ojMYOsiQE=vT< zASq#YN*ymO$n(YiRJ^9$@*FeY*Atp$%YLGO*&(K>X%rn3`Z_8e)J zN;n`=Raz>~MzH36j{jw7W+mjpcuDOu&O;q14Q`-WzVmk_M>_7boeikv>D=t|yvt_l zTDag+#S#^pG;X$ zX|X(RFs4)*R;L!JhX5%xvT2>GI-V*kf9Lhp%kD@(&{BZQbA3UxzJ3~(^&P51svf$7 zs5E@YnU9iV^ww>&W_1i-Qg&(DL9UzFqNgUWN*|#^aytlKA1RYuN2$Xlxt{Xfmg>|j zpU?S@N>X;G>~f3jyK|@Lcmy7-Yvl`?&C~y~_O&d)e`9hIef|AvzG2;tH8|Uti}M9L zak*$OP8TPjc;RfUV}L4L8H?k&xwu4V9Q8)x!U@O5q)J|{GPnwgD{Gc)jJdJ>+=NJCx1 zUc}B0hd=WT4Dus5`(dX4EX)iHMd-XGh+Vf6d$TK0xBmqke)}|@`|KK?B_Ka{dIU#L zUcoVbKX&RGp8j$S$G#ZFF>Vj@wb*jIlTgYTWk`xkEF^-nM1wU55W2cKTV zkz=pnP)9c!^9!&vatU^BO+i^%I~tk~ps}@&+fFpJbfBiO73FoUC?PPr8hhX(G}qPj zprp74YnQLYn%TkVWpE$wt;J}47VdWD+$mdfm!BuQor`M5SSYY%!Fpbuq-$C{XiKW^^{VE)W^VsB*oMbX_u=W1R7CpCz#Lwua^CRVow$g5*RSFx`;GX|VFu}|4A`TL zB{Jvp=0*Be0;o*U5I`1a_5f%07bh^AzNGq-<@1d1l7Dp+IIs_SKr>;$8=8#|QZup=#VHZWZ&MOmx<4+vCZWn~kLdxcq$ltjJ zt73u?6F38#!b6e2ay9x=((zPU5^fx5!+$<|0RJ42*Uy*ZcX=_;U@?B_E5i?mTku9v zDxo`EF-5ZgtgRihXM8P$NF9$wJ!F1z6-aOFfs7 znVg8rr_bQK@q4(-oASx?ql|T8OL9FfDmzqmOS>{LTD|#FMB6gb(~~5w({?02S@!HL zv2_`xRx52-vDF)z?SD44lzofsFkVUjn;IR(jWPDUQ32;MO!B*atv2miz*ikd_TyHq z|4m?)7wO2ZW7|9WA}zJALO|VO4hPIFD=1>*4IMs?%ES>l|Sg zncoe#sS#&>7XvpBMJR_rDd!-dtm>zY-P|;oaysi}rssHECw@1-8|gNZ_F}s@BsMxs zJ2TQytdBFGte{kNYC1!H7dur;=`dY+;7&hyCpVD-%koc*wsXDJ?WEepK1U0PM zTiRVqwUxDXwwqdpZ<=)bsrs`#QYHfnT6!M3TKR;Z^(q>&s)xP~lI|EmS-qnC(gjQB zaQns-4jz0;t$a|hG6tVjWZ*(cA}$r|!}+pobgo;4js5{@G*KSxb8U>n8zp)8vf719 z0?1{0PV-24K8EBW9vOLVaN%521%_J)jZM`UX%c`g!$^H8Mrw+9P7W^QXW*;sbV74F zPGqFx!>lyCmXVBOY02m%K<|i+Rr4SNXU&8^1J^7+Klu3vW0rp?W(I^}Mo>6HBNt)) z_C&&K501Td5^sMog1661;)TzLaOC7gLh}%go*u#>g7f~>GR`gZrP8B*-Ho>tFa_zIhHNmfV8wyR91JPsbvt&t^3i~(o1merg~7@ zD07&5QBi*Y& ze<29}I#35S{bIkog^!XKi}cEL4MHoy_E!e;-+Suudv85qxe<>CoA5os^~Zi0g|5SI z4ER3_{0`ROK6R_D0@qR#F`1f=_fqn(a!v@|t1Tv^H{fws5miO7mKT}u90L0HgzX;= zXkgZo&jfdQjW(fKMwuTTsa2zOzcP(q_t)bW0`xCWbz%HqH_}%vRe!(75`eyb{W!kA zd4mC0`UU%f1WOfcbpTcULqRh8k9@w&_;QKI5!ijjjr@%_8+${u?*bjeUA8A z?-6E^+lMPZZuuh-%kN7nCh&KsZ#H$68{jG4lt|3jvweqqq`AJHpiI@&G@!Y;4b@du zShQdPBEln3RaJ{mKm7tv9eWPxX@usv^D%EuBo;?UW7~%HNKS}HQepzOZrO;?&>#c_ z`Xe|>SQ-(D*vM$?-ns+%S-F@^|FU4#EcEAOS z3C;gTTmJ1pD?T86Z<`;Clx1@#naY}OT@CUw21OW6VKM8!9L_x{I0JrHWy z#)!ly0Zkrb_Xn{tV{ck=p7bBN`BJfg5NcpgYeeZf!B6(fiV$*~;t&F=j6U1(RL54; zk61n#IhPgD>}6T`oqTAH*V8k+#57Fb^^DDur^R><2{_9?L|W{x3Y>L0w1hv6Bz^l_ zPiWRM{Gj8k?w@ju;RbgemJIv)-Y;9;RE!N7XlC%Y8qz75FfMER>WBZ>P;59=Scgm_ z*iJ`FF1q@g!Yu&_ahf6x7qVYnWI0|nqEl9bs64~~_wpExY=j1<@80EYHMSGA z9I4}u{QpyERu@Xrk_$3js$JaD7fWQP?Rnv~^m)#^Bhizf+4HVV_*Gr@xhEU=@wwa7HtL4_L@;af}StrkSw62rY zz9KhZQcnVB<%}#zaB}}P%(`16xq1~0LNkw%*Mb^o*3QI0v%aykp_%2e>Mm_zpxG?( zrsif_Z7xsZn>ogwZ6|HzaK3KYbrgG%Q8aI87GPH5XK!eh^rakQGHS1Y`8+?LV;~sE z@fTi2*vwhj6*U)cm8RoVaSF~AiPG^!MIoNePDAdx)mR)9pyobro)e0;-P`a^aURZA zxp1MhfWTac(V7wrGmu^8_9~(InoP@Rs=+lv^AMqVxQ@S97vfS$7S88o;7daDDMIrn zndx{xGYzj2nx9Tj!O@~3WNqG}f@J{962IS>=zSw>dTur{KZWSoE%5q(f3ev>}|Y$^)@~k zzJ+VIAED>KQG|rbGF*|E9kBol7q7wA?J3AFY(P~_521NK+S&x1`_adJm}O<5SD2g@BFnMQIb(OSv8}sAQ#^;sN9wp zPO(2ssE4*iR-luu*r3?9j4*R+R>qiA-zDg4dsTCL*%!s1DSyH9)aab}3FQOXf5b;w z&@BJdXgk+_#`v1QDxH^3&*}TLv>(!((i_+q`iz8XKF?)YQyvGuk^kkx#gs9Ug}0s3;si z{t7<)=mfg^_9G!N74zmrsTZ5bLxelGY(jcUG7=N^VfCsN2oDR<&`e;KW#kscEJ0zu zi_lzy-kxKqtzz2g6(}vJN0qAum98dKl-D6EGY<=StUNF%bEwq=h5RhfC`5*YU}NlZ zY*`UUkWRx>hmWDIt`;EyeuxPQ!n1Ytc$Ml-Ovd(@XaxApWM7|!Ws4ReYVKU#Fwem9 zh!Aw{+J-li5^$O~>F-x$<84<4zG46P_oJ=&$Du0x{a_jXanOZ-@+S0e11nbCjSI03yU{)fV*i#TZWn$ShS@t3E*$lE6>3GZQ%j>RUlw5yaV`mKX{x0XJ_-w_7 z>CmjD*^c?~cQs(CAXmUw%R8>hVApllLt=_v1&#IPeots--aaJE=y5E+yC1ss(qD&W zE7m*CHQZzOS?O=4&)WM`JMmmqCX)x{faW~QBy zHOg#3v&=CvFk%|cDOG-fPYaq&gF5BBIJTr*x*dF`%X&N8NHi^NbvNdnE*&WAVb1YR z*Fdw#z@T>cM)tY-e5a(m(vjt}m7d$tQ(n+4`{b`kInJc#{yWr?X=lPq&p0P(}utM`&jfoZP=T{tDI5d`aEd={FrQ zSTS9>k+gwX%Ruy#pn1fCW@npAoxR&npDTT4x&t7cw((xS$ZV=~tmStm{NseJA0$17&5kShs2|VtGSRym2MIZmq&dLn*E|7UQR<+VS}5Hat2= zaF$VD0c1*mStKL7YTGX(%r*GM(%-sk@NW$6zxUMfH-Tmi-ap9d8H9wN1{(0get}y; zbAJuKPTzx}gk+q}EI@h8B6O$h!Hr|xc-T`(Xs*Je?kar8d>#!{ungt+;ed;9>>?zW z;b%g|&j$(62MNyys|n9F_<=C~Jt6tKLyh?HNDCgx15(co;H#cSBrT0mFEd%Ya1qXZ zauRpP#%WWuQ~6hr;4J?Nst1-S^}wBFyXxUo_DKcKO7hvVRm%7?)lZdw5NMXUpvGUw zemmaCI>=5t|5xP!%p&EBj12th0Q@9qRyV{-@>!n45J^8beO|WwT?Pjv537D6Z%}qh z;(}@mt8DUF_D!)J#^n{w)YRB0687v;{UkLx1?}zaw9{7FXb1ZH2N1t^FM@-Ev0~*4 zoH>6EZ@&93Y8o1_dHW8`k6wUiYVMr5*u7&Xp*amH$!S=`YZ~Auz#NS5&~O597frH|+DmR&1wck7R#S;zJ# zL9=&znEq<-liyA3l=R$rP|mjTC4Gz*p_w~GWA;vwDfo3JZieV(9+<}np1XjQaWsI=(9@yWDK?0t?Pvf4%|_br%t$k#nV;3#wv;kH_Z^#eIsQ5{D{!VvgF59bi&H+^ zL8WQ`W-gNRV%hSpH_v0;)!a6#^OzA~Z@EEJ@)D;ja^&ljIY)n&YpC}*;Y~Dr<$?z| z%RY0iI}OU+v8{IXZd+&GMpAe4*?FwD(B1|Oj}9d z@gWnOJoig*vY?p|EhEj|(Ch?eU(hV=B6GV{kiE!&>7JUQW$LW^iEXE*eML^^D>Ch% z+JcVeNzkn10nIvHRVFEq1DfTP+cJOc=Ex8}c>OgbZ`+Dh;i0JAwGkgy=i+QtHqMqN z<5D%Z)%iHnR)-U9?buJK-L+()n#(GqjpgfC(cqc0zZ)9fRxr}Udre>hKtOA=>$El#T zn$X<0|0sIi9MyQ=z9Kbe0cU6vR#$%3k~CWA(%Zk8e4b7BPG26SveIbEN(kEX_6f^#q0+xq#v1Ldv;lvXyPxTX>1HO`scn{{6_uWQ>}pcADuP$ zdv`s5Ys7=WCOkactVW!F7!(m`4>aIM?whR5!HFGlm`KmT8;Pk%i;Bbv-oV^G*oud} z%%`&w-x10m3{>IqfePkd#=Hs30?da>v<{ZK!TE}#&X|M&~9d5$6hwAa*SSP-2 zuSM><2 zkbTJ=8{}9@J}V$%JB$;;)mqey*{Mm1@b{aEyj=F3t}fbXGrBvwaPYt&wr$%6f8JQ` z+_ei=dG7JoUqMM#8CJxt!rbV2m=ie{bLY;(-re!Y&CbV;ZM)TL<7KUMfnM1zido7& zm#;>WTbuh)U)zn$v=ZcGmLNSX58JoxMeLH52)ABL6fq}~&@BH5`C~zJ43d*lkdP3s zM!jEt{&`$JdzSVvjQsq3_SG5i3kX0U0o`xLOe~LDgzWTOGIoRga6f2ga5*t&Bp`H zc)K(mdlrQtZ{rG_?d`=IWiB*q-Hbh~|01@VKzbm{5#%rbYR!V5Oa&vL`^oFUX9Zz! z@F*syZYyXWCWqJ~&9r9)&0;UpeYVIOn2j{ISg}rRwLRL<>=4s8G3`k*qs?v`k=X(c z_;W&_NRIg2!(!>!^#i96PQCp6nHP!aJMOVfdw5)fv-My4G9Iy6gPp!oTZ1~83O z`?$-`GFf@}Y9DXe#|ktPmW{5D5(tzW1-F8*4b4sictRoR6vnJOU=0H-s0TEU^6pID zFk^$WuOWN2YJoTwm5(*r%;V+t!S;8j@tOR2j^+GhSuF!29%Zzo8rm|-e{G~$&ea!7 zmro?$O-GTm%taRYNSg*RElmrWHF#;MhgRHe?v$>x=`c>2{5<#T@|p7tylK&~6QD%C zpjpmW4=eCo)v2vEb+)anSmDvheaPGqic^>_NsC(xnhk`?zW~m*o20?3t+O*)HY&$U zd#ie?Yhzz)3AVZ%vh@sF9^V~Oosyl|J)hZvX76>(ZD`izaJSJVww)DFMH*En!=&d9^1R*x%p$oNkOx0*jaQQh6cHS2y5veG6VHEky}~SM2OyHTN%d`C^=E zZ^G4_MwBDwFI1zsZ9T_O*EERg`hzI18boeEGxqGu!>YKgm@|I? z!I{A9A4G5t;WnJ-hG0SLM#Se;qOhqCMa_NKN+{lwU4*EmD;co;6?6xO%)|WXRfOgY zq-U0+pr{V6icZ48e$>?tC~&T??LuQ+C)EyDNeznfD^bjCaYZf4sKkBASR64239*at zOm;f1wbbHjX$~eU3-HVSHvD*~7550svdo*TTJe~mEYlypG2c41ipS#t7geg>BKXr_sn)>~O1_s9lqmscmiMRqE%^RO z6COX)iYvX7e4!(b~%YlmnJZLb%Hh|OU#X9n9w{T&^$bfVfNJ# z260(Gc8sxQj5eozyg;(2q#vq(43%`>wPp5eAMXZc3!2qbkIT|$q#v68qxI3BlVCY&zEJeE}r=k z%V=}{?6ZXhd1!5E<_%t>T2`*BqYEpRtwJy%c+=+1Xzyx6ePa!F?%j@g(Q`0&{v6Dn z6M?9x1=zY}2exe5j)?F`_{|JdOVJ51i`K@iCy*DRrn(6wMb+53BN>K~RzY+hVpElcGgQ`hq|`Fv>n5u*x`L zVyiEi*sJF>+sArW;{fm!64u9mN~#raEn>zfG!OVii-G}3i8Mqy;~GqQh^@>=agnV=x^Y^Omp z1BaTQ&Gyuz*eoBV9@4R7KCAXtF#6|`uX3!3$kjNZ_!%c1MSw8icm0o4w|6w4#h zz|6j)wgSyI4v+}nZC`YK$nt-wTG1;*>kF;Z8GA(_iuo`bW+Y4}Q(n=8t~r-fN~JD0GWlYv)r zvhb3OG#3{lk%2K{<}5_bk5bbn1e`bRNkVXaigJ#AS$E74Y+rp8uzJ(lmPQX!t=v}dQP%{^N;&$@oT>8Paih_gIA`0c?a+&$8Q`vijrgywGr#1FUP7UBBT-c2~acLy#Ol%RI~I&@_0 zQNKSXG(Qq(?rTzOGd~!R5odX`S{c44ILj-zf80-KKETfhOvn)6>1e9Q?eQ^!#4s*jp2E=ONeo{hFkcLbkRW+DDHyU%o=3ygmL^p0+hh@taA#z?ALc;?PF(;I; z8wz=YUlidmI3P@ouF5=QSusPVXULk(^XARRqD2Jc`7xLiF(3YZ>{Gnil!pVuXU`?@ z${Rf8<}(;GXAwdJ{MoNUnD%VME?J81&K_L4$hICC!=0PA@X@>PU=H)0Ngw0S{vAL- z-$G!{E2>3f%K^Od%7^&ki>sKr`55nf^fh+v&cN!$>#%v*TC7{W96PqIL~CvW-pNYE z`LcYRt;xejMf>pJSQ~zSsvCbl(2gH_Tky@H9(>l-0N38Fhz$)wxa3E>l~;I&2F)TE z6YL52VJ!b_c^S*xd04tI2C>mm2qiern&HnIlGXU^(=Tbm6S#E28=AGR*YQ%vWp5Vn zjM=t)L9-c*^OVGH+JmpyE@C`#O8&{w(<2xM2(MZKidyo#ntHXoV-|PjUd^3OXjbJ@uq({broqtan{+p{PuoR031Q%zFyn7?4Uqh(a>yyhjH}o>p+R>i8!F1*aq!ifudzn#ERRJycU?eHYDg3phXFFC+m(1C!2n6Zp1mR@c|Gg}&ze zPMNZqI(pYxj`4q5f&QjO_Gega!CvJ_A}p zY%E?XEyCCJl{oJz#FfetoG&iG*F^>Rw9JK9N=vc7APezJ7h+fB9DLraAO4%FtHNY$ zIVK3rqqW7jTAquGrI|`+i!<;=VJ1%Syti`GaXdE@$MdrAVqP|$D#}OF%H@dkXD|yP zFw1;pN~Tc6tlEIBdoz()*nrI9R%8^kAgiz)nFVdgE@($?aXZrU>Jgt>immY(Sh;CC z7A{|n*-?uT7(oCJC0v9>A}DMw0)xXDv;)+`e;d~ACNNi^w7dt^b-e`UZq(Ecpw0uD zd(cp;p}D5A3037ysBsaXOPWwtQinJO^f+m$TRHx-y@~*>q51xyCfp%3-z5y+ zKh#EGZo^$_m|%Wl*G62;NW=TtdB|I~1TR(OG3_RN-{r#N?rQZWwTDdqVP6Hl6REYK zUHD;upe*n#BhUQ&!vOcmDjI{egyshPbg+@%8}RtWehlU%sDBj%n741SrsiE< zVL!Vjzhq=hGOxb=?`yZ#A4^fHK zK8N(**jjR#+Sa_B5SgwT)p z1sS6kDBjJ z$#GIvlc#g^*cT`}`V&7EQn|G+ROE zuRya%udG5tw62Y(SY`((DrlBESkUYZ&eP%0ZYv{sfr7Nvw1+d<)*>mBvyP@cMN%f6 zHn#%J{Ok*uozN`jxNXa{Ed%Hff!W9gX9=Xz&bHJ;O$wT64^|zdPTD4P9;)7MVAfJk zLOYy{nabg8R}IaqpMqvxCz;PYMW-N+<0dtA?IP~o7(+)}GXfb%Bcoyv?jMZZOJZ>} zCkrR*D)3pO3!l}M;#8FjAD5Kiy~0wwRZ@nhax#$@y8zE*CSts+6;m};xL)JJM2)<1 zy9h%r2Dg$-Tq?@IIT>jtG=Gwxg*S83@eLC+^)!JzW6df=`OQRBcnAZx%rl%x zP_u%ze;DS)tU&DAP1vwA5%FmyNXf22dTui^^E(L5gy_OHc2fe) zZR%mD>Pl|QxGiZ$NntIPFI<7Gi(=7}oP=|&wHR)7VYoRTH+xHP?_dS)4OVDic1tf4 z_bq{0KFgF1c>~{X1lxb>t-?Rs%kZBHnk(@SLbHtc-Xk>MKHQ3L4tFq30yDw+Hlg|J z(sX>hXDcojmEiEcB;>{}!kMmmf_WXj?=|dr5MOHyrx@a-peeHE+J6CD5S80rxtf85( zJT!u94B`UJLp)AHGd0H8F-m(MzQ8_BDR35FWBD7^|Ga%2`;zz=N{$m?l==AD4~b8q zy%S=tGp1a>BCFZhE85ty`-2*twtSfBuYW~m4SH{Rz%u=)Q_7bLSn+%lZ=5eoGV%uI zKZ0hN%PcRTkXQR&I`IuyIWs zwyuvu;@%y|$xNc%W}~vK7@4W51ZP>{LZ&MCyJ1;A`^}U&z5#^HAO!NpH7s-vqN5gJ z$>QZ$yM8?uFI|FwKv~sBj+=!*-fXX5vl@L}ow#!00&ZTvf$Ie3sqqPXd;2clf9q|8 z1`)QIerQk#0s?}tW$P~NAAA~3tpixRY$HNvFJOIwkyl)WtGDl?s-qXNtJY$3+(v9# zvl4qYtwK}E9=w~Kg|G8-aiNNSh<;`K*?!!6^Lc#Q-hh(06e$OBLX7#+W<_(?hUiA)?ZQuZn0XZtuwi}_ig>?yX7HHz$^CqlFQ zE`Vy=xUZ3BGYIKR#&4Lm(lK3M6Zu`rZ0GA$E=$fnru&_6k3L^CL$66o2c$-qH7x51 z)>cO1T;%D{OY-2Ij4;#jS)lyK(9Cnin3R~Adcu?q=`l6K0%qo+D=P(~R6$LJjRVz2 zq`;YJ+2THd0f!_#UvFD-8mcL;a}@_Y4Z${ex2*Db%N*-VDt~7hKBPLBRYzyK0?qQ~ zERoac8vt~U`0Gw7nr`Gu;+Rc4D&vtmUavN=^nV1++;70yE!pQo2AZAMMx^F7OI_sp zNqtqFSY9i*{3$f6bfld`Jl>L|r)|gc8cm00X&-%E)x#7_PeZDeKKJRZuTf^XR$_0w zhSIiP?fd7@EF=6P!h`%n;{%$nOTNrQN$SmZF%tMyHmT|$_0{FK+t+FfH#F=03YzI0 zWZ5ctsnHF#?-ciaGc|!1UwjVX^X4HWG76FN7b1eeW?M`Qvg1~vc>6jurS8Vzf*ib7 z=E6HAm3X7D3@;WGqIchJ9N4!DSKFH~F3?P{o**!fl^0;RGzZs8vT%jL?P6gD&J^b0 zL|!Id&q~Mf>`VpD&*x_0`H~`3Y}t&JvJBNM2Dh0r)ciwP+AL(29|8!@elq`%zX!={ zek0~1a={9$jN5~4yYsOpsT`@fjmUR(qqw#oWp(>eUfqwpqE_rpc47V23~bt&iKL8T zRM&N&rL`Aj<&EkU*tO05sBP#$Lz65iM`*4;h#4v$+Dh@?S_GW=SwV9>p}AQ>^1UMh&F%Q+UABi$bH&ZwJUOHRMwj1$iN>NiUHDR*p>KB}dGSw@!4JZ2F;bDul}XhSp8 zp>Ni}BVrxdZn8D24;bHMpjmE+oxxm5exu)#n_U^TUcGX;0=ymDw-cJ1(Oln%=DJ2~ zh+B`@K@rGK&B0R#kD;-;4&^1qs3<8!Sy3KJigMA`(tx9f_G9DvHR>M(nT`<@sNVt@ z6d(@*h9Nv`4k2<8R#=p!8th%W237Gp@dj_&KdY|7h29Q)+*XfQ8Y)q~lfEN35c8Qw zIDNz{YGzOnZ?;3|H+Vz98*sVtTee_6a`z>mC@D#)AcZ%ZB0_U^b}FJ4%vI&AsjkQM z>o;lRyg{TL4N+pBYW_U!Rcuh-@X;o`Vz6yL+Mb;DrNvlIn{-0+n1G?OT^pLkW^GOo z8!Hfw8Qn0z-PVz~v7y}?xWxxv-a zA3?LbTq3rMlKJkdywW!uZN=}>?xwtUo?hjmBu_cV$vkr1d_XfD6&;!`thp%q0*lic zm%kz|@uJXqY1t#rwmczWjQebOVH%et4Os}B5^z?)EMkKws4*4Oh4v`8`i;h6Ndvf3 zCLb;1AhlR0OOlQ;R8vm%nT?{MSs$azp&LlaDv!6!vA*PNWM9%?<9TsRy57)iB=s?M zw4vE*6U&f1jNURFI}a(d1RrSw`&t=!jEV^K{zYik4nqU62Q*U#l%3M^qV4PAygt${ z&U(okh~xz*^7_svL9;s$s&r^)$|iI>DY4yjdS2JaTt}8kNv?`1R@tkVQ%Cc%#k^YGV;VgGTUz! zmhk(om?bFRx)UAAspw5lLtknlI=61b8<{DXY;9B{&DSf7FkYUI(Xw2O5S)jKvvG~V z`eJcDPUdFgjf^xkm-!WbevttETuv4aWn`dw^Co02i^ZO(IoL5T0&ByA5zTz(5|{;` zWo}{s19iYm2F4lGtRMsh&qZYP3ar@_kHpM!6jpYjLLMA!+>f%F9^{m?A+4YZStYF~ zuIff*Lq8f?4xpZpTv6SH`j&&JYak#uiUv?eV6G)}%WJgj>v{>#UHsmKnwm~jS9hYm zsuRsM9Y{*bK&-4|yK@(B9`Z2S*@P?A1sH42S3vx5P#(aO5oH^gy`lM`yoryn`fG1F z{?=KBf3_CmKU+%FX!Czq(ERP;7Th^R7#3(A>>!MHVW_nN@1<-}4^_TdQjNm(n{hNf z84nJ$;XzL&zTIDe2ZQDKZh-Jih*r=nAWePWTZZrZN~sd6SbbLDEUzDCd1O_OhsT=n zMMovJ^BTyDOg61pjmux2!R_(uxO(9#h8fte(SEMd{)TC5!|WUC?PuI~jrMhoaYdl{ z>L_7ZK{LNw&^$?Ko+LCYfK`3Z_;862GRUI)v<$@j>(DGRH?<<^zn;J>GJWQ+$ix=i zpV;p#slHC1X?&=b_5~*HNQ{wDMS&uDlbU*)+K{X=GRn{NTMXJxU{=uV1_)a^$kV5> zy=6JE8x!N$xqX`gyk)UVc|%u=juzh3xvH@!DhAP!3(?xpj{UuZ=3B`F7GrFE#SZbN=v z8U4b1b&gEm*u8THKKbB%g7GbUd+Rp+$T&5D8wBR5QGOn!Z(&)tvR}(NGWU5V&zs5q z8_4|Tv#i^9>_IH;Yo@?6f3gXQU#Zz9?$GENeLfhYlXb#Kd)6rVS49+6~L~mpC9kRN@(J(0fGENNmPz zeM_#R1DI(CauX#>Hj4duP$mzz1GDtMJ<{xKSZ`UQ^=>)SqjWx=X_#aG z44P%M*$I}aO=vvMyGxyqTuWy=N`2*AZmsL2B=hkU#97d+Mn386{{?6^z-EG)Q_k~M z8Z3tmbDq~mL$mEb1e)cIf$qCp0cWcX#YUy0YhV^hTGCEpW4gVZ(ru;NUZtVh+gUcp z^=DctO?UfS`6$U}yB<|_XwwP?n7>HB z@y`>QW#q|O2j>(iPiW?EUeK(ToTDSTNoO~Cjlpf=Hcox|1(q|Y%$*lb2%U{tK{K&_ z^JWy4Rbc6ol?e6=Mi4<&MtucdBZGsmcHUg%tzLd}%VbYaP z}8DJ#UMxmgODj}w|jFQ%p9MFR7SIoWtNJ5x>hIFgx$K|=Wc>@+kd z#iMf1PNc72kM;8xV?Kkc8iA$+pk=gc1_QBwNEid<5(e6>*cneaV1SpW@pmT|U{8h% z8O6=Wb#oHPOh&$~rJQ}FOqkdT%xsD)PtOqLbTXz}$-d>FVXem%+X>HI`1U|29vte%h3Z0l zDA3Fsl4nb+P_%hF-m56Zcm2(H)UBt5+#lrk-V!w}L#vF?>{8JDg8*}HF}DKEB7Ps> zZ~H6p_+TBb6ZVT%EkdMf=UK>&PsI(|icFD{sTdx1O@W;~nw^%_0;0 z#ovE(>n8GZaundnOGI3y<>>DjKtWCc!h*uEW#e{!A4FGsFSv}RizQ+AIfy*ATlBvixw=$%4Hj{dE;Ir?9D<(S_#TZ8j+PzhD9+e z+1LCLF=wt?8~Vn?qylDv<6D%#*bNzFCNNKp5ST}KO-F{2kgykv*;f`USwc7tz)V?h zo!~5MMtAr2p>J>yMa88oUp_9;7XJC4{|f^LpTVj%o3L^1R%~9i0o&HB!zTKg1-y}y z7o!LWiysPRd4gxk)Q?~SeH2#4t;g0~d$D8BUMyOyU%2AOV`N?GojbQ9mB5^ml7Ni# zBxGhJBZs~yJ1q^xxdm9j8v=psciw&*!f17#9TWD%M>~F zvJqZ$6St;IuDe5OeRI9ex!Zn)1^xmw4_nYI`wa{Dk{cYUDH$l?IqKndiDlL?QZ_d% z8_7QQ0lVyS9K~~OXlB`mMeM@@y~bV*oc;kco2kLJopYbN+{{CyK-tJ@XGfV$p6+(C z`nNgG&eJN7C3Oz-cCw0+^m|8rXN=I1$rH@ulsReKWTGI1g%L<);5J|^f0KEREE{On za*u-YIC<-hNc}}F;9`EGF{wPPt*Mlj_1R${x=gxUCVf4hm**;{q<_=mxt4=c%XBe+ z?c}6RJdTb{udboDDos@mBWZm5f_Rk6QKo5)t0ZNW@|*gYt$mT54pj1U*F(zfK5n|y zUwG_5qjdiAf){-^N~g(mWKNPXc9GN2WqO0Wbx#ADb(6q$Q1HgqcqNP|SvE`i(4WPr@fb=uFiYw?+eWUBshdcbQSVbzG^}f{mfEqTzf0a$ z{w6)_XVcIa;o|>`0I8yqf-LTat#bPOIz!9S75;FDG&U~vBVMwIL72*nDN`_#jMHMPB{ zt?Ng1O)sjedr(u|kE+@}RMd44Dz;+x%9ZFzO~CoqT8uZ9W1^-2j|jWp^%00=)e2c| zP9S-IH69Mg$Z{RO*IM9QjYkK|@pFF}{;jJJ|FtC#|Ff|G|FfwG|E;wY|K44NUk2ng z+HHj84j$WuhlB07x4#}=yOMA!b1R`a9fQ2dXxzOA=b9VvxTgU>^pxVkKrz1UFUEJ> z1m~U#?kmTS{4Dy3fc^{hD?k6zUBcgr)pB$X_E)F}74IHv$C2y=EaJ^b)Qp)}96B3^ zyZUi={3dOcftrDRh(Y}t?dcl*qJXlzgv0>zRobVXo^ch!BLd9yKN5FFB<=__pQ8kv zFX?`)`jSZcu>iD)eMS0>)lZ#1*FdxGhw^WX+{hZ>a6+@`ua>{yxmxTCR=@sB;`ver z@rCjbuLfg*Ww}`vX`iQ$p-)i#j^}Z!eY}Q7#KbJbv13PZ z^X5%VOKEX;{o zh>rdPc;)R6@cMfn`Oo@Z`jjP(~*^)g@W87%$+?Ck#nPP>hseWBm58ZI*q8fD%VZyhQ6NmJu26Mur^8< zleMSimDDnzGsU`2NqiC?&2!XpaBAR3BynADCj>M_64T`-%8KpkH|3MH^zv|8Vjq&0 ztbH!|jgawQ zx3q)W+BU1FUoaneK#(?~rR_rv{!k{SsXnScON`RLsaS92C2Zq-nTq$zRYa=ns-N<^ z)$hFPFA#6`skUYPoRTSHnlZFym}P$yEK5d~AxI`V$TAUuPC~P~n-(CjWW$j!XqL}T zc=9~gfQgZ*kcZeXz97{GW-C4AxU_TwY0y%z<#4*XK1|Dbyj52lVvR8yd6#K=c|7EG zfo$oHqVtgQnKGKXNE!5T(jjGp)pn{P4b29al~e~~-FTh~G(65ojMzZ)2n|VIS3S(K znGsI+6afOVk-U6Et*oF{SCDH)kdqq>1-`ny&C5;9sHNOY=^HAiJ$TA}o#cGg*7o(W z#q?>Az9heE=b;_1)X%)3NR2e}80lcL*uMcor*xp4F8zRk&il{gyq4;&)1=`@Up|{r zCnqqA+->E)fnz)>|a?ch*-+m&ccMpRw&! zwI`(>EaG<=IlnPFj&on0#j2HSF=tK`=FE*`pjmaXKRdlLY5GH3a9f0(?_hfT^-9OjP9HYDFGS z6{O>>%p?MH3SJ^Gzf52j9nVO`%hd6VbiA0EftRwf@N!lTUZ>v9%Entcxp<|Z5U*0l z^YZX?VIguiZ$}8rFw;+7x9yJ@0&;t13OZ8uAZPt5?2cWCxVd3i80e=)r{!e^@^H|C z1&gstR^`~1fUN8il(_0qUEhwH#!gf>bfLPj2bF4sxkoKSS6e-RIs$WTWgqwVqp-3C zE0?Z9_Qnl(z917L-Suis>nZ9xLi3|u0x}_4ub3g9tMOewVOotm6PgLk_YYR$(E%5J z=_|p%cjV!JH0R*|YRJR?U!b{7pt%yi_1ELa1Fby29ghY(@$g_PZW4e`=5NFK(gb{5 zoQ$5NU3ext8`pa}@m*()g5XEJg?QLoipSmM1nCMqme0LK6al$cMwv_S6F>jVeLoSd zANQBw(f&#`rR3^B1L7AiP;ZJ{=MZSLlD4WxwuWglL$oIY%UV|zFkht) zx+Y?L8DgAq57Y=G4+%7%wYJt24NBXp=_^_S&F(35JjVE39o!tyOh2KgOnF1I)%UGD zY-pCU`10AlrSq29A^p~lC93bbp_yf4Ii()<%{*oGId@tPA1%Z z?AVb=U`{10r?K2gShIS)T6;RsFNAek!W)GYq@`r2b=4CI)QS8qijS8YgAAmmW+6Mf z01*-Mv6!%Z;R3Jeh&%{9Nqvi6iz$hV5)*c@j8NtXX`vSLPl)#vb+TLJ2Z-}4f zu^M#Q77Ee?imgE}71PF)WORArniHCN4$oCUE5BLsT}iGx(^AmPGK>;HWrY%v8s!x> za31fF4UCk6Sz88jwLERlTju#Ur##M={H)I!nAsMi0?jM~+t7w)S-zF=oON{rvnBgG z`>MovDHZb2sGKl64%_3)cW{etW=2m5-3ZxU)PKJrmVaeaa zYUJ1fKsFe8LbDdZ%n4*(0A(uStcau%%J4)vF3?|*yAG<{);?Z1Tl|e>a8Floih)4pLYcPnCFPW@ zv0tdZSOBoEK)@>k@YhkW)8&9YB^7aP>lft=1lL9=DM zCh-0NG|Ti4cPF;;HFp^!z8=4#2QC;1Pg2F0<&5mLq3&5Pv5ajRJj$@hG__VMTrwF^J z(vmS+U548&RT!(t$5d6J8fhLaFThs?S$I1mg>amPmk7u&F$f>0UP({IYt-x9_j-CJ zUT1o*XXoN|!txur6hZlwtUSDySAZAz{a|(uwk=$&Ubx{O5USQ!jR^@wbILxvRFaRk zN_cER4xZ1;L|eh(1ujSx?Q-x=K4!qtLRSzmE`>#|X&d>H$DK(tLgR zBJIx(PTE#I`;3l1+W&h%Go?n4RbOBrR5H+PzKi%f{h-c6@)BL*`BD};2K~7tZ;2mT z`i9r^Vf(Y{Us5is-zmvHM_R_usZmOn!6TM9WT$VX&3&q!thgxs`PSqF`)HYhNqN9- zDeZCT(&fnHP3FOa2Qkop0Q>un;LyPH=x#rVGI`}S`_@%M(BMcch+K-; z1uL;&{t_&XS%$T7o3LkZGBPuZuwlb4rW=P{I}$LTfEyMThUcDp4mWSy#IX1w0b=@V z``=*O*C@$IEw?HT@mgMGe3`mFi9+7^%5rw0v*)3%xf?IP`7vI6{RG~6?<<_UI7N*k zC#M)&w(LOc;$`X~#rcu*v0?otBoTb`a`RDKQbv^{zn~2J5;L)VcOs!Vl>og*z3ni- zPnN)&gUuVaBQqnH;GB+q2`NZU;*G<;RDKqvD9OllMrJN%M?|VN946R_MrEX&pe5H# zfiorhMn~BPM#m}sZb9=6)^$pJDQ(UdG> z_sM9p=t;nAKv8s^+O z*_n=!uZA}J^xceH5bZ2<-e$T8(-p`t4#l0m1zUvXNs)1y{7syfy1-Ur+Va(iUO?qA z4xgHqrj}{1Ka%@+kMum#(b9EOlJZ%O*T|GvZ5`#8Jj7X<{LD3RUMF>}C;>I+H8aNx z9LsM8n2o&OcIPdVhG;r8Tj1(d4=wpk-^7S>k-JjW;l&||xZerF>YAzab^S#$qU>}W z-ZIA-nfCIQU4H%D9SAA0E!lcQvw1zX3H){dH#RBI%;Qz?bp%BQ6gMdQke#n7gHGR5 zx=h~ku7|3()z6f37Y9KII`hTXSQobeVIgx=fPDU`=kUy-BRJU6gQIPI=x^#mQ*8_C zYuk{Tl0)d6g8=^!1pE0TVa*y0GFX37R)y0A1^7HSliN%T)tBOCYZWF4%~KTw=dvPv znL}unDH$@KIZL3K08PD;mV(#0|IO5N{+@w%(zEcEj5O!w;FbIw!g4O&&dygL{UY;d zN{B~{Jn`=rh?%njS?<|b5*miqlq9@US&9#H)A4b3Dn86h$A`tac(;t_QZJO`;!t5G z%66_t;>v|s9~Fv4p#l08(S9>AHzWkHF^jNc`%a{1;h)+tx zikN5=@7#!Y8cK1dsRUPR3-C=x72&uZ4++Q*`m6M-v-_&u`c9^3xS_cU4-S;85#N97 zEX04d=i`4i7vq1l6yv|P6Pi2A@H@}>QAV2wJ9umx?(A>IDOVc4sLa6E4aImNeGd-r z*@mH(Mm+9qv!JNEdTkj0e3%xOafyJXpZ86yTS>V*I_g z0zdY!%!k`hx*}S=%p@ix7|-+%;NgusxH&pSzsmqUG(iX+Q=`oz>`U^{?g;zy2yX_} z>KW8E+MGbM!1S=#xANlz)!_@Yv-1Sz3)Cgro&+Wf29>Rf{o3}dr)OCGQS}+^BXvO2 zebDqN^I{||M<18iX4$$vpEssb7WKRHoF|ECxzCq+NPm_1;{<1`4>Ns_w-s74aZUYo z>?O6|s+T(7`tB=jskfRQz%&&^5kO9zJV~gGRp2bI&6bho4&LM*I&>I^53=tbIEKT6 zFQBV^5c#>K$Vkm30B=WHYAUvF*@@7Qd04t6j`qG%z361+iZuk~Em*gH3l=V1h7~K; z6T%Czb7#CtKPfR0moKuMV+2O}<4FSKgn*mFR~cWUd@GO9|D3YDWrUR1O`usGT%5@p zZvTKV_N`{T_T~w^^vWlA^X;$k+UuX=sbjCHDIu#@t;e!utFTXjasf(9TqqG(E-oW@ zmmxpD3^~~)g!~ljCiw1+Ps04D1?t5sflNDg(JIzCAIZsCNJ`375AP-=XQ)*>1e6o@ zrKpkVq-4VO&fN$J4Oh^7MFu^_Ce+QL)KlJQC?m>aBLojM`0@7w`hRMW0vzqTn2WWz??W}Yskj@}Lq@h?oi)?5%e^vQ{C)0_X1vP~- z(q)m0pk+g|+`;OH5p}xG5jrOju>km#%I_`t-Q11pzjN5xy(H5MWXgv3 zwG%lzrZ4$AM|U!5Yu)Z@KDa8o)vlBQWp7~imH_~C&T z=VP+bg`0JyxKmq>aaRexAT+-t^OdF;r%7p+!^x~!1vUNKa1O=gD`*yrqREQ4<%O~

c{*R92>`LofGv>hL}l;L!BE=C%Q@SvxbU`omKj9z(l zHDQ_C?|ZA&Di)8dInLi477>~c%KFG<_|Bytg{9uOVV(ru^8`^Wut$`I=qn*kNX|X__e1QKX((-dzpWCExzw$z69^b-39o* zs~A7Gm*baq7k=t;F+CU4<8KF;H{t%Dy$$%~P&YnrE=OFDzj|eMKCh=Nv2|zS2JK^# z!TGw<=&-ypdlbX8BN=TTy-H{jpuR$orX4DXrkyFfQ~vD|?MJmlplNi9h zCNI?T24?xLZidBHrEjX(prwMT)gKKsi?m;{{flQn6yGXkG0-f>*s$_s@i=w9u7{B~ zI2-@%gl4szp6qj%fql?RYMtmhYo%>~KmTE78g7{bE_vs6Bc@t@%IWMmO&pr17 zo<8<8jvamqPaS;@M-D!Z=7t^=9*^Yk0Gv=ULAnG%w>#AN}n3ByC)_tS@0% zi=gZl`vTiSfLf*=OoL_z;2LN)(-WNFZ0yXAQDV2^lbN@Qfkt+EqH!4oXPML*$?kKf zOXq3iNK4?H*JYX%wCm4O4r71DHa+a!+HYj@oP73{?6=d_H@-gP+|SQ4h45d6W@l{w zb4g6pZLV3^eCBTyq1h@lB^j#9Ix}HGCD5v1E&Ej=*g!M{ae~rWJpsuH%{DNJdCzksFX{WNAA>RbZ3YXcy`04)X9L7htsy}_ z^-=lJi3v215eDQ2PRjJziIZ5nWg@4fsEjmvfNKYK4eX(+(gH944Qmbt{Wc-%#B?IGxPSK@I`B|j6M3C<4*r!v3! z?%`V8K3ao2N2~Ge5gs!rU|fp7b(a#HUHC8UW%w^00?j3aX%~KxxzT+xGTn+xO=bAJ zA{Spbl;Zi^WE|MJ5f@$g_^G1~zY&uE*58012*5vdH{!?6TKpi;+?9{VohA6Gy$Zjy zR^ZpJO0_2QcL%HRGXee|JmeT&9mA3QRCOb=aCQVfeC;jVy?&FnG0y&Woi?MP zd2D2o;4GufGChNRme4$YWgPOF;W65wehHQQWiS0m`!#{)iIEHR^V|}cO%OHy&C{MO zJ2ti|zq4&lXI{Vy&%KPJhn_=Kc?0rt zijkUy)W@bC!a<`t`XzPooNV9xBh$YNi; zeDQ)F-Mq$Y&32auoAtks^!Mud81ehMO;j5b)Z_+A#aS5{<#oGz^9HJ1pR5t>DbDcJ<(EW{_KAt@!3dGEr! zc~NRQnoJpa?X_3&@dqDbYJxY3gi(>$x`JiiEGpY~i+zF6JnuanHtGECJZ3s9 zi|qbur2DU@)Pwt8@k{NO*z6-6U(jqb)Bk~(r;3%_&KPGVl95czwlmV1xFLa(!a7G+ zm2MDo?0PXn`KdWK5hDw8-Y8mUQNtt{boq_&)85UG54 z5zJf|XTzI1%Q5Z?>Rn!OU~(Mumhw1(Q30gK+MS-G5~|G3c6EkB3UUbX*H#y#!mw{#pLN+)%Aeqj= zNd9v4b`%23Q*;gnn$2jl&rWX1eZFI#3542-IQRRqMNeBb&}^jo2VvGs%P{l84d9y2 z?wm?uE}7Gg|ZXLoApymPZ9>MU6dD24B_-= zr?7g}TKLThLUK|DzkiLhU!2GAg<-t^(%Tql-;buoE;UtQ<(duf^AABt_*~5J_s5DA zgv+?~DA|*Uw`*&0y1EiqYHJA0mAF$?hC9_|n5Zen`SLuRD#^tsMY(v7fc$O_p*WMk zoSlgy1nKUSMD*-Sz%w$hnXvqJRwki31FsU8U&$mqr)A;k{Cp&>UV$*b83+prM0`>b z2A_Qf2^lGB393CSmf>hoA>J&_!v|$~c$e_}QE@I#GT)E$G6>391m!$@TAG88OS14u zSvEc{%fd&cS@^g-2OpN_;{D13ykA|6_XyIjmlxrg{7m#FC!luA1{B6EM|^Z7O5&E_ zbyp@nug%4oiges;l~LUq{7i8Ep|cv_3!DltQxE$paesdm?i{MYtz&h#`BW`#QFo73 z;lZH_d@m0K_Pg-!T^0C`PFbV54F4cR|E;&25L|&r2O4l|eubPIy-oN@pqcRgLsvPWxd`_u8R`C^yPW0IFERO%@c#EB zjrfm&Cj7^N9{fmXF5k9JK~r{OlA7u^H8#aQIZ0h-aGtV8n#~lAVcxh)pC()=U><{B zmw$v3yOe&fefh-jC7w_JCNI`<1XD8@pnQVczO6t>x%&$3)%a)qx32D6P9N_+)_sol z;U<>Y5??Cu&UquReZzF|H!BT0COz4otNGUS$;OYE{pNgkI`W%J@2^SHl61|SW~rZm z(}gogPy8|fvBm*A!6pHu5Yt5u!YHfmL% zOT2dKc^9^U@qr@oXUezJ7U+jHG;7$qH93jAtZe2p3lX!UFn@k5Hf@T>wjJ?UziBI0 ztX#|e^D%c$H1_OHL}^JSA-SC5wx|k)#ns$bkMgo+)+HbN5^}I>7jL3h#9{HW6?Jh*=k_wU?Q zD}o4+s)y`G#xXvsL3u=MU&(;8{H|k@fo6FS(D(EVd(g*#tjKL=a!nl^BAjL z6vH)M?eeFxV(K5snVRPFkP%)lpz1H}TQ2AYjzpZQ&mb;C02se-*qPx92|kus<}czwLn zG|+3OY4YarZfF)UEsyV#jx9ZX$P&wB+FxEgBQ`d4fzZtIe0BN@tX#fIjWp*Ml;Z2J zF5tr1tGw1@7(O?Q=bwHVwM`u;t!}`A#mh01#|4KHn*9P0yJ`)#FyL)jz6Q<7N%*9` z4(Dp?aHFoCz+8pfHRTv0G=HW5H5(rkXXE_>!fAdU-YUq$vsA&x4Tuf!$119E+ZHvt zEI|Ew2EjQy7cXUG<9Tk|Gt#gqA_QUK!B`y^hxU#xy!6H!NX$;hEdLo;I(H5_3ES^f zl;FMMEWBSzXfDaY3F@=bT$Uk+z)V;!P9rp@;UvNLvogYODYt~^&v^W&)QJ+7RaBOX z6BUIxQC*6YHDx&2SdEVxtMEl@4bC)`;?u$eoOh+*TSD^pgytV)T0{qdxwC?hO5mlw zB{bh1tj6u5Y?Ei|aGjcZx)!&es>b~zRd_VW+{`c-O{4UTuP=kB@ z1Z_g|nYu!psV&6mid?*sn}T<9lX0)11i!Zuu$kYly>ZR!;NG(buyaAAdIj-|M~>pw#0}caC~l11 zWM8^YTbW?L<929*8l(M;(4J(ZjD3;EX;`K`E3wZJfR*nNJ5$p$1e)cu?qh0=V%nkG zC%EmKeTRNerfstVnbMGxU4(@AR76L|>XF>Vs}LMAM}e}OclhX0^?Ccw-N?wu!S-#t z=@V9?%2lD_toT&*%4fN@3YJ;N3oH*c&d(FvC#!GC2()_H#i&dlAn1*WzaHXEm`o`c z!LA)URX#G>9X5Lbf6oT;CbFq2zc9y$bmA$0662*k&;*wgb4&|<9 z6cyAG>a($TZx-U#?ZlF0aoDnB4^lI8keO46jO=1$W|k^tWac9+BNtg&`Pi|2FXlxq zz`C{T@$}JSm>e6!-5WR5OJmg1aV+!ov2iv1L`_AJxy@=GGhvwkZ6y0f2+Za|Kv}+1 zL-V!&7ob@KCP;vb53 zuK|9#>^LjjDg8}L_eU+mC+53nyZpIS9*(qq$(f&#GfyKqc7~eop&XDDgbAoh$d7W6 zoaB&K=M^~}g^@QTX;7M808}I3GV%I&O7{7d{bakAxdtP&5$R_`=g%|EAzb+C3|7mV zxn>5S)K!JEX8;XB^1d(Q!ue|iXQp#?3ZH!PCEEH9BF|NghH70M0FBKA28 zn$`0ggys%ES8pPNp{49R<&eU~6pN8D1sQIX= zsYPdJH{rJvZEf9XZt6sBO%rOW8&OqRhoV9k3i4gpxM3&e&s&0JOXCn3DNEM*V_{4z zjz0Aa^%NE?iov$+JF%DjdBeIb*tLBRzWnSorbcDnI?GGHG%9nK)szgD@dC?uk=N@I z%cripToZvgJ?~kdd1_qXc^J2ET*tXHUt?+PQuQEUX!tyWGNCy%3Vy+12%j?-Yu9Z= zR!%-jODj-TTIGi3l43oFStM^Llm`XV(+jbAYZ9Ustj3n@iAc>TKvr&v+GggI5tv;B zd@g$ov;slE|I^s(RGzKvVsqspdkjtt?}sQy=TQsxR1p2yY0eg=@o zFwEZslx3+oSvf;(B|b5Zh&@j6`rn{!{t-0Gu?A@ESRnf-W#ep1H#D;^_<&{=q?g#$0&OnCD-Sf^#cvUvlOpviq=SzUmd}OlyW&S->=kZdfM9O8Dxqz9pdQnUTmq zlKJi|n2}v@+gaMu#~6oVfJ*`MMS*4+an>-Wr)OBu%rcpaU?5kEb+jbu>%1Kon?@c6Y?Z%$}dFPPxDW-_`{(hgEiZ*Vj+4@WxN#I%8t%EyOH0CCE^merPgx7wQL zNLy%NHm{r2=jny^P5au{%K)=)Xy$%OK>6CGi%J@rFL>3_z_Bf79{&n7tLYiyBuu9j zN$2B!vwpSy%TFFI(VhP~(^C$M>8roUEGbyEB-e4m?VyZwUr^UUI=lP2(hmMNpxNv* z$EbOiye3XymSFEJgLW#Eq-~w=lwPM<{_>@GjFI}j@m_f$-YO}=JEbLf zt)dL|De;I22|z?p5LPZ;hTY3oV<0mJ#|gx*q@?1Nj4V7WDlAfSnPVfv5IQdmtG8@K zX>mE;d+j}(`|3Punwu~y*dMd}W@6dgxp3{=hW@-v93-G0DuJ_=K=}vAGfx?X{R} zuf(;wJPg-l;SPcHQBM^f5t_fF9ur!>Cp3RYXnxdLj(goM+!?IG&8K-Dp?QM9eEpeP z+&)%|dxvZBV35GQzY0J1mgDyx7s1(u-*~=^7OVNphud+XsSM{DigBf}7#AyZ@kLn% z#u^LoV@EN5YR@MqyYM5Am$}Wqbe5?VGrsLB#RJ0j_XO!5d+P}H4Y)thii!RfeBM-n z*LYKKAT1uH8`fY&Sg4xb(%;sK2RH91ySOz%4h#GQa*upY4;u4{=W##l8&cuw8ZoE za-I(T1z7c#@(oPO+mE{YltaeP8F~M%#d8gOKcOtfC#kygSb4dLw1u2|o6uZaRjHtP z!-fs$=;%aOcQ4x7y9mzRXljy|^|YY6svZ^PHR@qHc@4I#ma%BzDlA&G0s;P^>@Pt` zBT#m*pSE>%VZ)ZqSi5#THg4R8RV&tFfB!*TACon@#h+hPOJm9lv1CfdxSEn7Tb9uc z&4k0NW4uP=^yLE1gu5}BdpnFfw{PNu58hXA=L;evM@BEk!lksmwc8P&oQ+~vEy}AJ zQC`u2Qo?a@Q599`mVk3HRVr`6EG$QSdj8ob@O}MvjXrBY1P0XSe7@V-k@%e4&x?4_ZH*K4F%15P)5C`n!r2EdJJ*j zHA3VNj~k}_$$HI0j34rj1+fe9%LkK>3j*w zpGi%n^&#(kM4G2)*A9t)aI(n^j|QQnkxWIVcUtMlPjD=fxK+6)$yXum3J5^yN$}nd zMmrf+K!Kce)M1QVqUeNm;Ai@hzr)d(3*iI{?NFry8$)qkG@d7*Y%RYxrruWNa8i<% zz$?=c=PZA{$X$&2-AJv8EM;{7qlQTtr8S^pJ5giEZdj6Zofq6ueiJ~nXwWipL+K~f zO;FJJtM;{^*<3dnt@cbqUjx;Cs(o2T1I%JDR(Wkn9WJ|p`P!umxJF>UCa>TA6Ik|? zy~dj#UAc`(zm)wG=`uj=-D%jeBLVByZA4sj3<@`H!pklf z&USU;RBa7Dswl-rRTX%*q5`kG%22s|GZqH>A#m1AELa$W^>Ld}uqO$}@=NeaW-eY$ zO~dhwOgx*HkAdtQBrJ}>oDe@mE{VqSjT_KTFu3^XSqz`MhJ#0sB69Hp%nI;FAnOn@ zb0%U!L$N443`=H*V|DaAY~t@bmoCEY#S5{IX=SZg2G@qQXiwUWV@26`nK1np!TPsKCLLk2Sr)G!&GxVzUeB%LxS=n0_r1f zA9hvZA%Xe3u5x_WQO@5>aqB<@Zah_kiD$H~^Rqkj>Bs3FVIRcdG^{QfSZ%!KCe zncjl~O&Dq~=S@c*F1NUFsWu;1s8A}ekC{OZ_6Y&!A^IeXo4b^{mtl0RUL$mmIE$KttELbFU)YHRC7 zV`CR;1)6Ib2%)v8tgJ_JQZ^PuEl2F)HHeH@$aDR%bm=P8H@Bg%vQ~T z1@s8dw{K42<>N0wR{DsXzX0Xc4a%o9cOAsQp%>B8aR?=4O@!k{6z13Prnnx(h1C{3 z>nSEO@?2J0iPV$~tXQ!Di(@w+BeQ~#T#c+;DzB2;a^x4*ps1uC+qUh+f(7%jzpn>( zZcO6-ZCMTGiUQ_4gymao-`o6to9%s@Hg`kdR{XLB&9Ywe5IrBHreZpn|{as1fINMX{lG;8Anr|@u8{98| zZu*8u$1br~oxYaD83WB$?6Kq=jn;5)#P3oM1?Qf6A~?JAbmnLBl;5QO`drXhlX-m?9ImR=^2b&&dxj!lPVccRQpL6?G2x%-vRga>YYiv<%H zVuw+$wN_hxr&RD}r3LcTJTZROQv`UPs*rrv4WjST&3C)8^nL?!@*to{?^kyqEUSSV zaW3YfspX3#56PF35eRv+oPI)8*Ht}z^#-wJZEnXW9Orv43({8(2#k5$Bd(w-6w zl-L$L#{3oOHUhbZM$%6F-Tj-Cm2D$;S87@bukWx-f#Fs`Gt(Slo&x*wRJPQ`Im)a@ znkj4Aj4$axpv&X_%ruofsCw!8nRZZ1N~w0$ZK39ru{|U>h*X=%e(P8@pN{9N2T`a= z%01FdL)X(X*rt+~q$}6TvN_#pP2P4H)DycbubL*qt+64sUw*fanN-(-$0OH2*P3mN2GYs8tFAn>}6b#EmNr({2` zyZI~CT~6+!jQtFqmlsWp;OcpS<~1q^H#D{3DxvwyvzKu0;xIn_^gKFykD#^Z0JiN( zP_L{G3ywh0tS~ilwR=w*;uEvf$j{z{Ozhm9jCHHmW9htk*cd$@WjnT_bp2X1?cRl_ zii>fKAf3H-En#w|qt7p#R8iD`Bi4)k95RZu2a}W_6hUoAR%q5Tq`_Dvhz$^m( zEX)oF!2IA)!t`t`;5Iri1QD}lVIhB8GdmdTBg3(4aTE$R#-VoCR&*xr!LjUgyi%Nx z*NXD+Nwo_XTI(^@--_G)&6sE^#?9seJnE^yLy@eK(Mw>KQD%bg_X5p?=WjYoaI4RS zTZH7Pr|U5ObPXm2nq{P!&@3-Oc`WNX_f_H-rtup=`}dAYJnE2T>00q_UIIR^C9t=Z z;YwW&hHEo1QIm!HEyeh*r38=LT|BlH-?ca5aa$AacQ!D;Mob>;!07%STsw9Y?{{}& z_o78=Il0AiBe8b*3hdsoogh+({RFO$-uZw4F@<~MH*xR!1aGF!;wJmkjUj>$!Ffui zX8=7U!<{^UWFhNws7}~VR#2Nh}Dv^d}34ZRr zsQQ6O{3QDl_vxv4a#JjQNZYb3cPBnVgEI5>k^r-;I;0>~0FdR8H^xqvwcicRET@!J zNqi~uV;>c%wAEJoPqW`0(S&WP54#>kg|(Ilzy#;g5?OkV;9TOusx@(lj)}!?`kb76!ZM*budso@T!mbM zc3DLe)^FO07#?$Qupj5X`W*LeO)2QRDI?3$o{Tj&*w(km61RA*MN^i2N$ik;nPGzS zHEypmu3V+RHZSeDCg99t<(~|RcM>Cn6Q+2ayz+d?x=~Og%>vKT229625Ts&|#3@T2 z@XI!JY|S8&UTaxviuIThsPzI^ImeE1dZ0?6RjqR40kD#@;(GJz45xrk8;4W zQ)Y0>0HGODHTHkmf>IQ3m8Gq+$v z!>yLiOXs5`j)P7_f0wN^oPlL0FiSe(xU_@uB{e@;HHsW3^)rT}>#8pt>mZwf1tn^i$7A(7=neeaw z3YEOgotdexQ~Gy(_rx+zlLRsWW~a2{*G^rJK#PXx9BpX6q<;6VuaWzDT9UGu`MPTK zRirwCZa*VaZ{2bA9i#isQO@D{0+!k?L~>1OV{8w#oEy&*XjV3DwV(64nrrJJwu8#s zTpuYT)7GO`EQ={Gi!=uR7KtXBzIZU1Z<7KtPbcQ~6zvlfEssA=mucMOg-n0pa2WtS86@1O%bIqZeO% zeF>kRxrDDSj^gMGZ=j~78!er^Si4~}p*c_m-H5O#Y}&9FyLY8wAE9|~0=My*NJ+}U zuHA{47Zr^N!s~p3?p(i_hz$+KrkI7;v}B2b)!6qoox~7A!*I z?s!z@XP~Aa9eK%nk)N^;xrqtLP1uM0eJRLENJi?Oeb~Ki7gj81o}r;w5Eh1**pUH+SNAZ4;iUYQ|t$BRcXcaD@Bbec?FHocIh^&&k@!Lkg1bO^o9%O}AvA}k9q%SbZ=u>6a6o&C^hZ|;9G+!CL{{g#ihSDw(LyaJ`}sbmg$ z_MJbF)j!40`1-5$Ng89Gj3I& zyHHzAXfAI?O?3+j3(D0z=EaLvsFf*Z%!Q!P? z-oew)zlo#IzJacuW5~^^P!ISO5|Ty5#dRzb;kk(5OcfW`5S(if&psTzCQKQt?MX<*jF|!G=;&bEU&Pm+pTr%u+b!O3$j!%1f}f1ks*z!~ z|BY)GF?IDkZT?9bsX(W&T-qD*cv4_6awbHfZ6ED z`RlYj>qs$O^}EB)J^B{gRP&3LI+yiu*3~Wd>$&xJciyanuD_NuEoa)gU8K+IJk>z3 zQI|r5&;c8V_ie^BryGZa1qAPIxr!V%kfe zZ|7q!ut;5SFGa;vtQ~rP^LZ`Q%SQ;z(r3)M=6r8xHf?02 zmxHp(Cvy)8$L259an;vso0We@E}uV-J-ha(k(v6&Hhg&EG(P?E0zUry9I6Of^=&<9 zXzj$pMN1Vl2l|I1cF}5V+p-Tkccfr%e7b_>g!l~g!hyt;3`E7mATY=uk+VY)6&8$z zgy#8y0SKQt6Y~hEp~2x;v}7gX)@{b>xQ$r4Y&8}wT80JD3lJO>h?xwo!8}%0bO@R? z6JcTD2nh>CPEIz?e0`esdXWL~0@-wlr=pzS%kpzk#I({A_aQ4W37LtB*t>H(_H5sR_-$LUk-)xb$s+7n z9E1493z51!79|^2qiW*{bZlIXlLg86{y;Mxch{=v84r4EaIde9;9Nt1uEw{$wNwM{ z^)%p4cQx*E|K0s{_~uX@?jEUC5Pk1JCB7Xf$D>{se(EU453Qy6uDuM`J4!Hov=Ps` z6495r6=(bEak0JxH|vY>U2`dZ?rmcpUAWj_$VGs5t7;0^+Qb( zXh(UBp35ws6*OD^-PpX?nGMadwfv&;E%Fec4a^3bjhuZ!*}l84*f#H$qhD%i|89Jn z@ne4^Q#Q9)Mk$l$GNF3kG{3>~o&L>EhkZ`^8M3m9k(XDB=mm?}$HLIn)rs-3VO}@FJ$-~4 zMDy~4wl8E`NQ|~(lEhr|cfSE=@y#+1yC5e=mD^QbgLgmp46na^0>|I@7*#c$YE5MY z%x*~5pj=cWBg_KP8k*zQZox|Wi^P;Hq-B*NJHHx*C5fT_&pd> z%W9s*#WP>v*5nv&^QQ3Dm`rbw_7rHAHonZ85(hNPb(ibH>nHyXT%(-{FzZ*u>+9)Z zPqaIUIWkhKUZ>5tt45kFXjYQx8D7wAOUEm3F8HhRKG(!RZKDR9Mbek;7^%ei%Q@cA zJjH$H-;F<#IbQw&QF$x-waZ|n>S@a7W$SkT^6XpgbFK4y)m`LF(>sm7EGsVq%Obe} z@(Imyg5B|LJqenPQ*cWD8rBa17%_brg^&(JgHzKplw+W{ot7UB)ai`8p;?2HE~~S# zed*7jSzQFVz-B(P*ZJBx8-VhHW-m>v2POHMwz3^NCFfXqIE}}A*3(9$1L?A=Jl*Xs zpQSCG&}^=q)N{HsH{k5KkH;&qta2S3l6F&{nU;Iy5=uszhX`Fp!zwsxdDr_%Qa0dd zH*ZOu&9n;jx2x6uQf~!=mspoe`sPW^Z@)Cc&$Kbxh92GUA*-!Lx_(}=+H3l?`s>gv z@&;y+nVabe0b26ifU(hEfo74OreV_X%u_=%+rtB#d7S*FMuRjkPY{Mg0?uR1OO{6z zhjH=DSriu(V>!XKxU>o%efl*%{pu2)dg(0`RW_h&;4tc%+AwGCd;|uBAUre@>(*?+ zmQ8!HYi9~}?MlY(J;{iVPs5)06eOi(A$H|T1cn44G&B&welrokU>WM?hxri^hzO6s zvgK>AX8m@oVh~)iJPt9jD=~lJ5`@j33xEGW`1|=IVAd>z1rpL{`Kz}N7Umb=tIt2f z_3;s0B`jaLaE3N|Uipy8(P31SmZ@VG&xu5BMmCZdY{L|mp{Ud(-(2`EN$WIi}cl& z?~=bVCEzje++f??85_kngywIrkKqRG><0U!1P%%E`mfzF+6uSqBigr%y@}nipYtYd z8Z^^R6*Ln_MEd0-8k99Wi{JMR&0_1yKRQLfV5T)`v1}}l_&Iw(;LpL?ql|8ePh(l9 zfwP8Mx1Z$swqK)|j>JlVMrjB0kB{s#X)9=Ey$mpGXx29qstuT~(_g82TX8}DmKG=x zV4jkj7=nuIPf6Ai#o3bYJ zoQNp(is=pOx1!utLzpf^cDAfEAuj^Sg+KfDvSrJ0;**as#(bn5)O0u2Q~n_{|B_Aj zJ#16@N*!y(W`s5L6b%7q^>_E!7045DG{5_hj?4VaW8MABeeU$OaLOYM)xW39 zr0Fd+s3$RAVm;f&3C%K1dwdd%^P;j z^b9?!Xws33%sihUU+EZwF&98d9nWLsVL-1mG~CcYd9KJnv&Z?A$-|=_l>BDSRp(n5 zK<4MF(QW48yx8`|=V$3Sa-5ZhXWPq#)qk;=b`|4PPJCLujpR7beeUwAbEZoN3I+`| zqAXA)x+ld{qlPeHpJrp!sz-&RY+RwI8 z%foo3>%68uq1kFLJ3pg;4Vp#M!C8*u^<;1~b2Z&|#jO(#%y-|=%(N_ER?y7v?&Cf3 z6S2;!O=K(5(?PMK0&>OYsu2(41Doew{aJiui1nJOI9Lcehfk)=3yoS_bmS)2Jk=x z5SV4&W`Msxs>{oHEw$}lJ%5JRneD(}H_U)|?czC1jg8>=3(sL$>=Fdc^hfxNnMhi_ z21i_lc&)A!uU6#XrQ#Gkl^c(ay&F-md;#{&3&Z;GU@Q(MXom!2-I66p-4F<26?ndUkEXLBiBKwGDW;r3a^v z9mn`bU*Z0R5!}B#irc*5oKk*T`iBNh8KJ*EMvdwvx9$?0?@x?E9+0}pV+5M@U%E0% z#T#}J`-dJ?G`2>6=o6YX7^+d`p$qi28kn8XEONen+Vm9z&9+P*cb3Njc^OFHZ*FjA z8cs-_CYDjnLuMI8Qdad@{2Kd(8vSKiJf@b=)|7a@@o#RQVDJd)4q-yKSxBak$m zBh4PrY=OD5V~@JB{#IO&d71`HWG=2i-|0_JVAYCc2ng^;Mn(qO+S<|D(m_zJLv~g^ z@^Xt&T0+Pyt5#rKTvU#@H5(B*H(HG{N6uZKR-ce{o6}PBkd{$|?3`k3+rC?YczSvo zE)q1QpGo;-3Wj>P+Fe&~iLa6CuH&tN=E>o4@`gMRI7*-!!{zfAuyXkdL`KfT3opKc zFTOg5lc&$(gA-pOJGTrO>4gfCWyK4Dxddcc zySafk`HgT9m`lp)v3x}wX7i>zH$Mwkubsolh_o^N9^3RbZw9BN4Qb=@Af2?Syo}^J zug4_oIZ0Tayi91Of0O|R`L{sqWSI3DB7+D(tLcf9`X|A%H#15p|J)e1GOT5wS=LsT z-}D3SPmsFaj5!wkx^2J9Z+W2(otduasOHCGkT#Ewj(0ZDeA+npI#ih!XvtTrUr3b{5V+ zkdaE<>JUy?5cwQgW2bRKvy4DE3#R074Bo%#3On~1Xf|_r%|-WYKyNu=$fTnUMhwNi zc=~uL&va*{K!)dfK$GO_JfAXoI4_3z&7<)xJqen{(47Y0-Y@xjwY@w*W}sP*-m(m; z&JJ1S(wcr9+~xH;Puju`XxgFc5og|66PAsclJ zSZvMu{$yyLuIYJ+SZ4*zBHD+Qzj`~Qj4TT@Gbl`4Ij@3)>>K00F$w4wFX8INE2yh! zR4+c*zAG7TeefAxeB%RDHTR&Yy&ne-Ka1?VBFyp&L}+LPR*jcD-4c&& z+Y$)ONeYco_+2mG`DslE2jt>x9mbR z0eo)MB81P4LLk%j4-SK0Pzd}20^sNGhd_dJTT3%;jEUbOgbXco0h2SjfHp@#--YrbWYuSl-Dscx|cC1Iv z%Ej11cwWTgqj=t;kWg%5fX_%sKv8A}N^`SNl#z;t+*BMTq`pv=j!Ods-otIUf3y?d z9qXdHwVvkZr#tYN`hoiKXa{~eLLKeGgTo!TexMa!cT^MNv#>ue9?fZ6P?fL-#aovn zb43&qqQbFrUNH77nS)L9La-|`43(>5acI|OyqT4NQ#^09Di3#S3vr*IeZ8Rs6V2r~ zQILXHvl8(^V>LcMdIUo!Pv9o|)VC8;_=b>qi*_yWqXIO8_&7m%g1|g!%~8C`n~vKA z=lkQ_G8oD-T>5wdDuakxxrX1h1U!x1(I0w2v+@t(M-(vArx2R;*7h+*(jPTAYui?R zP6E5Jd5^v=zT5z_jCyNm*1q2MKgQ4bl7=6rG}wB>wE)pD%O)#-sIn^Gq5X#4uUTG^ zJP0RqdnGRv?7;-h1!8_ee;{JLC}F7E}RcC_Gg_ zQP`@&BVpwLVajy(>Rzi?FT35j=k77i`F_UzBlFJrL`G)5uLRv|?R`GXkv=lwiO0-{ zjErE}OrNP+_9JPa+ak>7gQ<@;15}`!aRMCYS|M>Z((Fie9P#IM-G259Y%&?rh1Sl+ z;W&isAkiiS|E>_cPxt>vNOK%Um=(ha>8;aeXCp1%m!#t@5#68gyv`@x>utlsotI4o zbr$HN=Rk4$57Q?65Zw5L0v@UhBh9|j;u%vyo(LJ}K%_7-jJR`%j3g0JPczdaEIo5N zEJOfv420`>W5o(SZ7QvIn3f(3tiaz$gGwHrp$bVBpoeqDd|xP-xZtAldPk-ETT4d{ zJ5HKB!u%$KY3h1N{sIMz!&+j2Ld20=`3BzG+IvKl6Uq|=mZ&_0FC(wJ9vSzZ=)92h zjKKx71O&2l`XMlg7;#QgzF2uahIk(Ab4U!njP6*()xnRC*NS5z%OcHC@+N=Il2(d+ri4GIDHTh2H%Jp-1n2=-j0niE?Kmh3{#P_Joap+>(iRZ(gz7 zv{xHu67`%nsaJ^ds((uIET>7m-LP&oy7weO=-vfw+qbneI<;+s`NId}X!=InC@#Q_ zoOE1XzZ4hOE+BbcfG=0i$KH8UQ9OAxRu1Zisa-l_7@z6SXNPv`gmJxkVa~`ASUh(1w8LD7)z*gs(ej?bKm(<>L@{Kl0yzit`o zD+}y;y$dGd*1js#Up$N7vwr`M zATcueh(^`;J6Or1@6{?e`3b`gASdWjbd>xsD{V z4t2B-jrEj|_oUgACc*g%)&=V;8lF+Z=c#k7tK_(#I+Hzm+cn!uAk8ih4)SRcX?BF! zHJ-gMNQU=e9wN$ah7CdG;xXfjC(3cyf-G?aenU3rhbtHur!?%vVkQwM&GDXQPnq1}$L9_HoC*4Q}bWhBlk*?3*PdOcFpx>*DD)Ym7_{IZ_Dp8kPkvYBO8 zne3};yv&68w6L*6P(L)Yk(*9aKasknw_i8B!qxAtlI(Y51A0Dw`0*}o-+PEByms}* zPguh|ZQPKJjPzV1%Int&BF&q*)wN8+!ud;CzO}Z>-^x`Rv0&i}`@%J1bLdMH#Oot$@{y8M%sZRM)uPe%csr-2N7lm?^|7urOL5$6vP zWsk7qM?^W$<^L7Z?4OH6oX&BQL|gG?_2sqe_Kf34sQO&R164k;F-}j-@O zKTxJ5xanLH^AMMzhdE-Cr${Sx)ew+oqJhM@(cnmPgY?4k&`LhyvXo4c%S+iRAHMz~ z=$&*vNqI|#*ZtNIC(M4DH$l%?`7J*mhihFYAt;;Dkn>K=hxTdLL`0esgMof<(bzrp zHaFX6NY?XX2J#x&=}YV7>C~k+hK-txLr1?x)t*nVjo5ea7!DpfigDv7lgM^Nr_QP9 z-n|zF_8*2(BPL_Yq&b*9OFhjCF?aStEMj2ON@8P1j78sG{V`?obR0eYCB8gy8lQgl zB|ba)6)G!sW9Z;f=+nC|di3aJ!!VR~yY}tSp<@SVX<(7vSu>{M+2e=Kw$%$n@-MO+ zLdUF6eS%ew(%V-J*qpI}1fU}Z4I6}%)J`PM?MX6Ruk+xvG!)I7jqkP=;4X>it=x6E zmc9g+*UiU;)$?#_`CNRqXcqR)nToBG$6>>e{+OMbg3)@G(29kc1A6+X;aItpLu9}JK z8|UId-cmFauEDQ`>G(roCVnm5g66UeG?lKy`-=5=U%UdpFIkRX3YOrPlGS)wxC-~P z7vaxm_Ts_r9NgJof+r_GLF47Kczx?8{*LzlcO)yGFux~B5LwXtt1kn>tgZDlH`Up= z=ATKJHGl0p>YI8P<-6n;9f{P6q-;0`aHLs!=irHx>v5KkP#o~2nb3X`tX_Gaw@2x> z`?AOVB)xY~>~h4}k!3+XG4d@Rg7;f~ct$kUl=cOcGqOQH6ev!b!`Kk`0Z$$^lgMkP zYql8Jz9-FYdO1g&MV9?mjX?87F<+J7w?qa4LZEth_UJxp@83mk=4P{v*>mP#_wK#e zzGD}*Z{LNovPxuSR&2=lOLc&TC1U*J1Vg z49r`wlxeoL9}92)aKqJ0FpNu6Tv{3BA6d7)%!116rWSN{Z$o!PoT+OzY=&}C3~zqf zjB{tsBQ-UR`P+aycOT;3gBsL4d4X%!f5fJZTWs+a>tQz1oJn%O*$DpHHJfaR$r=*n zzI_HEEv<(&-ZcYA+jbpo71IvwQ_-z!Z}jaw5TDb>{M`JG^3$v@Zv3s0=0;DNo2>Ep zhBjfdFR5JSD;?xF{d9UBHmxGdYjv*sr32E{fb1Hl3JCK{zAxUh975+w#-eB9dhO0f z_wpV8J4my1Sa#~|DfT`C&*}a+S8<@u>AXmC68!XT_VXXp*B8#P^kL9dTQJUy4DyVpoF-)W5|ZG*B~!$9Tqq}eK$1&AjUC(NESClAl? z%A=KlJR$Js?7m=YM2ndb>g$-E_5LV>R0^+bL7E}>jCyOMbIcEbbz{onX}AiBDm%<`YxyR;0foKHBFz+dyhY|(Vy!4F&-uK>^B_2x z1PI<0^)&l(#FQlt3qVT$e~~m3E~t5V@;J~BLFpuxrzI4W!k!!6@1HfDiObQ$D0b~Xg1!5W;K0F8F>=&62B>!E)G3wdLc%)`qeo80 zv?=p2XZ9k@o#U`@-cn2&HwA-w55fQjxy>2bMwkyD{Q_T~{su=r{R%UtEky4g{g9U0 z1s&UWU_fuj07zm>(wx%K5#}KS2jIc&TMPyyYeBDA99;{Ho&iP{)%9%Ckog zaOc(y%$+sE>|@pHHQ2ZRpe<-3R901^u&4x?nHnFw1}llhiOvxzC((Uas50zjScjA`r3CS>93wQk|4h% zVQvU}nnjp(?v2eq?ufIm#{|DF1bQumombS6upC5gW@n?G z2)|Kp`A6XI^F-T|$Yfoz;}-8_o_OAKl?aTR!SjC&Ex+VOww&;;R-R2j?76s4_>Yig z#}gAtv-d?Iu} z1!U$M>gXtJ3>yn-0%3&`)~ZW}GR^!%N-KiFmTD`c*$eQ)aQS!8GhS%H^Ogq75CmWU zTvQ>1$f&PqAt7%LQQldm7-4RWG^;h!Drx35_g=j}%7YLRW)HqI>S@+Z`x_l<*zeV~Vfg>vdkGb`7R6bgiDG8o5E69fO*Rz4Xc>M`UbD+TSpD{yyu(FFh zJNU7c$)wUi8L0K^(~Ha2!2BrxahbUH(ZLx}PrPRNOgiV!$L&G;4%*jsL1l2|_*j6C z!(e)r2iFmP=(ayfs|Y8?LT zD;zw09QzM`hJAYvqk7wJ3>!MqNN2~CPUw`<#Xf~j7(X2|rfaU|C5|vJSZ3o6hYuNz zzCHS5!kEbn?1%Bi*JttB*QfB+*>4!A*OM&wwlp*ZLt~FSbm%}L-U%Ip9_F#5M&PF# zS8YyOTWBIqFo*>GULM)<=#rX+EI)*)%ke4EeG!^v+$-e6HOJFs3){CU)|Ph)Ml^8 zgRIqfkh=yyWiG{?tYti2g{L{|&{VhyZ?>l6cU2kq)AmjH2jU;MuOrsu&($08 zXP*D_&U7LZ|F|;?|6IKV|GaH8{-t^o{*9#h-$|hVW!onFWj9Ilz76=@j^+3RiSw@| zi}6d52=oTjmS*8k7f<2I@q;+KeJftwxrw^FKj78l8oVY+drLj|nR@pt8=Jq?>vQxA zk_GxFA6Szh;;fk&bdR8~W&YdyrJMzkKYgj79(mtjx)}Of2T!nDpXO5O2L2#P zpRETtruz|GejHr7@-Ojc<9?vES&u@Z?az6iDZ~SDy~t|hLkvvc`=vxU(yX#5hPof9 z6A{MPzz0fhPJf+Vd-#|XmzC=5DM|CS%a<{7*l^lNTWsE(g*|)sWBZPssIJ~=qXhkhxVFdc9o3eH z`r3!gU(i4e?9BWL%c!xYnM`|uIoz4;kix0a(b$?fhv z2k@xo8EPN1!CvA@L~1dYf4!Mw&&G!~LL>((a?~x__FSO?iUo({pKRD2Il$FmAKH)macH z&VD}`o-BLBiKGzMtrpxS$WKI;Js3yy>_-sUp+oc@UQd#jdzZ`42N66Mm%D+w7w2iM zK1_!%nrUR%Mj5PRY%~<-t3u5PWAsPjhgeQz;XZKMAL>Tju>mErZv?XZYhE{s~*MN*UDpVaA-LSiEd4 z#!i^(=1?9z2_uJ(MVGX0B(|LyfK#ledC1^Vm^O7TX3tU&^AgOPw-|F5EXBmhGcmCL za181{i~(>nKL7F*j-NV7;Qu^Qj{knR82`E_3xChN=&Ry=#= z=J{nHXpJ=MSf3+(z)-M|&obXam_Mhl4B_Ji_Z?|=#3~AwQLu8oj)MFEpP>(lpkt-) zbU3nK+Ih^b9e7WW7lU=k1mJPp7Q%FxhGL$@J-_97>n#qTee)GeTRz%W9Axmc709&D za|z({kxz0^ypqpQr-qNCEeP{t=2O0b*FxVBkY>RXXCKVHtW`D59&s)7U8O#A8RC zyL9PB@;M1BSFOW}6)TaOlY<*qzGJ=CCXi+T0YU!09~yfNy^F8f(|AL*Kpw@Y&HX@tCn)#JR5i6)s%3jEqfrM1DY;bFpqs7Lmzk zGD70Kc1=3-Ns`ZV>(-`|Nb`94YV_&T*Zjt?p+j*0?wxQh=ASi&nIzT`W>1>k(2TeB z`Wa9wU22GiNV9A7#PvvUe1Uq+vU|PqI_2ZO>6M5w?WRef4K;bA9d8lVwL1=?Xh~C5d z0{JDBkI#M-!E+IR3lP|Z@wVU{tv&C`r#kc;!N*kfJS$!0GfE>4Js&5{Za5D~v(LQG zm^X0yGmT#Y)F&kyp260Y9G>25^B79Wv>(7e zcbEfn&c`i=!Fxn75=gUJvz|DMAkz`iVMm0S$H6udxHIUerQslAA(+GR+rVi#1eLCB zTmb@va$%mG;kd0Wp-BhtoUe1T(Ke<$yf>^1dq@1;4(^A^C~qPN2|djMpAD78;EoNF zDesLOJM(U`4PeIO5WOqd#$*yf4&FifVe$ye^I?cwd(v#5CslvaGdfcXA{I!bKg^4^ zOxt>MSeAs|W`7OS_~RI4&8bVK>Lne&^{#%ZQGGxwk5G` zr{3i>3%qUHc0l)Ty=_IYd2>mY36W;anlWSc0t_25-g<<_j+ul#dk*1?6K8Si!X@OF zSE6$dH>F(X&I-(QgXEarl>O!`b=$KiSppKk> zqWr0w8lLg(nbSs^d-v#x(WA#;>dYBv*P$JH_Uehe>|B%;Rbbw%CFt2>Am7`;i1RF7 zyI5L;C%enhRGp7sw`b$;cWl9*wq@XtRU7g5+c%ONr;{{q#^3KEStkCpJq!P&Rs8l8 z;4eG#Nq!6QuO!p|vNs$5bZ`s)a*)J$KhNziCJ`p~7LhO)y6xT){1f;8h5P@quL%FZ z=YHF>1;2`1A1J~9eRMniuTRSH-*)8T&s#I_Hh%-&?ybOQYZha8`!-0QGXu}BUqS7S zt0XXNEYTO*Cx6P!zHHGKYN3WVb+sf1Bs&ZQ3J5;9#tE}vU&ch7xvyawKIlr$l6#yq z`?#S7u!h9A&W9iv$=0Nc@>K@vEp64#Q6GJPb)Y}CbJSl#dToTn8plrGL)+$g1^AX} z5WzbIZy$cVsm=c$gPr4hTYzPC#F$%_-4OV0t&AA$%hyw8v{Q$`PpEv%hm#Fsk%)2( zEr&I_co`?o7CR|N1Iy)pY^Ck8J`~%Y*VbS!$*~Axmo90@%P%m(Tv1s~!d#6?+D=({ z1&WGFQCLuHLo{q?M!OD1n$y_m6;e`U)9noEu^upB5H_sehz82$`QsX!_q>77Pqwz) zxbmWWu#DetX)~=@>b33{IW7fF}WIt|w{! z;`rA#Q^w}>0_$boxIxq5Wnn$Bc6A2Ju_?lejaa^H9rribu#M%*R%6h>!PW@Q+_aI6 z=C?+8Y{fs;;m;yJB+TzLmf0Y(+(dmRVdhp#C95+}q}e*9s0+@wMmnW>QNtmwS5A){ z>{z|Z^i@p>JNVfL%PNn10 zv$9y3;^lE=bnkTeQp`y1XtKPv0yTIO>@PrAuJB#l`uEto-1A-zEf73sz|f!* zjoHBmBtu}-hL9+0OF@v-Y8g%U$0^S#eG1Bc8X;te3iy%W2s&p!{hEW$J2Ip~`WX{i zl4i}`AgCoT1g&`Q#AC3s#>x|xi?Fd+VL5n?!VDdk7pCQl`v#VWd5j!$DYw@8P>m@q>Et^%bu-t1CkJlrHH{^kxQI8?M3kgr8Vt z7zWh?b(r#R2#^q9d2bZd_`J$*`CwVYhKI|ejk9Kbo6$3EHzk(J=ko7u<)e7+iG<3@ z=d6dB?;!kF6OU+lc~^%b{|(HqZ&-M^Rx$6PUFdIg+fsiTRd$tA@?aT!uYnr|Bk-Bv zSm#3{LJ0y?FWvPaf6c-krzz(;xmTK0o#?1`ZsD?%fBXOIk1V={tzWBQbc; z80%3UIAEBKVea0o4?1`1Y6QAx&%PKxZVCzWLM&Ri+-A<0IAJRK_8WwL{fA(~rYwBM zfcW+4Z?S#Xe)JtU6zw~8K}x4o8{(1DNj=tXjN-t7190S%gI2y*403OpG`Fj>Fa7A3 zV4K0QFOUC`Lu1r_ZhncI-(N=m-aXN;XK&1%IS-2#EI~@g6m;v>4Ov_A@!4nRiHoSL zK8z&{zCC*lKnDfgSu^nL=s~>wbQj+3D#p(w&srw?57k@n2NLH$Y+Hxl@6giO8ThBF zZ2Swa{{xBgpAVMfpZAsH-*<1teMd zn7)Q-=)CW}ku8LFPk5bIL(%Dbbgea*jkgNg%M03#Y{1*D*|O|2vQeLg{H(3Y#`}|D zoi;()5}B(3M_~?mh6bu>rr3bk;t&YeSs`t6$328m>e$ zWh}C6^EPwa%(A>@)fhK+ zA_nyzi2f|sh84@h-sN9kHrkYl??@=$G-$>c5@ZtQ=Em0sEe)?;T^Ey`o;m;GbU6?%US8yF=)FVFY5$P)L(K%fwT)IMJOG1lxgt{mlrtVQ!ecIu04eFzt-NEa#8-n77 zba-gi5^@V(Z{l;-xfH;C^`WgO9|`YFgx78DTTE0QgMEM8Dq`jG=MtVHA5tf(wy)}G zH}&Kf)bBX-Z2T*fLoB@*LL20D*`T)SH1PgFjWzL6*SL$yrTx%H`g6>m_kY%d-$egG zv`HqJwjUUCNS26rpfv<9FwkMTKTBD;Mw(ecPxPEfvoc8q3CMth3;1!R3I)l6TSfWv zs#H2dIa&83p#Hq|WSNd5gv)yzzD#krPu4AAq&c|eKk<6_3W=1+?K@JU>%JURXd0z= zuC0@1cg>GQb!C*(_efOgFdzg$#DYMqf~5e7&kNDL$?#=LB+XJN1sX?~J(6Yg*GaRC zJ1SQ)qW8I=)>{2|cqJFHeNCe2r*Cpjnlh$rtTnEB2p|LUg!ykH%}%!>FkdW(5mcrX z!br376b0jtgZWUPr2ad-Wd4jaGaoERGHLekJhuMeF?SIrPML)X zlV)SgSoID~!00g(F?85y^zJhVy?Xa2+3m`JtlsCYwlwd=3DX#)=VH*n;UwODF?q^N z?Adz=r_Owf!=HVD5u+xcO?wjM)UN2zu@l;-bTrbefIWKD2wc4QjeXvH@=!}+*W)dN zp0^+G>_XclR2GqDO+n|Krbx!ySB-egpg&^BKn&>J7wPLVuzKZM<}(FdyLHFLO*zch zSJ<)ZD;)atEDn5j0{JDCm^6M0W)2^M^W_C-IzrM+68vs^0p4%VCwb1nuiLZmTXi~q z*|`}%@7jz%@5siVdHxUF{$X!E{{2uH{*V1-_&!@Ka$hj!!d4{gV*ZF#tzyB0qdZN?A8y^<_6@2|q|kL}01 z&-USJX+AcN9gc34)xf@eapA%@rfZrzSHqjsi>a?8*2~AOeXR*>$Jq~c(c1=X#eq85 z3Y<@q{y2GCkQYfdJh3^41y7!hAO`g!n~B?fT;IcW&hSw5fg^&L&tnc>0M`${%Qf6{EE`4Qd-ESD`5LYZiO^`CCuV15Q;&6@SNdFwVF z)jY*h#`mX>>#(h2C(<|O*zgS3)9i?|NVCQ@YdFTLl^d~g#YXFSUcP)Crc9fK0c@D} z=-vzc`}9SZ4joCFm*Mwsn(=c(gAr%-GTV?O*1ac{nm^i+x-3YGEzffstj&Ba(@Io$z)YoNEm}wi!eiGI@NQGz%g$5gfTxCGkEZ zY08(|I>U;cBittx2MxbyHt>AR=?Y=ltr95{zDr1u6X61b&b#|P{1`m9xJeu?%_!|e z`1*^JW+{{J5sfDI=RFBBa!Qy;nqA#%>(h$oF&X}z=$(mm2nwH<=1DCQ&5UgVm#6Uc zsq>gKe+k-mN<+^+{n4dscM@*(G&{mPal&}qxPHy*vG&meJgs?T_1N4*=g)dJ8!~%T zS)4=m<95TIX0r{Jk@tCLI%(V(4B-39OR5-%Gf0{{*(d6?Ycp~1&>8I6cN)7$o~w5s zMs{I2h728nw03Qcj2(Py1p}9I6zf|XvFz4WxJ=tzp#_l}) z<+C#U>&cz?AE)->zaHCx|3*UmU%ohi|90Xq{>$f|aQ_hgndJ8$j_t#5pYFn|eHEy! zD8RKX8*ygsGJLjl4l1XPLH3woST}Ao<__$K$vwJaX5XHe#O;XGj_B9E4Z64QfZ+oM z7&WUkp%l(Z6GoB|H=^V>#O3WBg`Vo`qW$> z_zX?yreP3`Joa){{87)W(tOoOIjA?TLAL$`-(CC`g9i^ZyU`D{{RRxgpuxj1aL`cn z={pcTdh|uN?zF|Oz0ju*8;vBD!-kEbosJ|%VDOM(wnX;OVZ)G`*2U7%7oPif@8X~T z`7dZB!F>Aop^;|a0EkGlV!q(`fg{b_r{8;}In&j%Y`x8{r`bsJvqzLe4Ssp^5|_U{ zZ{%D!9I6_GOzo;&hGrWBx_H zBO-1dzL9Q2NvU@pA8G`~5$R{7FCGc+i0Poq4{b4S6VkmvCrz)V(3Sz}1#aNQ%rWb59+T87uMI;Z&TjbZiJPVY)oeSXa zGZ_lBsSxH9oZNjKNPgl^aa;D#*CTB`X;v?@C(YK={Gzrtfm~X`M4T{rl92#YmQ=}LXv=)zdm{N#Hh=K_qD3~i`j9@TrPcS{%i4$bMA1_lJY7p28O|kMt zWo$`iypVl<;*i3aQc-E0f>9PUOz)u0NfXur4U{{)R{<+DFv^ARP9#lkKaS-08v(TQ zG>Gs%m!`ibA^(K0zm}v~@<}Ak2@NMIU+bjV;Xl&T9KA28Zg@T!(nUA+Hp+Y0Q|1xW zL&A7S)`JW75xisb=@@B_=}|a7o@Gh+K`EF|*5@fX->pKyP=V0Wlq@M5Hu~IRBZ+zz zuVY{Lf;l$Hz}JC+Ws!X`-_8yKd0L)HWN7I13b52ON193A>5x3kIhmcQJj$nQbiSy4 zg4%mEc=5QN!R#sn>>zYX?SW5@eSyb~O}KvdK7M3SyK(;sw{^Jnpcc38Kg2zj>++2s zuqD3~8#m`+V|p%Dt=WhL3zlN+n8^&%L(!{8e+KALsNS&`XU=?sCCgT$W9M$@K?2*G z(9}>`K*4&r1`WdHi5~8=|ph z#d7rR)epOO9>A8(x#--fi~Y#Ba#cF^?LUdV`_5p;?k})s|1o6bl%Qw-!AR|rg5^_2 z;?$PaxRvsWJ+{p<_;{PuTvOrr5h%SQ*gWx7Pf1E%191a>6h zW28CZJZ&d#yP+LW_q3dF+-C^B!D&IY6`vs64Q-M6i2F;A)6Vfq>tWrBoby1o)=*ztZ^L&&(%K>oIi9jXZ~3a zHjbIT=J|L{ZL`(Y`bteLegyIj^Muh zwnV7Iv2!2mX-?2VJ10MuXtR-SI^8z?4#=|gn8)DlKa4AumLMBa+OE9ucef}jA-)#g zt9uCThp{pV$z}7;wg&08h_fTm)GNj>KRm<9+Uc3|9gf6D`EvKg-xtC80dG&fK0ZX6 zJ^YZhR!MV0PcxN19;D+6;T@fCb%d=CW~_D6taD0(${8aD*7IRX!~*!j#-L-TEUs`W zWH>i6^ARdtAP7%X;`<(c%vdsf`4Y>i^r?KY{7TUh^Be>7=pCu=@k}PoA%goH5-ZQ` zYa6}uKLejnpKg?PfLL87lV*p+0AyqC=){xBvfsw>{}yQ$)bR59=eHio>6;_?fDAoh zu==HrCO5u3X?B4(X8ZwZPR_60`=JJ!dSz^1WnliyE|?GNX{J7SJ(I04zmXG*;7BvC zn_UHR5cz(hb1qF?^F7V>8I^iiM~FD6#?a~}%I zwjr;$0=Wfc$jUCV>7b@gnMoo%!g`za6X5vqQ_(rK8_8-9bnnr}#xl2S=T`dD6j2BE z?#H{AZ&1f_J)>VzZ~JowTm=nHx8xnVbB319Bg)sttwiTBE%{PwOH`*-a^ z*Umk#Z}(y3W)-1xN*dC-^u+Szo3Q`D>9D7H`|cxHMk1ZoXCOLt>x^0B2IK48wYZbB z8ZQdg;d#k=yr{~+o1J-hTat;otkrmyxduO%(rLpR7~yG6+_#% z#Xu4pErz0HxqA~DS|MVr9x($+v3`DR)0QNtT?cMOV%po#m9!o`Fks|xjGs9jBPUNl z+s++PSzU$ZS1-}P29H}vD$-@s8|u9D$bHpIgwC~wmkhe`W3P+eH<_+7?L@f6D#CwF zw+Bzpz-n2kn=`NoeR;TaJd>TP5n1m7L^S+ZS`PiMQ%e~8HF`FOfUCJnhUiHc4{zAWeD#w0&_o|0;#e;qqj{QZm}f_v)hn)YxOf`H2$d@u zf>Iko^T5Ah8ZVzc!Y|E@s46eT=ux9^`t&*6yZ;yuYBb$kGcI1diHwW_Y{@J|#-_rM zG^cMQHsxT&@(q|gX(omZ9gRUG%7XrvP>;I4Y;{UWwMN9eS+nr|brarEUaBu6KqAZB zZ)&6-H4xm_5CIM6i%7HMA?^#EdOunIKrcf5AT0J#=i-nqiL3Zn=OO)zA3LHN>5loZ zSo-#kn0`6ZESr=rI3$d>4|UrOJ86wHdw-Bfnsw~+(|Ug6oo)J};^$ya*8jf+cJBvJ zr#LHJ3w(zw7q`YS=;N3VKfKJ1qfaExg6|CSw&TlXTb_%TPkC4R$z)kSK*wP}0fu<- zq2U=6Mu0?xAehNSxXe0((f!Pff_c!>oIs>?%?K&?RY82#!#f!X&sT=Q?u$s0pu)!y z5|t#Fkjz0bZ^=+#Z2{(wjyRFvQ`v0ml*rAbKrml^{d$jZ&P6JsRZ1MmWeO@Uj_Cf# zxqCy1D}QJVFA{r^9Q5vZS`PMEhwm^MT0JDb+sQIcnwd|^(0WM1`gQO`TH}Ru%?-A1 z`Oaj--}RyTvN}`Vf_*{b=A8Z|>Vc4)z6i6gJ1-OcdH*_?Mkt(0NJD*CUX{5u(yXzX zVeij#1&PQ$;(;v=f87v}X4ezU{Fx0fKdq5w-NXEA*pmAR)H$|@H0xQ)K!lkDS7;J= z+(Y5|X{u<9wE)8{!EK>uCX+2&L_0O8{=uR!3J$Q}-yAPpz z*PaZb9Z+vRk9o`mg#LLnQ{C;OS?iJ-?eEW9j!t#g^cW{ydk zdd5b)dEH2vXh_Cmyk)?VExI8Lv>U=~LcYKg*I2-f6XPcRC=@_4PbSQQ?8y=280=h} zF!LGJpX5%x5!`rsQTzb&Gdk^g2e*j;fJVhNog5CQL+7GtEj-A}!eh_74 zJ5g9tg}lP8Mx3|g7GZOCJ~m}$V^ex21`io-jWd09>D{*FcijbKvG z8lJ1CIhizb8(*|HVs8}fqQ<~$?Inw4k#xTzR1e5~~{Yw7;JY}^VJ728oo zr072h`mb}RPU81JzqQz`-ojV*Gb{C$^{6qlHlEfx5}bcjM?#!58vzrE`x{BK#iU>> zy^6;sgX)tp%IT=;m+uhgArjK|dKS|;Bh6-)PA7!Fl{EV}C>!>1B@S0+2j$bpX^#Zb ztaH52b&@%_o?MnCK-@2f{y7f+jLH;(_Yo#{m6zof)U$0olkbfPv*5-admd_aA$#M# z@@zA7F`r)6k<4BGRZb^2)|Vh1;kGsM+$w2K4*DM?&AtF37@!QimqsJvsrwv3(AMuq z5Lt24q{*2%B{6{oggH{07+640k_01G5&*Y?RZNigunfU9|JnAhg9a{vQn3=Dy!ve* zhaex;3W_VPNOMpjEy43v7N>}4iB%PY42s zCR_2gqaO(a(rm<8q*-~E{?i^5RE-?d;3LxPjx}pdJ>^lyve%I?*S~nlK;6W1FY%kuZOE?;)-|dV=pCsJHo{5$B`l zzQMNrpQ38dVN~t<1Qpf$ky}uK)oU`abmvJyQc4YOnp_a7h%2ftWz~e zbiKK;5jU=W&-V;K1xZ9@#ZDvgojY~Mym>3}$>9swyYDpi9XNrll{+zM@+_pJ^+HKsISb%>%DoX+n#}!+cx8U>1O;~u^Dfxv+#ar3GQ#(ig8_1(4j2} zaJv+A>DU$hNt%cD=#Ej{yI@Sut{6M0A7)G#hxxNKcXeOuW!C*2+Y#LM@7*2a#*8K= zV%(%@7&Boi#&bJrbdfP-SQH=)91ne5bb}mX$^6|YWX(eqI9Etg+0xz zXGfZy9f+(tInX`?D=Te>@Z=)&P4Xes5g&^@d|o{gt}#tdfDC5i@%KkiN0JR$5U8t; zG&2ouHEvlxN6)<=p7Zzx8^O;te>2Nh$7f$qcN-Z48i;pq-r}<(pP_rV9=2$QhGR^a zFd5gb-NflL)ZYV#v188x66OP_W+QeRZLz9qA4&5r6qi<`u(Z-hv&eI9K?(P_V%6#m zW{>J=)>L#;r%lK0pKcpze%VA>k=%x)*$8t)n*H>1p7@#1BYZEq;P0{cYS$i;i0J;u zSU7Js7A#nZ8@GPK9r~0E*)eI4_eOqyE^ z(crQ2=3=g7%q?O6{qVZp<;$u2668n3mjU-fu;M+=4$nXQBj8^r&7Lp|bOe@hcfTjd z$*`br`?^O3lyLao;PbZ$ibznF0P%N63dZLHgLN(4s1iL~4{{>$^9Y{}p**<&;Sed0 zlqHk{&)a;re7_3-E(lm&dEP6YC$S1jkuoMKWSD;VOdW%utDrE57;#n$(h=g&@$()x zL4VYmY_*p=25-Qit&M z$M-q?ipr2YE>VaRQY|hZf*dEzo-_+0)t&(SwVr0%(ndVugt;{v`j3!iFBg-Sl{+8= zERUY`q}kh0WJkPa={woAB+aZxf$2Gr;80f@>Y5m&Nt{WTUp#HXgS(Fzs2h==U5rj0 zI$O{wC@w$^1KZ8}cW~|QJtNK69@JRx@|C;yar0pfzPWx2yAK~n)n1b3-A7Qp>j4H`5Q{re9@mo8mM7~7I8w#U9*yQ$xesDI3I zKW?B8(nRzk&yoT4!s{Gibrh=$le>sUFmps4@@2YTbEeilxR2osve}u1*hNyI-l>iq z(lBe*Qj+HL*4upG&f1h4hF?l?`%-V>@d&<##d>64DZ$I6E zKOEVGKYmh$KOQQ>uX`)-aAyU^q-snhiEzhG7{am+A21LTh7ZP~Nn@~K;Y@5=y%_mf z>B!u;9+Sq5LZ7ak(KoF<`lq$QkRI)@aoH^7Y*>ZaQ>S3;q?s5weg;O2n26Mr?ikd2 zFm7DBhF@R3b$Z;OI;G#z4`L0J6Uj~3J7&FAiO^?ugZyaBCgVEjkbJEran{hF_*9C5 z8=k>yVQ}YrVvvut0Oft#2M`Zt&aWy~INPvsn6w=cW;Z-TjeYM!Sbmn>^9Mb||KKqc7geLUaGR0lqF{(d5y^7_x8)VvgQc?DqjTpjMw+v;b4fm%Nt&Cv zZ8GBQmR6TM<@-G-t5)Kuj}ZxRB8HdrrOk|&4{rZ};lqYtn)ZX#)JqR%I3)_lntr`<^Xib%8INvuOur;cBIj5ND>`Xg=-@&m%+ z65+q}c$*0IQu-0o!^rkphvy{z*zAaTx1LeH$HB$B)=9IADalY?tlZotzAv&BUpDau z5z)B5$C2!p<&zCMj&*bhaXS$bWK3UWKUvZiLV3mz=YxraS@jnAs7CoCo{KkVz5L_$ z9q$0ObCNBgMd=bqGp~g`%?X7eS`cQ-D7Rr|x6Kz3k-|G6TD~YSm%j)y_Gnrl2kXsZIlZ!Ys6wJ=l?N|}p!^cF7j{hX z5_)cu(oBZ#_heX$4(Kz0ep=A{s4kF*I5Su@&K5+H`Ci#XJfE_$nEYJ1qq@-ZzE|Aal;F-OediSF zC?ApLaNIH>dkTBnOlPEz`UJ1}z?SnbetK{h*Kd0c5y=!G}K`I1}j@#4dqx%-FwnV*@-j@s-MWl z6MOuf+)e(3Efx_Y&LSqhKCEF!dzv_5GS;q6$EQcWAZZ@NJfvarzCnJ##)@o&B6~Ag}7gy zgQmSZ@batAF}7Qpm8(}z2JnSTv3}JW~B-OdZ(^3#Sdkf$aq-$xX-P3F9$p>~xG6!$wDU%^Terd#m^2*VpgyuHhA$8=6c% zUcY>Wh6c)te!&f6pv{=iB4jUw4JR}3y7WQ&(z^iKj~~b3eT9ej72&vg`gBj8-S7gJkMw1h?gTOi}!UAIf}=Z zWVDc*#XH{X_06Xl?8{DF%p)kL=t~i0McKJopxDTLLi=T%J$uSVED45IlDmBITU1w8 zVfoS(=+V6oMzb+Df8H`%%p*M`2bER3aG1J$;?xD4Id>W7zPW<$uiwVm3sRO`8tr*0m=F4;+H8Pn^PUzy1zyUcbhh=2v+2 zsu3?mnpqEtau8vbESzn+Au$2u`;yL?{qniSXSMj{?JInCL$vfV}Lj$Vks+IoHrij>^^NsCapM(R^;r<@wecwk!J5${IPGLn(hxMOp~ zfO^EOIRwI!1@8<&ZpHdFIV8+EShqUUNb~X~YcOZl0vmtWy?ZbF@vmJlAMErgQ}F2SeaewCuc7@t zX+T456P;Zn%PP_=-J%?0`e$`Sf)bx*vbCOOkr;IVn66TuZ<-siX7yUkp0yBPe{~K6 z2Mk4r4xKS(%oH5hcM?ZFJ&&sG2eEqHdQ6)-0~06Bz~~9nF@E|2OqeOUJnnriB$^jUQ({h_`%9b=uFez|=&EKYW%t$d^Wbp#s+o^;M& zelW6gL3*e3^z&ip=cV6#pB6Z&_c|z_82T*fBGdDvxskTqNIPx?Er~PJuzbsAr56#r z9hkqSukbyC-hU(`xhz(mz^@5Dt~h`0&c&g??&F+#jTOsG*QA49=LqKuWS`8RY}OhV zloey(%Uaqp8;FhEZ+KjXI_|%COv22DU~SDqynEY>6JLFSKD~RIF6rks{Tr}+*;@M{ zE)u*dFB$ny?hh5exkkJx`*& z>(ayKd*8G%)1ZGP7B5A_kw;^fv9bJU-peY^gza)#>e;qDX zxHw_4kNLKqP;Kp4sK*=Hg#O`i>C#@}agNWngkp`e4I}6BC)^s8cWyOJrA-mZ`^{eH z7X%mM+}8Q!h%gIr(yVDP{eAv9!P}t8kLi1aGK`(`@UwVCcInbl{`~aB2HIPr8l@J~ zrGvh{h4}M{b}!gZf=t)s%e38m>?F;}88-yR;1Ool9`cBhM#2%w$dD3AGlR83`|g}M z8m8k6N{Z+V>u)2?iHH}xMMW@AS_LbaIJZukoj~nbzQ?B-f6o5mYvFavs7aDmSkzr4m zKTMk4d5v|pKZ;fEB+^{>l!Q1S%~~l>3xE7AgxSfF`>l{>^(@Jd9f{G_BeF5c?8Btl z_o_v34lmGK$&8M;k)%VtDz+qY6K$qRTRNd8>Q)oW&g+^!=iTdU9DcZc6F)upktF#RuHU^!!u$v~NSbfld5Eh&{)ppWe}$cU4iblnPqAayr`W#Z z2y${Ov1V;9HmuLZx;0x!nm60R1B>P_!z_}**|X-`3S{cd77^&d_g*}Io^pDQ7msLv zBoK|YEMqN$++$CgStgdz{DZes$yfD6f|Bf@MP58%HX;-F0&Rc!`7Nrds;$TBjdBSVR#$&ko_BJ z4s|$!%@NLgxt?a}rF<22TZoZnbqFMrWsg=#GxggM2QPt2#s*{Kv&Xe~*Zc}s zX_sS1k1;!G-@XGTOqhbq%zP9TX_dcy*j{}Id-i;W!-u~jX+Dqh-`=3zJ~C8P?ZL*3 zToe{pqNuQfq;i`h&ph5r(ySils;XUBxL~Q-d{+|EUOjrFd}{@M_sj3_w)qX-y?H~@ z+#HZ*=S!k`AQbORb|RiFdzKwRrc7ecIW08w?S->QN$G$oQ>WtlYd7(r_8CdD=4gIR zyS#wnf(qnh7Gm`Z+Tolu+q3Il|oBs4rcmS7w{y z2~5Ywf5DMv*`4cY_M|x?&avlX@H#HiY`lZ_M?W5tG{+r|C%hgannL)1J=7AayksXU z((FFj2_g-wJSzZ|Ru#nbjd*<^!*Svjhg;M_2%KaLRuNL>Sb7P)0)GYm$FbJZj(Ra- zc?^5JJ#>%Hi^?cPb-f|hcxdIFUJ>FX&@G4(fO&CnWsZ|iGeX|){(R@LpmV&>pkT#3 zh4m8T-Ic>BnRgZu;$?144gr#6;maMqJ3x#4@|=w?44qYk?7Ovd=0oU%_#f`g~ zUhY25T)v2%2lis;u6+#JpO7$rf?a!#qG;=GtlLz8)f;lLc0(S?^A>Daw*@Pfug9!e z3ov)?Ld=`H06ly5=KCCJ&dbVyejXEHu75-wAbAjhXn0IKQN74^x1x)+66Mn`x$^ZA2cm8V3o4*{3 z7p-BCUrIvI72Ue`LVkWBs;jo6vTQrHRggGu-H+Ui3anX@gOv-jv0{EMmd?$_^7&a< zxhNeg7Og^gNf}Oj@g-I+UxmKC`=NKYH1zG!1qXIip{e!}>K{D7YwDkRKU~0cf!U9n z6r>OGEzR`7Z|Q&EKW2l0FrtZm(6=94hOOgnh?=n)R@ z+i%N2w`i# zr&zl-3)$JlC?sJnC@43=TvD=~+iDW-og~dw*tTt#efbe-PD|^CVZ(;u(xprI?RO;2 zZ<~o%c>CrRG`(H38sPLPit}DJig!K+NPZzv4&bk;^FT87`@j9%jKcgJHU!$CxTMs2 znD3A%8|9X1O<5& zHdH0GQ+K4LcEdE1@K;S5hfUu>Qu^v8$)=Okg**`L$jkHP7-*L9{F(^HFO zdQS{CM3K+5OowSGX2^c&7wwBk03W~N`Sx&)-5{p;{owsp_pv(nunwb-AsE{nAL6O^ zCdc|X;#@!-ezI?G%f3#OmbX{ciH?nD1~J`^$%A}8y^cEQ_zCl&PSKbTNP>7&8z)o> zhZ*;Su0uz`8{{}5mhA|#_4rYMeqTuJwa}*vRSql3NV9VsHik=1!^ot9Ag?jac;-W; zpb%Rku4GP`<8VTWRiK0r6Xaim>ve6Ju0w12)HV*?6I7IyFCfj&XuNcwW?V_{^or1$ zq9j8yh%2zErZ2xZ(bsAOQsRnl$2DI7H?$`dw^>s%HQ3+f$#3$!qo>q;p*+5 zY;5z@pMJujV@I)L?_Lt-gV?p}Aj$I~lvV7*rpywo5?Lm3Ub8U|>o<}-uicD=3zm~u zE+DB~h|#0RphJgF_P!y52IJA42iDW};sNbcKmR?X{*g4-8)5x&W*p zG%pC7-!|CVnCAd)%0%zOC?688r?pQocJx^0eJ?g_+-!LnJ8>qCets68e0mZ&BF)P( zFnz`nOqe(aW5>`+Eg%^B*G3@q64e&R>M_BkUDZ171Ff{+_nk5I~F7xW;|(HH{=#Q#idz zBFi5m&929eY4Dx?{Z1zm%VK$D`SkN^oHW}MQ~~0IS^mrU1P2!&<(p$N@Nyw+Tyo4F z@=-tJE?BStFDS2HfBhBB&9Ct8-D{HOm)0|^uT!m)W=Aw6 zKa-`8y8$FiZGDW;@ozuB<-6vgU%!6%;;WOm$A*HYnyY#G64!6tM`_tk`i%@WKv!b^ z+{G9(asuTs7*i(AM?r2iit=|NC#w=UTPkdb%9M$-(V30Z?p?d%)5D+O=l5?Dh;r+s znfg#qx!aiL#Q2p6)){3Yoezh=5%FHj1e=j=5dL}D*lQv_T*Je7R%kyTMx^IVlXCa6 zG&@n1tooSnRs_hc^Q zj}xFW5S}!fJqPvR*B8-N$FANKE8Ke=wC@m?Wki}4F93-_cD^FM)Ee6O?so~uT0!v) zvXV58<3ps`BW_HBTUMKym!qIVaO9bl7LsNc^pi<*yh0Kxu2qEy6e$K1N>GU&{~^*G z5|PAo9in_j`3+#lk#n%tjFpcDryrUa{5@$l1B<=W%Q|jQAA)2Mmy6*)K$`VwG0G>) zW95=l@bw`?3a8`N0+JypYlz@{R96nk=X7j;-%{C=uPN{0XRQw)PMG7+HP_SZNwXu% z$)wrK#K32yOHtXK4jCCHVNM{;K~J+*&QnkG$4Im4KO)Qy(K|$#nHK{I1R?!#y2QLu zCeFbzZwk1rz|{1e(c?I z7?o9fke*qDHS2S+KD`iY(+jYcq|VE`XRTEHG{^-NV9a$$yvm>p8NIM58#G+Irtw)%`aIUJhQ**H*MHpvjm9L zDnE;tuEBR#Z{yRWr%_O}9m`i_ko+#j#0j%7j-***o=15X}YcNV7<=Nb}!BmOaeBMeqEeZwk;r`}TxcFrOn?aT_PhLg*6+r!RaT;mhyk z;^G?9p*+1kdeSVy+=4XQy$-enegNI~oJ3jv;2G;vGiUsG^A?sYUV`r3dtyNU!PbzK ze)jF#7gIikX0xMT%Va2K}lvf@gfxdv8ymD+JX)Y+JM1H>O zWfoZ$%F1_;bZ^J@9ec4kGaDUKIw7S~XQZX2@%{q*;SYbn&p-c6!u%HR-fIR8*VAm% zwb7^8e6|+foxk$2L^5+RK(UYKg00SLtGZL&ih#d){R&!$W!CID`0o2_xbv_U_iLZy z@v~R>?D%P{ThGSivW+CnD=>Y^TnrgF3X{gqLT+|BN{e=(FmDIVD$U$BWEn135;6v)=5 z&}{`imIx%xa&{4^NXW39heu#ALNaOg#MzT)_Y!78j>1T@z)CUFOs8QPcW9k7(@_Qa zv5E^w)qjLETV>Ly|M!q)osaU_I)ZnG%0ne3DNx`S{#Md#Ez@8>4xRVJG6DYq(kvvC zW>4fo!wMA2LFbcY5g?)LL~uT;D~AuAYxRES-9Bfrep@5Pf|o~Y=S)r_%LWE#ZsVlc z^e2#sfye&St}DCKp?FWT5$C5O&OD|r`ym<$KcRWj97D2m4dqNEkY?$d$x!tY)QR+s z_r-~F0+deR4CC@sV6$Z=a%rL*nqC@len&ld#~>!+{NmAlOd99LP0pA-16S{S zhfBA=$EABe;o{vpxN`qKuHL_k>vw*@tsigT;3xYTaJOU6uKn0e(p*)w8(CY5ux?Wh z33EPCfDH^}>(leFe!~_lU%u9U*qJFIKPKjOiMhF?I&Vj-G0SS%2_rnk)U3GCLTC*yU zIIG9VNOMs4(LVrAhZR^})8{0U<~M=g@%ERjx8|Rcd{q4Y2&xP3i)CMe8w+W+76#gn zlV;!3oIs90guuVWeO$|YMR5O+cW0{eovKf9F(^^ui)sy>SaS@7%>xa*(Gl znk>I-*KEPc<(si&{(4LpI};1$E+5}^B+6AND%g&^oJthrRblnYP3YOJudVEP z=)foV%Rm0vW)F&bm}7)Ffi$}@%|@Du)=0C{7y9hhbjaIUU@u`@54N%+pN{NMIuwud ze%}o>jch0m@82UZJ(W#)^$6aP2-!sPvx@EVvwS8-m?PrMbiF?FUcF!c`*>SRv}f7@ z>(&2z=JR5DL4s-0$u5KU#fF=Cv|e82JFy;A zFDh>oe^qa+cgn_r$H6`3CxiNM!}wTVfsV%!?|@0@Y4!+voC5=h6Jl+X39~27eyhZ7 zB|cV$Bh4&a0CO75m}Qy;3I!Haf?&ZvmLI_hPFt^7fg&X%NYBGN{l5i%ct&z%31%co zX-8n?d$R1o^xW_a5ohL?<&IaZe`i=>e*=_{WVw8lG&9d8&sgQhNwZU&WFw0d>O-X2 z(vH=ar5C>zLu=>!`S>;ETVpfhWH=!; zfrQMBVeG*>pz!$#lo*ntY6| zd(!+fb*Gu6Sw!0INzls(=9^;v$-~>LM`TYP-Y>gYl1!Q-;!J%sTMh`b$nf6?7Z-%c z&$XTxUGp}Qly`rR_ggW$ONRaE7`MGx91}F`LPIt5qmgS!M}EP@Ig&2wdfc}N{!@P) z_3?4-BfMsVcIWmTNK5UCw(UD1ty^!58Z#c#XUxEY1@kd%*dPoV*cZcw4937dL(sSD z5De)z9?Or+>7;?#G@%dfKn2$Xp< z>&1r5HPGiY&_C;EJds!G=fD5!zv9Z3Yqprmr^k-t!ljEid+{4wy81oNT=*6nH*LY9 zg{v`n!hB4hyaek>p!0LLVr$6`6y|TE9qmG1uIpva%Pq&u>GP4&Ar)grjKy!v^Y4Cn zXJ4IO2P9ovk-S&bv#^(xTTML~?`8hak!GiRvW+mNYnx;%o;3SeFXH$RX^#8%2uhFo zlfW?|or=IbCdCMyx87Ef=8!DYKR&6otzMw)q#>dL+{@mc%o#I0hF`^U)1Nt{o0 zVY;h*rRnq43R)%2Zi*3Ro(b6BgM6t+oG`m31s(jkmZVv&8i54~9F1#jy9$mwEnfk` z-$t4fg&2Z`il>+SBcLg3>^KhVX$aD80hY^sl4=R3#C{v>OHnOAi8NDrxvx(MaahGG zKXLF~al?wk=OvPh^2PiH!N`+lCok2dQ*2-1QqquESkPqtm~Py_;<9wcm!zzrJfiv% zTD>O;Q9Zgex$%s@2 zR}SZVl8G~q1Kmvsj680gH0$}mS%uL1oSgO3shdZd?>9YSy{LZd{X8dwj(q|S1GjW3 zNW;=$`BXO38`__(YD8!T7C$?I$n)z)1~BfwZJg*`hIitE*t6TdDuvj zyna(2Hf$AcrF%H0#@fHZ4a={RX_3W&3ux5#JR;~G49_z*j{Rg@NqcC>N6igUD4dcd6wS7mJ$JyWcqehM6@i-Fc@fbI55++WZiYZfO zVDhvXm@;!VCh$0=Ll=x1G6p-#c47P09jGYYft+mhFqdH!N%NYF-B?Sk-n1Rd*OX%6 zvTRJ9xr`)f3L6*wkdorYd}n8FA`y6vhR2VH8v4*$)*E$*q*<%UHLWm5Hp`Ka{M7!T(H;>=z-a9q91X2;(Mt#_G7oMCm^^-gin6U`|-f`6Tou2 z`Fi<`Y%hpm9vXTfsOLB&%<>QN34GRiwUh?a);r^$_+PLQq`8B8(n00$sbM@}6$!*`psOjh~IeyzMx2=p>GOb{0pEpEp!g z9YAJQ3376_8fjL~t$qg7IOeMAeWX)`w1|gL|NNhh$SI%nb2i&Y zpfm0~>p)x8IZ@BNd%~PdnjPU{J+aQ@uQV;4*^BCs@LzOVBW|Wgl8?58&6iHQa`O)- z;zPqT;{M&k#cJl2{w$11t;C&(G)D-pg``=8IUvp4I(>FJZk;*IgX?LI;%x-p=VfX> zSh_84A%7H+WqlEIU-}}vwrz}ndQe`P>>&AFiFU2BM0Mol9+!Ps2Rvu>sQP2PH?qun z6fFKoE-a^!X2N(v09P+j9^+6xjWlb5d@ax1YG%Dq2q~W@&2hpUhps76y+A+VMB`@! z7A9~me~mQT`&t#8Q!=MWkq{H^3G(XG5n+;0;zCY7_J|M9NJ!HVr5Vq+5Rv5=Oi`t< zs+>USz48W#zbEVgA(X$?;IAjZ0#QJkV|D1D>#m}G#fPMs=iK|_GIeE-@NqfEk@(!l zk&q4{!Y79tl4X`hco%G6F|Z_)W)G+6ABSrI@_yn^q z`eU!p;WHxGdX6^G(y{4XPS!@8r5C0%B6*SC*tlidix1dg;0|@v!^zOf$~ui2w%i(N z=J_U;-+G-%pc+IX?{WJG`*)Nety3GMbZn1p`^xaih21!N_Y5vRypC`6GvMtXaO?gJ zoV{=i`}dV%S9KApwiY3OOD-}pvTX{ijLrGjzyOw>S%~!;bBsi%r{~&KP<{LKN5_sL zfo(Bz_((h;3D;Cm>Xp+xsP412y7YgVsE#}29J)VZ52lrU)EFzZcry~-1b z33fbk*$TnrjA5?#CYAZcia0e$=6{_R`P z@EQ?j>wqB1(z3+rH5cqSrdJB;{&TTEC*P~F&u^$lZz$`RY@}F&ox1)XCe6I&?WBeM z2DTU;tB&L^x$mYy41NdH(TS(yWy17~SU4z_8CjO?Gp)!EP?vcu7-5z?jNJ2n+P7p8l4kj0%Hi~> z)6^j?Dx|Lq-7#dqa7-RI6$|Dp#=O~!FmS*S^rdcPZpp&@#Y-?}{t}Yk1331@1swbQ zJU;*OTYPfl6tG~cxqaDWWsrPA@)LACh&)>3C-B{G z>llYv_BSNWzr1;azx&;9_~UQC*|=iOIlX`XUQC-g9i7<#7&~bUdJiH=?$i!#+P6WQ zwry-Uisp`9v1}7EGfHfR43TB^G?x_ZKvB_l8#WJ#B&b%;m`P(k!1(_ zg2h-Sy>|AZ&L-7qA}0(%8;p}?KP{V;QD7cz#x>^4dYvf`-)Z5e=Ji}8k_Y1y%j@4^ z@(s4u(A75B_hm_Lu$a#xj5r5n_l+FcVVpE;Q5i2+KfSVgm_sfR=BstIu{mz3?8$`0 zGD(4$acD~+`QGN%2(yRvG*dV)DKJl(;||9=7pqiLE-9IMh}^qEctIt=DU?&D)~-iR z)f_9+X^k|02!42mRZ3F2andYA!yBA3JK__AomV+kIf5&5GQ1MR-|5L-G7|ISUhdjuUzCW-ko#AIReNO&&7$I2|3`?AGJv&gc?8wFYE zrY+2&pc|D@h|8x{PzHWVw`AQ&Ce4EKmrQh>bCTZD=nRn_d-(eGjx6jAAX+Z*?%yZb z(jih`;*QYL3Ig?CBhCKvk}s?5&k<)wnl*!ldh%Y`{J4+NT>pr+c^AJnKS5#6dbDS- z>ef9Kdyect#gQVMym12GKe~ZScdp~cgCB77&i6R}`CjbaQH;v1xhTlVLT+|0vU3ZO zk(I|_UVzP;g+d0rd?U;&majvP9(~ZZZF_VgX-@6b8D~zO!n;?^HZI(H@8qa$RWz3M z3CRTIYNT0&nf9V>9n0r?rcJ*?(xkFDnC_WAi!di1CqvG=?(svLb6cP39zCeB#Sye3 zT54KP^yt|Ssa?7ox}F@W6Z>9=udLw zdgR++M^z<$ef!FOK$Rc0KoEku#6aitDyE|>kFJXp>sUX)sAoxIpI?zUtCzWnq}isJ z(si$=Nw6_zM9Nw zqgwG#Q=^?da|T0)4rQIUqt11;M&{IU)3A8X63m~q1cUmsA(hq#Ma32P=DVx-{M1<- zJ9!?Tenql;>M~BAy@ErBPoSuD2MSB7QM7djiF0{Cm|YLEhG%TtxCQg)ufSNQ-LKzJ z^z7Xa1BDCJ9j=VoWF#3?|#Fd|3T#W?`*ziec^f2q_2JSJ0xtXTQ%ZT z9(CB5tXWnqf&cUCX8h^*f50Dq_j_v`U%&D_N{dS{WzuB&0jJyT+P6cy6lX8(I<>`+ zk%KUA@mv&^7UIa!<2Z2W7*?;&vWBL{FqanZAZgyka4I6) zX5i0%`V-#2c}1C1*8|e*338k;`*VV3+wo{=OX`UxSoV-Kv)()jvkormQQSr%9dr5= z>M+ysy{R597Bddp`vQH95#{*4;7N1*KO5!Q4-FKe{CmBO$`gTk6G=`+K$^YX`*@<5 zY+rR+*r9_b&S9CDz9-EdIv%I=0Uu2pdIv3pI z>de_=9CnQ|@#I_MnSEZI?`jdI6$v(;cNHrBDewO+(ro$UeNIWKv~swC0UAu1 zMWVceO;+HjjB)5)zVeb0&rkRa_ZwMm-vSn*I&^uC43+Y>o;ZSv>+W#|8X0LkKRS-y z<ha3u5P0M>h#;{KCTNyVtR@4a-ZCS9t6`Ho)ba-E2P<0tx77p*(bMe zSZ2R&elyFePphx%9up5Kcl9*aAbq_SH_*%(L$LqTgV=Vs62~qa#rF@c;bzSbxbyfY zoc-nms;i4pUXnw?yczj9*(l5}L{@e_HWS&o#mLIyF_B5~ynbCK2J|0__U$^_A|Fb> zs=S;5^^Gl)t=<@$E0{jv6@7wD-$l}_Ic$AT^K+K##dAlRZ9E$5+E&k^e{r_0x^ytz zGe6=2mETW>)jxG#a+V{lXWB~^uRyo1y)bh0cnlag4E_2KMxVX|*@z&4?$H}Pd-gT* zFT&h~ggG^}D>{)ZtM^<}=XFWzj$Xa`WAKm>m^EiUHfQFLuoPi)Mh-Tt-%Mm+{rXI- zU6+du>1EiwWf#)7>?3L3fi;^-F@MQAOr9|dW5$i+yLyp;x5e0zqwwUxL#wX_l6&_r zgo6cH5|-9=O8P;0NdI{3r^S-*efgOFRbTsR`5gCk-p^^Bj7Gwq3DWXN)?3-0`;x_D z`D-s@^SP2W&lz+NeGBc_k82j0v_M85Q??Jg!5(kyzKrN zaJHeZS-Nia#`4EWvIAwKMLGh6Uw$lqe2-+Q#&688JDZP?Oqi{KKwsQUzoprBG<;6v z>-DQw_~ybl_QgW>(W!G+bnDq06DLf;LN>bQ&svB<{YIf{mx0L1uE4nq*Ky(UEu6gY zJ-+_tD!#fvoWF|0M@5*o5!EQG+>HVfLPj{@ov@l4$8uPRpk!F!B``STS zs?6_RzQ%i!YyD{al=^ky>^W4Hm1FUOg%~n$5IT0~U}YRWWEhq&TZR#%M_|IViP*4t zBg(dw;n-Klas0#=#0i`_`z`h#I&S|(d8e(qc;Vpk@6&Q+iQx3tf2m3U zpOt@cgj4xbJ{e0`$2K%qHm!~qaX{@0!FfL(+4GAQq?zgYl}2sogKcG#>hMZ}cxYg& z_FtVp4$j$BnSq=`KO`vMG58;NtxkB|8ahlP5jsbm5DyYm2Hqzm`Wy$(9fUS&joUac zDo6rhCQQNjfLAyP$_L`s2(txfz1WmO6~Px0o-`}y`%1J-o6~coIqn1vDs22-gHtFY z3>2vH7$?kX{rb;&7F;z}j`vwBhR??pyfu7?G&{Mp6lRn!=F=Q(43_t}F}SiAtg`&G z5$t`e2cPFSbk64|{)`)crPdRVgX`gacFrlYN7zfF^4NVY$aq8wCH(It&HkJe*~?r8 zr1IHnMjs;0!Qanr6{P?Q#>|HgilMBipUIHk`h5Q9NVA|iWZ*aA?8vioX7M_9$L7T7 zj6At%NpKwOoitYIi@VP`LDL%%j}2zP)h{pM7}{l~qM3%F9M^UOq{4J_-wp z7|;rlonM4(62vVz#1@{*EkR~x0VYqHiPX;B87$q{+=UC~QsyK|%==qSNk)J0mU(|e zIW^0{v5Z<`U-P)zkARf#b6$TzU94k$+ITe9nSKNm>1ausB}4g)Ko8?Sq!nm|M-30S z^vxxtbnJ}b!^a>qD<7LOvax3ECahe&4oj9U$C70$ZGZLJ^;o`gH5M*jiaGNZU^+?e z>NV@IZo?*IkR)r-iK3zk>)Es6D(QLHv@shSi47aJkT7R4jT~&;RL1L?ZDc>vGpn(5 zMHXhxU5v@oref%@!RV6O3GFGb1H1O%XA%*OJA5gEV)ZWhI3E(%t3-XGUIe~YI^?EW zBIJA3%S^J|KvJVXs~^y`C`_CzdqjWKNB&#z@9}oxd_c^WVz5q=K;Gt;M3M;&&oHoV zMK0^ z1F>;KG0vX*2^TKj#p(06aQ-qIv6p_pnae-mlh4m0znFSgQjXH{?I(duwM~&sRC0M#}C6+HoceD_31M>`o&i`_T@<&`{E={ox6;KhfiQb zMiKIgSf-L)s3_ly(yhB`UsX2uw92+(`C9a1W43>v{cENWDo?!Wo#*dL^7c(N5F2#=Ba6sB^You97 zh*w@WBU^PkPg$_s{@orre$9JDG!3zQ3Kln1j^JyhTdi0fF^T2!{KBQ@L0K^W&WA`w zv_-vxHsfqlV8bk!?aFu#ZOHKfcbqW%lFxcs%cgwUlzuX4{?NS%keQUICIxvyL5{5M5=cQkV%7n4rSB%2dThnf~jf*~1l9`IA$0 z`3%Xj2lEpV=YWW*0<5B(;Nt>~!;NWnjx{2cfow$5V#2e45nOjYElJMR^6Z?cSK`ES zB*NaqGU;75GlUdI%AdhxX&p6`Vll9XyWPr z?$lFP5_@z$S?C_GEHzo4lXz{ zA0eV(8HaOpA}onc*#a=_p=G+fReltZp(}u z=j0b7r$8vdmRtt7JO=8#tysKhmHo)1#Rl|)&dA{-aPRhQ(<}Lz*CfZ9Cs>48Kc;DB zhh}a)X>NSlXg>hPNwcjeN8jQ;^9#N%snfR4b5@Uj>*_I~UJaBh%dAxsMFh5FW?R{N z^&UV1GYS0%4nyz$gVA^35DXYN7=wq6z`&uyFl59?j2brq6Q@qYgeg-oa_l(Dr3{tT zyRo%mJBmuSSwPRqEU_UfncQw7an|_mbsKZBW?eQ`uHyc>QeNMM%+0$=&WkX2{whqF zHVfk?j6>hPJuELXnZBms({i;U&Gu6bb?qggAuZC4kTh$UMof=PXBb@l3SUN=rH9gG z2G(TKT=xOeT(6&mJZX**XQ$upU7<|n_mTpQ{Ehj3^W(fFq>mIMN)Y8NvInICwthtDPT{PxF3_x)oldj0ok)$F>l4EO%XP03VaQT?#IrhCtqK zIdVsmeJ`^o&Ymde`x}i(gp45RUY?Lj-6gJ5E_8*9D zJ^EtR+ATPB<`&MJyF=3a0O!BChi@+5!`X{J;M32~Be$rU*oxw^N|cmWkvx+y^EfxZ zlr~yQo7`%>viSwBcQ$=ffh}#nV(B`}nm!kk_|5@+2cmtu4(Q#zH*zv^apKr1e17C4 zjvP9Ug9kpr>90@WNzEg?5J`JR!dCkL&+gyF{U7KjNLH_1x`figBJ}A^0^6@Yh71{k z{sa1>TeoiL-n~2K&Yg=RM~>k7_3Jo)?i`LCJBDMQAIBG8eTn0re}T`x_!=ipp2HU> zH2cRToV{=Z$G`j*o41rBuXrc6Z953{G#3_D6Wi!xod3~`JgJ@1FlFLoJh*c^&=vUu z>6@!JkvwhnKZDnmc%3Gb=FpDJA4jCwuPQ0%D}nUhdOHKVh`2*=jd}5eS@4`G1h0FN z=}0rvr5|*&2#FlKbN;^>KgFqAg*GD1mTzz0OegYr@`2LJAV0y@cmNS6w+>IjxZ$P_ zjN>WuETYVP%F})yN!Gw&7Qj@ONsL+fz55&D#mpp6wqnW8gsnU-_mi+ItKWxubk zYrmwkmTz^0E+8u^Ag{R|T|N^ip&Pdo-)ALLppIxJXFP&)PEH}h=b1Mn3`XwU*3JGw zfzv^0T-6)yJ1B3g3|-S))aFDT*)TA#*3%filXDoBNfQb6KQ0KSNK|P54i7gr zNC@heTbF+)zlc1$KoGsh?<)`622RJtnSsX)>|a@z2cr1T}`%yq*vinngVzx}uW51QZqhMPa!#?IY)P*Pfo;^JZy z6_%io~kl};6D}Gr%$G7 zjxo?T(>Z~4oB%t|XV{Q_{^T*nj2>lWPf6)wtDUG50 zLXziFY|7Y1RA9m4wU{@!n^x zJo&P&iGGK1$9z{k-o1W}dq3XB*iq_XZi`MSX*S$v{P;l-oiKE-NpIKcX5IE=JHQCO2S-Nw4FLvMm?-VY56vrMlQdIM6<9Q1;v%f z$txvFZ0J;GPN6k&^N989(lKY&d|S%CWBU|zpx(D{+m2|@e6cZ@Uxd$(et~l*PvO$} zv$$~jB==9?>Nn@{!}pi*!<9?8_U#2+I&%u=PJD@TUwwhfk|GT5-yfrf>j&DA=+e0h z8=+cpa}Tbu5q|UL4>)`FJkFdwN1VaQQz!A|mtW!Y&qQ)N zxNfkRL*1i3DQ38aYkcd7vqQ{IX`lM9!G;Ub*V_zD@(p~y?-lp?5f^Ynq+;)7xn!$u zj4>M+%!7!pI$GS9N)L6u1YrCyqRiMD=Ovz)x?vbZxOFi<>H*e0?p{ZNolUyG{p(DKGk5MF?Mo=9})0a!^+ab3Et!MEt$G+#UppI3n+M-?Z{f=F|- z{n#k-?C{!FsUoyul}I6L1mS~xJ5HslI9P?m%;X# zmQ!wLm{wVG{5+OnsbLZ)&0exrPUshqCNYVgGBk9GQ*gxN^52()^h)zci5 z!$4VjXXJaM{FYkUKL38D_)<5g{J@fvaWfI;!Wqo_ntZ~h}R&S)a(UE56 zz21m3<-jt$6j^R00@7@@8+stC|Aq+wnu z`yOmc&%(HIld*8&GBzj{VcN_HRMF$~>%_d#0sJ{UD-8VU1$l$Py7LD6o?LrGf7K&IhL-QS#4G(~7lkElZqlBMJnhjb{?Cr5Bu*F5&9qh8hVoFmOzOoIVfgt;bo zM~pP&~G zCu9Eng_tsB8v2kh_wL&dYc^!ytJB}$?8R$1d-*2Len--I`3HP+7{tjj3du)#4 zk|L62Hg=24G^V-A$g)VYhG+=-0gziM%wlBs-~J0+P8E2 z(nI~UJ8SA>Y%eRqm!E!uV~6(R_$LQ&^7v7Fee5%wB~d>8#c`bY>{FaPb`<$po6)~l zccO<4FPb@H2EP6FBJSOPzy|n_`0mP8e0%9KzPb1;IwybOe0*XIlg|)o7Pznb2&;G1e*oW$>F#%WWxdVZf6dqvf}~UD zdeKsAW-7+7Z>x5r?S|iOQ%CgZ0!jCOihm_eq(^NkFvhltXD~kPmAE7vDoehH7 zt4OnYo1ezWa!b(^pzz2a?`duUQ&v|n!5_B1RpRN+Dol9Qpp7s*!T2z!u%vc~0L zvPjB{kU*Ma`V~ibZ_;_Mmmh-DUj^Dg*IP-$*%-^f`&%c>f^3TQBq;F9kx)jy&R)}j zx$=8?I5{}S$YaW?vEc;?a~+;HHR5SgGivJJ;lcAam^5>-1gU%p}E zlQS~%kiIF$#^!EL&qvngLY^eTTgSGH1k-ps3+NYL9(Oa8E!nGdYV77#LCI_ zLrB&Ek>+3CzOmUURBrXMbxG}ly?giL@};ZR6SsKba_ro>AA9y5!rpy{@Y&Jh*p!)x z4xLlcrd=nDoiM{zU@K%m&nqBV&eK@SDmE}Gk(*tDA~qz-%eSGt!u1BOU6+qlYw}2* z3$bc#9%jv3Ny0n>Lx+;UFwJ(%hgN00b^QvO*dTaP`;dW=WurbQ&{%I$f{r>lnT=77 ziKMx8y^I2}W)ETD)enK1N#j33n)N;}->5MxIR9z;ywB^b^93T!&Zj7i=tspw!fZcj zikQa8jU&#EAiHDvp2YJZX*N6-K| z@O8cpT=}Sjl0#r`o^%QFPwGh*S#IKc)j)XtvWdF2&UCJGN*APb?1C|)##`fT@{}3q z+ixIx_Uemu8#8d?%mtjIE}r`4dwhHSHqKr75tna0z=`wUqkP)|Y|bi1VL=s&Ntz3b ziPCNM1E7eq^)jc~8DyjYPM|kkCy(vi9hS4(-~TEp%+( z0qxk36oKy5H5ECT8}Z4$T{w350FECyWcc#f5gg<8#Ia-8US5GwLxv!=V|%2uZ-eDa z7vawBpYZ76LlWj&xb@>LT)ln`7cXDJwAFBEzK=|&>%f9qU=E( z3COa=)mUAIVBNUhWf!;Nh{xxUG)rG-51s`2;Y-#*C?BKP!+WCmrq~@JjCW=qOiRxh zl5N-7TO5u=J3FHyZNlj!T0NjX_s&1mAa^SVfrrpW}b{&2+snL->}iyOuhbV zq&XtaK2AsFiDDJ=!+d&Zh9sN4C%6`m{~q2Bz1%$Fq}hQo3;c~pv#o>RIZq%z1Iwbi z{a+!?UO&9vB%_VyoU}jKgP`RdV?sso)sxp5tf7Lm}A6i%Et(-!s^GioZvjhSvsL(iD82=nHz#*$^5uwrc{ z)@|B?_32qQRAcFi)#%hIEu4Do=#eA%yI+2>{Oj|Z^w)m$6uAGEBw0fWjOrMLU{0z5|?~IdX*?E-5IIJ$D+uk-?{pLcTdfsHQYF7NAR5zxiQMDNGglpy8V`fSull@ z;swyy=1`zPz65_R842WzXq7aZvQU;@$vu?c;95(^|MB;GQfYxeWQ&HSC2Z}d$pDF@ zSzFgLl&BA?gJcDb)sN|q^5uJ#Tv`zU)_eFqzQZja>hq@ibR50g!Ijg>%KO}~iy#f# zx^l&#H2+#Jv(K;7PaVhM%R-oe3Q@gT-34WDNt7M@_4vKEA5S}k-s#47whG;&JgZ!i z5z8n!H0e{H^@y?DLC=^Y$OMCkdX^jYC%Lw~x2~ZP&(*`+@D@KkeS=$1-{I`FN9aF% zk_FS@<3^$HfPU!EF%6wM_dutVZVc|+@7M#KQ+o1P!L}E=^%#VH14m=Xh)EbVaT+E} zn~6CKmtf@B34GSgmXW?;Bi_G$WsXV?)#fPWzW>SdE$j0w_2exHv$h(Ot)6BPDyL`q zdeT4y<6mhXTJBb#N*e_dXZvhQxz-VyPqj6fh4vs&&I)*vTg{^f7&^o)Cef>VFMM|R zD6U+*imR8e_Ut;0J={Nh_!#EQ(GQcFI;#UFO`S{9vK`sE6^1P& z%X#@*QCwJtin3~K-?j%ks`sIk_h@m3_3Lu6mV|lviVc`Mcd?CO?bfY38yIcSGp!4D zRF*^YF+Z!lk2(#_AaNFHl&1e8ItCpCz=7dj@u|5r75`J9odOw8~M#R!F6Nn>1TwXZUysU!E-93#ta)l zR)c`H<6&vA&f^ea_CDF($6)`ut~MadMEp8oJ;3Gz&z)PQjd%&6hQt?EW;So#jmKGm}dDt+!-l1(<%$_j=XHK2L%^NrI(@%FvnD61{t=qVM;|E;5b_-XoUdOi=*%&x; z0bkL^4(vNhTiNA?n6M1m?qZ|h)o4q z+KM+#iZj#~`h?~N`kH#`0O1-`+|o~kq*<|mb>oLEgk$S%n?#rmBA@EH)%|+b>AjJ^ z7>w(f&yh}0&uAm=%RO}~2FE*`y$aGl*-+ef`FPMe^d7H+ESK_P{!O|kos+K=obJ60 zf3Bl{B{Ug=&7?$sD!ir4M4I2wcA9B7!W+uT^+5X`=>ROh>ojzv+i{}E7CkYxACXCz z9at74&+eMZjpZ|bXg)yp$?Gv_9J&9$%VHhhmpslqPL}^>(oFbx5Jxgu_K4qOJcaOa zL}*j5g#{R(Y`j^kmF1Mh6o&V`Vn8#p8mkoN6r~JGG^R*@4ZP1gXE|+6j}$p;79hAU zEI?2Qd+#&7Up>eq$HBPe*BV2X0Fh?@Q=odrBf{${zBPD99Xn^rlSpJmY~#WW^C$tg z1}bAnyolCFvwz;TNMr9agY#(xM3Uz`X-;?#5hs!&n7#+adQnKE;)K}|&^T#!pEPX@ zVzLgjP|&E35=pb*KaED@*TKEV{iLTnx^s?X`)A_Mv5dS|h?mLl$Dy+Ods`#Sg3qsd z36d8IaRF7=c`xhT*H;|A?5z=I!C&|H`cKM{oZ?V^K0=xUG*U-YCO=kNvq;#uUezVx zdYb8!wBJO-;DdunsNQ%OI*HPk3C1vA&n&1hE%k3J;ZI>bwJ0I zG<5IY2mSgEM*n_8FlgWi3>+{VefthUTDQJP>Cy|Gy7oqDuYO3?4@H{0xJy@ie&U3Q z%wHYeY8hS0`*AHvv3@dfJ^(vb=xK5$xXa2@dW*hQps6$GEYR(5{^t1sySM<^tpwZX;o?B+8LxGZbuPU@ybA z${pCTZ7&1*J{06sB4cBrVfD%kEL*zTruj-s>xvG++_ekm&c^eahj>x@&E>RMyxMZKAy~&G&5~CnH`Y1N3(~0s{}}|VkT7dSK#^uooP`9^%!UZxBgE5jNG8pE zXGoZnNi+Q??=yJP%(`@b%?DlD7lU?d1X-}fc>+YAlY<~``%lnJg4A5k`;9m&oxor7 z8CyC&un+g+D19rRGXhN=aSb}AL!TkvuKV6JQAb}=H(ye=Jht(lESnoU8j{IsS9Pl(44AJG;YagIb%Bvf_XwQE-l89EG;CQrl2QRC3F_W(zjGxBik%X9eZ z)J1$w()h*cOC-%#ap8Ln%eaSa`;K5^b`dgi9BJ0L<-%f;T@vSfqOe56Gs=;XxrJnW z9tI2;M6%o-9ou)tfUy~nU)`+kzpYLu7nBw1w|B+(^o?3ENrf6Gyj zSA^L!=b%T|ZY0EQ(TRGhuOUMQ^vBNXZ8&%O6iN#6(YuG1{BDC3maS9!PUzO92WCv2 zO(MSAhFn~}eADm^%W?MHC46<_3=SUr%&u#7zoG*7V_#XRi2E*-Z{1~$Ogm3#*=(H? zVJ>E)puB7cQH>>wm!m6v&0rFF^?v{Q_BCytbwYoj86K?XR64<9(<9ZL>YO^Gy7E2B z`T|gING8e=d5&*=9gEX=TT39#L}(LFWslSy>3j?kPjYZy))GmxhVn!)m%0_-*So9% z7vzD@dL48zl0K90q*?!Pd48rZ?4C5gr7R3F((G-nRl;nDaSI;X4RVAYThHVjL1@O#sh;(A?i4)a|h;8a<#m=YyLUGrQ>ufUcRtwNu=4vI2$5awg%Imm|i*kj_Yx(US&M3Aa0& zpFMwtM^9ejhx-jU``vwPKXeikrY}a@4jNL?(MVjUPMsLkM`FREW!SJG1DiHxV&#f8 zwuE&5{zEW^#Bsvp=@>O`5{5G94;nTKJ^Kwr*Is=|rZt|q4OT8+LBgS*u;_dVIp^0T ze$6bWo8^Nx$oh@L$RF!KbtAcZJ&Jq}^-l7n&8UaDj*SqHkTlypk*vaAAL&fs@HCa$ zn>VjWHi|5-A_eK|HsSJx@9-^2^EHx`ufIHxo!bv#&u)_Dokwthr1{{1PceYymz>&? zAk3e?90f(y*h0d*g@ie`pd2N|WvD7E#}1O_>dHMNBGt%BFG1#}B5n(@Zgn~qEm(p9 z{rcOx+qP?q)Rd0+>iAKc!s$irBNA2D%j|=IKEaA8zb285K>rhv=GJ5n#;qimJc=GTb_wV9)?R}nqNc*gz zT$yf=j{C$N!4~hZ2Cn4Kw4T)9#p8#T-qZVc@Z{cYJiT|vrd)eo^MKcQy`~n=9zNu| zh?+;JeQ*y?9@XH{J-(A=J%9EbI<#j4scl=@NLO2eeZasW=+(Qw4a-=co^3?=%afP! z^_lPS#mP$~&6jcJ;tgE5@&opK@;Ndz14bSRbCLVWPgBWh;gUk`7nW2~2Mei_i_veu z0MjdtN$%UbFGdU>g{+Jme17CB93o*ZBU#PZl#hzC-9|#oNS;dyD@al`tVqPU45fTW zQ9&s-r)OgP*a_&^AtfaJy?gb*n9(D7uDuavzi5fb`G}$8uyI3Qg_T^{PaI)1-1L7Q#XY!Qzv9_AGUUuc}%{mPOdsCy|`z>!f zBBuV>6K0FMye5*YyxNc;ZUy5;-rw@N1LZ^8v&LwHWTwW3voYCZYn-SB4^f8=FyA52=Qyxz-0EwV=X1)ti?2~!jKew&Sf75_ov(`sI>$71FMYy?NVD@zUUrn1 zy|X34=Yupu@cH3AI?((LB>bOP}ii4qgJh$DKz$9tH4Ae6$$2|L29 z>5w!Q#zC2j6wmiy$6+64&ef8#5X=2v(@nLc~^68G=d;p+7o?B4$s7O%`g zuYMyf;B`vvM#7wG%S10+u#~~A6lI#eD!&BTTk?^)IS0#@uELPPBhbA^Z=1Jx%*08U zI(rVL&6$e{!Q90YCrz>O%o>{Ub8~axB&#-<2J&?l`nj0JOnB;WQ#ECX`vu9cl?A@`Kr1=Ui z64$TZ#^=Y+VB7YC*tz>t?Aql>^RC?o(W6ISv?F2eOnI$Xv4QfbL~d>ca`Q+M3M(1Z zE3vJj3fn7pVQbM2Ee zXZmA9GDt2Nh*~4fnySzBTG2K_eGY<&ea3af&_3s~;i6vu##)D_ni?DW(G<+d9Fb-P zaY4S{LpH9h@+#Rn`8wG$Kdg&n((K1z+prQ|Q~kPeoxEQ@)B>(-LnQVQ>T4h1`NR9f zJ=BqqHc%cO_4+ZA=jt92+}bhGz`Cs0^(VZKWZ9A3CwRtdwIsQZ?)^mkNZcZRz>^0* z;c3lX-usaG7SU$;H5`Vd`uRhWY!d9p+}1w4L-I>-`-I0&b&ba!I)1J%12$i7t&wL9 zXL&)0{AfDECzN$P>-M`#m(evX4IMkWhG~b6Dd^p&AA0oci)Aa;;{3PY<7*AcICYW4 z`6AAIa|0*OUB&rJKj4$kPGZC6e4BwKC%+VVgr6Q(PSk6Xm{{e&1zJvR6 z(xXRDOq)8LcAJG&yr$mYJv;Ve_xAlZJY&;_TvV2eq>7x9kZQOG3AiK972K~tQGOYU z3)K)T$MR(>(XVfRbY_EEdaEHtLOc61)t2YGVCIbZ$l6?t{G2NKjB;DSFe7s-HfL#p zl`0e#RNE5R>NVDT9riFEdx?DpPnz91?e8IB=6yDvc^ey|yHU)t_wP;FQJ!;VOv8=u zFX12l@EiW|kH5z+Z(oI92O6dO)C2tl-5kVJ&)S;iFow5OGHwbf64t`}3WSw27T zJ=9tAx2C_&N6EK&5-9k%68bM)GY&)OUJ^%t>=zl5&1qk~vszl+$R_p1Vk&h?Hfv+| zY0t75kzLu3zSz0<1b&XkzW3add;?>x{HmL7l)jeN{di`#IGrGpEVAqfbDT6s4O>_4 zIBe#RfX8@v(yX5q*Mb^OZJw*3yx^uLa&S`>d1tANC?OV(6&Ml;+m{fO z0q(~#ll)Q#otH8?_@#*J83Z)0T2oFn5%Ct8Cd1a&IozPqn2zS9bwd_nFj6McECS_7 zl$#@y&m|$55dSTRNVD5_GKk9T;SDqaR79s>R3JHb?th##M`Zb9q}i8OV}z||SqL13 zAg3X`p&Wz{_cXio5=iqik!JP!sh$)tsS5!)lyN&a9nrRtj=_(OkH~H!9LW`29q5mB zHy-^ZpCODmH}RdygHI=tqn?kK@3qDr+EjF|2f+~B>+3_hr-oeU*kT6YTI$l>MEy;G z)tjqt_amf(TV%q6hRbdIx_izcZp?{rHsOeJ93LXh%s=I4a?too^)wUjnpDSAU;bci8a| zPe(}-C2~i+k^NFHjOcl?taE~qX4Wg^?CZuY;^FizPMSS5<3b(fE8?uJ&9EaG5&mZl z3yyv(UEq1mnvtEIiEdrHqI1Vi7(HwhzC8LBE}p-P%imrnX}*iYpMH%BlICqB%-eT; zf&&MSp|Wb1k>&OsI+5)3!^Vv{BnFaE1@iN%ET~tMZ)c#d!q(y{@Rl8-_PkBb{;T5YUYbZlIDgdB*Y@ZB*E$lwmD?wYq`~!XZu+&NYBQq60ezt zb#&M;CD{e-hWFR;UhDATzIyIoJXM3_q4gTqaofOSk!6u)BhEZ-&^2zKYiv28ZQV2a zH^Tco3$8JkA0+&blOoJ7C{H!aG;Y3${`K1n7i@+Uk!F$SPMtL!V>{M=H_V>55C;w& zC2_unlV>jD^!ckebN)K6-F$>oB+i-HTd^@cA6s&YZRO1({ou#OY+(@zDQ$B6xG6@$ zMP9r2=!MBsreoc@bn0XY32GGz@=7sd+Fa!E`9hvsFmDAmrx#FPt4WwOEMlkiK5x~S z;zH^%QL1sog&JdAi2{;T4T;Fe$ijeugGiR!lK8i`t#r6sw;ouvaxLwk7`Y_M+0=WH z<}F!7j%M+wW?uP@t$VSRM7fOb6k)C`Cn|QMs$#Fr!(3Gn5ob@L%lV8*vl^fk%;WBA zo-3}v$RQ&zV#pB8o<0=|=gz{qRV%QgsscB!UcoPKUQq|Aqtr`%ZF~3Xm2Ka!v22UB z1@S|jOg5%jKEp_uKzp;eES;3j$(Gz!u{9*Cw2e4?FSAIYCx;@%FKJKV@QvV{hYg7$ z9C@Y=QD^FCi*?j}L5;yiHpKj}tk=~0H`KQ`)O{n*%7@vYVuB;g8bV^jNUUrwZpyFP z)U|{ki)^d;DfV;g?916$M3N&o(oWk-f;u2vJQP?q-e(-Ze299QBh;&mJZHni81H;M z^<2t+>JH#YGwm!+gy}O9;p@uZ@1BXs&rk?@oe`DPfu(tou0JkYmb; zTO-X+S+Hj^cvgy=21tR)?<*ih8Si11)S}Ch1Z3z(B3FKt0D$4JFl{EWa zbxk=IEi_Y)Xn~McU_H&07t1N~Yy{d!vrEejryyYsdwe~4p_2&%s6*7#%yMvRq*;w0 zDyk9QWEl|ZDfOh;eq7Tr-`n(pL|Gtl7W5;S#+{j5>>kQJs9R5%qnRV3uL=g$zkeU= z!x>QI01}*@8I05CvSqnn ziX@XTH#eysolIUgy(IB!!gDr4zP)@M*#*@YJ9z~eL$=A|T9;G_&5!#sk}p)s0Qx6#574I~f{b z)@O4gD}is)wLq4>o>cE%2TXp{n_u6(#W&~9qDz;~=-#axx^?M_bqx6DPM*Ve-(JO) z%eQdl%1_w6{}@WE4`A!|LnP1pvG3qf6qZ&XrBf=}w(E!i1BS5?QGhL(B*P>Lg}Ede zBpDT3t4Tg8kw>EAhGmpuOGXI^a}mkNR;*gF(x&xF=^&ERh7FaCc>Ssg&lnt)cPkI` zDqm19@+Y65vPdopoD4jI1!BrpKLY9pHVX)ppO=9rt3sSKH)=5iqJi)8Xk@T9(ya69 z^p_y+k(QNwW*IankIDPiXi?IF)7yo~x%l)brh1eNV*sIrH;EV15Jg z;$~EeNUL-}tGm7CJ#NlszE7Wt=|>byCBtVd>r=|<8T0>~HfeKGE4Ha8n2i(l=s$n- zFzD63k2<3Mp?a_%vMwH35AjpV<0=ePI6;uC_O_>2v3@=r&-Ui+_g+k z#Cg+(bx27`VS_8hwtf2aL9gDu(N4#m(lBbw1Y20>%=s($=He}Uf9(OTT)mHiqV33J zgG)w7odF2i%u+AyaGLY^rarPN7ruMF@EAitY@Q43l9|*R-)L(2J45^16aCb z9abz~Z$JMnTe=P#*XJN>bCC^^*vk9LNu0G6ITcDZA9BHVlI%)01b3iV^RG{sYO{Wb zEUSlE#92hWfB%6-n)P#`#w8aNY-1j@GG-~V*r?1?4|560`BoC!vxaMI<#VM)yocnuqHsI*?>K}#JN9Avq-hv4Vk8Fl?~Bxqt^q3a=+@O{ z?D^&GJN)+kXMY;ICaps$qA40YCRr*OvQ-DxB=45^uUdrQ2h?!RY#jR1=ys0@m4 zjQ1kTb%Yy#P5UHY60B!hL^%fio8{)XryOE787)wxedb-hg!Uefm7$MLuz$r^=YM=0 zbmUq7#My&!175fLS%&y~ZR$`X&Af*;-9R6psRP|G4f-7BCo%r%n6ei_(oC4Y2oU%} zPnwO7nD27u<8bfL^Ff*(J}ts8J+5AVEf}U@hi#s*U@vIszW2rvWx)k1dyok^$YI4n zM@Hx6gy4uuGQ5C2DNKZV6dhczvfEm~X9EE7{hInPWiU{X(O+u>#xdog; zwFY0&eqZuvt-Q&w>5N1Y?Rm+Ha^$|71)wEq_McYbgxQzfrAa5?NwazuNrwL>!Yqhn zwL+S;nxdb9LDV{u9vxW((Fl-^*$;;G#YR8+V_sY6S7dhb^izTH8MQ! zAUxuR7(UPRyx|#Q_e4;dfqWC-u19HcKgeS}Wl%@h($dmd%B8tUKY*}ouU_L}%~Nd3 z%EO@H{D&^>FS!)YX>+YTaq38Xow7dg1Z#zUeUz0TPz8APaWgnE=s z-?XKSyrcd3Z8O%cU4>4aQqZesPYfO~2)nlJ#FcNq$Mq{e;QF=O`1aCGY~8jG`Q^Kj zU$GlS4DNgOA3QJNSH}(wqzC~GoA9< zOwzDL)6kWYP?TZT%o#?S72t;r9f=1I?psgfGX)rxoAzga>Qk=`w0HFw+0=3YX?A0T zRbJ^aWk)!@q`pW_(iiG;oH%QyhmbH6ehFSd=l(OK+5J50jKvYr?b)O#3;XQ(z5(t7cGn!SX+2-k&m`PgstpwIt59 z&q#O$#x`zW)H2T1u&j?L3zFx0qT!K-fINaakDlO~d%{w9S44Y;NYBz4~ME z^0hen`5F9p`zeY`cVqRMEhr(e)eIT=)S+TFymGS(Fme1;v~QPUV~vLo9fcJuR`I?v zY%L=}CDB%|ZrRrTB&LV3YsXRQYAKeIEU#Oa#e3Id!@4a<-J5WLWpFVvKiL&d>Zr6b%wXGwm4(f4kPx3qzt5;``EEgkdi-wX^ponC= zkohkqSuP`4){lppw_IOjswzdE1JZ27c~3x`-H?nD5%(hfsJIK;O7~)K)hF0fbpR_D ztwNvfz0tF4H>8lfcTQ37eJVQ92Pr@LL2&xi86@51HZ0~D>r-=OD@KSszkBu4e)3ez zd(*@%V};_N^wRW|dZ#ZQjZaBfsmE{WUu{le+L!x@&UZ{T&}Y%^WFxh#d;KRsoiW>_ zuE?fZf?SC<<{&$!ttf6#UtT=Aj~5T_;n}@AczX9Xp5FP9+aJ;J@E%^)JizP64@jD8 z@H2HhTv){7HRC4t?dLz{%}6uzWh-xTUj#Xv9?oK?{7hhn<`?NN6@S%<#e3X&#X!X~ zqSd%2+jsV_Sm|b^amW5X`DP=|vJp_Ko*}w9n z_sPEpc@+4(C(SwzV?>Y-#a1KCv|VwR=zIjnarvyCkHONDuiS2D%V60CwuN$2#j4+c#Td%F9Cp0o`-{Y*_W~ZzQ$~NqP`k(+B z-s%FY9HUj ztsgEDmvH9X*VwUZFY=3bU`y_HtY2S%#fvv$)5b#N<&rdKm)RvbmewQ6d_AF#Jy8!c_3(KU-m-yY zD;cVBEb>hG8dWF4e2>(OVT5P zpw$AkfJt6b<10-yaHZgt~y)cJPi zk9Ol%84HI~5YFCoj`sxDqktLcv&gT(1xQUF=JRg7jm#@RKf!ZC zjgY4-!!yRS=K}Yi^PKB7eoW#_)QG4&;`WjCFgI|kG1BT$ep*{g)R1u166$T%vn1Lq zzlS=Ntaq6WkLQp1JoEFk_5t5b;{CJ+w{BiXX-N@=4;f6N+zBZiJE3F8&NlwHcb~pC zgT}ID%du+p8jKn<23@=LM85%pF?;Sp|4sbg(Bv^bSu*%X|5DGu2LR%6FabN+aBhPU?X)~)jpDZ zN1BTZx7qNKqJkLV%v_&~Nu#GCwPPCEx9v!h+=*~w#M^ghk6yidA+<{v%b#FB zG!Gt(RV!EFz}|hf`seLiH*xpJAMnHX-{IzW7jgCC1$=kmJih1F;XJNiI*%W&T*SSb zSMiX3^3JvI@x!Hy#3lT6*#Zu5Q~Gw#hjgFqmjeG$z>wOpXEsLODw)kQ7kPIp>@ZNJ1is?DVd8t+m&) z&$&3~_WPgO*IFO5q$wG{d+q(4i}qB_Icrvp8a2Xi^y;GzfuF5X$hs(=kQ?7e_LU#l z-)nNDS*`h-V<@({8XU5$+rnvTcdak-)q9`B3(nZ)>&w}X>v!{S=JnN2nb(5a539rK z6U9Mlr@60oz{f-aUpEPw&};)|pZlkQGmrJIt3Fz=t^4tv!{5qm5X@_dP1Gj1>-+j^ z-}jDN)V8Rd()raEsa@f@cuwOnwKKW~|6>+=rdw8i4O*lx-ts}bW5@R1#DtlcI9|*F z5F2Wq;B%JTDH!$b`^#Aq)v&t8-_w+3$zS|l=bn8I4{ANxd3AlB)3dlr9^gp+-#b`y z=k-621IC%>HnG$4ES3HiOaJ|8`Cn`HZ~l2Qoo|+RKItL(uUg|TXF2D+@;v6)Pq}b1 z?`-QqvleanU1H&tJdpgPSfdGtl&-IE7@R-jZkRe@9p zS)n|DG>Bv+^TJzAmm582|2%SOHeYjAT7?PJKJ$&CEy1%Wnf^{zq^ju+S*nZ3Pxet2@pHce18@{2eDxYSE5`2qe^{ z=T;-HydI_X^{DS`#nAX5Mkf04_-O|Q9=D;YhMj7vV!b9Jk{wiJ%sm7L$Ff5Yz=aEe z2nmT}#~g3gqaZ?a(5-OXBseRoM!>b3ICJ4LzTUbWa~CYM4s7Y7Md<77z#kvqTN!py zYS_^5`TW5$s!(%(2$(fEn&7i!J7dqDWz1ssPCXmtiOXvn$eY;U89?({`0`lL{Ts`V z2F&*SR7O}w)#k_&s65f}zrO!Km>#3|i2K~G* zYae~}LVk`4=k}{L<^ogYnJly{7jkd#15mvQvW`c9=YwJw(o-Fq>MP5dUJ;M8W8v@K zF`k=&gJ>rYnhl(pCTGF-t1n)gMyXs+w*g8&pB0~n_eOBqA}#sX1^$m6nX{-%?w&Om zon1#<+0eZ>SvvV2`4!7G!$|-=hUbLamj<{#jVmVhDEA4?+!ruU5`G21PGf#@|9LGd zJv)C-UZf4Z&ew5oSV!gVG&InQ@UT!DpsnHnY5p9Aep^VmTZC1szrxp>T=Wc020V25 z2rgc_gk9h5HaCmaYu8u|@6fPlg5^D%kQ*1DOt=m(U-~b%Q((Gur| zwr<^5g#TkmN{B~Mb~eiM@=%(SWhyJkMRi#*YAZ@mUsZ;dnkt_v**=t^x-cK5*^f~C zCZtk}v^LbDt+CE+DQ)$= z_M2*}Y&%L=em&-!1k0`VUvnV$;>i=d8qsH#)u;S@L~cb2X*kIN+8dTfa=?pEtVh3d z(EA>fqpX{wtf!-6c+WEZk>&p5i&1Vz@PT^IGHP#>Yw^$n?v)KZTkC9?1w(EO(4wkKd*U=G|Bqxy^)d3b3y%< z*H&;}vbRP2{Wa5Ua}ZdsHsIA~O6Svgls1#S za1az5OzJpx#-E2~?_Rz8arfk3C|$vv*>O#a_v0%s)19U7UhIB#Zud%QN(RibHZrex zoM1qmr3G%cbMQX~&D{E)q4yl!K!6ZtkUSHb6^6|{NBvi>Ke7_L`w&bhUtRgu{mm?! z)6d3(p_-HyeLuq0e1{9e1o z`wor8<$#-5yuvL(eBi)ggonp7Kjg9!$2{@rb^?Mz1jtE<Chl1MxOFs)wuK73GfBW+)f%mv)oa(fQ_KpmxtbfQdT1u1KE`gZ?G{vd1hL! zeELJP;L;6nrv|y>JcgR2WQEmn^5fNnHqZL(%DdNr3YwL_YLEC#l;2ZzM|I4VXAPb` zO_ym8m>oQ;e5>=DQ4jdpADZ>w&q4E4J*1{wN_=|B?>umpf3x{Gv+T*z?8Jcqx8~z) zVh3%0%hLGNaT+{mYyamtwMUM zIdvXY)eSiK!!Zl%xM}k?ELpnJ0Q&5?i`ce(7dC#q8N2uF!-)hf$ZqnZzoH}_CB}KK!$u7sk2l+_P zEI{gmOl;e^n`v8#>(@i@@L`U@a=cJh(Z`&B%qq6!V=6($Y3TdQl)QQv^}x_Y$L5Q?exItR(^O@!#?dbAUowcpv& zNHy@c8iI2*8f$o->T>(8zq1Vk-R&6a?ZlIT9y}fF!?Pzt76C@vQ7udUc!>K?@r=Mb zHad>+5th3#)djt2{)9g!KM?5Dm#TfxBtil6Fze4S>(4O3dYE;?UMR^)@hFm#0bBcTMM@z$P*vf=e~cBKR)o<)NgM;m@ABodvEwZ z3!%q0N|s*1tLOXgL9^ghpS`Vq*Wl9)PMy^D$ao?a@eEk@L))2*qQR{1 z96Z01OOM)%=@k_{@&0uVrrR?H*uAUGqC998EKfo6e{+!y3L$lKCZ*g^>wa}x%WrGQeI;Sd%{_A^&4w~G38Z`UlEX)p|<+JT38p^+! z6mC^sXx^&G3W+&aNDxR!dv!GdNwtMGweUfjkllJwS@`QOf zB{wh<_)P!Ts}~L`1k7IJAaIsswKGWf{a3H7{`~znT)c3>I=ai(f{^?m7a0YWsAFS2 zFx-gY(FP2SR-^xE4a#eCk(QZ>w2UHTWK`hZgF-@c29i=9n(v{k%IDADz~#$92o9FD zIRU}!T!U^#5}Lyhcq0Ti2+r4Uh2ZGVC#IlT*NzMg#&BOR+SnPX1MxbQ_UamR)YRZf zS34oiHRK;(u`z!+#D=_w9Rq>Bk8t19fWh8o4EML=$zTUYpY-ARv&V#H@yZn}U+tjz=#M867LiCuj>9cV+u+c6 z1QU`&!jlmZu1EyQh~qJFk@3)iCx?DGjJdqF;@X}*d6H>j`QbA(v4IvedlE2vmSz+0 z$0iZ6TsR4wDgIXj6G5u8uCP<^0jsB}SCms$W0hS8%m&Y9W#(_sJa}}niEjpy4z?9p zLv`E(=4ohV-BEtC-nh2FN$wK<&@8!j-V^UhgBoACc4_t1CCYP6uUrZ;U{+k^n}Ol{4?#RS2?=qj z2n~wF#;>**D62jnKJp{$dng`dmLfZ=3h8NO$a+*^5WIQI4y;(U3X###c<>0d1{~=xA-GS_s^&=xy&nUx(C&A;S05-gb-+b>rpJUc4IZ z#_MN2n0(e}dOg<9@A~oVaTf-=8qwWai>{Vh{A8DIa?y|rm+FRd=U|=nlimiZt&&@X+zQpNn`QVpf!W^d zD9R1n9@Z&E-;l-Ij9Us2A#1q$NY+K`BY8g8Kl=JmwI`HZFEprC+vmAZx&b!Rp!SIS z8YH-OMl9eyJ4^etyf@`yl)pgpYc>)yED%$Gz>Ap0hbI;EN$RA0@&{1^8g^U?06sNw zjhgDnn1M6kgmQ4;8c}o4U?=FEb2cd32#4}MJJ9pDkh<&4zLtIG1Forb`lWle{?5|u zf6l4%{x4+zxAXD%DXY8A=QM5Qt%%d+Cp8Z^Is z&+AXGlKAW7$G?T<>HPd(L$l;M$?5!1nGqm*!0gif*SvCh%6*9y$US=%{%L5QatGoy z%(`#+=boWA5%90jENe6Kf!}|6$F%d94+Q3quRrn}?_J(0O-!GGsV1rsnrDWhQJUPn zdm~$ECN#57XsgP}iYnQtl@)L{G#i-DkoUbW9el=K99#eYP352N-}1uckE9iwYzb86 zr6we(vCy)=dKI=~#}0fsZvk%H4nqn%*xb?@boaMmgrGb^U>+W=LQPXP(sLdlGrJIv zvTJbvVFgm|77&=zk#I*AqI+gtmbLlf#hbW!Guq-23!ZI~+f7-U!_B|>#?25MKXnF+ z7B4YBLUpo74jn|ZtS#z1YS}0^HkrB`8_`zH4xyr$AYF=q&MG#_^%#HJglA8hFf!Cg zSgu1)M>RWx3bZwrv(sS5(on|ENBfoNY^z0gdp&yEd2DwJ|KE)lBg1T<)nK}^r7|XH zmbE}orHP@fy#7wPkuZI!iLrR`bP%I`U3fXri?KeQb7URUJO9%={#Lb`tLbDb(5iBPVr0>`&fl*W}_v0esxs63+ zSQNHyasJGkHh#@}=)kY<-y8HfXlC0h3$q_EYl5I6qX?R1cu?+DCq#@ICT=5^%r{A_d-pFVs2 zf@cBqojVSgWtr9z*-;#19y$CoR`R@azEq@|jrjh+KK%UC5rXt3wm(6vPmxGs-reN@ zKkY#-GSiEgwld`B)u6br7NsRMsI6&1b5obOc`RMB3JVu3#X=5XPH+IwLbz=r05=kv zn<|UZOmJ?gE@NB8Hl{|_<#MwwH+!<|s~@M_nbiR9yx4D=N?+%X3XNp_=ETn(GL) z_4QO8+baUJ0do!Od%bO&>iM5qLOHkkO=@o7btru|Q%oNLx`Anu8tWaNA(o*F#qt0(w2VswD*EVobk8Zk@_ zcCo!}b;tI$YJNu}`rDdK+V5#@;JNDAcGsb+p$?tA$BsItqlRf`J~dR7qOPn2^&CJn z*VLJsq-qT!^jX_lv@u^go0@qoLVjDjv#fWuQysQ1tM@S5<7dMIctIF{IYw}1JER!T zu0M48m3_DX))>S3MD?9|Tj}dZy}nd!iM4rZ1J&lSy`ap4g5TNyS%>{45Ld`!19XD2 z+*P#ipqcDx;OwATA~eeip%Z)1EHH5LV9A4K4?3s7(+4r?usm4y^sg+gFh z7PS5anwcgqFFa||)^z%3{`QwV;WJQK()m0XlssrQaQ1!YCZDqGP35Oc=WJ*;dC*K* zyk_3_2M!bz3FL#KI52Gn%o?c*n4j~0`Q3C{b-$l~HctNKSoz@wl>U}xcmKK$)64Vw zhf!37n791Rjh-o1a=l{L{W_3l9$6!yXFjcIUr+)w;YzUlC(G#{lr3)Z@jb!#3D&M% zgGEc1<7Q|C67Q#@prRIoL+yC+jBq;EfTorLJj%UC11m`k5?yJI+fohCCZov4H4nlA@UXJ#e zMu)rbWUvj7`&%(c_#YZz<2ulRr-Pk%I@FEv;XXWnI%JKkED7GT@vOLl=PowV#||B~ zrGU45{WV(I_>T_s5te%}*3-pp7siMB*kBK0*N$ywG1nq3vJ?thELwmxcRp6G*@Vki z!*KI<5|vC~PC*FaB{VV-5n2?2(xMOo=6FTTh>XR}>o>4$sas+^B_##Fe)PwUo)Xl79WFdp7#yY;X(5hIBUza*#tq~*7_2k1zXa-s~f)Z%>QWX zrPCHlddcH_(5wNd>V!Hc3(rOQyMgH0%4((byyo>L{kG2HEK9o5!F}GRCq>Y3Q8c*a zJ$Wu1ub76{l>Q^&7G#^X+9b=hn`Aicp22*1G0t;Pti!LFro)GiVB^M3Sh~!;kyyY1 zR??kR{`L-IqZ0(@F-#D`M@AG~<0Zd)ZY~B|Y4WSDzQ(d;%du+J8rHK=4p^^Y#qu=- z#>0pRk0YeIc*Y7N7p}><9IRPfZ}LNA+8Qz zy%K~gmv5SHvqDMU4hqNFGncS>m0PWH$M(G_EULu7<7a4T8{~c+@^T5x*;RP>pa|Qy zeUEi(wh(d?2)_@_|JZZ0kV}TQ^`KdnW%)Wwf@i^XO6nsA%am3(ymaY0wrt*k`STXD z9hr~StJmW4m8(pj!sT5>{T)oBn3b*?YW!n@Reb+4R8R=S(D4*#q z;eaL40DKXldC}a3ID7nr`5ZS_5-`gN%jHE>AzC<)Z>}gINEg}njTL1SkEtTSQu19E zJUf*eMEif1HQL-QDD!0|*b0gT!1Xl*%jyyXW_{P%(AfI)U-DbgcjXbezDu zy|orCtvY|Tvs@FL8wuFDuC6a|ZYDIh$-qEx?rEzu^|n`{ucLy1%4gbDg?@eJ_xQ z6Kf~jV4Lll^|uq;pD-7{U)hJ6e0`*#S(BUHYN)<;k8<;9)GtrVR#fFyzq#MlcfVmD z&34%OZP%YGOsXvs!!%gPx!KTc76~?5Qt?+M zPx7GiFW|{e&O4_E#gc-P&zAr1n!)LSPtfB#mtQ06#UPv!vrzEildTr$``%nC49;fC z|9oD$CI9RFujIk51GRr8rh(^nv*Vd2ABZ`9o)M-j_$rTfwhh%j#KpGwD4TwCBV7HDH!l2^=)*I9Jvr z|FSW2BA}B(@mMIjH;MwH0Ry2~*5;Q4XjT9Rvz}a~Fe^FJK7HTvf%d+0&)(D2eKB2A z^+J|TcBZqR)h9&)5iDzV^9xPf(1Z(jZdv6%*uBcCrctu~>*N}TS@PwZ%II&r{_m8a zS(fHMe*M5s_$gMdT8ViC*ju5Ih)KSSdyn$a(OJv9=*CcAEwZ!Ik)0z8GlBV0Dek5f zBISNA?y$p4WJf0pkzSK3*6U4nxUx1+Tbkpi+%s-m55cv7AOzkF#<8DHU_Jq1-n{u( zFnoEYTu zdxk&Wzr-IOc(3o&=Ex6D3r&m?k~Fj0MO#o=G}jb0kjzI9n#~W6c_rwpt*pS$M~`8F zFVtJWrOS9a@Px45gW*oXJk$2Hs}nDuK1N+-3DfS5n=|JN-pf1#=!FXxVIJ>q@v^n} z>BJ=m%t5KR9daKb;r9_9nQBWLYvGCbs5rz`L23h^Nk+=k=KiQ~4cu|VqT)oX;P zH+cTcy?XSl%}VcIfZ3;CIY_jIntA8RCdm29iM`EXxisHsmJQ{Besj<)Ig76Wv}E7& zdgCuF?3zNZDfHT_XVkbBS8%?El3-TtzYSV>Zs+gEYjJB4mRL7bM_hU4^_ez-stpJ} z1I%y`&Q;vzuBKyUGjB)F0_;DPxQvptN-dFWSv*O$&XmjYHJMec{|X5 zj!C(d@ZL0u%8SSRa^xwVkBpc<`SVe3DTRic7=On9^S?Zo7TWUKG1X6d#mxKEn~W(x zOTxVtLD5YAiLs~n^zj|i(;u+>&o%H>`B}mqlRELJdS^W(lbPAXQZv^A$u@k)R zN=y6Ry@yd<)qFRVspW+A@axgYD+Zbn#WJnp7ELSmdk#}Vo| z=oTz{?ip%JT%-)=yL|87Biz5AHRYa>%;N;qH*bdF+g*FHc<~CG=%^?o^XD(L#gc-8 zf|*uLTD*m;mu~z;moHqym5YRE?Q?sBZBkfB6rv;IaQ^INY+V11xyUSCxCG1CE?zow z25oh+G?$~hz8XV>!6yW^$1M#QXskz1eJwhwD$!h4h6WD&8_Ef>@?j=G*AlFyy2?^Q zvr|JgkMl{u>Zz`(7&VnesIDkPE%yb&4s_+FAvcR@aIUW_LoLC$j$5gwrUcd1{6Du+ zEmdD%h9*LAOLH|^3D2@*%XOo@xdBae)ucBp|qtE-2`L3!Rza& zauO(a)H=1-5t!?^m6h44ueFZQT#N3eN^~)uU5(Y~rn-3Du6jaq1Mh_(FE#6aON&uS z5UwudJr@;IOm|5sYD#z?#YLzsW`32Gqoulr=WJ%b(2k)lMI!0Iy$(->SCK2gaY+K3Th| zc8GP)V40AtRcaO4NwDlBCQzU4eeB<+acH zYGQtA0@R1Mghqn%hc~je%F_JN^wFS_h2{Skn)N!>D_M}ndzhI=E*zEe`GsjF zGz$Qj|CC#ChsO$_l?Pr~WBz$*^zPM#+H?H6uL#XAxNjAL zc|Qft?z5L}e`xmZw1lZbYj*(N@2?vMMXHfI=f{i%!AvFIb9_R!l~II8nN@h0UV#T`MM!;+g`~R=k(m4d@rh}OjlGZP=wt$5vPI4iGzSJoa$hTb z#ritrFm}EI=U{||MdR49R_|@H z%AYKkzp)&Bd^y7Ve@w{fXPfhw^ScfGnIzG8#nRs{U<^*;fpY3mgav2%~qym;h4nlER$-q%~gZt z!Cn*U-dP5e7xLlP%x}vVmLcae%zR>9^4xsPC4&E#1!i(=0^#X--;^t>$!aVgwD=xgoj5WH#ZOEgtHAB))N-zViC)!LK}Xw zdAm)#jb(k*LOQZ4hlS`(#$A4Q&n(Wj0;8Qpna2r~RX1&-E~R~&dq6-II0q7-ZBd?} zNL#Gv`n4b)7h=(1=FQg%p>tTy&!VNJ2dyprC@pP8MP(n#$~#b1(~hE&O8ju>7*?)a z!|R12H6`5^AW2TlLh>Cd`4NFR!`w0i%}$yGC_m?ixOe{%mC3r5j?}y9Y_Ae<=Kfx4Sto4`kg+!i{SYxOzF5aC{p9mkH5VZc#UJozQ>n z$_;*Zow|nW+`n-4A`a|3h&8L$VKHG@v9VWjFtdEoVq7_YhM>o1UQuGuJgxzAUoTz{ zunig>#(S2@w=Acxho4}ge*gn43+=UaXeLOv)Ks9Umf&YV%FjNv%Hqmx8=Oeg4dK=p^KIwKv(;QwNXJ?*iZ^LTCfeQO|QX5_Vf;sU{pZ6Rw*C*Sv=Io8|V= zP-C#%L740!WU9>9*HodF&}^!zKwS+%mg%jpE=6-gIgcgqw%4GCu-w;4K<;k9U>DU% z;OuI_lb#OK<8H#I)FrTOAgtD+pJ3fjuEpGkOA0AM za%l-6nga@1>X{xz7iq5MJ<0#Owg#OItdDIiCJVpEy4u_7Lh%_aGw%e%+BX*welK|L zY-Ihe=l$2obw+;ZrF_RrO21f?U)4=SY*>zeT$jAzy z2HdlJdU9?A+A=XdDFK2J&oUwT!-~I^S5V}2hMxmxfA@_~zQsDGCfv=^{}P%h?{`gN z^wQvIHZV3xbHje@8qCrTA%azos!=<{XxuwW;a33%QwlFULP>NBS^jzNN}qO zai7e$S=Q(OZ=u;qWoPCy_0K2W%T)e)(Cj;>eaC`m;(5H+&XRTVUj9zyjh9XVgM$i# zO}}TU<6Yzd4jf);X>Ll;%nIQ_vjb;U4p%1q%i;8A@0x#=^Hb+_cn^OE&F*=*JbA~u`tc+Vet#5eS8uRas+0`>;osiA!0+!~;Sbj1KUlAwtIZqx{3iiY%Y>Ud4`EPlRUZY) z@BG?>_dGYH@}SS!K1fZ$o^N;J__1RsDJVqx{Rc?9cb}b28yovBHvT;rXzw9Z`C_jU z(jK$pd`fs59UDboUq6bA%gpyOI_4g(veOnYU%MG^&>R$W7ooxT5Eh(-=FvUB zPnX^AKzkd%ms`r9S!O1NhcGhO&wS`&r`m?Tu68Hw%m1*u1A{!@0M)O39`~5n86w~g z5^e>Vg4&+8Hg=><<^t0}IO$LaPbh6yn5{Z>nx0%IY7<1M7DBIHG1s#LY+@(dBH#B) zc7#;~1*);K5;bLIs3eS*7Z#zSu!LLg7jeJ1h=amnRIt;oDlI~NS&8q}dnsDAtt>!u zMFGDnKtp*U>Lo%`ZDlE{%8F4|QiKwzxX=b#rTO_NE1(MVP*ISFniBrEvV8EVJ@ipJaJ|rGf5mzhQLr z86G@HW4#i@hThzw`l5SXP7)Yf)b^bak>ax>^Q?j$M7MT#wV9Uqf{q=alAG|NRJgAy=1pJq1+ zFdg?FWa4337E)3l;?kAtSieDo$$5n3`LSh;*P z))Mp$n$Mjf%$1>v&$c>07uC5rsO9sn$;(4M+ldC2uX?JHWvy9}GRkB{En}G~MNN5; zS(;^4ZYH>yC6^G}K`7R?osg{kPC}*m7_*!=v0OK^{A)m`I?<|-bUx|(W?8*`$GG2p zgE{`+>I2o_i|Qd**6$9MS)@P=%iD{ITyQ#jMAYYCC zo<mpf5uKaxtn7wB_oiEdJ-@CG=d-yyw^BC13=8XW^ z{4RNZ=@rkf&w?dewTb1{K=L#4hh~d)%lucUxsQAnqip1k{@^AI>Gi)R6>ZqG9qTr1 zLsV=k?xz(XB02#f5z&ZMIH`LDc<`_c_wE)rAH?LlNW7DbxWrUM#oa|@v|KZiOyS{j z$)oxH)kP%d5OhNU9w~umM&e2rAwA#F+2Xqun+>T zCTO})fbykx{`cObP1l$yQ2ZZk-2Py>Q!M8{-^d#M7XKg^%i{bS!P)A!%Bj!wNb$$j z76_L8rFKE?8F!QJ;Pi>(ICAiNJRN$>|Gp#W^`WOj%VoE-!)aqf-iDrLEjZAE&ek@x zw|AnswG(*-Wo9J{2}vXz-@&yTaX80L`_k12TnmiF^*|0Ff|3w=I|&iD;}FY1M0`jX z;&0wS>dos&ynGRNu3W;6qlXc2>ju@9Ec%(5hYIT8WP3L?Pa6MzLJj`A@n_~zq3wIQQaqFOb_6fS+=;9I6HtI zCuqycJV9{Q@q%VqdIjIZ16l}z|IwCkED5N4yIOr#vIg^~Y-^&}k!xnZ`Eu5Czg~+* z5F{*omtxq;N+)Ymg^Nl+kgX@s%BQiRvfTU{8z@0+ovcPR{2xJ}t%e<9b&Y|upjnfz zYRbz{OOUBAB}A82pqg+~&2v?i7rS`7Wkm#?LNt{XpoOs8La=Qq%R#eLnrq)T5Pp<~ zI-aYF|F59ROUq3aB?R-5QbKtlfjJ))1gna?Y?S9@qB8Fhs`4@jT4|`reu&D#9OP$b zVC^dB*05sf3LH9k6u+D}LwF5Fb7QN;)YsgScYL1jSZ+Ul{0Oa9n442b=)7YS-)`Lu zMF$6P*RI{L^C;58zP*Rdl_DYTzWMA1UgtoS1GgL3!fjEK>j8?DEWa(}K**FkM!?lToIP{FCLYe6r-(Oma3vrR9i4q>Y3W5pbsH+G zI#5~Fg$e?5bMv6RZCbl_lSTbVNaP?}@uTBX&C;Bd@QCp2fZ0KFy3fk|5V357JWI2n zIW{f{hmZV>g^T3lKL=lGY3?}$=ehGRcfQ_KE#-B-!t!OSaPs&Wgak+8+SO28y&TFs zh_ZaR$pOP{rYDSpg)l<(jVpmTar6YXZQO!YOIHz&SD3ygl<(br0PEIlz^Y}dv3%iD zLi2f4aiCR_m4#BOA}1GB`2~Eo1#U}lmMRJg`0Oc_10G+_ZFzBlse$g3dWSF)^^ zvksINW}%SfHY+m;*_p{GF34bd^K1YhSnq7nA}kynceLApx~HA++EK^)UQf_&K!1ND z1_oO&^tc_5hY8ffT^J>FKO5e6E?@d)d+Yt9A=5Zg7H|fod2IOpS+ndqP z0r_BWryGd-2HgFf9q8A9-qN7OVcP7?lA?q3cXF`b!@)TR?ET&CroL`BnD->(fZ8}2 z5o~@GIl!AR@AajBl84l<`YK4EB5$X6saa z(EK@Y_CWdXGy}Srx&G;J(5iIK1VQ)xU$5mIKP?aTe9+8%wb!P4u-e)=X3AS~TCZEJ zGu3}7m%s10Pq1Zv*;}%bdjVCO}r67#Q$u%P;CY8{9$6_|e z#3Y*)OYvp(+H}DJ7k5^RnJiwg3@ewf#qQnvas2oR?Ay1`mZ)CH&iKNaGx+!4{@@7T zL9?5zss_sgX&*qp=02rGPt1LT(EJYq@xKw6|6#E}`Fu4nHnAPh^0(CQ1n1vB_~e6T zf%MC({Koqlf7R&Wh9A(m>c#htTCLs6B11Xs)eAX>lpi(z1}i&OJ0j zt`{i?3B8Y?+xKwsN(|0jj=%*$^UXNsMI3@}Cmj`!M<1E)>csnub7V98PzZ`jj7f**N!ZBf5pe$DuO}H6WOxT_-^8+-?w50C^ z$Yw=$om6iJLAj;T0NG7&@~KJwnR01xE2??mEbEypZ3fGFolVK_v01*K@&`3&CbSZA z1<|rRYeJQPS&}7C6S0~q_@BxuSfm3&LY;wgxj}Osfw{h1pMBrSa!g>ZBQQ6V<`b0j z&{~pQ48s5+wEqYKc)wRJT*KxK+pu!!YOGka64x#S5QYhxSp?V|2eJa#vI2sy4`M3_ zu{LmGIVhLB&)im&5JXGew!FBQFk58cTFSB_33dx|atOItmR5~{m88`0U$HlWqHp!9VStot-kyW1vfk31$^4{N`h z(5yFCEe6hmf?T#Ya)BD|>%)`&0X!Yh+pZy-|1vu87$XBu_<0zkgW5i^Hhg4gh}!`? z;oyFFfa=$ODYa?*PH)`gW+rzt&10~IVyMZ{5i>M=d^t`=aBu6L@qFW~k1yn=Hu@Z| zN7ycpJfp^W{J4wWG0rx1ToVe}zYChV)x>HogZ{BG`rvS=e zfq+6daL`Qn5+F-H_;KJv5fZeu@LuKy;t$Of!N-=nouz*X&FnO$Gz*%i!P!Bm0LmYd z&EL}b0v!Kph&skDu%2}D2 zFHDDnW~PTAXAr2Z&gTcsQv(iPhimQ`0%pmtBX;>Ha8{ma>)W?H<}r5PpMhqk*Pc6{ zxrJKebDsug_nLXiAD&w;#`g!ytG|I}pEX!Zg`01t;FqI5la(8$OaJxdwU-}mvM9f! z6x&-qoqA?7!I{^v|MEP9<&Q7jL_iPbr=@!nC~Fx}ch57S*~^n@>ALnCuOokX<%ef! zw#XblXtpp}Oox7V&^&3-JjnsVq_+I5NmX(?kPCv1iv1)YGAQD?ov%0Z|GAqrtc5E= zgRWx^J3VzkN00uD1K%ITB6c1ZF9+jRP#o5;--Rnz!jN)5AHj0DApFK85|on^6EztL zgk*(z(qaTr(MdL$Ej%(2u`!CT>N>r^z;OKV!_Qc^ZZkXC#TEuiQ8HGnSdMSL`36T0 z{bZnU_RK|`J$n%+PoBb-Z?>3U=HA`A@sfZj*NjihcRgQ?@PB9BHD7ds^B-?NIAETn z_*uZL?H@c&RsnO*u(wHk#@Y&?6+Oobu1_g)$R8hgo(}|PZrhrgkQf)s4)+QcFItF) z_wKVH?>A^3>g~s1Uk|D)N|6*Fi*u)c!FS*8VqR^+_lJ(-R>&PhL}ws4j0#QX{U;GP zB5>te6fOmX<9c9}TPpijG$KP|5vkY50RafSa27XC{*24t@5hO4TX8?=7XIV;IR4*1 zf5&fZVBc#cE|!m%nt1p867`KUAhzZdeb!RXjat{{5Y^qRUmo@{q?) zyX>ewIcWay@)_Pee@1Zjt#W7JEbH?iMr1K2#A_0cW~!SM+qI(|L;TK#fND1@P;Yw+ zx?7vp9eqL#<;cY(7)Q#ESRlnN}eCSfV2Zes34j0wz5R5PKum9X5R{gQlsr{8sKDPcuc<#NK2ta}wGqbdk4l?0cHB7#SuCNvQq zB!WwIUM@;A(@}6g1;y$2ksK4nLE{3<;ed6=7DC?jU|c#cSPeo@U<5*LMItfg4rWA>VL2v>VttE_R9xhHIC1;}=FeM!d2^THFw0;<++FGcB0`f0z$|mrl}m!< z+Xl>vcyaxjfSFqpp_$jdMqpNa<13f15wx%3JnN(C<@xihpXV;)^qC7da^#pr=#X_+ z1MN#!0?^$zgxdO6RMs?8Z73}lIJcsvu9bu7L;~#w{{I?M?`q+ahio4n5V$gU4|2Uo z=QagNK2R1gdyz8)%!N>R+`n@32z85CrL z@})~wauD&0`DAO!@4)L}W?h$Gw*0Y`&p-3`t(&%+MS0QOC4}X**uH5OemrynSI*zS zjVreiaFKa&;Rd#Ry#p(jtfE%p#^oD?))EwF<)b(|55h|pZg zvL{%sEUn}<2+l0q6#{49d8f`@RLFfYd5?N~qg6{QOY_)HQk8l3->Mqcfm#B1E$eW7 zIT{*SFFAN_Y^Xs!>#%&OW$CU}d~3E7iuj?)j&04==I`Cg{r0A64zd-m+P$IDyqE6Q zR`hkq+T6qTN9uFx?z8Xu+Pm2vb)mPD$8;04dwMvycafQfdwZv#*}&Phg$B1!X8?O!R{~ac!c*|BV`O<|dVKjb1=6$haq=DSOBO%*H*3%#fM#BL)@I5;(4GU&BZ%a+ z-9#dH|C*KRp1lXka_gCv;=($&2mQz8oATDZE^*RG?^oq1p;^yLR_|Z6wdcTnUf;=C zSAV1Q+dCffme2Jqx7xQwb|`Oy#d@}zf{?`gVrMDi;!?%2v z?*z^!m*={F4Ipd~!7}9BTzrEwUtTNU^{VCr2c7P|UD+`49^AlF15vqS2$bd1Ofg*w zJ*VsF&4w*+&NRzS!(2637Cb1D`+}km*z(7G7Fw1xHZBr#=6*@d!NJ4Fu>ITbv1;9R zghng=at&hQA7RHXeSZW=so98%)pF2x5lL{4zjGfkgk@QfquF^!MM=r#k|7Io5<5Rl zEIM}d7pz^g2@B>eB`hz*lEo{r>Fcf7zwaQ-iz#%&YuausT1e%%TH%Iq(lzmStsr z&oWAR;OyX;Q0cQU|MU0XFx20Vt5>xI_yz3VyN3gcFLD3=eLNk0LI7-ICsu%Km#<^< z*WVCSWHEQEoh@9p4!@kgh49!+1P0$j5TQ9JEDaYghTzf#z2*)jq=d8Mmb8jmB;hC! z2lxMg^|F*LUxp=o7P~iWLiwW{{QJ9)_~-kNws?pp{0WXHCs;?GJ;&QIe&$xzW&yNY zNXpfJuUz>X2Xt>5T>SC&qve;vDoQFBRz{ewwld&Lg0qEe8=nH_SEElbHpmW$a4f$; z$xXs(o$=Y*?`d|k);)jZPIii>rW$lL)uFq|i`!a{PQBKxuSGi{Sw4kbjrw1mxmVay z!nRdBS9Y$Id z6osg;1jL2kL2PIO_v3BB8!c0v$aV`0^y#zs z>8IoPcIUSo@Gdl$2$lPjr!JtWtrykx9Vn@2LRnP{$|||7Y(zst2Og$pV%^spaOB6I zk$g9k?M4RT6J=@6Bq*m5o(0Dg0Y5SR0m1np5)%m0lqLxVaX=|o3@!hyI;OnZvSlYy z?`5L4sRwl}eW-2j;(iBOI(t!B*MJS1wwjBX!p|v=bikEhLNe>zbxmvx<#+PK-i;M2 zm)pIpU^}D5WPUk$j`>5tz822>y3N50+oto^aQ<8XpUpM{@uf?a;p(N!X00vE$w7W* zCW^DNQAWs>AG>qGAguaS!E#Z_a#77~H9yNLTt&c@syVOmpR;Azyi)Roj4FwIvi!iU0ASS!pXZ6{D18yris< z&y3}>n&}oKR}*ZjinuL8bq!T3*eyVHbsoRVK{>&>v@{!~WqJH>5sFzKig~?KgZC;_ zabPOf4}o)aIsaeI|5k8cR%_PLDh_O`I1sL7y>Do!r&L~=(A3myvC3g1cPdR~kM0=Ea#n%vmLcB83IajhE()y*7`w-BVA`<#5X+Z#D>Cs=o~U22za zGu7K7U+50sd=y3DXvgEOPMf#kso&bT7F&=-@N6MQwJ-}cqRGbm?&;$pj7S9PCqqLR zCL9Z(hYh0pD1v#P;N49^*4&Me!9fejso3>jn8jC+BuKNZ58~J$n+Pp`2h9Sw4?Mlnoxd`z4V46`I)z&w^%) zvF!t9<(XMdnKpM#`TOvkN-JU5R?*=z*7NdT4R3mx$_ZO-X9}7ng0u40L9>_74r)CK z618>t>^p|{=P!LHP=3ebBoCOq1$JgXH#?t|Lw{&yKKMbiZGF%@4a_`7|FP%GEeAy& zG=EO^th_R8`Q|IXglD^ddtS;fg0F$HqGx#GGvfCeX#URpH&~XKe$Q3I3pr@;^@8Gc zVxl4mgI{3bqWRdn{{YsrQ`q>;Zp0@)!h=VZNWEXo4zC=S0wSE)`2`Ei4*&2I!oy++zITwwI&~-hE>Z}QDG3je zoS23b4oK4<|2H3t7x90GPS{%?)j@@NRC&96AqbZ*-m-;Au4w7&0Jj*) zRl@RB^HmPArR?R#U~U@x{?f&3ICJ^}jvW38dv+fnq;4}GVMU}+-0?X+3$tLhvAG+y z4c%s0E~#iiQE3Cgs|A%cEy&F+#m-%OvG@B!9CSZKY{COX$37%9XAzbO&aoL(ItST` z#jNmh_i-ojF2VUee^0{CKb@L_<#`+wtodp)PM^Di{K5(}@Vcc{Z78ZFq}Q~fteU{p z){XQ>xn_;tf8Zzw06`qA1`?JdZNj4_AZr1ZGpEm6cs{KXxOVMYoIiiTTv9Y(P@8n| z9Jj2?mkG_6FA|(DYJf~=Uh1Iv%B4$Yp-sD+f(OZo$W42Qf^5Pw0k_PcSx_nwJ}JSn z+<2TT4?&r#Au!h%FcYQ;%~b?i4X9)dmJ3BuK@JM@1kbsCz$}Zlpjnn<2h61`Z{;kH zm9k=&5ymN%NvT?&S)S!GQN^-bWiBF2levg^<(21P`7WL1mQf{WrWCfWy0!>aK4`9~ z5jf|gvN9JH6?rJHAmnr4T2fv{c&^~N2=C?9s3bI35Tq*z)KwL7?I=WLh1;(*UumY1 z^|+dY;W`3ueZ7Nb2hJ@9(RFpT6zgbxHGgLvmW!66j5JoGO%o;+VZf;xZJ{YtX#zO=1TzGO`z7K;~ow8+vW1nMzD1*ciQT^E*?MB(?!tj zhn6OP(l4uYzpYj~L=E=#n6&?RKrSDTF*-C%u$CTU?6K1*fm*QaN#~Fs_7}FAmJhzn zdd3D^{-WlFAP8XR?UvI1D>Qqr>jeo@Kqz1^3kqS;2NeceKI^3acc0S~dmLb-m~w>`*)q^;|4w%R-$=JiiO+rPuu)cVA8>rg+G6jZS>Wm1 z4VYHTf91P#k~fzci}xz|tjh#A51Jh~vy)bbEf}TD`fBAt9lGw9dC2s7VbkQNYreYN z7bwn#=GovZ7ZwBNsrz@&Sn2j$Gjy(*&(8OZ44SpD3Ganv(!sKWX7~IwFmbX~11WRs zkW?r1hF~gBXG+h>7Fl@CJYb%fzc?CBdgd+Hia5|&S$yg+C^hf_bF#{NCu(;Ku?%jK^VufINF2P~GgXN_w@XeNQaro#N zT)G^A+rbYI9`OiK(b)upTwJ~qhBM~^2+iz(2`o3Rh1$d`LCcxb7tN1bz_f7wLTul< z1u03180FwV;UBe%mE2Nn@+Zrf0W)PQ&hh(KV-xoKOF=Zh_rTfO2-SU+E#{90%?_L$ zFq>-zfAmTc(Bw&^%h6nJt+eOb17z>&`aP~lX#_V*n3|H9JW~4XN z+3zm=lV-dNo(ay1>8gFImoTkk8w9~KTMwEW2+0lX(Cg)rLD;R~ceU)?>t=|*6_;6DSXrCv1;Dmm z2zmu=ZW)Ay6wPLPBB) z!C?fw8#qV6JbU(vLEqJ@H*uX%eC0|Y;V{tLFs@v=L14UsGiNU1#PQP>#b(F$@33*h zH(1Vr;G%`ge1Pl*hz76rW?X{#5u(~rRNR2F$~F|2w-BCM%q^p^xDE#o9>=!r z`w)NUfdO+A;W#dy^@D@S#DrWV#Agwj(-0S@xb9la{% zKZVgdf9^6ie!YqJHkX6rZ_M(1j`?){>~(9KE}rASfzW*Eyy|Tr2N8tkP1_Bc7cW|Z z3+K*qz;hSZE}chsP#_*XxNFd?!B7#OZCOzPpS!c}&V=SNlgdMll?ws02hA>`i+qnu zWW_Gb%S#70kDj4Ym@#2%<^LvELRnoFLQOB1Lx|RLOyT8b9Di#stU|yL&ul1zLpV! zOIgpPase~zX?Zz+(|HNxm6bZbyN3Lv1?(I-O~9_qFEB%SR|6v4K=6}^h=Loj{zbd-9Ak;fx=+&_qc-d+s! zbfd3}ZC6Jdy4e1hwYh}@ew(bw^LOz7ovl-mel-8W#Qko;v|P@Z21WcC;B^MO<#y4{ z@3{4cW`gjO0YdSBe5xPgnE|weWr6eflP4zaJMiwq7wWt{0QIcJ{`x3zF>3~4zy~e= zEogq>AngA{9xQtjAbLZ_v!^@*FPj zweN#wJx9Gpo|uMaTg!b4ngz^G?+A|{eb8)g9{BuSh!({mmXEIM=shc$gG2$Z@>qE- zP;~qL(z7x@Z@+ujciwX{5c8eWK*G+WJP|Ytn0fzpFT78ItU;}w0|CXJp_7Z);R9xB zx=iqUY6_n9e3W0jh9+{^f)LNg2+nTspn74~QGPJLKI!$l)dx*PbinMc@6zeg@8yfT z58cB%TdY9)yFv3P9#C4^E<&4g2$nQM`OUj+Y$Z zyihnfK34(pa}9WWz^vco(>x`9*7t5ciLA{k<18a8TdJ2FFnAVcbJNfOLiu5XD_eUOy>JmVUNhe{DNy;YNr190J8N>?l-1b*vkM6)Q1+r+XsgePt12tC z3maF9&Zb&)X@xLZm}PNpZ?^x|ma%hYC+ik|Aox}(Dn>1#xq;g{G*uF+3D5OXWwlQP zXu`7rG&^efEY}N=3C5meoz_;cD=F-pMbEJB1kx3RWr8#z!sI*7lYw(d31PaJpG)j_ zi>MbBn%o8g0&Wm06OqiiCLi5h-+qsU3zr*QuUxSXr+zulL2C*^!=iEHMyLVv zNq%?q$KyD9>^P1b{Rw;beUB|$cM#AwvusFZgPapg7B(F0Uixw?M%H2%VHnbDQ>&^9}w6Yn6 zr458;0&`_MO1Z5dIM+3Gqq)70ZBa1u@&~xp3kAyxtEXroT7A^Toj!Tutc9b~HIx?t zSJ@_=4X}3U;`t!LauBXuxQ)x_Z`nX!)@Rk%#R@;ScnRzCdEAMM#;M~!BH-dVBym6Y zQ94Qra#6x(T2_?Dvf(0T2$~I;3CFc810FaVJR2}ORk8e(v%HiNwiUXtxJXen2$}r9 zfv#_BZa!u`CMdJ4npK(Q-JrR=6g8UAN@%W?-~J5Af1mY(s`Gs(-~MXWmns5p6$hz; zWm$m@nhDD_lonN~AvD+2a=(V+wnpw6)kQqc0dNKXE6Z^?>t89;Qp);P!F1?5J68pd zktLfd_x-1mbz9I}#Q|_N>u_y#E$V6r&b18`2g!uy?#?bekq_~+F0`e`?i`iC_Nn-Cjm6` zo|VvBEyz{{;(dD0WU4IqG&63pD<1~W{LM-E!HPZh46jB<%!kxw?2Z#&EH4DjS`ygh zg<=+as}j0FCd(A>O)%u$i?bZ^H{PQx)|&hndfNf>nsw{3edlh%-!hy!cNJ-wvM`t6 z?!!W}EiPIm1L4{G9@@4i((5yI`njC6_3AtfV=KF3F0woWc zS&n%g4Gv{-e#0_(_`q&L34!5D0<*!x9DFfn5mv6-ik*9o;l~qKaPevwt_8+0FCOCV z{SqW46(Bh1F7GK6XMVZC?N#36d9x}DY&0?H`~61*jEHad)P zJ%82_vncyOd6HYnS(vqu$2eY%Jj07oig2xCrmfAa=Y0O&TZ|dd%zOQv_x!u^c{PD{CnS&UF$yUr!z=)Da#WG!r7Yt+6<{x`sl<$v0URX$Qsz%Y+*T%{-3ZdzLx} z&-zUwoXE<|@64iHOh_m7shM z`wtw(ZoR1?6fa-3$^p2RZ?`h+No9H-%ijW9xMS7wRanl!^Qx6=ux8a-!t{E4vuP{K z>27>Stz&!P0rQIGtMMqa0PU>cnS+ha27BML+Z9`3UH-Y#8fq55!GX#kV8HkR$ zhZxq8nCJ(#jf%X7+d&Bk4@-g~>qN)gK|*3G)_tWF4VUBa;bV3!4aTc#Taa5=Nl>mw zQCSm;DtN7$PL$Pj5th4ANkCVmj?Ao5i^_4GV63J4!$RV4`&JBY1VkV(AkvoY*E|Bv zv-t7YNppkIKw--_J8eLF@jT&}uzc}+FfI$2*=Ak6H2d5t;ZcT*!@&cAipQ_}NQE0$iTg&Sco@F_%lx5n)bQmO8n6zKXx+M@- z2*)a(ubRiz5T2_m`915psglRbI?el$KXz3`HNUIjmM~9{Zm47X($;}KS#k#lF!JOn zMxTyiOiRf>n_huaOaCj1l-kRe9v{q z-bwAI`V7~8YO8+q?c^JR`6REcCHI}7Vv>Ed`tOOyiqz4MX9N8hlOOcZ(3ExAbIs6- zqVi`}gr+Y9ezugUPc|WonsQ%|3>XD_1SkTGlVFYzWsv1SSMD78>_M~Vf9tQ=fc&rI z{bwc&I%qby@nDAg(r3YW7Jzx0f+XKLrWVTJe`Z6o13H%n4z&27c%T2L(CnWdQ{r`} zz?s*YI=AH^^TW3_@i*OrZEfF|N6HJIwD^a)7;pqP6PVvI4+YW=nsse2zg-jz0W%9L zA@AJ;HO}8Bn4Zr-^Lx$MXQ!>L!lAvNaG>Zctm@b$LG!eQOe@)`<5$=$3!SBB!TU5= zH!HjHLO#3RDrYYVPcKHtZ1ppRx%t5S@B*TLAXK?YgQ_3A zKJ&(1mua?Zsf_WSnLfE2X`~z;d>ixU&NCN-mFyH3En9`l*MpIkU5eDSeB4bha-f_> zaXy5+XIhNZTuf$xGO`_Xl>=o3LcTa;zszp8n-59zICJrAwEva@7iK zT))ozTQz_X0RPvYzfb9Z{6YQx2mZ^a|M43?OP~J0fByPA{`u>#_{S$j#!v&vXXl%* zAb=J`eiTF#zU9Afk$2cIzT{x#+il+%G|!#mCLb*#v~1qK2m6nl#<7#vaOr9+0)tYJ ze7ArAGC|G09d;SQBJSYg#oPGl*hL&Wd>+SsID@0#pTwa9KM|CVVn6fehl9sVf@Uqv zJNL_l*s^&W9^QX|Q4R*)YF>aE^!FcG2bsQiAMhLXP94G61k1s5ycyLBd>mXo(TaEk zW}g*WR%RzRnbxy7s}s;xpY`oZD~bct0oe<5PjYJ+}YGZ*lt8? zT?3l+TDH0twe09?$_2l!!*8r=AbeATW*;#7Lvw?~&Q>lPb=*2lgL5rAalvzqtZ;+_ zb>f2NT1pF;C}f>tJ=YS7t4oVfP3Wj0h}7^u_L^J&6+rWMNzpPa0s_HAmcep^of4D? zc8ZD!Jc4CGvlmNOufrWYE5d?ajcc-(fLW8IY|HCv(o_ZECqDiTwrtsfx%1p&A@dh3 zwRpse7O{HudV=j&Sh-@YZ8!QX#;aG$-*}}hX7Z(3j0L#PYCL!TT*ByD?E3Co?Ayok zyKf&3{qO^RJaQCA4>b@{?2+`4&-z|PM*THEpkh6%X({R z?!<&-w6=5;bZQAV1m==P-giBLxfvDZt;o)*Ac!8oiQ`w1nv#R~I7Q01Zwr_x{&I5C zBP1th;=zMl-s@c)IB*2t?cUEcZnSG`+PoEK&tJjmbC*%U!BTNq9f}CdCFMMSO$W*e z%f;nw$SZ2Z-3LX8B#_^@5sfPxzy=0JAuKErvC(%C5q1Zmx8o7SL2b~D7_&SF25KVi zZ5%m#+$`*Z=51SdLD57MlFw_a4-#H=B}z92Dngqm7K6kla*o41AK4X@NN)D*BRJ6r%R?OwRLKNrv zB4$wexp~OT%j3Qxb9j^gSVrtTJV!pu82?vlF`Hcoxe{)R^W22NBL2Uazm?>v%$4#O zDvtxU+#;0a_+%mMcugK#q{)W7ha#rGnCX{F3bS}#-cJ#~D}IE6!VDA@WwCtcqNF5` z&ztEsF)b{=6|5UphFO0E`(?ffjb-^1x0OX49I~v-0z9sR<|ir><7#K(CqTa|2YVn zS?HcmvhX-)e$Ra;UjH-D{NA8>j1W3bdC)8itG9KKJ8cQ3n09aCn?=(Q1n5nM|NT(j zq(2AE4tPCi_I#e5#abZlBmmd*`;GVb`A!##U9D0Jt3r~6@gT|^jJ zzbEl0N4)=Fe*M9G{F9&mjd}VHotNcI%NpA<-YPdd?$`*EB0U%fKg=9~+1=pY*)m`G8)7gn#_{!F=tOEm?$BtJh%p^3_@1++Ll*`Pz{2;v$k8(;7f9D-)(9W>v-83MCl`Sd9l#!9OJZXi6aS+fD_)_jeF`*{A4 zC_KEEhO9?fNJtQv%ddD5?j-b2{6<9qDebrc7_JBIz#_XmEoxVf6( zwRg`!^9S9!)vGJZ@V?Kx{!vsz?vJ+3(s6QQLd3(o!z(4T+{@4G7|N8A8 zc=zlD#s`Km);ENao&myZzs=I_QwP!0Yp)G8i(7j7Sg}Y4p%n`IySvauP%UT2RaaGJ z@lCaA*E0^L^j1R_Wpl-N_7txto)Vg!rP=do_S`gHbMU6PrLsC}Ho0Kgg=G>jJ6Lvl zY^w&!qO6I0n#5;Pyku_8#loz^1mv=6f@mcHql}$w2_+bn)wu?Bl>}*mQ)6|5A22(s z*%UMrk{f-h=lT4hxu!xQ^j4G`G*=UpCC}3A7ANsRGo|mV1jd47{$IdcW#CNUG2o^M z%>ri6uhb@FvGXq>Bo`BsN-6oDdNNBhkDYec&?|R>6M?zPB#0(J@w~+XBi=(!b|Efb z3B=K(C(L?$?)+7&d&>#VbLTkM4V6!)IaoMV+BIY>N?q&u02j7+t#iov&| zrgZ&UFn;{;7wp=#ABz?(vGXrjs5ez#VhOJi7!-l5+!DfatxYH_;knB+!LXzOnK>1< zuu^#BJzNclVjhJvOJEA zzL0~Fd2<(;54V;Hzi{rlS(^pUnlu@pH)Txg<%`$tcv<fo}a%Lvk!kozbD`B?(z zOah;vIoCmAVJ@F(0m~8LkPut07|w)T#dR*r=irJ!sB$5AF6VwJ!L``pF%yyrumu8U zg0EDNLuKcnAS=hh&z0ol6L|R_Zc7#FE?Xh#2)hQ#+~#CiL=Sf!!gsb*$p2H12%=ee zD9IwMN*2n_o!=Hw$<1=mT;S;u?};Kj7Zs!(ruz-_*%ka?`F`;O=0vBk<;=CQU#8XRbl!TKjxmYS+6R9DF& z&U#&4VLsJ@@Wwh0%sF_k)8M_jhUHm+E*BEEBW%0sYHH2;?O@$4X4Tu_B4Rx0lL2AK z`eQ|EQlC93w-Q;O3Co(u?xc9pLp}@iQ+y%#H8>NX5M-Q>>olx*&@6opnr*ThJ2?k+ zo(KWldK2ONCF_agzi(C+KXCSkXK#NtG*7)ce?=gAC2*ESkz2}p?QUsdI^6`pnNa54 zlLwaS?0zEzdGP$(8=hO1P|5>mmmltbpR;P~db7XtLC;4nD89u&yg{7#TWcW}9iz|u&4sa(RhnS@ z<|Y1k>#Wt1!84E5D{M``d#mMa$6s;u$ny0}kqgER*aQb7{e2i@#nD>zg1P4cAA-p9 zkzrdY?X@g*Joe2fN1ji|@R~sMnic0YE6t060gU$#U}EqwUObjHZw$}J#tBX@2vRTb zjMp3I_2hf?Y~-mebv?rWj8f0!GsVhh;{Mp+p!rG3x9SPEeN2y*Db}h}2ljktR#Q#l z*|K#f5(vU653`Z@AO}h5`AE$ybe86fVmzSIGfR-2S4M!&$02syyY~~A4<3emChy(P zFh5Ck*6Y@8AON4Y$u*ZQ%2)ZC>B^;GLh>!{-=yTGpal_b;llarIQ7c~9Q^(WHgDRB z^(*X}F*K0I5m$koGVg_wL=pZl-j7># zV|C(l<}Sono3`W7PiJuYQXs{9xE6}!heaqY?Lk~@76Jmo5F8SL*tj@cyl?^EY}$f7 zyS~SPy+2~lw@2~A{*#2|Jc1#neMIp-nf{?3g(~XD zU{?>zRyXgvkAnmr)7gX0#ujumHk-QJWL4=zV|{})nYpZ6X zH}IHRv({??kS3qcu{T*yQlyK{=6VeFbYN_lK&+*3`(;_~vRKT0ZO)fj`R|2)a~9fS z)RYu)TV&g60-?UQmHOBzNeUC^`4`K*!4?CNFEBg7qMRb+u|v=rGN>do3*WgXlMV5SUwO@!l$QiEo*GBe%Pl$OHQR*Ozl z8DQ&JS+>ei!O!-)9Ye6yt8)TwSpm1Qgb@@<3uN)53Iqs<~52b zh2mmfRqUjir1`1d08Dnyoz-phjr(gS)jl9W(&Uk_B(w4{Xq)@ zr=_mLL!%HI9nbO@Z-cO?h$zHFM3zVc`h99gN7(FvNsMAR#J-kbIXg z^$3Lm7{bLv*0;kv@1lA0*sCVUE*v~~kRX4Q`UxjkXHWcensxg! zE}jcOct|XM`S~2aTJsG~p16p8dyZhkhMicoay>zF6_zhsjn%8Z!u)y8B}rPfYOVQO z%Pr%p^_vhCort`GDh@`=*#1-^i;$f8s1!*l*$5{H2V9H5CB>R1C^%@KeveBNo5mC3DJrxt^Z3!f^Pn&Ajd&-A)&dDz$^)x zr<1hqr4gLdsSM=uKe@bazV93s7Eh~QGTnk>gJgbJ#e1k?nkopyl}uMT_ls5D zbRSG#QEstMgkrNeYXYRp3xj422(+LJ>qT)P2W+f!2E~Qm+ns57ze!ryMS1K7wXDA! znA^fHtYdmpRK~$-Dch@Zwo`5;MK{RS;8p)s>}}h!UP#(k{k4cClqO&5vrWik{ngup za{fopMW1VVPMc$4ZB-51+&X)kHuXl0gL3($fAMCLa6zDa_lBTB*`zC<3@`~fnrvo} zNHEd&v!Qt=|Ox4It}2|^Mu&xU46!0dzO7lh{L9yIeBQ^4$85;V!piQsG^Je$RN zlrTMlca*HpT5#dx>j_?af_eDD=a;E`6$~@2gktB8;Rni;?ekvD`b_Zl{4`}b^A>CH z^i;2&31*5Epty~LU4#z;M-xYwdOh5zj+>RBkt5amDndY288pLRuXC%^`IZ&2f|S`} z4GL|ekTpGojZQ5StmS+wE2(O41qymSWmd!uz4#kT)u|HJ9pW6#b60L z!G#N#;TVDX!ubH4KPUg>D>!-l5>B7Iit~iz%NJ!`4j@!t!kN>T*~y;8?(Yr|lr<4) zD}MOl7(1!}L`K9RDd_>U7(;wa5|ZOnkV0s_fA1mE(=)LDzIL_oJsHAlSyNdyI@?-t z;z#F8sR=^!7p=gy-3M{}LLg3EzK!!*93dnQ`Le3jcOg3JK5hjPOgNATBa|d2#3SJ9 z6>J~?Z2RV0?Av`52loApAHM$ud-oj1{(Xl`d)X0xyK^509A6VQuA--_AKirEo~~XD z_77p8Zvan*9&^CchmID_;_hNuYO`nDCRpZot(sX*37C7jx^2QtUr#Uk`JXz-vre1P1b6A@=tcF@A-kS zeOKy(V$V02u*puOh9D@I_MqC^cTg?pEVa%tJ0la>>6wIOb-c1P6|j?JMv-Bxhm8yK;EWc?FnpJ0Q5e8`r zn#=5X!Lo^I*AmAbG*=b5*Xm|PWJg(=PY5CCln^>3fkV0Axk#2~b>0Lgi;pW;4MDTJ zo}jsspsl62xEx%04Q#Y${gw~lpnJ=UyQ<$LR*Nr7{*XyHQ4n=_w-*~M7CcsW)sU1_W3Eu6au z>shY1Z~czP$$Gqi1NB8bZV4e(lL_4_ezIC0*#84=-wwx#ljm^c$VsH7u{>m#En?gYDN6-qQl?-+z!W9l*i-S^RMDC?R==4QO|4`wl-HJC5_` zFEX8>h>DCsVqy~RPzfBI+=+|Fy`*G3NKQq@{rf0jUY5y{Drh7$Yx(pN0;R3s$LCjE zP;SzoNN}ezE~~SHW`ZQ)&|E8c4wJyypxFWQ6f{$%1hW!93DmUI_6%Sqgyw2MN+8WM zXeN*wY%@)i0kaQy1mpaW3Ik%F{mUzBhoNEPVof%R}Z_o`Y+1 z=ivE@n0Ic07Um_P*4#Vzdoj;tON|R;S*KjwbiKh+40SC+#OtwM7PF2jQi%;{&8>v> zkNfK>Cvf(@n+E5&Sl3CC! zOEdLOe$58U)CBX8&@3yi+%Ei~`8^@#{mT)iafIb$l-GL3{30m7eQsMpv-B&=*C&>@ zPlV);Z~0vLOkO`9#q+UYvmOeFMjsDgxUU;py=;KsARkMGz4D~Eu(GhVR1*|xWPxKR zY%7luh!ys#%7wks(z}|mU#kweoE3&0YE1=UK+Ep(cYUta@l-9pS1|WVt*}(A*K|eJ zL91hLVMS|Whwk9H5-sfLn+VT!l;T>Jl~<#*q83F}4JfKL*g~KS35~-ec2Fr9 z1xS8Wh*aucb}=4i$(Omv+%ht<3b0QW;cYvycnst2R*aF zx&JXn2b_z-2nR=Fl*)t#O48_HAIs+iMuz$j6&8XuD_0s6%$f5g<}F-`{Y>Y{D}?1+ zakw0O2REZqQPr%46nbzgI1(XLltS(inB$_Onf^HZ!h7DZZZr0NcZjfj90v~kh`sv` z8z{@l{N1ht)c1TYd)P5=#i0YoaP07Le7|4EAE6H6$D=%Ou?5xn*mnE8zNRX@`Knh-K zS^gCwOE9U9!f<7Y&%fB9ncwj@51Pw(?Q;8GY3BX#_i8~iJMD74cBb5` z=xlbD9yIf_y{hML*6FHamc^O>W2db9uVmYx^Hi&ipvp8kkmoMu@nyXC5_a+>g#=~- zKxSGNveL7W$w6d#8UZ*xoqA-jd^h( zB`=F0LRcxHiVd2x2rmL00u6y8mUaBnne!}@EVH}5!&cTgLG${r*0BDr!PlGC1%wwVFSKey&6ji(TfSii{>q0+24!fKmUTmhmKi{<>kv)5wP8Wec^%y*uG;2 zu3QOV{g1&p4jO;?J2n02NQhBJ*po!Zeg04Xx2Rw!O6;wHal+es}tgwL3gK#DA^M_`+IhcPj z0aTXOJf1^?oM}rl%NRj=HaL6GEEk15!e=h8n?vOiG_{|XBg-w1BistI9gsTrhXP(p z{Y62(z}cyguq+9h3-ccGm~@_-z|694?jwX|xqm2)vUC^oyrqP1)q^tDfil$r)eC~N z!86lVCQGvbm}!wqN(q6!Bv<~{&gWVK3-u&MhG>i#+RwEXd3=&-z=;a%)hl z)fiSOGtpP9pt2Y29p5Na3@EMi(uBY!c zcfsOdv%Ru6QEV5gG{9!tB}0MYY`eEkMQkSo)z!9j1AayLac*hamxbAL*ZN|Tz%0Fa z>!4YyuKB~UfB3fdK1;LHG&D;d08T+Ofz12OU!Q|!h3@j;cs5Xa`yMcV7MfqaGkqdZ z`h&AvOSJs+w6)n;H2)r&6%+M6C4bmS7gN?Oqdsu`epXSXc4BoKENXKh`+^Vz&q zNHm_)wl1FVJ6=a&**wd=&h?S!x9bs{<$7Tgl4J>f{sfZ~L)2rudNzcYV}p1x(vOKJ zeR%e`m%!7DF=})`YtDC>8^Sj6b00a{>OBM@dsU8h&W)hzTpUyY!d-|IE6_U%SXSFCoQ}ntF&He zxAMQO?98QRrHP$-V+}j`>S|bA%j!CmRM(@hsvh~(jmWKTLVj%nN(sW{yml2;%l&4v zFgG&&3jL+{woTk`VcK1M%R-Kf_xfBiv_$rrHLG#<+<7DH4{>dkf zpEFDI(Zi>3VDC=`hue2-!_J-CdHgCYXUBB-kQN&VL|AAvVKWh-!LjU^5)c)U!cHg| zvC+wdPbw;j+eF00D#TU_zCZ9iR`LH|v17Y`=MMS_f(migPZ%5`Fb)wEA9r@+2?2AM z;P{xIhvaA0*2QC8Y|X)5g0JW=WA~0d*s^gucJDlh z1N)9)_jd>J?XJDpMTq!r*FJ)>{Fry+t99RC@uF2&vS>9H&0k3^$AY;_ERx2T76aP( zk?z~S-{O=CkiBgafw_?bqB^ahMF{Tb>@ZOFpt+~pL9++WU7Ze`+uI4<93%>wTm7K9 zg#)n`HstoEgYv?*)eKT_VfEC1PdHZD)WM2S(r=84VJZL0!VdQ>a@+8#?Dt0prir?*z@B72K1-J?mIK z>zGtq!+NT^Y9c_hzNwyi(5$q}r&xa82DjXnGkvl)SMi<|5uk$GavrCM0+l6sOh-Pd znHGgUtT2Dg98?-05s(E4goV-qLNkF`{;eif-C{I2nL8?^s)7W?;d(7$m9RxDk^Hp@l-P@8Cm1a241 zn~(X_^5x43%e!#>`VB-x5Q?K?Z5tI8Lup@aRy6B70h-?>#Kz%%%3VI2O!GG`^#P&$ zb;}5j_~k@3|NSgsTGPvHWq1cFYh=&cU-y)+6A(Q}BO3OS4&K zeGurNQXtH8*@QVl^jm`NJ8l)~NnxIRGI-WLkFm*LZ$I$(kJNk1AA@?q>Qp#4cg#nD zuUU8f&wV0HJUs1KgiS4&*GS@rVw<$dvd zN`tx9D8G0gTFB&A4FcYCP(bNgnh~nmpqeeJ_1HCIR1WatPc(DX%^vl>8+|-ztLqH+ z6LJW>k9#}tbf60(LtXs71ASdh=;>%gS9?7=T5HhJQj3n}TC_FR*z4ssLPJ{vfw8_B zoehMFT7FkokDlgM3u`6onJl=10TVmSsnA!o)n;98tFIteY93IXn1r>wK!1&I$rdWlcRQ2+D%zBEoYqw&-5pWOAG`Rppx+_7W7;EOL8;H&jt1_j_H}9jD;H<#?n(Wut-Hu+Vr=8&3PMB^*XL~c+wQxu) z2U*QLUn8NpfzVvfd?Gl@zgd3HDu?z~g=N*YEW26}ua1CgmSVFQ%LR#bf^vW-$drZF zz22-6C~LA|Q7#&avr%92i2uuEoyazLu3@KBT~KD0&+LcUcp(4v#Cu4Izl+4!R9pG( zUP=Zs)77D?(<1C z`8(m+>#*3tRg@5Nm@fHjnm?ytMH89S5t`MxfcM0Xp2w6GKVs*Y&i`dFJ(&dEOm@gr zVHUUiy?_wovogy~qnNNP%eh|vmJ*7UPaY_n6p~DDqBLPl-|HAbJO8672+qPtFrfrX znye+OWkDukh9Ira@^#g3HknG2MJ|h=oJkO%v@Oo$_f$a!VJck^M=0UIxL6=UfL1hx z3PMH&!K$3_QpP+l&n@NsG4G11P{nOk5%a8=TP*;?e3ap!nQc@n+Z;{C9O&vr`0WrZ zoHvhyPOVJ15Sun_WF26A`2GNX{^=ynpSgtSY+#^^gncwpDYY`l5YmpM_N-zR0+{6t{(z|loVxXVmfc}Rcv?$9i ztpDn3TVZpt7G==@l=W!gyoFdeXD${JlvgiYfUj3A$F>b$;lS>1@ypLYu^qU~IvCD& zC*Bku9*fY37(~SIbKD)o$8jLfc2h1H2{BPNA^B1IJ#$sCs2Pg4o%e{(EsMaFMG(qz zZyKmVLS!K!&`n+|VfiW}gc6!5xoOCaK+jL0s1^Up2+1Wrcs77$IS`Z<5SBHOui%j; z^bxfA9E&*k(K9z_G!W*o{46(&g6u+qty3PKvnB)DfB64ms!0CL{Eyr+Jmtx<&GIWb zcy`ifEyPohlga@ImP*LojU{1-P6 zuwe6dLi6v0X2nPS!w;PQc*DA4hxt1L?JAM{Uy)w2q zt%O|pJPM}WLJK}1UZKxJ78Y-vydv&FY8 zca?$C1Sj{3T7FJ~X-zob@#>&ik?fT;&m|aXB6!O+!>r8&X1!K6Kc|XncKCIuscA$t zRaM)Bs=8)W)-|A-kSvI5X8PI)&mEOD=w|2D!T)p-l>2$_Exh+L$4|1OSb(LAm*U*n zi%3jNK~zivg2U7iBqJvAK4Oy65SN;X*t==C%LX~UpaKtb%5Xok1X+dEh)cSU^_#X} z)0S;GK`6X)=dMix+{TVcv14}<2={+~5WBwHXRy3u$6nLct-G*k^EQI^9vnY@3fBSx z5g8dv2u?&KJEM@>vI>Rc_RR?MGYq{Qi|}A}RD|b=blLqFJ{qpO|+g$9;64iuWotI?q6EWl&_+Ujk5M)iFD^0QS~F%6g$Vr;Oxlb~3O z9b2~gV0j+g3TJb%Wa(P`eBv5z1SjKq@I3^_W+Ni80O?r`>^zzY_{oS3=b$4Z7V(i$ zytixEy?Y;aZr=?}^4muM*t7cw^V`-E;F~yTShjo_<}c8)!?GYRAW*Epp@Sz35K@z~ zk(8Lh{rgC|^AL9kj{#Q$`M)ieHbHlDOABGSc^aCjcKNhxSy;VcXl&*HpdI~$<^e); zf0qE7g9zTIq?N3+s+Cr}>S=F5C!t%bVaej$Tt|3jU1;JUQiDaQp5;-l8QRvDD^FFX zS(hkTlP%6Mr9~7ZS)?25c#fu8G_l^cv{Vty%hB0dg6{SLf_nkFdJ56eU4qWeYNn%^ zY3w3sX$e`XcLKjRi;;moj1FlT zTMiaF8YrGmD;4&006s8)K@P+RI1ume(|>z-|J=$~oacGc|CrCe4?P@Yb+@&mj{}O{ z&L(uVv7KtFLlfJCdbST*X4oxKk>v0zAXddb-^0+N9)s*G4bIjxMa_wM8 zNtLsXtFyL+DL9DG;uQK$zRhI?9vlmz)#1vr>|8VCBVDQg<_mgqk!xTq_>nsW!9=g+ z3C+c_H0pbOr{4ragoZ)_RG}tOQbifurb#@8pq!sZ_@)HSf-XfX(CcmwKFTD)M>b)> zPbIQaXUV05;_tHb$x3JOe_7}1%GE~JvTZA6{xzE)^ zKX(;ZFDt5!;+y&UA2+}-34Yfw(EX5$J)M?%U0ldnLFpgEt=Tu9(Gt22Rj zO8jhtN)MQGvRE#hTaU`4gJuD<$|BR^b1(UOXx1D6mNx>jxn!{pY2cyyVKK}Jzyf9u znkxj&`XBFa8kk*~HEEw^)1X=TNdVR)!D7`zN)rSH%O=6HRKT)KXszV2l@?3A)U3+h zwt~;5lFvy1U&3oEl%U#!B0k%aLbeMOwjZ6zXQ&n2WI^ymjl82 zU;pF5uBB}Xn&qA$Ycm0suaI0I-wC$BAVmivNRP=OFmVAjG5 zwg3a6(}U(e-YNu{17=S;_E$o*t*&Rl>|Mi`Zo;t#&C|ZbP9M#BKPykYd?fU~=kK0% zSzB4Ff2HK+Vd++gGQ~!IKF-esXIY+gZRU?X6Q)VfEH@fK^PivI;q~~afwDT@#_DQ= z;@Qxw-=$7={MyQjTgOqCI%V^zWJl#muZ`OpYq_mKGXc7xR=$-iU~+em+XJDeo}IJ@ z&6+viz=pet4Y#b#U4+K=Izexo=CC+SXa@bk1f1 zNqa>lT6v9TrHRr~zt$O5)S|Yk9yNsLYHq7JD5xPY*DzfQh0|2U{NX*cbKA=QXxl-U zZY;0D)idX@VD6WgH)jrh{^@7jy?Y<5Uula9pHY$V*xL~>dV z(sRm@RZxwLoJtgwHsI{V8~Ez$t@vi!x4261eVCSo^A`iKblEC`;A-sH`JI8Ypn3D= zo!GEp2exlNfFFN6Yq3zH2))sf@dV*mTY*cHa6*G35ps(Pit;I%iZO6Dcn)D2&GH3wZ16MrJozlxNS3nk*S3<4Z8h`Uz5XxY?;M=S zUBI4`0HuH(yIwozVe95izK&f>7He|TTzpMX4Y*F24@<}G@JEPEEaC->fZ2s1kAeO~g*I2!JHNM@s zoBugss{v{`-?$id#>rV66mamu{n(gP4ib{My@S)I&sn;5@7{~%<`&cuZW|j|E@fSo zHJu>c(c&U!bT$+G+S>`BT^Q=_!IQpzj1CTAOcNl7o?x7sc&sQdqj(`{F%&HfprvMq zpHOac^Z9WP%$!}GUK@nUii6R*4R z{7ow+-q+*#rv|+Ey&2DcYsd2sy?FKh3EqET-|&I$)&~yWKYqmf_aE_&?b=%n>fe6E zhj*Xw{@n*#)$7%ZSHAeEZm!CkS6bC_f`j;R)7#hDmx1OL-m*=7_m+JE+b1nxpt#!~ z%s9jLlIQ=G=l%5NgSD43CVYA$1Hdb8U7zqy*O++ji~!7Exq;M_6>wX~Hll`y&*U~UA9)#h$P*+#q#kISgFK(| zSLnGny4tDMQ z*5)8+CC_c!wphf8Q@@o*X1h4tscMO-;`8i&674jb02#JW|>v2pDh?B21R|2>K; z=Pn@ZW)KIq(MXDkF-?)Z5k>SX^mx`>156u5NH93`o)>I@CobM#v zLsHU1B&Vh!^+6^cFfX$RnfW;-4m{-#On@xOCnRg49O2N3ASqC!O3WQ0$1KL4pK&qU zs3JnMxoIe3j6t*90GQ8m%^<+ay}@&}n1*KeHesfv**$+N2P_+A-PN(&Dnf>!S*{p) ze9kU%2LB^5{rcU?kgV5!!0bsNt≠fLM^rYjf*9^S5k*Gw_MB48>m)wS?EY19HDeuqRi>R5< z%yKJeCNP_1!S_M4fZ4M)J7|{QF^^|m6D->zF>;gOewo}cd@83Ze9)|ctl(Io1WO6Z zvMy_|th#H1Yq?-C?Pg`>HMBIjmMnLRBauPuO|x62v4+rWEJD??F4gdOLNgmSKVN1w zMv7=){=0Htm}L?DoIGguz}bKQbD9m!?tAB_JNsPHG>kfEme1}~%wGfNcic_`^9*SA z!MOi<zsa090rb0_#r!?!nS?(AF zb6K$6tbCM zo5wd1l1~^rjY2#M=E!=M=FgKQQFi_?O11{z)0lJduQB)0f$QHVWYGL|Xc+Wal zJ3D1r1<$i1nfv9J)>*AuwbEQ8PM$o4u<$5bVpkTF$e0vD^L^Y&euN}KbMpNxq(00> zCZRbyuNpadHMaWSx4REw>yACxv1>oVBIA&ll7>C|4&lo=3$c8~8m!;232WDFz=n-m z@%{HdAvic5$w^rRzXynjNJ8*!g@cMB3`gVkO$W-_ms`d(G;801bm*-ZgaySSLgIfT zL!t@L(TItT$1lfEVeRU52F~l(t;3zT7&dI`K-96YBbRl6jjdj7%j%~ISXxv-0~t-K zQRCfQU(Lp>2;c48;d~j{Vb7hXC9&sY-n_;5ZqI(4IUk6iP(o~SG2&B;keXgw zga&iK5W;)pJ!wU<>j5DIZ$$`sWQvcy&(BE+368QjvZqfuXx_ekC+h1P2+YnM!?|Rz zd^gEAxe4t}&FF3GKu=qn`E`%>58(Om2;o=@;!NQ6$O}wTZ$}A}V{c6F$GJbjeM$=s zX`=4rbGKBtRuh!8Z!2v*cdHCetu#o0)#53dBO%$+`fR_nRlEQ4%?Mt<9VQs};ni;) zc=d0sc=^v}y!;<+c>SNOJwwp*boC&!=S*<&r{(t!b4^qJRv9HeN#qBf?9 zfZSNyOkiwBdrJ?xI|k6#{TTgSgOn`f+;;ZZAYM?c__Y-rxG7>r6=6Y>0qgZvgq^7@ zU*79@MGDdJ7EZfT6 zLvT~fa|#Hga>;iM$*v(_mc?4G9fGwfXjc0tH;q~wD99~MMgswf;tMODttM{@p3k2> zhm9M)!fMvr)vMQH(L!g1o-;?GFTcRzMf369t{pgflIC1m`Ts?aV*DstUFrUP=GpBL=>}dpEI*-u60K|uf;%2W0<_{q$o4_oKYp&$ndL+-KLX-OB+gxI=^?gvwyerCM9{F4|3fX5B3783F7Gv25 z%m&RqV3q~hL9^r!&7P&%!LM_T5MT?MJz(~UpL4m*=eEE=S;tE}C+|U^oKFy*hGwOO zzniOvwi>_*gmrFb`IZFBp1ki&JWkN;z?o1fXtwpqZM6W)Apxy!l7_z*+vtih1r2%vvBt(CoQ}R1sPQuYzauW%dEH zK{KCG1-pcq=!oH=%?Hfc3nhSlv&uhp9M9^#&W!3}73gi|U3RyR733D+B zw|X0{LBtnYN#)HX8)M2Q`mynn#nT{;-w7yfu@eCq8>Y``HaPz)2_|O(vja>)^XHEB zhh52_S)glDq=?sqX6L3c8=618d^-c0o#f;E+!t$Di)9cvd0vG<)8s$Fuv^ww+i6(- z2S5MY+xOglAT+->%d{eDXeneZ^Q`l{XGbC_RGNzhqC9iZEX%U3(8shYZ5}YYNrdu| zmZjUte5#oSxBWmk|G>0;Ak1^i?+xzxY`mDqdPWwyO|Y)cN6(SJ%hEH+dsJ-OzRorl z1dgHzh0{xSy2fbszK#)Ks_+O-Jy=$pRTslh)>Zi%I;)LT!v@*24%ZV5O)O}FWl6tj zWu!_Kc2+7?G;{oCqh86;qhi1sD63&FmX)W{iTl+ov|3WPURLWWCkMe*1nX)-ueKGe z7{$(l$qr8NEFd;0mcOG}huHBIxtO$a|B&BodxHz-)zM7Qsnx;_4)SG1ZuF_Sgpf=K zYv(muO_hY_YP1oM4V+6kfFMLS5}GA}vKE2RtI$Tue07;;ZPhhsudBCNyQhBo2@B@V zVP`bo!XhOlr()T17Z&c@UHfq9@^!?^0rEk!}~d)hnkk59kh zfBxrxupN4h=TFD*g0MU}O4y^`jIz#-5D-V+QSb3?lp1@BcjNplO%Nmr&yy3Acr{LN zo}dVgvg*EMd!w!NnwliEYFXO10#UXv+S)$bH@9*Yf&9&DexAS^g7f63UcCNCCtm*t z^`D)X{P%vm`EvyCJ~3S%UgPDjye9RM@cevo0x#I!y<$5jy=0sBl3@Ju)y?N%UqPVu;>W)o zXC97m;4${pRz-A?eDoHMP_Ey3+$8hr#n=lxdpdz<9GFZzo5VOZHu4&yPhMg|-;F#c zaF3y{waea^w}w6k64WINHV2tKV%7$AUaE<&pY{fbA;I1xO6LtgzvMif*neCb_`Ai9zZ52C$mFlIw*953*SFYNd zmK7^kVEx7oHZl0%!2`HR_>GPZM`rp1!bU#Q?k2I$2jc4aGq`Z_C$?vYah#z1%TGse z{?u{YARLF^ypFiAV8n-oAR!_Q$uUuQn3RCbyD7+ia1Yt{@8i+^2grVyj@%bk9tetfmTO=KNh zE{g*f`S*H0zG```oDvH51KWAm;ua8n+?!x@K|J%^c03s?imi6 z39|;x0%lJ>a8_AW8MH-Ncr4R21y5tM!Ik z{30teS%}9e(uVSj(5%MP*NFPySH8>+njMfCI13Q<%6}Fx|DQm!G}Fqg22>5@XO3mV zI~$t4g)*kVnT@EdxN={R#rZcvt+_J1RHN&i-#e#(mJO;8n7suvew}<_0IDUEwaT0P zm#2XFouK(0q4@*#5&w8Y5ukO9gH}c3VA=?&<}1zob5?VviN|TGZ_hjc0MZOeL_t(p zU2U~$+$W!_g`23zduB(VP;PH0#;Ipc?06=hYn}5^OgtUIv!P*(4-Vtm;A4yp3}Ljt zAESM}c-q@zkp+eo=e0vV%8h8LuR{}Iu#pXYLk(e<5@c%StZKrd`R@AQ_;Z>8%p7gW zcUOf;&|0g)MTndh!I{8VOHirhwvI4a&+k0xvy+2nbH8B2rxBx@4J(%j2h9e+2Fx5u zN****`raJ!Z3O6mS&%HCt|+dg%2`2Vy;P+sK$R!CTgWd_lZ9+HutGJtAoj8tc~J#4 zDM|j(8f9vxyoKN5`DzQxP*+rjhN5yba@)l38mT7jGfhnf)dXhUx7|Bc?%NXP8_H^U zOtty&dM+IW=@~e*XAhPxlwa-_*tc&#hK5El{A3&%nfcgHs9U~b9oDYfgnb7NA>i6A z#3!WS?!9z8%*aMMJK8KZK)JbP>@X^D?|wG+e}4=M7A?b;?cd_mxeK@*7KOFzH)HPH zMcA@!mo4ur*o=sbLwb4vQd4u0no@w+m<)u4-bHALTU;O_IFTJ8fjKA^!GvNDnu7v8 zpXNwP&>W7?8&`g{Q2>B{6GKq|Bc^2ej-E? zX4wWzj!zPrC-G+N9o~+9ASnNe_hY}|!?WM;c8uUW?&&R$eS_CL{`GTNp0y=36C~vp z@tWI70;byZH*9;}3XnNyf6GDpTY)pT@7WGXA16Put^C9`Ro_1&7!Tp~AANZB@4a~W z&tAOx$78(tjqT)nUi1BXOnjQei(lX3`Nvn7ctc2Ln<;CvEZh#73CjfM=iI(zJNuex znB-QAT-iJewvC?%_P=V&c3;cdYxPZoXT3esR&C~#Chbj|wt{4XX8|+s*OtbA@z%gu z7Uwr(;}{1+&+CWfsv;#FiKb+ zdCKnv&D@R+vMzQF@LYo!>wk=qUWG%XyyeG-`mHS<>mO$OJBTMz-w=lT22D@3@}vIO z_ZUyR2Qbvpi{7>_bP>e6T079*+=j;bMzip0j)j&m)009K|22t z!HFOxP;-_$SvRu@C2G%{MNe^n<*O=Jjsk_KBPa=)6&g*}G{J!Zb4jU1_^=h(@(J)f zhtBO;B{fk|mefZXnQV(Pk)Dx(M+CHDLWZs0#`ZJkQ3i4{(wSyK2;oX%dul$U1v$Qb zSunHhxJUON;?aYL$WBwtX$}PV|3aQq7J9SH3hoF`MLEUh&nxAsedB%P@}BYxEcsb* z$!$L2)kI(}R6Oj$3R6*Gm4oKoYQk+5L9D{8`vSNULUU0TubV9qoO!QyU4onQ7xyG% zj@#Go#oTw1G~}v5@q2->xjFE;C|0(lh#L-+dEO}zR!w{kf>=Vc!UxKAM*|=SuLOTe zv6B_UJB#O0OmF3RR@NirWGm_lLD>LT1_cMr1%hT>ljpD~7`%>6C?znLPzKJrKZE9( z;Owl=?tGHM*_lr>f!Uy$&&5Hr^G}w)vS(=)kShLhF$YbuI1600SWqdS$22rc{7nN8 zKC{nSn*G4pgJv01T%2WoM_G6}mR~Jc;z6@mK{9q=xi(9&8QwGzmK1b)>ngO z0kdA&3G%$qXtSYN@^{Jbhh~4!oh|?WeD3#Y{q;PbgJ#KFZO)V85?gxBdaHZzNfrs6 z!zV4GVWIM5`FO6yH>SZ^F@$aL4d#U`&Hs2OYcrvk*SF=MU%a&ClRr+dk!IsMIYOv? z@)R!~4|Dqj&xfD770u*x$VPR9fcunyt5w3D^s_DMNLX1!{3z+6SSRfF^@r-GGD)@DzN&FbQ_Qf$oi zs#TJ;MB$^VD9yaLn6H|J&vVEUtaR2C3Y<$(Pk659`RkbODKYJJ?1ZNBM)$`-g^gON z`Vzu)Ni`a|HMr(+vRd7`e8oaNEnT$0z;Mr={b+Az=RWcVV}z!*wm}OMw`tQ3tXL^{ z-iYsZ@3YB$*Kgc%{?3Hoo43Mn`0xp=AYgy-<$TPazYuHIugAr!SMl>Nrw!r`9r+nI z0)w$|@iHu0yaLBhoI_GlCKBQwS*WP^*en8bDnf#j44fkb&fJFHjwcinn6;d)wo?{n zMbQZ3HVmNzN5S&#fKY^7Cq!$q;EfOi=Tl7U#`PPqZp~M=n8N;jdy&ZwI58m!_fpgF z@O~!J9%SMnJJXExhtNt|S?ttyY~SGnWe3dCy!rDDmVY^Q1|eY)h>T4@RO}rjr9MQ` z-896d+(RJ0U%y3@@xH{|`SUDP*0P1mF@Mftg8OD{-+2UEb{xXWHJdPZ!7^<4dIwIQ z4nW|IaEpWy6P1F5*n5bLOhQap0-~u%LUV*q;bC#iA1%%x7Zz=qCqWSe=5X@|KlFoJ zVC5TLw^DCp>gp`^ww8IV=T^|H(1IP!?Q9c<3A(TFV&Xa8sy*QV{KMoY0wMK+5IgZ3 z-cS6F_v63g-Lqe*5ByBfop4*N)G26wHU65=%>5UHQ?_-Ie2U+2KrUJIoj3ds2l{Sb zuWTw8Aa1KabD94vgIwEf6I!w0sLwnYw2@=vkVlEdpa$~_)teT z2HJbrc6XwKgCR}slx(7BQ?pH6eB7yUihYD{=d=Fo@e>RW^rNq<3;kM?58HJjN*djN$G+^tQK|Y}wzQUX!<)=EM-w-PJ=FlM4dDNuXz7=PacP zyQNp$rKJQC4q6M?>F4DM)MUlXA&}{nKVgb(qu``SmQuo7K7l8fz>>?)Sv+@UCP6ME zix8aUoA4+AlSNfl&uoIQ067cU=>$E+Cnmt<33$2H%63KhrO?W7iear+@CpSc|6p5U zIj@uh<`SxmKwm;oEcU60DnJhJC6{m|Y4HyMa{>vC4MVA(0#2hFoB%?8i>U9!OxpO-y*o>Ra)1;_WKWkkTNXEzPa2F_;9*C2-HqGToZe3L7fCz1!w8sHVnwZTL{=65rJS<>J`Wl@7V z2gjZt{uD5CAYnkw@2ASF%Am@kfSJ-c{M@@Wv&{PmFBM6 z)zabz&66H9zjosHwm^o@MZ-b!ObGt(!0>tWwkm zf-xUnBLP<-&06a!(NbT@PL@(!R)Vy`wY4|!+_TllhuWl&RGLX%%kvUeSqQ3mKE){3 zzVkWe`K$E$VWxY=Y)GChEqb5`6AqR=eFmCM9vnMR_GF*S_S>MruknVxkG0ck&3$ zIfTM{1i~bQ5tt*xlgtev^tM*(QmDBIO3^d8B_IX`I$%~9s-Wv32)+^O1LZJ;-;O{; za3mtaq70hlp0RqRLP;&c`t|E@`*si_BV%m&Tg869ab2IIapz7V?%qwoce}nd@LaG! zt{7in*|Oz0_Tw=G-nxnCm^j46-$87AGUD#sMasQ2JV?*M!;E~~OUuK9%zQ+|C1K;1 zZJ0ZEKJ#QQ7BOE|FJFhPTfW0r8@BP>i_LfW*pH_XD8RoRWtQVR3CxQ`!gqA4O={HS zMupXj2%8PfF$m^=z>vO2$cV{ z&f-iceZ_m={v`MHCQ(oK}w#>33P2`Q1m#Yr`ck-n=IV&H``t2j;)bK19}K z+n3lkQ11!DZwSV3#>Z@q$EVjag7CWU*q<@IPTYUXyy3mSWj@F%|AOE?@$5M!#(1A& zO#2w`gWK2RFL|z)cqdJ~H206U&&CPk&urQDNghAR<6e!8+eFRh3fD;>$SBV- zX5l>LI$~+$y~z#n_46@X8vf;^1M%YuBVtB9mAVvydL`?UH28Q z`HJ^7$^PlR=CLr}WwqBNaO?ZnPs-w)=Zim_Pgrx-TLKfo!$gS4%Ol(ph-6)sJaCpT ztN=w;Lv00ewrU|IHwY<*&?U*eq%g<1ESRf-!5Dud2 z;<>YV4Q>U)&Y~@AZH5zJ)L@@rXx4ba8l|v)iY+X_^9B$uT7>{x_vy1FXG#v1od~f$ zU^f4AA29z_v*6kCPHrgEgP0tq%Tq34)?#i;2E1}zQQ93a8#F(1_aYE>{>syqUG00| z>|IC8XWJl2TRSKJOZeA$+_@BGBiGz2JZPS_HWOGWi?v*kZ}4m}j(uQU&b*Tp3bBlX zj%oj9A4vOtmpo`T@xPYmdJf8WmRsj8GcAK>O4epUvpB(^SuPeHIGd%pn0>d7VSmm( zT`}6_o>5y~WQ(CG1g0&!U!r5k2JG|AU*0#_Uwowa@y(SL7GhSRJN11lf!ZRCk-7c` znjLhymE;7Wf@WEs*?9S2_P;BCVD^XRzkV#A7r`UXHANe%Y`k$ zOepsJ=_EBiYNVbG_2Wrz7X~}qFwoIztCjV(wxXxC4c$%6=xS`R1sS@jUS`z)zqkKv zj_kPh^+9^?DT)&1DTz*oGL$HZ@)9LVBt`GKn{02;@E(P7Kve++4>ZsK8V%Dx!?z)+ zbMBctcYMTsF*p7ZGaq1LzRQ~Dw{mAyq1n_q6B98nev!L&ZIjz&KDlz`%1KR!OT{;m z2UdaRiNdfAXG&I1muxr-DCYp_{1E?*@PLM_%r+WyqCskdeE_xw2o+R$vX0{nI9MY< zT4B?C#;m-cIRhwXfM6h+<-r2brDzpJMXD&dh7}GDkahvgnlVh!TyzfmK9g30HB=X9T3?Lki8b1qoNf#`Y-EuUqt|OiahmdkX|-GCEKW5 zM03g4D$Ds43Q11+>aq+e5Y71_G<$qyMOOg0GhNZroV8x^*VDzvr`E5xHmeFPWqtU3 zK>CRJFjESZ!$&Vy4k^Qa89t&2&1tOzoV8A)=W|Lqz7%Ez(_q6mcre~a;Gb&?&>Sqy zMZmWtc;;RRniURAy#_D+;SYBE%{STYfb-n|@69*eWk3AE^MG=nt*-6@H=o$n*2h5f zmZfQ&z5VV1`}J@CWZ!-IhxV1PJ#Jrm>>Kvzm%m|;eeK)!^!I*jFZ||@_V)W9*o%LA z8DRdFJ@m+9_RJ4{X21W#-|XpUo^@zeJma7K^fz|+a1$VS%&pA(-fwhE^K1V&U~d7C zZ@+uc-UJfgcw?W#p}+QD!^taw-cieuPJ@%Ey?6JqbZVx{2MZ53bN9->0bKkuWvhFJY z^4+Fc{d^Y0#4Y3QyYIGVzV|(@;ZOG3>u=ei!^iA&JrxTDx7=77n>wwX%2*X`XHUPS zbe}#rV0YZj{p8%5R`>z=h~IXX-Fn*{_RViUVSoDb-|fve_xW{w@ctnW8@K<1qwbm^ zfR4V)vOd54InXS<_K$a6p;7UUJ^4hK%2@6hX}+O6-&(10JIB& zwE%hBz@$P{YV5?PJD2UBm#@0}$H$j0J3zZ@2Ee#`>5^T#%sL?WGXGyf+n2U&{Swf8 zb-`BtdDfQy+gV%sk2zcV_XS)24?|5)A;;0cfc$WJ0P^ZOI;xs%}|7(Zg@%*!>BAwvcY0jEPJfLnoN%~0ynddhW< zFZVVV!SSSH;BYBzyeS*|tsa)WcPC--2JYu_gSx^_kP@L2lPI~o@xgN+7cYtDcxyVQfq z+ZM-P=Y9dy8?ufA-CIES_651>aV$Vt^_E=rb~%UN7u_Syqx<%6yF1iX16FtFi2DW0 z3M@DZ6V3okNb!7w1kF`&);0nab%*=FE1*Mw*a`@0B9<2UT?2Zp4#0}!AS-b%+ob`o zE^BK5s8KWDV+(3+V7mqlb_{9;np>KC0Nb>60L>j;NWk1FkWO1`YY&jz1)z74c6-zd zsaXRifmr!0YlgfQK+7lmAQhZDq$63fquR-5o7R!nHu9=e>w;CefqPO9I7$E_aM-E_ zhTxG#f*Kie(~#9u{^0@E5@>FgMOPMhwv{EBd-I<|vnr0DKLory3&lv5aViZk%laJtBjtTa^UNw>?7&jTWgEo@H(J`dyGLj&YU|6OeYLsHX+as>cP*VPd0nPvV}@9mX=r z5UDXt3gfA8o|;rmldH{)4w+a`?p^1n#%*PG+Sa@$ND+wU?A*+>O^uE?Jm0Vlv<7Gv zv~Kf(2XKzi9Kc!PziXiOKU0L_|J?8SUqQ1#S=QkI%#w#!qcHiDV`|{u$Li8k;kTlJ zHRs$ye36S)7v06BbG9@;ZFAG(Ha(%(rvUG8n2vc2*bFM#_<*drA!39EYiXUJIcU(v z3IiTvRu<(kfO)DkZWDcDHj){34QHtc^JDx!X2W^Fxj%rjEXrfB7h}$)`SL9p^wV8SW#J$&!pTlihUek-NCYM1z?sm zEu^4XT@V`hrg+1G=FeD|6JU?bjuq;;#`6#1i_!1;E&|Ne*V_1E8V|K@-G^MBax?!VaJ_uNPNVY}Xbi?>=F7+;gwpbPMOa>rT7puDk7vk9^U7`@7%SOMida;a~8( zf8P;+`GoCL3~b<8VEpDA0^|JwnBP>4>fp~Th5rLI3zlVJ7A)(xqHBEP>%mH{=o!6y zLz<*RKBVgL>C4b~&GK#LC>_vv|C`{$xv_;kfqudLg}E0=7Yeb<2I zHK6(8&bn#PpCE4s$<1wBz=tkgTn3s~fa(?7K-++xVE8KM`XtfEl-mx-4#WZcD$ooF zUgj8*jw=9;kSI3~8AYTJDpborGyhe_f+W&9)f2l(WT;)aL(&ne@L>N7wdm{)jLqt!oI^PU8WJgP=KFqhi3KE0ko|GPWA!Fd^%`gFIt$p z+ca(@6QEg8>(Ja9pt+u97x}0cJas=#otLPU} z6U)uOUUbE1YwrnDN~3nl^frJ?uj6#DR?Vnu*Z4EVN88OC(S9*ZrR>e_GzXe(ZD(6 zs?qM_yA-lkp>hS#lOuyRHvufeh|9B+wsvlY^7EXnFQ0d4UR#1AVM~ST+@Ya~nL|M@h`fYTek5+;UP$VF%|y zS(kwAVt>&JfODb06~6SqY>A zOh?xZheUxR`)Oab0P}Lk0Gji?b^2UB zK#m`vSyK#m`)q=>E-bnaLBTDh>b9bF@(tH^JX@ej0}J-Q^R7Mi$fFLK3OlDr8uH7P zONJ~hmoNV}`#=7l{+IpxfBa9oeD(iq=U1-S(Ad1?fT~n>$U0L4*4D}UQ=YODbr-?8&E|wFe)16mWgOUU=a}JAS;?4t~^R2S04F4-Yii z8*dy0{NAnZ!nSH5MBeBm*>{~qA`)_d*N z8}B5aikG|>ZtuVEL3`{=U$!6r=*RZUUp#NW`Sq{t&wu*8z4qFxcI?L<| z>O_kjKYH2@>_6nQ=xP6(Z@vlLY&QYV_uO-jJ^0{5cK6-)IE*W5M$j$x@WWrWKmYk< z>!C3=eP)izzr~*Z-jjCAZQMIewRUU}so_V>TPVz0jXI`I1e zpnRwjUs)3)yz@4o{N_G;^UV+3y8Pzr2i)rX9*u}7fwQ)e^?}|?e__EER~Br= zE0lA#NxHEFpv$%3!g;&6w&cLPz7Zm6M3({$0F)G4J^=HY!{DVY{v-W_^891gU1h&3 z8!JfTO@Q*%CEHv9M3)!cT~3zQV7cXZfTs5(%CdU_a0fKES7o7PJI?EasW%YE-$o9x z4$Lyd2ws8KWMHAJ%BmDEaIOoSXKgvSSZuB&KG7?{v;@#@0L)6)wO+u?IoHVN8u#ZS zsi0XQ_O^h1oztz}z_1Szr_OS#H5G4lJkU0IqMmd^mFnJbU2-$gwYi0v{3|~7zg^m{ z=szy++J9V9B&AK??!xSxzZX3Kjr?$BNp1nAnwuLbO9aXQ7ZB*vKeq?#a_}wgvM!|n zM-SzwCYq2huLhvCxs_EgDu@GS6t|diNfAHPVnD0CUJkhi}9G4$RRq9n?|ZWSx|OI#w6?mQ`9HTyevY$a|E>=%z8@;$8{HKLgDg zCrEOoa2k02ib-eH;gT81#W zVu%yuhb-4q&B*KSrx}*hS=c6okk#Z^Lz+XvXCl_5Yk8kkX?A?b&P|Qm;_Mk)oSSwS zc9*C`>&tVtzBp?a7tY$bY1j_-8!1wrO2GB7jH-hJHZwkMH|zqYvh->exd5&S+_GLD zvd3voUQi?6viugEv%1tT9&;QCE{^FMb;V{Ln?v3>AZ$CvNsoS3a z`A__}TssuYS<7Gl+T%0|j@Vmo0JwjD-QIrlJ$vt+1NPy*Blf|2hwK9l_(Si!CEw)` z5#wFWZs-v*4g$)DeOa-X<-Vcq-g9n$IcLk7oMmCb{d!k{?6oz(d;L6MxCF#5Q`Q5_fb^B^i_`@d-BK)z^$L}lrtUw> zG3Tbnd@BFNvs8XFQ|@B7!1B_$SzDnZT;=%d=NAE9z*pAq)kRx#pW#J7cR5J^p^J-4 zc7Z%;px(kf=a`+bIn^a-aej{DkzaIyV@jG-=G=^8IZtw3<3XJN9Qimud)6nASw5%9 zWYncQOXaMIW@cIgo{r1s%-#L2jy@l=kHVsG`9oHVelU*pe0MR z!m}yjgpTR{zoe}k#|3PLWdXDD8LYAGKynArthmow*XH0UNY%CJI(5AQNXmVb0vHwj zL806NplUzl*Q(|Ks!q7_>b+{xEHP56YU>-lpEMjAj~%8!9&z3 zFF=(+(m=`)vK!K*g-={Jq>C+sn?;kO)2MR zS!!pwoptTNx1d{-x5$mr!P#Lq^wtV?^VFAF&e6;LRp>*Hw9x_B%B@j7w;KE71~=+N z>N>%!!rA&1)t&6m|1#3)zdqLKe>dK-E=(|{(6?|2aO{KgRK}EtY>M|^Gn1-uzi{@f zEzN82tR{NWn2?#|8$84J%(sQs&QH_Wo~6MH2&45CpqNJg#@cz?ytqi2vjS(XIaexd z@TcgX9+~Z;ZSh@Rn9~4Y%9p80FLP$bhiz(Tz|M@6fo8R~_ds(MY{S4mEd$9FRiXGl z188-D1!yLfUzzV8jvb&`lXwIn-n{Zt4cnuS}M-DN}8X1Qp{w>hyS=Z63v zNz);ZQu##*s0GdZr*2kRoCV7+;MrkW6`}?L%AYysq1`k|$bbM^6L|Dz0BuC-6M7)Q zvMkOXEu+8BE!3rcz_gE-P$AR!Etd?pY-Ol4GyT?=?oTYef>kMWXXu611Reuaq?*#$ zr!|(3G79jkWgu56;9Stl&P!bY6#Kh7*#@P6#$LcVEqIp2mHjBFHLKsX(Ci`Udh-s= ziY`%wW=YFQHw*h#2LVRaLbF^mfRY57r2x%6$&5`go~8=E(Ct$T&CzW`vj+tWa|C8d zT|mJ#BS5nxR}Kn>1e_&>s}nc}A7`#Tf#yAM?7+-Dt(6)k$$c(y9~7o;kmscJM?XAZ zKX~d1yY;4L=_S^5a`yYJ7zVM~5pvUYBU;Z-te#7p2@C$aAW(U0ILA&FQdpW*fResfX+P$oQ z;NJV~`%gY$FaG}b_U0?E+XrubVEX}0MUwacu-x~~A)xc99Y0cU^(UziPgC$xIW#9$ zqQ-^})M=+rx7q2sW~)a{0JJLECWWx7YhZnw)t&6LV@F!;(7^_K>gh0m?xvf$)|+m& z+ky7q{^3QxKE;&&?i0`0cb@v5{ph*p?e{PK#eVRkpV$w7_~Xj7)tdVE?z`@=+itzZ zZe>|e{SAOwe%Eim@qrIqeE*#f?cF!`*}?tCe6Zhp0^>Ke?xT>pFSA0*@!O+jd}MFE zb-Of*a$*1&Ww}IILAjv8!vbcdK>?P1d;%4UW|%B% zfH5#$&eGV==(v59J$cF^McWAtW;LAitj{INv#yUaM`cf0ZXX(N(Vs|TE(GjeE)^+{ zXmI;@1is-6->)X;ku@7oA0}@@vV3dyLk&WvO!5Xi_gyPvbmKIg{mkqs%F#DZ_Gn zYk5r#PT9zI3b`3Tvt&bq+#4Aid|XKAv6CyCi{*hnu1OO!snV9^NSLaTwsfs+XWZLt0j|XjG6vL4e^_G|E+#7k!%4cmS=fetdvaPz=z^S6NXrJaR?@za;@MWn>rdtVkPer-Q3V zXA2${4)bE|VgVezTynE#gY{mOJ zPUC${vCH+X@tuyrS!1I#o~2>F75NFXPP1@o;;;eUefbT`e|VhrTCUYpA~h7xPK?{i zxpTm7=;hg7TdNvR;Y0P#X#R?;+ffQ@DOac!yR<0~R!`G{?W``??#1)AO}eql@g#uR zp?Pi5worB4#Hv*yvB8?viSxvnVow=r*I>=CJ5ezarTkqa^9eM2x19U6g`oBxYw1QgbUaKrCo3%aTlmSEd4Vmkig4 zrvhg8OYGx+_EQ(DEHwoxni4IjK>%Fq2a!O!NTpenugUm1uq}4 z#|+g6%LkfnZnhjBVLz}~?9r6RK>}wfxO@a3BU!hqkPfh`u&?~qatv4mdl2qS&Gmctj<7l z(sGW`G$J_1Fm(S~RK99NsA17fg`O+gw(EDaBQ81b)$}2CF}Q(oP&Y zYDYghYDW&8u)~K=+L6Pj?bL}T{s*S(S`f!#zh)|`({-mErca${wWEic?2XqyurE9k zrpLZnE)}Fd_|dcW`Wx@quV47RJ^rmH?Hk|tj(r8--+%CkW%*v3d5-0wF$(-{`_rFa za@UT#?zqDvblh^2Ld|^>2>t*a@KAYz-u>?%wj&zwM=Ago7%M`C^xj(s?X5TV+k5YR zXz#!Gk^3!2A7+J|d+%*+8>S0a%xZzNEX}eq3(Vhn^L_j7x1S;pvdrIX&;8^%w{V>} zdfYxb@S*K{_dWaYy$`%MAwu&A-}n%qTu$@-1G)~pSy_sK#6JF0c>rLB*)HW;6)e^6 zVx_q00FuvEnLu-i?|L9Zd6K3~aG(v)EK9RoHZ*l|p2{wV@~rCfOfS;gIm7*H>f?5a49PdB>d!t^~arKBH zty5Ucz6{@drkgzRP4;$K#{Cb&NcDcs(JwbKuCF>SgZxO8>*{$EKo4l(G{-4q0O&0H z0L}%CcxO5E+5wxIRvW;U{CsNQEBq$^hq(0`m%w_yTZQRVq#=)Mul%ZUqWh=@2Ir2F zVMAk2G?o9L#-I4q{@lwV_n$_Ge9VW7L;f7q_$mAS)wAMW$s(MQRhZ-QEHvw&dIMAr z3}k2+c)6`}#OE$U3VC85#f$Az8A+K}N>kobUwA`Gkk2`^y~@uRmP-TsG>J(Mu$$`9 zSP+$?p-jzn3yO1=?UJ>*CbTE(Vyt|3e`6mFx>k9stji>9}jiAe+On|PQy8vpj zdtC$Ax@C|cuO1qda;>)~YiX3>TC<;{3}s=uM+)1f$xko&ib?^_-5N;PrpZQV475|` z^IPib=p|2z%mJ9|Kk72YR+cZUEapDw4ak-2MEhx3cLmHr!B?1TV!H^TE`AGMqf10| zrI0_gtl089b|30SuDMab+X?7$FZkWlz_Cylu-skGar`%63b1!@4?7x81HFK=EWVVp z0o-*D1;^~Cdn`*b+jjx?Aw-?5(E*eN>dJGx?*Wume}%fNyIl>IR?BMeKi_F4m$6>H z&s46*vZPYJk9UD@A&@=omOYocmFHDUmR^e%zpPsboQ5u@CjI)+Q z^MpgQ#ubDK;?_SSaiRXxU|VThE>Ua1^7=BMybLriy0pA|amoMNy13{T=FL^0IZ`IEj zA?>ms|8Hp!Lt-&r@Rze9(Q0?*C{O_ z3PTOzk!!{XseBYQSVxuMkSs^u-3lz{1ktQh1*xu23R4xtc?x>H#91nZth!ERU0|Dj zw#}wXKwlp% zsMS@3LN`yFwQHT?Y-%P7M@{(46iL>rx6$SApgPpxNHR8To`L4$oBJ zvbb~2p%KA7s5BOmhJ#ys02oCUQQA$5oM+$HO-0JPpaTD|e|pKj1vK7!?}K*7?TQF- zyMyz?4?bv5e*0VY^y81)li&KLeeb*9u^&J4w7u}$bFBNeJ^HXF6uHChyXS6u^ow7x zr=I+-z3{7F*z2#oW*_c9V8@Rh1$^tRiTA5PlMhgMH`LeJp@aK?=MU`I(U0ut(ft7M z2X^ELB0Y2veYB5d{yPGM9zJ3xjvsYDn#eI7cA;u_`<{X8hL#2#g_olXY4zVKW*Ro=94}c zacFqL+IYULJl}5aV=iB`ehTu#M~>Uqzy1yPr+)N{UvVGh{re8P8-?614(vN_2R}S% zAG~+ie^{Q7_t{(jc*`4%f`+VE0C3nf@!w%nDOi{Xiamhv^TnS?6glsU(Aw6S-L4Z3#1;R zGpqX~_y#)r1F)BMY~_h#76k0b^TF|5{1)I!%A?L7@&Gg|4~hCIlO0Y4$_X3`_LZvf zP-$35{8m`d;X)GW$Sue+*64rg4mEuGN-WbD%JDq)EM%eg#!ZymXShbvBF|qcq}5Og z&pq}oXj#^Mp9Y)Mqx10o54eGaeK@C7l(;wAE*Fgq+)p1TLiw8ZW!9q*-&SQOQ0(p* zY0AAYJ|spXlJr|{8!3QPGp=d&HI=0v77Tg$>TYh7yE05f0jxbOGDzrL)#G_+P=Gj# zg8e~mx&0?9rb{! zEX;DxsKB!<&H+j#0d!OVYq@f?c-Ko&GdaBe@#cD9_>?_kxI!J>t2V}0m@ z-{or9#dD79sI%nT>~3Ow$N2(fNtthGsAQ$Vg7xvt9$8a!4vNp$MXHOVVgK6dKANh1wGk|Vze7Si6 z(Gi9Py83Tx9dJh5fV3c63ec=*I|&#kgOcO#T3y)$tlf2N4QS?~G)rEDWTmnWhvfjx zmo~X*GNy1jm2f)jql*hx<-NchBnz^jIUYMev&TyI!43jo!LXzo%y9(EN~HvpBOph3 z*0N$Lhj_}Xwz;%ik>FXeG_|~b{=CnecLr#lP+TT z06RkSINOgGM_n5IK3p6Io&jVEFtQnJ98tbh7)JZ!@ zWqJ(QICkWe)tzj%hPrmr;Olwp&`G!89X@ab9d;c*bes? zAnWS4Ob3-C5ZzBBs=c|#Ew@L3(0%V8vTr~BjD7vFuh_4C@*{fz{qE<_*j(B0KF4aa^dQa5Z$)m?vf6R^_0eS)8<3|sZ z9t4a5>SKrO#EHXJUw_0-)qRNeSzY~pJ9QfI`{c<(1q(SH5w zUpO@X;F%xUcfa*Ld-6L!w4eU?1sa-zmf{{Ywoqa7ezmA#20Zq2{71j^RlD)VTkQV( z9=5mMe4l(Z0KEt8gZGaD`KP&O$M}83-g@Ihd-L`EfbKr`O@7;>WRSiUpjm?hr4K(m z$}#Hg7tjC3zWUfVf#bK_4MTt|3-cSmxByyi8gl=5>BU#<{(EVp+$@*;9vwfPN4-xjj_;EtQ5g;ai$heN0#L*o`qXIVIx=kEE4ZWb*7wraE4b(Wp?=EM0%cOsqY!B<2=FWyki4w6RmgV`C(lwr z`6oX`VOSJM(g=Pn5T?^OT&oK%TpdT%}l3TU~&VaP;mF~AUR9L4$=0p9IpGxI& zp>=t=tPJo!<)@4#NY>!8e9n1a%iQZS&p1Fd&rOX3o>P(Ql>D7wTlEggx*w+ESFfPn zBi;j)6BP3OCK^B`QC4!#dnu#b-?>wU1`VPH zKn19jPZ_|~t-+)j$|hNQ!*9-u;s9>B$f=xF#0>}VPEyqCR&JHWf^eQKjl?Ybhw@K7 z7QBn8Zz%4X5onKL$hX41ySMr;&~ zjgQ&TaM=doo8iG>8!Hdn^ysk7PmI_(Ab1|lvCTZkKkt+J%-Dt5`7lmkX~}k1R_tTo zRn}MyLj73d)wTiMO@VA;4Hn!coTh=myQ=`R2J5aYSJTxH8ZrX&r#o^H4sy?f1e`xl zfh7YlUjk@dz_CxCuc$My>MrN@^Rv7IP;ajXSat(261;n)9S38a21q&Acm(}C-7xsB=fgZIwzOAhVSQa92G6jK(nmEI(ZE=E96hr zPg(!VKRKcMi~PRG14wi&D0_IJ8erD409he=^g9Bx9`413^R{t*$=2o zKAs_tBOg2d=ffEm&<_H=g%KNOzfsb0g|RB~pTn;#9!0lE4Xg0ZD=$ENKY8pAgMU1_ zN3@D4uN2sFALygNPzbl4E?S;F?G&8qx&fY3>b~iuLIR-EomAjKDo`}FwZpP^=Su3{ z10GrKqtFS3dfu-)SYWHx=K#fmXSI|SF{3K^J_m3Pt>_rgJcx57a&V@wi-0V33z|_C zq-!k8@?TEiH|qM}%(gTbe9BlaSg#SskFyg+1>jG*LsbEEQxSPrC{XOcoItbu2s^1v z0OtPgg5^5$R091}4DtsnQ=yLm3FB5SOOy|Mp8dkejD;3L}yz#jtC578Lh|Ne3N$1CsK!w)Ic+)W-i$TJ!gjj-+|Hu5Z`W1EnU_mf5x_MJ9etxzW(@k+5cv{<>ou>m%sR} z9X(QK2lkU6fcfBur|tcBkC~5UVHp&-?5J7F1~^A zEh}n_x~A2D%ISTe($4`E8fe?kwe< z7yRVaWe@Q3UBzkJy`ij%TyRca1Y^OHsr7=u;Km`}XaQ&d=HPZ9tE~Vzo9Er7g3lIE zKi9(ld3E;!r0ONg0lYbW=YUV8J_9NF&^Im6&7C$2S4^ zW`Nc-*X!jW@R6rX%Jr2j-#-Yj50WQI)>U9c`DQ;&LLfJc3Iw|Z$N`!i>>ZkAab{b$ zo@dxUNbVSN8IhvDcvNLdT_g*0Tpp37nR_wR-)rSQQn_B}e}`tuTVQ!m?l##l09@;X z|G6yVh-a(m^xd_FWq;mC??F+$07&p0t4l;eGUG_--K{{G;n zu7Rh1{{_rmUdn>s#kv5_YMk`4K1CUtqFhP=dl}x9Ebm5^_eIk00MGi(b09zcNcoVD ze#nbpSUr9M=TMHyhqgL#RJTK}%IZAjdS1o@HHiEjMyw0sStZMIZ;LK&nm|fDcs{Eu z<+Ebx#|fvx+yOjOj-$FtZZ0nJILQ0qlZEM?%Qc2)HNw5|u75yuvRtz1kwH5czqSI zUohs%!pw0uB^h=Vc}LJ2@)q)-W5|tzd#Zb#Sef0@T$2w6X0A^ba0g}w=^(dO3$kTx z-jRC-Ka_z8%zGv1trZuIMD#Lq2j}a!FE>>#89W%>M8CnJf1q1tbcNs1>`{6_{6TW@ zy5VxPzU&VnSeE6kp%~FVla!~dXFo|dUJ6sDE+Jm(1zu2%q&z!oOKNFNYWh+>2GTi= zL>?RjbPF~{Yt3CP3Pl_1mru@+P01xgz>M75ENJ#AQLErwu{=9Cb1b%TOEde9a%{PD z$k*Ae%2ZNANNIp(6{cz>QlL_(X&PmZ#DO$rHa zQ_vuzXjzGtR=i`TE5R2L29yI@En0T#c%odrDPqEzaPLH z0-cj{xM4`2FDttN#!{K5z_(J`P0l`ygPP=rpQB4M*y&;kaCR8m;M+($jYO zc%2>PxQ7mYWQPwQu){|`up=klv!f^92cSQ&<0lW;$&*K{@$@N=igBX3!HzYZwj&L7 z_R*=6cHjg$4iq0fZu7I>yWM)ry$-#1-Fb)o z_}OReneRSrU%3Af`}WtLvLAo%m-fnE-k||GVePGHhi1ifZRc*oXZD&9XGV@sD@x{dW%7?|%I! zkFkCGt#eJSFeTxhrTV|vkua~@~fx=!I^%=^#91Z+DvO}|c$2HDI_t$3{a|!ra$Gwxkvi#K*OW&V=R)aqicuv$u{=y(m z%1?gx`+Z;;DD3M8X33YkYH-W~_s<(vyc?Q2SW=XuEWgu8Q);U`O!IDKXk_Jh-}0PO zX;|hwYSeKJ9vYRptt2B0_0=HtXt0V3=tDz8{@i0jB|9@bX!Da}A)*N&Dkz;D8?%{_ z5s&^cK2V~rR4;GQrzk#C9Jyy7ZluWk3uraWDg*WL4C%TM%It2Of2;!w*2!C3VqXI0wX@OF|f`dtQJ9 zV37){Jj9n#e|B2RC4n4e}zbt-vD}6oPQYR)Oo)6~*yhb=dal z+)3B@e`6JPSzYpFg*#kBzW#!uyv+0eJjY+)ycgV(TMLvC9&6neu2qCxS$rL2#ZjAz zh6W^~k2iN=Ews*Ww!g}?Yr>(+a-HC~nwmaBvrn><+#A7YD z?~>VFQNO=bK{`qBg;wSOX5d&eFl}=~cDS*+5Z!16eb*|Tm^ z|0o~UFdx{EyIPzb8Ro+pW1ZHI@O#An3l`-f@R)VK8v*JNg^aq>M*F2c*BG$rmS$jg zYLI=3LE{CDjvllTK5)fq9vJ{`6+uJs`64)%27u;Zadv2C{kTK3EX@Iy_d+wvw0Z^2 zBc!8L1THF}5%QwKw5S0nnO+Ldlv{cRWeFkgrMyP6Q&4GOphw@3v@9unoeD2)C*L>< zYwz+4-F~4$R6(yozt8W3;8`tmL9v6atimZOCzPX-6~qc|Wd&EaW*?PXZ%WXd2uSW# z!3!j3WVs-}eN;%nCy>^^SA^~T zvz@xaLZvIKmG)O!RTxn&$pS~7Wyz35uTyZLZP=IVmvuYKwyNL-Ct4OXYXGE1qI=~c z3kPYRz#JHH3x|VAfNzCwk~ET7)(*`$6sP2t&v$PdKuzlLo`Gjs6f+Lc0&M}htS+2e zz}%$~>r_y#IC`CY0#^#{mzO14mgbHUFf>e_CeReyoZ&cA*3&v>y)6Z6Kh{q}18^C<0-E`CK!1HbP?QcGAKX~#7 z_QKD8YfpabhxYyN{>(aBcptMP9H-SMsbHJ8$`N00e?gRaY-~Yv)dHPxVnqpBu z^kuvAwtMVrkA2I2`|Cfj-<$T{JNtaVpFmjy00q#R5%FD)_4DU`R?&wO4QM+_^dpOMvcsu>s6Drd1$L< z4<#jEd(DWc?@Li@6vd^pxxuRAR#d57gw@0Mxc*w5|Y9i*^MiesuN``t$xU{ur| zx4b64ziwrw49W|TDYw$x^At+c(9ibMm;>0de51WwTQ6lsFW+nmDNQHV$}k96P|UGZ z2Bg)G!2XnJy})A%AnXCidMU3noI6K(mlq%chcYa< zm62<734AH{gVmU82TTic4M`-x8=wqy_ZPO^E6PlVW{KY%KT5zW-)|jX(MSZxL4xIM z0?aDMDdT131T;&M3?6bPktoY$;g+AcW@UA^5;RD@BuzTye&22_r#|33#RQfW+q<7K zQ|hB^mtVSOnAG4%pTLUqOL68`cQMg3R=GwatH8S=37qqk-C6E?mS-Z#Xp-ldNeVwG zOLCFCDm%`Yb7yVs{5jhJhBRIIKQFD@zhBw1|NiN& z{qO&I#s1sXZGP|AKX*5Q61h$;*aq9LF9L!~bGEfS56mn8Pm4g?3Jkz;5?zHSE?-=7 zT@o~|avlJ32aw!escPrKdH!DpyjIu;;H%LUm)3`93W}nz$vz50wmd&!%X3q(me5U+6T3&8Ly zpuDD--fp1|pscCX*L}Yb{xAZwqCD)bpSMqTFW5gXUnG?!b`zkh!sPXU@>)5-qG)^! zQY>(^zG{Xee(;*jrW1aYH7dUocc4+24!EzCruO#OQ&@5P%n=N_O z6!&Xrb;FNU3(e8LSuPU+MmbNST4283HOBSVhi1;fx(Ln6&o1ZRUESpUQ3C@GO(e^` zA5-j$E+t?b!8usxIiC!`g89T#LeluW1e(dj9%yERohU9S(UsjOGM6l0-ic7HloRnn z?f6S2uqG2bK=SscT_mH+ODlH%+@dWlEZfSt^RyZPj@dD3Qq-Wi~H67U_Ta2%m9 z7^V;zqA(evz)?Cv;d7>30+xqdb3n78S`tVrbeswz!Evw%gyW1+7)NV!kic0jUJVM= zAVK*!mjPkmcyPdmIgTbb5i|$M>KtYZRE*NmqI`jYYe0HTpbf+hQOFHtWo-@?=PEP@ zpXOl-pb;0xpaK|{%LdTw6&Qu4x<|_qnq}?ae-HfxILqQ3tj+>vrL5x^J}R9;JWg!j zgG^+7$pOqN*eOgaqA-qOsQSEOqi|HEMJo4@@LLuxK{J(aI^}SjrgHXfJqp}3z^ZQ4 z40+B0r0R|p)CNCBS*KM=QXzUp85AlmbwhIQOvzQs43cM*8KBWna(I?+ub?{kU52hn zx4=;0)k>ATx~9Ehz%`fIUKTT1*9yRMk;*_Y;3LYBEK_~5;&_Ds&;Y;Mt^{ls0gM=< zI{J%7pHYEFKleib?xCOrBY=-$GUo)zKy$XOD~V;P-+;EDxkFv?vNQwLvQEpQ4G`xd zGw`d?P&o>A_}BoBr z-DaKj?bc4JfutH}dg9pO;Hq(w^u%F1%I_1$PI~-h%?zdiKdF{3^3x7!gw;+cVF6pL4!tFTM0P z`^%sIYJdFgAMDq^{FVLc`Cr)cKmD0K_v4?~55M<)d-exEv}d3Fp*{7~llIMTe#0Jn z>@j=vkuQ>$FW46zddMDr;6Z!n{`*Mpw=X>SfPLwahk^Jn+jk%TmOcCZ@7eRuJ!gOT z?XT^p$x`Oz3WJs}3;@(L=m0o1wtFcvdMQg%NNJk&ISHVy zG@dC-G~h4vOsQ;$jcY-D&a#wqY0BpeX`XU92OwuSub??aIo?ZoEo*m=rCAY3B$fRoe+F!$a*E^0 zC*BA8Wfdp83eN6gBe+){xhI7_t`ij{>dF$wQ#lqwUQ%Ash>Ge{FCEXsUFF;$5qkcD zdyrtA`c}QG3e~4H0p)2~e*xvCb7yT;&*_*-@x5#pz}L)x}v`JwIm`&+!|O7tjZA28a`2c4Hr~ z?7-|WCM95Y4eK~K1Hb`zmwbYsHGqGO#=@E=z5$dq0ghXm<&u$Ti)}Xr%9^~#C1@s} z7iIyub9QBudkOIC9twN~%~fdpd}xmUyJfisnB(lupKNa0r}`}cxB}>_4!{XCvs|$t z`?26T7*Y2_lG{QRngf(a>$2p)?9j||4Kzn^7BI^_C0g}mv3F}O3|f1AQM#sRQLnC# z&|Ld$IFBsB94pKg$~{;M(7a2knfx@VQ0=`6WAmy~gTSM4aQtnZTlZWbufP=!&0Hre zyBC^k1teb!(RxV~hVJSmZusSG`}h(!Mo>(~cX-kClIbOqHhF2(np#~sZwm|aHZ^s| zMn=bNaB#%Rr6C&vIt84v_D%!K^TR`Sc4(000h^Nlk^GR0vJPi}%nXp3wy|Q?rbkLP zH$G(Z6Tx|1-8i#j!=6qmG#iD71Gl=+07pp>y%(D0s!`zFp}R=S(x?|2f@YvL(a1o* zgR`bwbf^6OekynYG~3GhTm-ZQ%@mS?X7_0Z46D#A|6xFy3PHiTT^yrKVJa&|v<8Q6 zKw!@cPFb5N1O>%Z6s5lCu2DuRjQP!aw=Mvd6}JkcE=Glc3SEbZIFBqOvW65@5K~#? z(?!bxtl8e69KQw8?l$3;VYyVu@+?c4thDNSr6Nt|`5h`e6}&~(YYJ@#XYYy)3WIM_ zp;A_b=A*;~$z{NFhys2X4FRMBz-a(y--de@z*&`DQ0!V2F#AY!u36BmMuIHp3bo~a zoaD)Yn(Z92*)LWsYKUlHQmzYV=tZ`b7D9r)bf@M;` zGS{j5G0J_Kvi{VRWqW{Tz`3_$+&Wvz)&VqkH31aOfCXu`rC_~{z)NG+x`E8LlbzOj zyu(_6;?@(bz_s8Rn3jO**5kl3(A<8q3d|k#ilox&;M{qd^?-5X>C<-N_%WbaI&Q~~ z9k-JLVz!g-ZmOl*Bd%z+s&?SJvxX6F~q+xYOLO^ux8-JP(OhBWH6hPqCxBcCTuG}*~hjrQjk|7Kr# z>}z)a{SVl8zWr@`^_5rc&9~mMci#hY-`{U=YI0M59-_Z-be0sAy5_^%c3k;R+Nla|Ne$A zYn@=>`b9DStX$`mF(bSs*LJ`=8~T%mQ#(zRMin zrN@1j-*%6Bsd(=wGX&2mlvX1gcvd+PLd)^J0nWiiOV-Wkz9kEImUCn+&9{}IJj!s{UBe{iW(3?Xu+j#^E@>1>%%c0B&gDPv~)>-7gNi{fnfc*w~ zfn7A%TjV&Drw+U-XM+L=t2|1MO@87!?^OhDWem{-RRNeO+ccn$2-%K2MZ6^{ub1XqKKpw5k z+fofQM^*E__qwi)o>7J7psG8upc%1^EX{)VOThaj<%4C-NfP&u=kjHpTX`Tq+cXBY z0B#r8wJmUFyI=+9INU>^SvQ4DTno)o1)3!?$hu7~R!;H@vom&fYQoNpkD<{-BWT#B zfY?dW@v$Kr9WL9@5U>Xvmj;Gy0F{bErr5+IK=aJVm|KkJhk@Xcvds<)oJ$Ef1I>K_ zm`4j48!zS9zUaWbFgfa)RR-JnBr$$YwN z&OJyTkN$Rxkp|R-Wd-^d zG%J3n!ZPiFX3kU04zlea&^t)wRklp;5I{V{GV4+$3Q!7pDn40g1f?1QT;R9XtFp{{ zs5B}gmJ2DM8I_a*pG9D~l$E-JWT<3R={PWliYA6~s!ElT`~}%YjSqn{&sLTg@4DqU^4FFnW{`?|kY%aN zy^EEY)(3Fr7=VDRQbXirM9`d6c8XFkOy#TfKFAWOr$JUPRphc_=KvK=0^@#*9H+!R zD*=whZp8H#{WsUBd&%#7m;9IqEZ05cR_6ZHBsv3JOj@RA+)|w**47~zE^Dppu+~%U)`pbIvg{V->-kx+JN2$J9_x29Xxax9kD})j@scv$L;8m6L$RQDLZ+z&Q2XY?K*zs6zfjf zv7`Kd?5G`6IJy(Ztgh*_wRd$|H}73%|FCuBhV6HMdB<+J`2oB6*8A+<`ya9oKKQ^b zh-W5esI^3|`i{AKhNd*H!G?4Elcv|De#+itu`Q_-goc4_P_%VknczAp@$!`2OfIBzVPsa z_UNOJ*w?=H6?^>gZ`u=2e8--A^1J8>d+Mnt?3rhtvhRQIY5TzszGu(<^e6VKU;V;< z|NGzApZ@fFd-26T+Dm`^v%UKAOZNI}f4BGEd(G=D^~e=O zzWG|;X6epW8gnhAO~7-r^>sJ#4Y%-qw(zbamJ2=2yt~cT+ol+>iuc;)QpB{zCHKDl_i^f3Y@RN_KI`vV*~{AE+CsU%Xg@A zW+|UMDhJjwO*BTqy5y8lX=3 z?coFSJxIRbQjZPv_1H);O`}$W>H4Tgvo$CX-l4ipT(vsoP!v3)D(m0!ogvxg@Kz;8_;y;I6SF zD>Gp2)@t-A(EcgF{D~~rvd{vYI-bDZeV4s!Uce;FXMl6LciOR_Xx+D1wfU6!SNYm;NF*@mpjnBWMJ6;&WGNK?l9wDFomZd((~ zaSZYv1nf#&SDd&bDuQwqn1ck&+qxgVU4YXHFiV1Fm%REH3^n$rFDLN?lz(PeI7J(jk0z-GzWOTth_~QIPhHS2IBWic@f0xylk`0 z_S@_yXx;&q18{4h`GvDKF*)qBH3|IW|2teNhbR(* zBQ~n24nw5Fqc$=;;vr3jP?`S*N+ZDXuoVksD;0(SM)uXL2Kp(yWv!4ahJ&-L$bfQ$W?3kJ-Vz0>2KU6mS|Ov2hxu!w(8D!re01sd+b37lAm>;xKZj{1!6lfl$pp?b|wGqn$Yx$lsphiJ0@Z%hU zP%q4p{4*6h(+9OtVf9mhyE_C0@*wFD1$Q~!6_n`)77^=wpbqKGvw;ukn5$rSe(V6vm@en)^Id!6jd$!H zufE0cUIU*0VSjz`W&7J-U*-4f4$H5;{D!^ukGEa={}<2y+P?ekC+$m*KIT)6-*m%` zcJqzsh8yhZC!ey{-+0qref3q2`KrC~`s?<=d+&3<_S@luhqxa!AWoA1qnZdtVg8QL zC^%|$Cr()XsZ-WaSLc>}SN&;gZfLMpP4n9#|8ef2EX9K1&JO?G-qK*5Z4Ce>0GtAb zQ?0xgvN8k6-C;tDp6;F?083#PJ*ol65j1Cj;!Jy^W!vfj5d@;7+lv!h$KDZc1@=kwTUAb|FC8-9IR2dMRhplrKK0(w8&IfM4K|{qw+l2AEg) zFHN@5<>%@F^i$SJDs?M=9Mj_j1K5(>rShGLx&+vKKl)!5bk3Vr9+2m!CPI%U{ZPE) zoZKBKa|PxKtLA>}93x0@uiiP%$u?;qR+EH;iJI6~JyTpmzhG7=|0&9ZACvMZ2Smr1 z%rV-Kp9>K7aGN~eY)=>Jp!^g#1H0_&;$D?hE_>N5Xhs~{Csd+5b5W*AazB&nnOxEG z95+h?H{H=5-V=q_>`?Cn?@L!Jb#Mpu2lb0A+m!1?0lJJ18Mzd4F1hw(gUb9?gpsjg z-s|yek&~57Qno@B6o#zVG3llv7$*Y z0CLvJx5Ua{51^R^~84f3=le`t;;xbw&m z0HXl%W6q<-!6zL1ldTJg-=tSJ75PCfCAzM8*S5Q;$WkxxCtoh|Me5PS1d%L9U2+Sm zEVg1gpb!b@b^TXK6&}u|djmYj7|8*g1)JOtfb?2zv1|^V+ zb^*>y7q^k%6@FA+xkt*k`xOhCllvoWC8_eQd#wh@XMtG_1pzJ%0nQbnc@Hp4Ss=%53CBHV#scZ59faD=GK!9Zj<|;JHRU^7<+zRLS3%|FZlD2cDkRhL3#C9(_n<;4DO$uxe;FWF=r|xe&3+UF znnrdcC#V620VBbS!u$+ztRa?%C9Ma-2Z3<;O%DT=nnh5-yEXHTW(IU{c8hZ&O|LBh z%Zh*@Rn{S&mQ|4IU?$)XfEEXj$VEhMBZBi`hi|TtWsO9Z1|_c9XW?SK`>N#uMnNKg zCU}(vQ&x&nN|wt+J%DsqmV!|rC`bnE9GJPr1e|4gR-8wQW#EA zV2)d#2A%=V#Z)N)T)`;EM1pe_+8HW~EU60iehP5;%9T)=!g?g#ZDZ*U8%=kz+{HZ> zME3%Y&3^pv$LBl)T&tvf%EiN>8R@+lW?M<1EGu&WX1!y5 z?yjLUE=%KjhmfrM8jLDc{Oh$nzcmn45nSY3@BZ}#H5@f5&H(p`#uE*S0lBV__NNCB z_e?G|JnxLY2c($PT`e8fa=OV*9|OEjoVI%DWWCj&ZnQ?9R7BDYfBdd%^avTp4j!`u z@9npb-v7u>9y)0)bqewijG^uZprgLiI_h=YR*u(TCy%kuk(0pQaiI9P#}Gexq|Qzq zuCvoe>#b1`eyYXVHC=2ou%=lh0W86V$97dE?bbW40UbM5Z$EtYdAswThwX-&Zg+52 z=sCGd+;RKe_V_oSus{CpC7-gp?o_j#CLgCxwgAx0i1cW)9om1=-h1ssd*!dM+aLb$ zXZzEi|6+gn%irxUFaF*B`j?l1+}8s5zVfCI-jfAcQ)=tCe3~_Q@UQ5lzbbv%{`AMc z+MfYx?f<*q{mGvH`3v@oU;NsB_VZublTSTk4?gq-pH;EeHR9HrZ?XsOzT2Mu_IK>~ zp`(0Pp*vm?2x3?;pV5xz?!NWBH~MZgF^|vUg}PeWEKa7N8NtG=U`jQ~0-)U1&T~$p zv?!tr>TYdjzXtX{4ItJ5rS*WPtjDq_3sf~bW*R`v)JUM(0}N@vswOUIb!&1K@Xoe1 zvfKbXA^|f{p5weZo?{L`&b0u4&3yl@KpV=m0h4W6@)9PH@R+zX($cM+4%#`SP+=N7 zpeQx^ue()~OaRSoY@YykhE&P{(^;TaK&rUEA^tD0PWd7UHgmvhALUyqI7Z2Ztg=D>R?&WEP(FO@dd&hr`PKfmgN@JU8?}dafVTEaJZ7lJIa?L zm_ve_kQWy@_maz}do} z2^`8L!(B5H0q~y!&~ch*ch5+mH-IIe`?=D#vM>vnr4X;#BVTZgU?~=0Zo34W0_PZh zPLVhqg2|)6S>g6-6)eyij7fS8G;e$8xWw8kiBD^Rd0T_@kVA7~4fj0Dx*V=6PNo;G zmEYt$TFoni0MRyiR^DqBL-$EH2$ruWw>T%R8S>Hj=f}JDzy9z4S6f<|vB|M8HRAN} zh)oTSq6wP>kQKjp42=NF!+`Q|VT_Mt6kr<#*oJ{+fLYc~Dj30x;L?GDk4ba$3tWc; zk!~3bK4_|RJ<cy06Ldi-~bi!Sf&&l(g4fB z!l(g;iTi@!2H271Pt#^=I_>DsJT$=nC<5{bpmRNFcDD$?*rQb>a28x65@1f?JV0~5 zX&USZ%~9@q3;gvV4B_R=3-ve*9hy6t;&pB;Vwu$?$?%uXCUVMjkYW`{LF$%hB+qXP%s(tKdw zLE8^(?|<(jJMiuS`|#Zl?eM?n{0id4-vj+9O_U3!` z-KTzNci#Je-FV9#cH@n=IXvHV<1KdM4Y%6ecRgU=0-zOx`PJ9nvV$MVr@D~_fh^G- zfP168|IUX#2vG3*=Rdt2W9EET)(1T zXj*Yi$^FnnkJvr;+-G;(ahEAFM|BGG8_3^HZq5Gkm%nV^f987}@2_^^@Dbp(9iWkG z0`LfQNOB=i8lW!#QiMUp^o}!A3Z82fr^AiY_-dOFt2}~NfJ4nlndxrj*e$4)-x}lx zG%H$*tg1kAEi~)5th<`fC(Q84^J;GfkR_lwQNDdIG-q1?+GgH0pgHSSW?&QW?G^k2 z%sl{ek1WP+kp`Aq0rEET2AIpQKiw{9RznK#1e^uT4$TQLX94Og`I4(gjIkc&vwk86 zOWrA!?|G!&t|~Nl1N#6h_b3e{hlz-SJBUI}cF{->L@V{sle#}Tw!^vZCC7J>2mSVK zI0ouL-2!nztbjI=tmuN(Ac3o%ae(Hk_)WR!&)=WhXBEQ`1_8*N&-1Skm)D%nLk&8t zlYf)~+NA)^os{E>#~l2Y1-?q%C4=P<3t0f1tmj>gz#Nw&G;5nY_o9#d37UfJ(K*J1}#6kEF!;faX%V*UQaV4@P<1WEq{jjver+ljY{9u|7Vqp5JFiM(y16 z8Jj;lZgVpuHa|UVb7w|uZfcZt)MlqxCY_%ivxT!0c7Ape(4Mx9MY%V~)k2oZ0B;)? zHqknQV}Lfnt>PhXTx4A$#Vy_hNH^ED(&Y z1nxQjP;CNY6=>c7{x)Q7Wacbt6m#?U6zR z$qvrRU_8=VV6K&**?pS#Kr_oga|C9&#RO>H3zJdT{=VXH&Grt<0hnu{Syte^&@8tV zS%@zJqSr#O;535rb#(2vK=Y0wVsK2iHvda#76`NK`3}%rdySRrN`Sc*nq_&u9yBY) zwSZao)Gf`muv}|(mLH3vX2>;Ti--5W{jdLv{XhP<|C_(0a}%Q;8g6Ootj!Kj+S$QL zJ5w6BNi+^HkM#r1!17Rj1i&3K#jO& z0h-n2l?N93frgS>DT_L0ei$ep1_lR#KPqMc7ZtQ-;1X>4%sdnd`nZFY7679{kMeGL z@-ZZ60(O*!sUiWYVsOV05D%4<0yzR^fwX{Ht{K;b=5P%jmIy!!VjID-#CGoH;eO5i zR^mLR0w#p11t-?)u56ubnu5pyc?k+3EzW9{s5lk zrsJdcxo(f9!aGr#w|S zZ@&JP>#aB5wzuDW*ZqGLhVA9Qy=Jez^s2r5;@|B}emB)M@oa+yP?Odu6ktE^Y3vU6 z=mtpER)s4|qh6k6LraIf50Jm`+dtShzx{;Wd*6e=s-_E<Zu9zhkew{JQ<+#h2~ZzxuuX!b+L zTF$b4@MBfCKEMUoWi+LGXA96MsViMw>KsGh=>kpx<>>ZMg=PRvAPzV?90Sd=G*=~v zu7zgKmq2p{fR=iJ=TuwDVY$0q?iYf80KXM@Zv*6`0OxcYd1~c)5@?osQ5t2`8^Jl; zor8SI67JB?J*t2+ayaI_>g$#ZLFjdi(5zY8JUpa;Qs4>T>RtzM=Ki|m)p6sGoJS(a)KXP2nJ`JyzI#wYBeE>Ji@xl{(rbW*=k#_bK+SG19WdUJ<> zF#w#q46JT?L9 zTniMpI4W9BD-kAYpm_@@-GH4oVW&+WEC?`5g635K_=0@9mpHD#Spz1Qf!PZHtlTs- zRWsM)Dc}rjE6((;e4Y~va~xFXfOicj*Gh9>$o=37KzkKHu1XLsOEW3gD_s{T(ca%- z*1ROrdaxXI?Y7+S;GPlWt~J*JGm@)G$WzS! zHrh(^9@5G+eGW8-{04UnxoSi=9QUQ>8uz$oL|B%t>~P>+c9QeAyJ)Q1ryp&kYF4)reS(__Q*>q{brV3*=0W^>GkJ|{)JOKVc0l%a`@3}!B zNWhE$h6p^0cHwf^6le;N0ACcKISlN`An(docXf6MfDkN?0II_X;N$-QdsNU#>)E@a z9NI#wI6|in#E?age8>-31NF+llKhedFoGjlyk)T+E(vmj;-J8a+lE7QNCm=!0C8pT zU2-jgW`&oF(CmKA@@WQ|2L#Nh2rv~{?f?S9sZSfPeTJhR&)nvxol8C zg%E8hf#ssBU@?SIKhGea0hD`3spy9&n9Ecoq@4p)!UYQEe#`Y}(4VZm0hl$aTLbEp z7fm_qp^ZpG%+a*Fyo~bSIM+I=Yvs8Kn$;cWt{vQajnGzCy?2QM`I@>_euiCCl$wUL z6Y%M1?y%N|7He;8vF=v+(`owdP73sHFR%s9p`jHTB7#=EKe9#(nDxvi0ACLaSSnE< zYoKYcW+>8${O3oD6w*b_>KNi|4^c>$%M{qrp9gwwrIh!ydTrVf)UvpR}I?`+xZTU+r%|{>v}F;h_9W;Qv|R`rF@n z!oKj(7afXk)Xaf5-DbDndZ*nCgx};)o>-7?xY2F{)bGFVKKs%aAGPm2{j~k|*T1%R z-g?UpeR$C7>l&Afs+U5%*G&83f3^T)QTW@Y&WJnB;Dd+;`S_%x|DD zDgtsXFh_vi1I^r1Mt-PjJaJE};0!3cYM~iGV>`J|y`-3!97F3oOWmiejS>7f{Tg#*4Z!Bs=Vo7)Sd+%Of z7&Na5!A44&Ol+94S)o)*)Qe@x+p?8`=HW6hU8bBJ?6i|;@V&(L0f^xt zhvvkp>yvb_el>w-wi9$YG$UEI1>StfP&{MwB|N8&fEX|pZ`1iKm7NrHeT+x zrRhogkKJASkDbf5v$$$2lk;|NXxe53&c$(?D2&@!A0I`Qk0G1PT9*M9sWjbMNDDPc zZW4T8RHy>xV40HDQI_S{g|Dz^iV>;+@_bkvx7H1LV+SQj}G!bpj@lZh2}&7m^qF>oa4Kd z8I{;?phU{~r6T|NY)_n9E*c@&J^A(Pk=2#_%gU=lOWkA?jem(@U!;D!;VXH0kOt+cRZ)GKKo~$_4L!^8p*A>T0{Vv}hODW_fC>kH>x8t^hZKH^iijt{6iQWb`vKUmG(HydD<)kV|2%BMZ}Ve<{} z+b!nafaHDyoVwqB&uiX+5aol4k9#ctLs^w%iBuQ9KtcnHG-yeR7EgDvkp-9rW0b;N zg~@aV{I?pbas}~O-?XouAK#KJr~*ZgNI>PP<5iB&xpeG4mMfK}@<)Elyd#5|6xTt; z%e5%pcZv$T6{u)x@3v;3sRejy1?)O_2Rcy;uyyQso$Wtx$V15e;g5d>q`z+uf8k4j z@ZA7!faqIpzMJ3o*sZtRWw+g`@Nak99k<_Ok39S(`}$YE0bm2&x7=nA-v5Ao<;!2S zC%^lYJ@=EJ*-w7-W7416?|$=Jd*$yh+x`#s*@n1aL-Zc5nu09d3bL!Lo~Mb)V!`6roub=4Ou5 z9Dp~4dRk>+)`To=*2OWp0p(O%BhcJH{u%*n;Ffbyuwx&=b~pR&g=Vf%(40v3 zLM}(C2+q1c$}igoi?qrJK`6_K;i= z`&5)q!h<|IhyJGy=+{i8s32EB{u4Myw+;WkBQ&Q&h|${E{O-%F#y$0d*TpFGD$C#@ zBY|AwRPP33hN5TD2&A}+uwK7tv9?skNaKBAW47dx&(!#W@DP)_% z$tfC!26O>Hg0BE+K$pNv0H%R_7glYfrmhf_MeAw=VZm~ky>8VFgPK}8TAJfD(>|b# zeFWRV(!2pQuh}}#yAiF-aw8Dj3lTLGvd-b2Uyi>&S;U7J%?5cXx5|n|!;tmUw@Z`ZDVr znxmyTE^8#?=9*#-F4@Hk=WT6e!L@L1($+4{+lA$`HawWK$&mr~`O&o2i)Uu+?9hZw z502YpVbsR*!#0{5vf&IL0I)llqGHydH7Xj7?5%X+P_b6x<2uyJdKmm6!z3L2q`)(< zozEyN5S6>8jqVdTmjGO~Tmeo#dBc;Qf!%95yM!I6BK zWxX5}EVJJLFf~x*nos~|SvqMj$VG!|Q7Ad(R|Stph~OUSI>Tsb3M>jq4FCx-je#Kf z#Znj*xi6(Y`HySRWil~JVVM8MtdJj}0WwJ8+)u@qvz|DpN0qAzeV&nKhzgUe$Z9Q- z2f@D@4r(ZkT2oDV8VgkaX!WsI3I(M}H<5o- zp+J*VfN0Ri^6a#`FbSGRdB5CGUJWMhs{^xJVDvjoek19941*VgTZv{Ut3opsk2h4p zd*BU1z8^`Im@2c-4I3(1L3AxNYc@ndZ4ebB-`ie=L3>hj8C6DWp&3-*oh;D z?dZXe?D*kBY_5xsv;?N5~2$-dmBp+MKDMd*TtkgnsEfhz&PciUv zz5gO8?~x|W5iGU?itYzYN+U!5!)aM^+W}Jv0Cr0=$8o=1x7teNTa8kVA!}}$eZz6N zCb?5M*j_7++sg4;$gco5QExN*qV87S17KG(6n3^~rotxF$nt4mx}Lnq%|n)E9V?L!RS$1NAc!P|BS_(5yxd=M^*q%z}8m6B-;T zP?md)Vz|2t3ymHP@C>da0i2_}L7@B!*!5i}l9gHwW4UH%ntvZ~$hYo|Hr)gAAaHhR zo!`swUI^f?1?LFMVZFrqltN+VO`-e(RLhNn{~XA<-d-9HJ|((ld8851+nM~<06^+m z8p#8ye<{0S`6ai-YEL0`ruPa8ng^&8VFXQ1hRW2h=XsXuh7MS$Wj~Zn(b_!0Y0}?6pzyTLyyV z^57G5$g-_2_#*28zkxn|;DVF@C=Sg?`vglf=LL9&`N)R{_>kiV5A15dnL`fpFlcTy z2P|1uAt7LrCENXMWpM_W2Pr5VmMJ`B87^>)0^7=`IiIEQRkxQ~?BqAVl0ZbD6o9#y z(x_<-x&Wd8Ee~BMUtPd;q&#NhG-9Sk&)CewjLlBX+T59Wo1b2=`59SR7i^Z_bJO#7 zmjATP^!T)$8Jn`{iD|Y!%Xa9jV0XsO&nejbtX)`~w~I^XY<+p&Hj!d-?$E;Dr8T~@ z#s{$ql&-F$YKR_nU2P%FvU7=-_{#P!Kufx#+)9cMZ3(4tyy@y(9w&X6jBkK zrSJg+f2H7WrH>)`$ocIbaq^MvatwWp@{8IeZ<{L{Xv;QzfXJ$?uhi-S*R_mjG)hY> zuTZ#hJ(^i)m%Qwf4@rfD&iBa{dEsN#2c-f`14zzKPueu!mLkJ=Zv)TPXMuDOj^HfI zGw%V=j`g1#MK&0Ua?EW8>RB_G&e1*3xR7BBd*J(19NDl=?lh`!sh~Zy6&1dIbDWC&KULWL&pVi-k(vjXV?!@m z!ac4v-(*>nXejn84cDxonPYU4zc$p`2|Tx_tr<9LXc9DcdD__6WzB$EE9=`@Q?7R4 zv4d?pJ2EJ3tw2~CfY%9FrYR@-vPB2kUXJULBmj(Vprf;;&6?{Qt*NfT{k!G+U7rB6 z)Bq$mDNJ5Ny|pzrSx2)#usLy)06c*Wk3|e9d%R&JSB(tUsC(lB(KuI%bM*>L1zrw) zRYfcBwP5%eSVlUp4-9NZO&z3c+akE+SPse8Kr`$8c;vyYiq}Z6E%zPyAItrzNA47R zq~p{=GuO?1)j3m|%tpY>ZvnIN)z#9%{b@qFPYtM^^$nb(*?~FZP~OIIxt0jbZF1?5 zs|I;;_}8^44U$#auYok)hiGZe3Yc{bx|RfvgZRG&nqw%s9PdM(_an-Gw+E08*5@jO z=Jd_D1iJjMUN6!p`CJDG=lz!4sw_~}l>CYKM$~W>JPVowIP=Y11I==+$pOrHByd)E zw_b;4fil;A4K#cH5@^<-N2L+!gL?+c`aU_g+)`wT53WJvUBAhv-U-FrPWk=QHPM)+ zE{^c5wDz~heBIn zVVfEnwb}7WTbiBm84TAIGx_3@ZEgb0>r1w}dfqOqEC9`ml>JN8Q3CAX9uPySDg2ms z$E%#D&T!WaQkRyK|70+^EX!vC&2DL?z7a52tjv;tY)t}YBRF%;TEN{4%pCVxsXp}c zs6JEvpeADZ1CoMT=B zlBHm6PJGJ~aIQkL?xVzY2#71t9R90VoD*rnng3s*S=Sm;xnHn-a$jnpS-#-c0<*3y zG#FUs_&4-W=z8~BM=KR!hf7^kVN`C?dQ1an0Q4ZOyb)S;!$U<{gS@EfGAV}_*?Xn>|t%FP~Acb97u{3AwMg1E%^W&FT5yx_*m6t ztVumeIzB)lV3s?8tZx14o>KQB*P*aUrJhzB&2`ySDPw0xa&~q+Z?h9cJ9lQp&d;2& zb29+q*o2MCm$kr02=vQ}JXS6Onge!bxD4SeMVMO4Nl;r93K*pD$JgIvdZi~ z$n}R1dZEi5021r~II&ym%+Rn$8=2&sYCTVKtf}z{n?w_1<2J_cQT`w0|B)g2bOGcP z6mn6R93J#MC?0i`;!{iW6BD*HGiyt8=WOMiToG1mZSf+70zkO32_$X;i?o2(1hN+? zDCEzz4#=_&sq18yk71V=D#Rnyi@n2(xyyQ9eEp{nHhD>}?p(BwHSl8hqJ5&}?cj@b znfa?ROES>x0+fBfia0kADk-j}x?)!dH_V7XS&{Ub_lAG>U;K zk2#;Zdj4(ql0&o3C16(1i{7zGJwJe16@}b$ywT*}M*!xiILJ}&j5Hu<7Az-e1)|x{ z-8ba^LW5F@qu?dY^yPz+1j|&Ms@OHSCsc%hLsSCGRcH>N8>HS1!86P9PfWU7U9u!o zVFt^!mgW9IC5{}L)ySi9SLq$$*d7A02AXAQR%1LvL$HT$T&@`HT^VcZ%vdu(+|bZr zb@lCbs$PTp+O2`*X5cx(vf#O;sS^Ryq|J&y4Q#aocAbD#Cs5GV(rN9jtOxqqo7zx| zTb3pH%xhY2S29r22TSr_hr++fdI~&t0EAtXB`O!Y+uFilH$Y5&_$lBbO?f1@m0nr+ zXe_8vp)PhQhK7?g5KfA)d_8Ecz+uhr2+RsU*WDxmxy>9qLbHw?f!PJI>a!Y1c6R~Qq5M{= zc`tl+yX1I+aKTvY{*MFsqIK2(V;cugOi?LYhYEej;-ye)l-dJHRj;kpOg1fadBw z+>e+DIT4=tWrP-lbz^v!9yi!}c$RON}idNrv|A0;CH?TV@ zFv!X}7A(!~hnwZ&&?G0NAFlV_RGhIzSa(G<9bJM4%X(- z6A|5R1k(=G@@dw36n&f(QAGnp)u@cn>`_#Jsj*$aONCn=? zm-0GDqo6E*=4=-YfdrZ<0~Bqdvny?#omp#d&sbA)%IZ#c*r`)(R#(?zCr>rm>C?^D z)X;8i&44q&C}3`G41*D6U2X-C^}7|Ii~|%Mn2`oCYOrDlfY;U>pjm@V+W=<`dQ^OC zrLFZqa|7__v5{k}WDQ0YSPGtjRv@(tAngKz6u;LaWXSRk9Ap88OgHdS1I@A=2J5eY zuc{2-QVY!yaIaMW)3zjDv+~o%Z-I0GYLDC8(Sl^`P^vi2+E*aj>HBtb4LvBOw5iFu z8tb{ndh2KgaN7XgcCMMU3wTxop(=p5M%_wVJ+-nFim{T-UM4c8Z}%>gPM7)jO365%&k zmwCsdc)xN$Z#Gd*@C+Q+LUQ~qMOdzckkfJpkSGD0dEhoS7y~R;4NPu*<-1ULK+m7v z7xob}yPJhunE~eZR`MbrY7HH__6lt% zcQFkf^d4oETPh2vzkOh!3`~h%^H6cGFJM`N+%;yW z*f(go+=%sN#x0W>Mf_HnPRdM0k&+w7$Y9!Jt;vXDrRS zH@ZFtb)5rfj-V{J3<2_lKo|%<0}M}(2b~!ovgwHtmd9*%V#>~rkK5ejge{z%w#B&_ zTbV!W-64xJEYHr_<}wcu_`3vX>;jvYmH<5s)>=Go>j2~G+&No5d(IYTH8^Y0&d)E| z;<+VT0xp*qmTeW7T?1AGaPlc#J%8So=jJ@%_=SaYyzon;KnehQL4&~-X+18XMa_>t zgQkIC`8Q2*zH!?)KVe%dvk46Am^?VOg4TC!do|pcPX%bnL(mkHnlZ;mi1M;|Y(|a3 zRae8sOT464xan8?Ww^vKTpK_py3EVrt*kX*bu~y9N{zgg!m;!*>gC{n9;}O($sOl; z={ESlE|OPuAE_0*Enr5K5x5S_z^j&lK)FWfe(!o?AVBl>D(Nc6<$8D7CbmTNVO`p| zz_kP5>r4Cw!1>TF1DD=iLcT5m&2IHo*dCuviF>rm%d^A%*;!?ok8)cdr@(p1Q*|GO zk6bS#&%X9!KW%T)3Mu>*EkjXA%D}2e zkjcw6DEI*CzZhmtzU4(K2~8=k@O_%pMybNN$<0V^6;h5$O;IdlU3TBfN_T4j<|;Jv zpFlT(U4gD!iUrXQ%?U75`4?m{B9#>_R+K(Mp8qu~SOMtvINkzg$=yA&rtoqO&636# zXnJ*^xgTkonkQ}Vdg6FE5D}d_2Nu}?w zm_TzU0Neq{bpX9x09&uRy?cNKMOo+q5Ib8PTzlF8MI>mD_5d@`;?nPb0nCEpT4-+P zTEoOAT9>$1v;)GfM%3)U5@0%56a6~mj}0uRnn?lXuEx_GqYhX-&Gj_U2mqK}K(kbV zXI*ntvvtyVkoCQja|d7+#FMAyh8i_;Y_$H9rC8Zr&%! z?`@lao^P8*Y=GuGWk6q0uR?($xkq{#5~3l=faN_;$}@Qhjlj@o3(ri}@BriaE*j(A zG^AB-rDf~_W`Wv)Y>~3A2t=1W9ptyx$^CLbW0%rlqOGFNF9SeND1Z91>JgJGS)XNn zyi>_~dPl6acgi}_lh&0Uu~d4%vU2B?@qzMnK&~Q4y3?jCftR?t+qf@ zROlHE+Jpf%5^YhpZBe&qWomY4}vR8xc<+;wCl6Ld_8Ec<|_eS&Ub};`*h+_oFkrF zs(tRVd{?14+!wz_L9@n41!$H(Z=ZLOjFkr*mS-o&ZT`%pEueGLK?|CxXHL^!F1sal zTiw`hjay2fj)%rePzhn8lk$MB0wh;8I@Nu80yyr-MSxVWa`D2ttpXscfZv(`hMT@E zXybv&B6$UUyaNdB07l!qlw9<#U=HY#wt2C3bQAm~(n}t}NRIytws~RBwxv}*j8(wU zVO9_G5|DWbfaaKbA>2|*UU$f|fEhWg=_Bw~BrmR@nf-Py3YaC1u@3Md0l5DbkSDMl zfSDITX#%#FfG*90F5lPy&T^FJnz`;Rww3FE0U@ZV|4N^|CZ8?qq1?atyUd1<%~0upE!2>szkhkR6g2nyk&ZWxDV3YabKgSyeScxN^x-qj7R z-Q^&yfv`bwStG8u76EG-cgr(C^5m4w(uka!7`LgBA@7+{SU5>BzC8kl_n1^IBR-%~ zzbW*KvLK^D_3!|j0_GtH=Cn&bfa?8`G{!)w11c4lDv<(}m@2X`U^Y)hEoct@i4Mye z(CN@D2o^N+TUKHRW-1>Szd2?OWm(qjW%7;A_XJ=0qEagH0%?E6e=~GH`=mj=f^8fwtC0U{k;;kd|c`X}JwxZWA!O zg*iyjtdM(xHNk3dNeE!t?E>Hc;Z9(!6AlAyUqQ0j25h!p~6xm9ppS!o3=f>hwM zOQ9P9ZdsXontUiR*?ctP8Ff0gfe)t^rg=`Fh?-pXOR_ zPDnjA>bY@12Tfn0_ZU}unfN!duZ!bvDoJ#|tq>5}IKjq+>!T#;?XAVBi z1)y3DQuUDL6Tr>@&@pu@Gs}Wxxi$LCeo9e6FzuG>q`|;8!8I!Yf91MpIC}$ted6`Y zUt3ml`IP&6Csz&mZ0jCW8rs}DFI)O6WlT_LoGbNmhq5Sysbe2^5u^M8x*e9)K+*XU z%YI)Mjr^WI;FseFo>PFc&wdIlqk%NbnpBJR{-39P+#|{p<%{Q~slFABDNbST=(ls%)v=66-bRfEw>AtGI96*FG97nh{eC-b`<&jSrV>ZE@b2!KYD9 z;OTn6ENSu*LGv~ecsV#zhlJVk1iwk0gS`IndKIn^M^MjPrmhbbRi)H}-b<|duvP(> zuOzA0mDFhg^ujqKG)D!0Y}Iqr<$i97I+yJtG%J4*nmu**j96FJLi3Kx>(LsyF^W1k zz^SC`<+>`@M!v)jeh$E2#zRT$gxrOb^8`zBa{qwQC|!GKJg`6aagPMZHF9X)X1Nxa zJzs)mDX_-JY7}tH0L(gHNO%4Dg`V&L%OUkamU?cwXL_D&TWe*$#5ryl8!FowD*yTE zX`doet_ZTK?EuBQK=HP^rjbW50QdxbvR=x1>Q3^!EQ&1PEiEz`LVB>w@6h_-gn~Bq zk5br|P2g`G7+FiS%8kCr%d^Qbw$Kj8(Tn0Qjc&FsJh}L7(oNQF@ZaXR%j6Ad9eLQ*Jpz6&lLsGlo51oV@+MVVV&SFos15+L8VsA*O^z8ikA$q|8&!4k}b8~inVcwUQNEK#oaemGRLeBAhOpj06*wC;yE+>ZuZDLphLi=fOs?pi! zQ8MJeEbFmbVg#bSRAw0}5W%S|x-4sOr`(DZVZtYgk)@gaG+@v}Bcf<=mgE=h;o34O z-!}~215Bf92C9UPq5_NEtePRSpGsen^*9dRsVFVi3i1aiYao%r)Tv7~j5!Dub*!*U zR9ZeJL82lK)^1sj1jn35*G2yNfn&LN==wEDjb=FXY0g>i@xG~SGhKO0b!35xe(Ubc zSVwyg>b4Fbp_B7#`r{6up%XBOY68;g>+9{*DS-LpNl)ueowlYrxm_s6H&E=p$pCV* z{FED!e3l#ePyWmT31;hX=(z4dc z%G=$38r1=+4em4CE8tGH>j98F1!m<>8e>{(eeiddK zW?9zBr}D}*xTP2{P64keStWs5S%u|N(UHJ>C-=Lv9=NUN|HdBcZc20PEP16d)Z9yc z5zAf88K4=pW@&iPK;pkvuBjEk?m#^Nbh?r2;&{C%RnPrx2$uDB&f5tvceg0y9q&tX zj=a(6YUN#T=bh&r5+Dc5DnOk`ajkuF5C0YfT(Wkmu_s@D{V(zElU9ER$dj~AGwNxQ zEJb+|z-DROXUobN{w5sM{3Aclm&hz$`T*TP$LaU*Z86SjK&(p@RXVkzZjHZ*`Zn?DuH2C)VHoO zp$E98v7#9t6}?23e>GzI1i|5t}kau$YZpuByPFJ)K4VKp6c}=!8-k-BG#XdVTSg@I~VOyG;wyl*# z>Q2>(p)M6P37}L*Y@qZ4USOl213q#t=w2ezND(9wdNRtb2uQ>`BllH>h-efXc?ubgFn##eHpE; zQ5cxWbtZX<3J}jdfdzy3jr?9I&n|vDCvcB-Pq=3lz7PPD1{kl(t@dpHG1~{%i=eC9 za8E`2uKWkP_&)1Z=HQwHyi)Lo=6uSBKkLu~9iP`O>5kTI1=fkPVqa1(JB+CHH>|HL z`wJrI@QVkeBHUhH<-nvd)Q&4GEHV{NhDHpdEGX2%aNTmr zs6z84blFn?(7~AFagPMeZl%`slc(!R5FMbIyz)YCI5hLYa!(}hevzg5bD&wR*a*$K zK3#i&fk7ok#+ zUvroUN50Nkegn&-LAgY1?=VUwl2mNhgJxNa`jT$eFew25@j1|}0e+r0z`$d@CeSQP zvY@#F$;d5SQB-0cI#LxR*AVg)pxMDWfo1`6skc8sGyiK_(GZ zz>HcOP6NyJ)&e{?dkQ$qTFUX{cF=gjxIeTw=%xg`9e`a&i}iLkS*o+% zdOGX)-2k`>UM2pM0Bawl*A|w8!oWSnSI)}fC7)K8;5mWy7Fk*4R?_165~K>Cy8&3` z%csER{Mn9XSB7Pe*Ua(MMczZ>LjxE6K5)EFKL!m8V7a}if!}iJXd&NRmn`KRL-#A( zx7aX{n+5-~EMHiMc+M?XjSlx^mgQRRA8ZFedzXK!B4ni47Fbk>I?_}N;LN%%es_{~ zwn<^YW~W?91Uns^SI{o$nYA=q3+HI!TutPwm2DImB?WZ#pzihz*9wFIU}_-B60Cu! z>E0g8sP{zQmcAPwH=u7R$Y&`;k2xfV&OEhTRjjF)+PyjMrdF!JQ+@It7;M-LwMtv6^`<`QlnO=3C(0KNg zbp>ssbCFM-bCC6t{vV{wkOov{@Lf9$Uo*zXC#53aaXCzQC77mc;utz_RjlLx5QCrX z5;ri-h$?BlyEo|g{hCo$Qr!QNX_B&17G+ADtAO}V>IJ5OVnr(vG|NRsmS=?w3}q7M zRM{aoMkRr=fEoB!THrrzqhq*B1`U|lXwf7-8aV0S84VVXMen_1Y?tk`Y(II^EQ6zy zHaD%o@$(^$|9OSQJrBFA*vHg;pQug%v;;}si%XptBTa;84gri+PdF&oihcE8fI)$3 zs2^2VI^=mztiU-!bEprm2g$XH!0gbhy7C%nT?J541m#+V^Mq|2nsuE@llm|~vsgi* zj)oZ=qAPX2wu|Qsy|-1t2A;A{0-iBHo+m$dsQ+sf?^9I*Rl%uvfqNsyNK`qll-yUp zx9k&LXRC_9EODLeQwz<{A2{Y)( zr*sXd1)h01H`sQQeYV+uhvNvS0x$)_RrZl}R&aj#5~*NW&gJ!wJ#8Io!r}?wcEVm8L#XY#h zaY9!PdD8RK2djHf5ywLQ5%5x1x-i%;xV?md2UTcJz}a8K@X=^r9is}bHS*)@!`Cy` zv)&8MJ3#Yp0?m@3*;~J|`u}H^<~`6{?Uo835&7dfJnkdQi;L%Ms661lxThBC)#RV!dpNSFxu0k`woIs1*K$M>X6{5OY z1FWzgFw&Eiix0S}#AZyWDI3-`a7``<((rZI?Rtt7eI_s0%w4`2N>z;0FLFCs{6t|a!u(0P@>C5 zcZUGGp5Lc|-P6EygM|rAnnL4%?R+34(z5&8a&P6zl0qK(PpPh_hwDk99ss=;Wjd@! z(sGK%5ZBY(-C|8$E!M()=;ZmN0Z7frnC3gk@s0EWbec7_FVAV^&luuS%jLWVt%c}En(kf$ zdwn)M-md}Pai3R!JZP|)CZy^EJoA8OAIplhUnuk^0L^!xbt+%nRf|+(7UZ7g(zhSV z03YC{ZEF5g9z1j@Y1A-f#SoCFf%85^y<3sl7kG@nYb9W=LbKpKs7SfzQ}c5jMH>CO z7RBR_<(=Hf%Eh8-60kwerJ4IQV1KZ05KxaZ{3a4W^S!$dInXRWba&H1J@RvIlbZ|W zqAdS>yRroPRQt*!4gMl!qEt}X$aRDWLm@<+CLGfwQHn}*c5KAXO;7pwrp*f)o3r8d z$OiSt7TVraof7H~x%XZ1`iMFJslHMj=mY2gq)-=9H&B;4FtZ$?SS}cH#i)g54G644 za{#aiV_DAYk$zWIh2|Y+|YV%w@el>CjjO7iSpk0m`K{+b^c4*!O3@`5jbAo3M z?w01nD(u!~(qPFXGXP%b#%EbKnoJ311uviQf@-3PPXU@*XqGDAh+@3M7|O+4a}wLh z!o0(a6QS9CT3z8h5uEj(wyTBaZHMLn%mTzpL<(+#0*(vj0;uvat?jz5$mdrutmBez zDMFb{`2uHu2-=t9g%_N2?15&b(XW|vbARNT5v}nRa3)Vtv4y7>R@=$9S&=k0*iMSj z>>=8ckNjUjvn<*=&S#;y>Yfqu&NjRxdSSu~PaXuX$p;%!u2b)UJ^+8Q_5yR14?Iww zROl#=UO2GNe*(>FKna@T0LUFL;65LkulT$5Ux#K{o~x~FNgog&R&w3)X_oJj3PZVP z$l^RXK5Ap5!!|Ztwz2Y{Tb2i@@MIy96-zK6|IUtPprNhVAxd%0=chFo_KeLR5Qtt%Q3qi#)A#wb*FK&Sv2DUFq3tO!3JnyIMd zUsz*l?(62wG;+X}M}ml!=0tlyxpo;~_2@U8*JDHrl65>mxz3?3 zRduiS1ZWOcIzh9n-dVs9aPG=cf#v|K0KL8BF~D#QG&?K@g_#Hi&^6%9K5_pH=My-m zx%RZy`+rfa=eK~cr&}=DZXJMP2hX)rAP20c$e&;?1GuUYp{N+yG;$wRu1oRivjB6B z#ucDe$a6pPfOEbB-~g0!T{L(UAp`-@1z@)5YC!eer+PrE-Uic+G?rRzkZq(gus8^C zdmQUrH_%O^h2H_nIj_3=bzc70c}jWqL*;y1Rk=3S0r=!8P2PG@8nDl>f0|?Va;!AR zlrmXhHrruY&YR=cN|IvOA8G|&VTr_%s6=gx5L=?uV5qobSrr)hv>kkVeR zCDqquo#e9P7JK|aZq z+$w4m+%syRnMS@;l$AV6)^Z;|5ujZbd!0MXLqMZK4GW;T>`|`5#92OhRt|tpYtUYi z#&$l5RFbmp#&&D3SI-OYuArZ?acpqNCWc0AYGlmAzRrwK+U(RB%F$U{Id>iqR@l@< zTU%VVt>wf$^un@fAbhNE+-G-Z!#=sRZ69lZI(5Mgb@-+;0is^nrmhK*sem@>$QUjz zh&oNcr1}j-pVr{V3@nohj%#5#Dn`%H{_d(+17ks27a-4ntA0wrI2hcb;<}&+%-a51 zV3q{U6$ZMd&WqA|7j=8A`>ORI3__lr8Q_arc~3h3#qGRTcMlx@CrZxi^}j1zlh^MY zM|EvgYTzWsIMP~ZjxbxR2(x}a{{=M5m;732j{G5Lj_w(?E*e34PJ4m;T41h~L-XZJ zJ9hPwW@1z8N5DrW6IfO%;M?P3p_}YCIzY3)Mb_Dmd4WFxzU9&&a8{Rk0H&~Slpg~r zV0e6qeMG8WaBjSAw4gXZbM-~3mHrc8>*H{62E4W%mH{mXV;&%27G7?BAbb=K%^Y)$ zZB~J138<9gy1F~U$VhH5_aTmw+yl+Bj_O6Ifo6en&@TTa7H)u9F@5FN?2Qjyvx{?F zN43zr$##NfrB!I=c{m*FSp-;)3Lil7tUd$HQNcCCTYg+i#aD-yIecv4URR}cNeeb9 zOhOk%7(}P=ce@l|>e31?C)da}!G(qUM&5&EK0x!ez#O4j1)83B(VU{hoo*B-7CKi@+yCN6^yz@y(8Ot|G1AD@VCBt!B)xkk<{ zzs%6RO};uamg@v|r4E3$Q(znXs%1U)sn$^h<}4L{OvAFa5jYF9gQZ$79v;d}F@QUJ zU0Nq~cl5eTfV+2~Hq_cCb-FsZF1d%a1JQzMS)T>hJw3o%PZz+}4fv%zzO=j90I@ls zGz-XPfzVt=fE}#TeF9#~{k^y&RKg1sKVCn@*>!him9bai6-Itf{Tun%f$ARH}I?)0REdzvjrgCRrO%Y8_5jadL#pq$A7(h4I;|V{?xF#ZA>SDq7a8t(mTSrNaZUX+Zu)o@{2naz+W2svO^z0T zX298@IZVN<=r)pih!iDC)ASGI1;4zT2{`)1UT%d2rh!$tJ}5kzr1$0oIj(dcNNR-*S=fUUCaG5X*X5a|Z3sRG|U_e0-x8x65G8j>7iW@5^g7tVXxszLAS-F0_C>Z{;3 zsCs?J{(1ST{qxdQ%5``Ied-}S<*q=TLb)wymK=_N8`TBT_@gmUo6ZB&DeU8HKphhd zajIto(#f)eTm)gtebNZY>iLd<92H(%vJbinxPGjZbNehhdN|xb zcN@?kyfwB1CWE+UfvfT)o2e)o6=-Ig@G?Yb7AQxB)}ucDZJ>GEeZh4; z!7LA3*SyEV92Bh4HPDPU&=v|U-X!0AtO1($Dh%ukiqNcZdSO7M;2DVb0*e219KDe4 zucMdsUqiD?_f3Ta=dR4(zbaRqRnD!Ck9UFN`Y2P*Q4#m*T40XQtY;sl162Vgcvi0g zaG!wlF7l`v2{da;Z9%gdVse+c>hBuIVx53mA8YjS5&)}V;3Hm1!^b4)BO>n(&9a&+ zrx54q@g< z6KD=2{e${^;FEj-fhhO?l%-kF9Q-Q#fumYzmNhloT?5UXVbp#Vn)%&NFTykj6Sc!eqt*rr81^!dMG+{&<^#Zl-J|UM1_7w;RXci>r0o4G@QZAEs zfKIt(RPmlQGhoVEnmeqivCY~5XAKIfD&_xow5I^(E`S+zbX!|zkF~PCjcqzOMhDyJ zzji^i2JF#DNcD94c|0yN_c+V_&H+m~x#?u|TObWka}Rra!}b}{42_UvuA+z%|P+g}#SVt>{K3Tfaa z#XHqw6N5cAMmkmu8XZX45TIBZ;+-i1%nqELZdsOOSr*vmfzDEYw+#0XY4Vrh7*dvV6#9B>a=2g%Gh=q){0t4?X}hpEYby)0fZJJH)~u%I=WT6i z(bkt0x^#hUXKZG2%%&$secpiy8oT2f&rn91##yt!@?FbHIypRQqpTk&0;)wC#Dirk zmq)BTG-iY4QGSnFd2qyr%ELA^IBWxh!1DkNUO>H{XIKQF1niae?hgWK`bw#gED|6vHlYtiiZkOcLC>iKO2Ws&H+ryHN*Xj1+&Vh z&Y=%Fo=*_X1NGJ!`P=K$9L^b+gJkLE1>rtv@(e+KGZ9K;-JuQ zPF)lCnE$VG-Tr)QbS*H~LbL9ro?&Pxao@OqYWO%b$CkKTnG>I83Ix4V4$a&gFiXPAPxJ5 zFo0XDE4Fh1a9&!r^Hb9{GdyBb@lplNf@nZ8gqI^_+c@EXhm`_K-3naIlxztE@&>1 zUqSN#&^#b$W?zS9@{{WXIsxWP2jHAQvrqra|AOW$m2Q@MBMFqHJol-Odz6u@2FJ_; z1v&0pn)}hszCFIK2x!Vzy$F=`0c=_81c()(qqVKeT3S1U_^p`;+quu}q-`F-Bjup2 zaCpH=9l$&Qyz;;^;G7zCBybm4Ynu>rntTiL0hGL8M%Fm$ZEdX80PD=?piKj->cu!SMmjcPXU0by;$~;g*zDO^o1LAt`Pmtpo0+t;lOuL^tYov} zC7YWVuyd1TJ3l>Y%X5=DSd?d;RZjS2lVUmKv-Po^Gz=sU=K*9j?Dvijsx>G3O$@GQ!iCSJ_0}oc#e~r1Yi!(9NYy$U0MOh8u;1+WSl1qo)aiX zXbyZ6z%r?069phnu3IVjmtrrgj#mrKwcxB}KOg(*9hZ;q$1d)ZAXE1YRR-eiQNjj$ zr0a~*HzZ&V(5!XU>#oU{pS$MzP_*&~F9M#mzCB$p%g$!1yKBxH^W<=!{ML88H$S`w z4$O+&h4{bveq001oI8Zct9dSoD(_DM%&uLwy$&eX%Axu4WghB|E;3k|9TL&UK#_D6 zK$0AoWog!LGUlTdIeCQU3Os8aH&ZwA^Po8@o=*ef9Jshp-_KvTL^@vb0tILmYy}|H z%PxRctX={0r>y@c5;TXF2j}6q5mJNu1`pN6xsa^D4*34ygTkm&_IF6F-je{%df_81 zhY!I)9te!nRV(#@m4e#{(QbXN<|XR$pjrBSXyzqLfVm3I>{GEcZvuT)X!e(qkLUBC zxeCtVBXsFQX5V^D+8_ z5AXAQnnQO`xQ<;EEX~P>M*1o5jrP;WBxrVU7VwgK1xe*toe?nrvv*NZ$X_c(FcB~d zz{54Rj;C1&qov!Y*A*})adG=Q zxi+^nbA9AVmS%VBaA@Wl*``-k=JsAo37R>!PdCi*vs|Yd40$SIsV@zz%3XyD+`F4u zE*A!EVr;_BP%%$V0(g^CEYH}yCQDiXUd}Ds-0XQfGc{*pW0N*KJZi<#u=N+pmItc) z`i88(IAXj?dWi)LA<_J!f-g7XYC}8y`Ib z#0*-2$zZL++!dciKN0)o8f%j?Uw zvbk)F7thVnO&@6_380Q^k#9sq>r<`+Gk<9g?2=51d8otZ^5fc=0rtI9*lr5j(`lq@6xjEb7o^1oMm-se)nqsa+X^6X1@q9F?R8lVZ?5P|}k)BkRYk#jFC{wn17Qqs!-oZ^cuGwTVaHRyO4$R4DTQx?4U zAwXQx%%x*Muf}GSG>DgmwIY#}0e4MhJvvaf;gY~v*7z}-9FYs#DDV&Tmt>8W0f6lR z`PnlQ9`;oO?RR)T-LkAQJE_~EbYC?e!za6{1Hv|Ll;hp6f{>J0PZQ{I=73$qFFSfJ ziMIHEQv#;9fNHlklWtLOxb->d-Q40`5JM;kfxfe99jyA3Z;raR0&=QD^_{8yOj21t z_dr(##8gMgg>VO{P74-hrIj91hsLU+@vfG2+*stIj^%iPhmfwBRCQkj=qfX@AMd8m zGvVx%^tyVzAL~dO14>cViq3T=>!j$LeSK(7uAk$(ud$!E+CaFn30JD2;7qYi{fuIL zohTaEqBJ10VysH;xewC24m8Jbx0=u;y6re4@w~!wO1@R(gBZ!@wYti?5tvo)5%*Z> lCALlA_$td$)peWp{{bIfznsf79GL(B002ovPDHLkV1hCk<;4I1 literal 0 HcmV?d00001 diff --git a/docs/examples/te_llama/media/transformer_vs_llama.svg b/docs/examples/te_llama/media/transformer_vs_llama.svg new file mode 100644 index 0000000000..a872d6edec --- /dev/null +++ b/docs/examples/te_llama/media/transformer_vs_llama.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/media/weight_swap.svg b/docs/examples/te_llama/media/weight_swap.svg new file mode 100644 index 0000000000..b2ff3ddf23 --- /dev/null +++ b/docs/examples/te_llama/media/weight_swap.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples/te_llama/te_llama.py b/docs/examples/te_llama/te_llama.py new file mode 100644 index 0000000000..fba35ed30c --- /dev/null +++ b/docs/examples/te_llama/te_llama.py @@ -0,0 +1,172 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os +import re +import gc +from contextlib import contextmanager + +import torch +from torch import nn + +import transformer_engine as te +from transformer_engine.pytorch.attention import RotaryPositionEmbedding +from transformer_engine.pytorch.fp8 import fp8_model_init + +import transformers +from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaRMSNorm, LlamaConfig +from transformers.modeling_utils import _add_variant, load_state_dict, _load_state_dict_into_model +from transformers.utils import WEIGHTS_INDEX_NAME +from transformers.utils.hub import get_checkpoint_shard_files + +@contextmanager +def replace_decoder(te_decodder_cls): + """ + Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`. + """ + original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer + transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls + try: + yield + finally: + transformers.models.llama.modeling_llama.LlamaDecoderLayer = original_llama_decoder_cls + + +class TELlamaDecoderLayer(te.pytorch.TransformerLayer): + """ + Wrapper class over TE's `TransformerLayer`. This makes the wrapper very + similar to HF's `LlamaDecoderLayer` and easier to replace it in the code. + + Args: + config: LlamaConfig + args: positional args (for compatibility with `LlamaDecoderLayer`) + kwargs: keyword args (for compatibility with `LlamaDecoderLayer`) + """ + def __init__(self, config, *args, **kwargs): + super().__init__( + hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + num_attention_heads=config.num_attention_heads, + bias=False, + layernorm_epsilon=config.rms_norm_eps, + hidden_dropout=0, + attention_dropout=0, + fuse_qkv_params=False, + normalization="RMSNorm", + activation="swiglu", + attn_input_format="bshd", + ) + te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads) + self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda() + + def forward(self, + hidden_states, + *args, + attention_mask, + **kwargs): + """ + Custom forward to make sure we only pass relevant arguments to the + forward pass of the `TransformerLayer`. Also, make sure the output + format matches the output of the HF's `LlamaDecoderLayer`. + """ + return (super().forward(hidden_states, attention_mask=attention_mask, rotary_pos_emb=self.te_rope_emb),) + + +class TELlamaForCausalLM: + """ + Causal LM created with `LlamaModel`. The underlying `LlamaDecoderLayer` + class is monkey-patched with `TELlamaDecoderLayer` class before + initializing the causal LM with `LlamaForCausalLM`. + + Args: + config: LlamaConfig + """ + + def __new__(cls, config: LlamaConfig): + with replace_decoder(te_decodder_cls=TELlamaDecoderLayer): + llama_for_causal_lm = LlamaForCausalLM(config) + return llama_for_causal_lm + + @classmethod + def from_pretrained_local(cls, pretrained_model_name_or_path, *args, config, **kwargs): + """ + Custom method adapted from `from_pretrained` method in HuggingFace + Transformers repo: https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579 + """ + vanilla_model = cls(config).to(kwargs['torch_dtype']) + is_local = os.path.isdir(pretrained_model_name_or_path) + subfolder = "" + variant = None + if os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)) + ): + # Load from a sharded PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant) + ) + is_sharded = True + else: + raise AssertionError("Only sharded PyTorch ckpt format supported at the moment") + + + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + archive_file, + ) + + # If the checkpoint is not sharded, it's a trivial sharding case + if not is_sharded: + assert not isinstance(resolved_archive_file, list) + resolved_archive_file = [resolved_archive_file] + + error_msgs = [] + for shard_file in resolved_archive_file: + state_dict = load_state_dict(shard_file) + replaced_layers = replace_params(state_dict, vanilla_model.state_dict()) + + error_msgs += _load_state_dict_into_model(vanilla_model, state_dict, start_prefix="") + + # Force mem release. Taken from huggingface code + del state_dict + gc.collect() + + return vanilla_model + +def replace_params(hf_state_dict, te_state_dict): + # collect all layer prefixes to update + all_layer_prefixes = set() + for param_key in hf_state_dict.keys(): + layer_prefix_pat = 'model.layers.\d+.' + m = re.match(layer_prefix_pat, param_key) + if m is not None: + all_layer_prefixes.add(m.group()) + + for layer_prefix in all_layer_prefixes: + # When loading weights into models with less number of layers, skip the + # copy if the corresponding layer doesn't exist in TE model + if layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight' in te_state_dict: + te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight'].data[:] = hf_state_dict[layer_prefix + 'input_layernorm.weight'].data[:] + + if layer_prefix + 'self_attention.layernorm_qkv.query_weight' in te_state_dict: + te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.query_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.q_proj.weight'].data[:] + + if layer_prefix + 'self_attention.layernorm_qkv.key_weight' in te_state_dict: + te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.key_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.k_proj.weight'].data[:] + + if layer_prefix + 'self_attention.layernorm_qkv.value_weight' in te_state_dict: + te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.value_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.v_proj.weight'].data[:] + + if layer_prefix + 'self_attention.proj.weight' in te_state_dict: + te_state_dict[layer_prefix + 'self_attention.proj.weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.o_proj.weight'].data[:] + + if layer_prefix + 'layernorm_mlp.layer_norm_weight' in te_state_dict: + te_state_dict[layer_prefix + 'layernorm_mlp.layer_norm_weight'].data[:] = hf_state_dict[layer_prefix + 'post_attention_layernorm.weight'].data[:] + + if layer_prefix + 'layernorm_mlp.fc1_weight' in te_state_dict: + te_state_dict[layer_prefix + 'layernorm_mlp.fc1_weight'].data[:] = torch.cat((hf_state_dict[layer_prefix + 'mlp.gate_proj.weight'].data[:], hf_state_dict[layer_prefix + 'mlp.up_proj.weight'].data[:]), dim=0) + + if layer_prefix + 'layernorm_mlp.fc2_weight' in te_state_dict: + te_state_dict[layer_prefix + 'layernorm_mlp.fc2_weight'].data[:] = hf_state_dict[layer_prefix + 'mlp.down_proj.weight'].data[:] + + return all_layer_prefixes \ No newline at end of file diff --git a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb new file mode 100644 index 0000000000..974077de57 --- /dev/null +++ b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1f37565e", + "metadata": {}, + "source": [ + "# Accelerating a Hugging Face Llama 2 model with Transformer Engine\n", + "\n", + "

\n", + "\n", + "Goal\n", + "\n", + "This tutorial showcases how accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "ab4c0b82", + "metadata": {}, + "source": [ + "## Dependencies for this tutorial\n", + "\n", + "Following files and media are necessary to effectively run this tutorial:\n", + "\n", + "1. `te_llama.py`\n", + " - This file contains the code to load a Hugging Face Llama 2 checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `LlamaDecoderLayer`. This is used in the following two sections of the tutorial - \"Improvement 1\" and \"Improvement 2\".\n", + "2. `utils.py`\n", + " - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n", + "3. `media/`\n", + " - This directory contains the images used in the following tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "466ff515", + "metadata": {}, + "source": [ + "## Table of contents\n", + "1. From \"Transformer\" to \"Llama\"\n", + "2. Hugging Face's `LlamaModel`\n", + " - Hugging Face's `LlamaDecoderLayer`\n", + "3. [Baseline] Running HF `LlamaModel` (Precision: `BF16`)\n", + "6. [Improvement 1] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n", + " - Transformer Engine's `TransformerLayer`\n", + " - `TransformerLayer` options explained\n", + " - Mapping weights from HF's `LlamaDecoderLayer` to TE's `TransformerLayer`\n", + "7. [Improvement 2] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n", + "8. Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "8e84bcaa", + "metadata": {}, + "source": [ + "## From \"Transformer\" to \"Llama\" \n", + "\n", + "
\n", + "\n", + "
Fig 1: Llama visualized as a transformer. (generated with [Nvidia's AI-foundation models](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/sdxl))
\n", + "
\n", + "\n", + "A flashback:\n", + "\n", + "- 2017: [\"Attention Is All You Need\"](https://arxiv.org/abs/1706.03762) paper introduced pioneering \"Transformer\" architecture and changed the NLP field forever.\n", + "- 2018-2020: Emergence of GPT model series that showed causal decoder architectures are great fit for pretraining, few-shot and zero-shot learning.\n", + "- Fast forward to 2023-2024: Following GPT-3/GPT-4 success stories, researchers and companies raced to produce the next best pretrained model that could further be finetuned for application-specific use-cases. \n", + "- One of the latest in this line of pretrained models which is also open source is Meta's [Llama 2](https://llama.meta.com/llama2) models (Large Language Model Meta AI). \n", + " - These models range from 7B to 65B parameters.\n", + " - LLaMA 2 was pretrained on 2 trillion tokens.\n", + "\n", + "For more information on Llama 2 consider reading the [Huggingface tutorial](https://huggingface.co/blog/llama2). As a quick summary, here are some of the important differences b/w the conventional transformer decoder architecture vs Llama 2 architecture:\n", + "\n", + "1. Decoder only model (causal language modeling and next word prediction)\n", + "2. RMSNorm in place of the LayerNorm\n", + "3. SwiGLU activation function\n", + "4. RoPE as positional embeddings \n", + "5. Grouped Query Attention\n", + "6. Trained on 4K context length\n", + "\n", + "
\n", + "\n", + "
Fig 2: Comparing GPT and Llama architectures.
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "e31303c7", + "metadata": {}, + "source": [ + "## Hugging Face's `LlamaModel`\n", + "Hugging Face provides an open-source implementation of `Llama` model in [modeling_llama.py](https://github.com/huggingface/transformers/blob/3d2900e829ab16757632f9dde891f1947cfc4be0/src/transformers/models/llama/modeling_llama.py#L4).\n", + "\n", + "Here's a block diagram that shows how Llama model is implemented in the Hugging Face repo. Notice the modular encapsulated form and `LlamaDecoderLayer` at the core of the model implementation.\n", + "\n", + "
\n", + "\n", + "
Fig 3: Causal Llama Model Block Diagram.
\n", + "
\n", + "\n", + "The above diagram translates to the following text output of the model in PyTorch. Notice that the core of the model has 32 `LlamaDecoderLayer`s. \n", + "\n", + "```\n", + "LlamaForCausalLM(\n", + " (model): LlamaModel(\n", + " (embed_tokens): Embedding(32000, 4096, padding_idx=0)\n", + " (layers): ModuleList(\n", + " (0-31): 32 x LlamaDecoderLayer(\n", + " (self_attn): LlamaFlashAttention2(\n", + " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (rotary_emb): LlamaRotaryEmbedding()\n", + " )\n", + " (mlp): LlamaMLP(\n", + " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", + " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", + " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): LlamaRMSNorm()\n", + " (post_attention_layernorm): LlamaRMSNorm()\n", + " )\n", + " )\n", + " (norm): LlamaRMSNorm()\n", + " )\n", + " (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n", + ")\n", + "```\n", + "\n", + "#### Hugging Face's `LlamaDecoderLayer`\n", + "\n", + "Let's take a closer look at `LlamaDecoderLayer`. It is composed of `input_layernorm`, `self_attn`, `post_attention_layernorm` and `mlp` modules. Each module has associated weights as shown in the diagram.\n", + "\n", + "
\n", + "\n", + "
Fig 4: Causal Llama Model Block Diagram (with simplified illustration of the [LlamaDecoderLayer](https://github.com/huggingface/transformers/blob/e770f0316d2a9b787c9d1440f204fcb65e176682/src/transformers/models/llama/modeling_llama.py#L695)).
\n", + "
\n", + "\n", + "##### Self_Attn Layer\n", + "For simplicity in the block diagram illustration of the \"self_attn\" box, we omit the \"Grouped Query Attention\" operation and only showcase the modules which have associated weights.\n", + " \n", + "##### MLP Layer\n", + "\n", + "SwiGLU is an activation defined as follows in the [modeling_llama.py](https://github.com/huggingface/transformers/blob/7c4995f93d8d24aae05e1e43279c96dce736e5c8/src/transformers/models/llama/modeling_llama.py#L236) file in the Hugging Face github repo:\n", + "```\n", + "\"\"\"\n", + "1. `self.up_proj`, `self.gate_proj` and `self.down_proj` are \"Linear\" layers\n", + "2. `self.act_fn` is a \"Swish\" function\n", + "\n", + "\"\"\"\n", + "down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))\n", + "```\n", + "It requires a set of 3 weights as compared to 2 weights in conventional \"MLP\" layers e.g. in the traditional transformer or GPT architectures. This is also illustrated in the following figure:\n", + "\n", + "
\n", + "\n", + "
Fig 5: A look inside the feedforward layer with swiglu activation function.
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "686df4ef", + "metadata": {}, + "source": [ + "## [Baseline] Running HF `LlamaModel` (Precision: `BF16`)\n", + "\n", + "Llama 2 weights are loaded into the Hugging Face native implementation `LlamaForCausalLM` (refer to [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py)). \n", + "\n", + "For this and other subsequent runs, the `batch_size` is `8`. The `LlamaDecoderLayer` is left unchanged in the baseline as follows:\n", + "\n", + "
\n", + "\n", + "
Fig 6: Revisiting \"LlamaDecoderLayer\".
\n", + "
\n", + "\n", + "
\n", + "Note\n", + "\n", + "The baseline implementation will be run in `BF16` precision.\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "107a8146", + "metadata": {}, + "source": [ + "
\n", + "\n", + "Note\n", + " \n", + "This tutorial loads and trains a Llama 2 7B model which takes up most of the GPU memory and therefore, we need to restart the jupyter notebook each time before running the following sections. A small utility method `restart_jupyter_notebook` is defined in the accompanying `utils.py` file. This function restarts the jupyter notebook so that the GPU memory is flushed before the model is loaded again from the checkpoint in order to avoid running into OOM (Out Of Memory) errors.\n", + "\n", + "If the utility doesn't work, comment this line `restart_jupyter_notebook()` in the following cell and manually restart the jupyter notebook before running the cell. Repeat the same for other sections in this tutorial.\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "975f9184", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "Average time taken per step: 289 milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", + "hyperparams.mixed_precision = \"bf16\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_baseline_model(hyperparams)\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "id": "c2d5b174", + "metadata": {}, + "source": [ + "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n", + "\n", + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 289 | 1 |" + ] + }, + { + "cell_type": "markdown", + "id": "a7d436bf", + "metadata": {}, + "source": [ + "## [Improvement 1] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n", + "\n", + "In addition to basic layers like `Linear` and `LayerNorm`, Transformer Engine offers larger modules like `MultiheadAttention` (combines \"LayerNorm\" and \"Self Attention\") and `LayerNormMLP` (combines \"LayerNorm\" and \"MLP\") that could replace their counterparts in the `LlamaDecoderLayer` and potentially provide a speedup. Transformer Engine also offers a full `TransformerLayer` (which further combines `MultiheadAttention` and `LayerNormMLP` layers) which could replace `LlamaDecoderLayer` and provide a speedup (with careful mapping of the weights since the name of the weights are different for those two layers). Let's take a closer look at Transformer Engine's `TransformerLayer`. \n", + "\n", + "#### Transformer Engine's `TransformerLayer`\n", + "\n", + "At a higher level, TE's `TransformerLayer` could be visualized as an apt replacement for the `LlamaDecoderLayer`. But the internals of the `TransformerLayer` are organized a bit differently. \n", + "\n", + "
\n", + "\n", + "
Fig 7: Transformer Engine's `TransformerLayer`
\n", + "
\n", + "\n", + "Just like Hugging Face's `LlamaDecoderLayer`, Transformer Engine's `TransformerLayer` encapsulates `self_attention` (as `MultiheadAttention`) and `mlp` (as `LayerNormMLP`). A major difference is that the two `Norm`s are included in the `MultiheadAttention` and `LayerNormMLP` layers as shown in the following output prompt:\n", + "\n", + "```\n", + "TransformerLayer(\n", + " (self_attention): MultiheadAttention(\n", + " (layernorm_qkv): LayerNormLinear()\n", + " (core_attention): DotProductAttention()\n", + " (proj): Linear()\n", + " )\n", + " (layernorm_mlp): LayerNormMLP()\n", + ")\n", + "```\n", + "\n", + "Another difference is that Transformer Engine implements an efficient version of feedforward layer with SwiGLU in which the weights from the `up_proj` and `gate_proj` modules are merged together and SwiGLU is applied using a custom fused kernel. This is done so that only one big and efficient Matrix Multiplication operation is issued to the GPU instead of two smaller ones.\n", + "\n", + "
\n", + "\n", + "
Fig 8: Abstract illustration of the SwiGLU implementation in Transformer Engine.
\n", + "
\n", + "\n", + "#### `TransformerLayer` options explained\n", + "\n", + "
\n", + "\n", + "Note\n", + " \n", + "Here, we go over some of the options in `TransformerLayer` that are needed for the tutorial. For a complete list of options, refer the [TransformerLayer API documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html?highlight=transformerlayer#transformer_engine.pytorch.TransformerLayer).\n", + "\n", + "
\n", + "\n", + "In the accompanying `te_llama.py` file, `TELlamaDecoderLayer` is defined as a wrapper over TE's `TransformerLayer` with a few needed options that make `TransformerLayer` a plug-in replacement for the HF's `LlamaDecoderLayer`.\n", + "\n", + "```\n", + "class TELlamaDecoderLayer(te.pytorch.TransformerLayer):\n", + " def __init__(self, config):\n", + " super().__init__(\n", + " config.hidden_size,\n", + " config.intermediate_size,\n", + " config.num_attention_heads,\n", + " bias=False,\n", + " layernorm_epsilon=config.rms_norm_eps,\n", + " hidden_dropout=0,\n", + " attention_dropout=0,\n", + " fuse_qkv_params=False,\n", + " normalization=\"RMSNorm\",\n", + " activation=\"swiglu\",\n", + " attn_input_format=\"bshd\",\n", + " )\n", + " te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)\n", + " self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()\n", + "```\n", + "\n", + "Here's a list summarizing each option briefly:\n", + "\n", + "1. `hidden_size`: size of each input sample.\n", + "2. `ffn_hidden_size`: intermediate size to which samples are projected.\n", + "3. `num_attention_heads`: number of attention heads in the transformer layer.\n", + "4. `bias`: switch to add additive biases to the submodule layers.\n", + "5. `layernorm_epsilon`: a value added to the denominator of layer normalization for numerical stability. Default is `1e-5`.\n", + "6. `hidden_dropout`: dropout probability for the dropout op after FC2 layer (fully connected layer no. 2). Default is `0.1`.\n", + "7. `attention_dropout`: dropout probability for the dropout op during multi-head attention. Default is `0.1`. \n", + "8. `fuse_qkv_params`: if set to True, TransformerLayer module exposes a single fused parameter for query-key-value. This enables optimizations such as QKV fusion without concatentations/splits and also enables the argument fuse_wgrad_accumulation.\n", + "9. `normalization`: type of normalization applied. Default is `LayerNorm`.\n", + "10. `activation`: type of activation used in the MLP block. Default is `gelu`.\n", + "11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules. \n", + "\n", + "\n", + "Further, note that `RotaryPositionEmbedding` is defined as part of the TE's `TransformerLayer` itself since it expects this rope cache if RoPE is used in the model. \n", + "\n", + "Let's revisit how `LlamaDecoderLayer`s form the core of the decoder layer stack in HF's llama implementation:\n", + "```\n", + "ModuleList(\n", + " (0-31): 32 x LlamaDecoderLayer(\n", + " (self_attn): LlamaAttention(\n", + " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (rotary_emb): LlamaRotaryEmbedding()\n", + " )\n", + " (mlp): LlamaMLP(\n", + " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", + " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", + " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): LlamaRMSNorm()\n", + " (post_attention_layernorm): LlamaRMSNorm()\n", + " )\n", + ")\n", + "```\n", + "\n", + "A major portion of the Hugging Face model implementation (32 `LlamaDecoderLayer` layers) could be potentially replaced with Transformer Engine's `TransformerLayer` layers. Let's see how it is made possible.\n", + "\n", + "\n", + "#### Mapping weights from HF's `LlamaDecoderLayer` to TE's `TransformerLayer`\n", + "\n", + "Refer the accompanying file `te_llama.py` which provides a reference to create a Llama 2 model with TE's `TransformerLayer` after replacing HF's `LlamaDecoderLayer`.\n", + "\n", + "Briefly, following pieces of code are put together:\n", + "\n", + "1. `TELlamaDecoderLayer` is added as a wrapper for `TransformerLayer`. \n", + "```\n", + "class TELlamaDecoderLayer(te.pytorch.TransformerLayer):\n", + " \"\"\"\n", + " Wrapper class over TE's `TransformerLayer`. This makes the wrapper very\n", + " similar to HF's `LlamaDecoderLayer` and easier to replace it in the code.\n", + "\n", + " Args:\n", + " config: LlamaConfig\n", + " args: positional args (for compatibility with `LlamaDecoderLayer`)\n", + " kwargs: keyword args (for compatibility with `LlamaDecoderLayer`)\n", + " \"\"\"\n", + " def __init__(self, config, *args, **kwargs):\n", + " super().__init__(\n", + " hidden_size=config.hidden_size,\n", + " ffn_hidden_size=config.intermediate_size,\n", + " num_attention_heads=config.num_attention_heads,\n", + " bias=False,\n", + " layernorm_epsilon=config.rms_norm_eps,\n", + " hidden_dropout=0,\n", + " attention_dropout=0,\n", + " fuse_qkv_params=False,\n", + " normalization=\"RMSNorm\",\n", + " activation=\"swiglu\",\n", + " attn_input_format=\"bshd\",\n", + " )\n", + " te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)\n", + " self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()\n", + "\n", + " def forward(self,\n", + " hidden_states,\n", + " *args,\n", + " attention_mask,\n", + " **kwargs):\n", + " \"\"\"\n", + " Custom forward to make sure we only pass relevant arguments to the\n", + " forward pass of the `TransformerLayer`. Also, make sure the output\n", + " format matches the output of the HF's `LlamaDecoderLayer`.\n", + " \"\"\"\n", + " return (super().forward(hidden_states, attention_mask=attention_mask, rotary_pos_emb=self.te_rope_emb),)\n", + "```\n", + "\n", + "2. Before creating a `LlamaForCausalLM`, `replace_decoder` context manager is used to monkey-patch `LlamaDecoderLayer` with `TELlamaDecoderLayer`.\n", + "\n", + "```\n", + "@contextmanager\n", + "def replace_decoder(te_decodder_cls):\n", + " \"\"\"\n", + " Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.\n", + " \"\"\"\n", + " original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer\n", + " transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls\n", + " try:\n", + " yield\n", + " finally:\n", + " transformers.models.llama.modeling_llama.LlamaDecoderLayer = original_llama_decoder_cls\n", + ".\n", + ".\n", + ".\n", + "class TELlamaForCausalLM:\n", + " \"\"\"\n", + " Causal LM created with `LlamaModel`. The underlying `LlamaDecoderLayer`\n", + " class is monkey-patched with `TELlamaDecoderLayer` class before\n", + " initializing the causal LM with `LlamaForCausalLM`.\n", + "\n", + " Args:\n", + " config: LlamaConfig\n", + " \"\"\"\n", + "\n", + " def __new__(cls, config: LlamaConfig):\n", + " with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):\n", + " llama_for_causal_lm = LlamaForCausalLM(config)\n", + " return llama_for_causal_lm\n", + ".\n", + ".\n", + ".\n", + "```\n", + "\n", + "3. A custom `pretrained_from_local` method is added that copies the weights from the checkpoint (which is meant for HF Llama implementation) to the modified `TELlamaForCausalLM` by carefully mapping the weights from the `LlamaDecoderLayer` (HF) to `TransformerLayer` (TE). The method `replace_params` maps and copies apt weights from `LlamaDecoderLayer` to the `TransformerLayer`. Refer to the following diagram for more details.\n", + "\n", + "```\n", + "def replace_params(hf_state_dict, te_state_dict):\n", + " # collect all layer prefixes to update\n", + " all_layer_prefixes = set()\n", + " for param_key in hf_state_dict.keys():\n", + " layer_prefix_pat = 'model.layers.\\d+.'\n", + " m = re.match(layer_prefix_pat, param_key)\n", + " if m is not None:\n", + " all_layer_prefixes.add(m.group())\n", + "\n", + " for layer_prefix in all_layer_prefixes:\n", + " # When loading weights into models with less number of layers, skip the\n", + " # copy if the corresponding layer doesn't exist in TE model\n", + " if layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight' in te_state_dict:\n", + " te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight'].data[:] = hf_state_dict[layer_prefix + 'input_layernorm.weight'].data[:]\n", + "\n", + " if layer_prefix + 'self_attention.layernorm_qkv.query_weight' in te_state_dict:\n", + " te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.query_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.q_proj.weight'].data[:]\n", + "\n", + " if layer_prefix + 'self_attention.layernorm_qkv.key_weight' in te_state_dict:\n", + " te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.key_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.k_proj.weight'].data[:]\n", + " .\n", + " .\n", + " .\n", + "\n", + " return all_layer_prefixes\n", + "```\n", + "\n", + "The following figure shows how the weights get mapped from the HF's `LlamaDecoderLayer` to TE's `TransformerLayer`.\n", + "\n", + "
\n", + "\n", + "
Fig 9: Replace `LlamaDecoderLayer` with `TransformerLayer`.
\n", + "
\n", + "\n", + "After initializing the modified Llama model this way, the core decoder layers get changed to `TELlamaDecoderLayer` (wrapper around `TransformerLayer`) as shown in the following output:\n", + "```\n", + "ModuleList(\n", + " (0-31): 32 x TELlamaDecoderLayer(\n", + " (self_attention): MultiheadAttention(\n", + " (layernorm_qkv): LayerNormLinear()\n", + " (core_attention): DotProductAttention(\n", + " (flash_attention): FlashAttention()\n", + " (fused_attention): FusedAttention()\n", + " (unfused_attention): UnfusedDotProductAttention(\n", + " (scale_mask_softmax): FusedScaleMaskSoftmax()\n", + " (attention_dropout): Dropout(p=0, inplace=False)\n", + " )\n", + " )\n", + " (proj): Linear()\n", + " )\n", + " (layernorm_mlp): LayerNormMLP()\n", + " )\n", + ")\n", + "```\n", + "\n", + "In summary, the model gets changed as follows with a large chunk of the implementation (core decoder layers) coming from Transformer Engine.\n", + "\n", + "
\n", + "\n", + "
Fig 10: Language model after the HF's `LlamaDecoderLayer`s are replaced with TE's `TransformerLayer`s.
\n", + "
\n", + "\n", + "\n", + "
\n", + "Note\n", + "\n", + "Let's first run this \"TELlama\" implementation in `BF16` precision.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "48dc8935", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "Average time taken per step: 242 milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", + "hyperparams.mixed_precision = \"bf16\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_te_llama_model(hyperparams)\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "id": "3c3d228a", + "metadata": {}, + "source": [ + "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **19%** even when using only BF16 precision!\n", + "\n", + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 289 | 1 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |" + ] + }, + { + "cell_type": "markdown", + "id": "b92d6792", + "metadata": {}, + "source": [ + "## [Improvement 2] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n", + "\n", + "Now that most of the HF Llama model implementation (`LlamaDecoderLayer`s) has been swapped with Transformer Engine implementation (`TELlamaDecoderLayer` or `TransformerLayer`), let's see how finetuning in `FP8` precision helps improve performance.\n", + "\n", + "#### How to run the model in `FP8` precision\n", + "\n", + "After the substitution, the model can be run in `FP8` precision by the following change over the previous BF16 runs. (For more information, refer the corresponding `wrap_with_accelerator` function in the accompanying `utils.py` file).\n", + "\n", + "```\n", + "# Specify the `FP8RecipeKwargs` (additional argument required to run in `fp8` precision)\n", + "fp8_kwarg_handler = [FP8RecipeKwargs(backend=\"te\")]\n", + "\n", + "# Pass the `FP8RecipeKwargs` to the `Accelerator` init call\n", + "accelerator = Accelerator(\n", + " ...\n", + " kwargs_handlers=fp8_kwarg_handler\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6bba7cc1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "Average time taken per step: 231 milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", + "hyperparams.mixed_precision = \"fp8\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_te_llama_model(hyperparams)\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "id": "602239d7", + "metadata": {}, + "source": [ + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 289 | 1 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 231 | 1.25 |\n", + "\n", + "\n", + "After turning on FP8 precision, we get even more speedup of **25%**!" + ] + }, + { + "cell_type": "markdown", + "id": "372867d5", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides speedup over Hugging Face's native Llama 2 implementation. This needs careful initializing of model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/te_llama/utils.py b/docs/examples/te_llama/utils.py new file mode 100644 index 0000000000..04abe39b6a --- /dev/null +++ b/docs/examples/te_llama/utils.py @@ -0,0 +1,180 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import time +import sys +import IPython + +import torch +from torch.optim import AdamW +from torch.utils.data import DataLoader + +from transformers import AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup, AutoConfig +from transformers import DataCollatorForLanguageModeling +from datasets import load_dataset +from accelerate import Accelerator +from accelerate.utils.dataclasses import FP8RecipeKwargs + +class HyperParameters: + def __init__(self): + self.mixed_precision = "bf16" + #self.model_name = "" # <== Add model weight location here + self.dataset_name = "timdettmers/openassistant-guanaco" + self.dataset_text_field = "text" + self.learning_rate = 1.41e-5 + self.batch_size = 8 + self.max_seq_length = 256 + self.gradient_accumulation_steps = 1 + self.num_training_steps=10 + +hyperparams = HyperParameters() + +def get_dataloaders(accelerator:Accelerator, hyperparams): + dataset = load_dataset(hyperparams.dataset_name, split="train") + tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name) + if getattr(tokenizer, "pad_token", None) is None: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize(element): + outputs = tokenizer( + element["text"], + truncation=True, + padding=False, + max_length=hyperparams.max_seq_length, + return_overflowing_tokens=False, + return_length=False + ) + return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]} + + with accelerator.main_process_first(): + dataset = dataset.map( + tokenize, + batched=True, + remove_columns=dataset.column_names + ) + + # Simply pad to the multiple of 16 for both FP8 and BF16 precision + pad_to_multiple_of = 16 + data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=False, + pad_to_multiple_of=pad_to_multiple_of, + ) + + dataloader_params = { + "batch_size": hyperparams.batch_size, + "collate_fn": data_collator, + "drop_last": True, + } + train_dataloader = DataLoader(dataset, **dataloader_params) + return train_dataloader + +def init_baseline_model(hyperparams): + # Init the model + config = AutoConfig.from_pretrained(hyperparams.model_name) + # make sure to use flash_attention to do iso comparison with TELlamaModel + config._attn_implementation = "flash_attention_2" + model = AutoModelForCausalLM.from_pretrained( + hyperparams.model_name, + config=config, + torch_dtype=torch.bfloat16, + ) + # Needed for the cases when using TELlamaForCausalLM. So adding here for 1:1 comparison + model.config.use_cache=False + + return model + +def init_te_llama_model(hyperparams): + # Init the model + from te_llama import TELlamaForCausalLM + config = AutoConfig.from_pretrained(hyperparams.model_name) + model = TELlamaForCausalLM.from_pretrained_local( + hyperparams.model_name, + config=config, + torch_dtype=torch.bfloat16, + ) + # Needed for the cases when using TELlamaForCausalLM + model.config.use_cache=False + + return model + +def wrap_with_accelerator(model, hyperparams): + # Create FP8 kwarg handler if required + fp8_kwarg_handler = [FP8RecipeKwargs(backend="te")] if hyperparams.mixed_precision == "fp8" else None + + # Init HF accelerator that's used for training + accelerator = Accelerator( + log_with="wandb", + gradient_accumulation_steps=hyperparams.gradient_accumulation_steps, + mixed_precision=hyperparams.mixed_precision, + kwargs_handlers=fp8_kwarg_handler + ) + #accelerator.print(f'State: {accelerator.state}') + train_dataloader = get_dataloaders(accelerator, hyperparams) + + # Wrap model, optimizer/scheduler, dataloaders in accelerate + optimizer = AdamW(params = model.parameters(), lr=hyperparams.learning_rate) + lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=100, + num_training_steps=hyperparams.num_training_steps, + ) + model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler + ) + + return accelerator, model, optimizer, train_dataloader, lr_scheduler + +def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler): + model.train() + total_loss = 0 + optimizer.zero_grad() + train_dataloader = enumerate(train_dataloader) + + time_vals = [] + + for _ in range(hyperparams.num_training_steps): + step, batch = next(train_dataloader) + start_time = time.time() + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + end_time = time.time() + total_time = end_time - start_time + time_vals.append(total_time) + + accelerator.end_training() + + # ignore the first couple of time vals + time_vals = time_vals[2:] + print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(sum(time_vals)/len(time_vals)) * 1000:.0f} milliseconds") + +def restart_jupyter_notebook(): + # Try restarting the Jupyter kernel + IPython.Application.instance().kernel.do_shutdown(True) + + # Check whether the device memory has been flushed + if torch.cuda.memory_allocated() != 0: + import warnings + warnings.warn("The device memory hasn't been flushed, trying with a second method!") + + # Try restarting the Jupyter kernel another way + # Restart the kernel + from IPython.core.display import HTML + HTML("") + + if torch.cuda.memory_allocated() != 0: + print("The device memory hasn't been flushed, try manually restarting the Jupyter kernel!") + + # Suppress the warnings + if not sys.warnoptions: + import warnings + warnings.simplefilter("ignore") + torch.set_warn_always(False) diff --git a/docs/index.rst b/docs/index.rst index a64aa729a0..d64cebbfa2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,6 +44,7 @@ Transformer Engine documentation examples/fp8_primer.ipynb examples/advanced_optimizations.ipynb + examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb .. toctree:: :hidden: From 2c14d6863d51140c00556ca87f31395278eed8bb Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Wed, 20 Mar 2024 12:54:29 -0700 Subject: [PATCH 087/427] Llama accelerate tutorial (#720) * tutorial and doc fixes Signed-off-by: Sudhakar Singh * remove extra code Signed-off-by: Sudhakar Singh * fix typos Signed-off-by: Sudhakar Singh --------- Signed-off-by: Sudhakar Singh --- docs/examples/te_llama/te_llama.py | 7 +- ...tutorial_accelerate_hf_llama_with_te.ipynb | 74 ++++++++++--------- docs/examples/te_llama/utils.py | 33 ++++++--- 3 files changed, 65 insertions(+), 49 deletions(-) diff --git a/docs/examples/te_llama/te_llama.py b/docs/examples/te_llama/te_llama.py index fba35ed30c..c73bed45b4 100644 --- a/docs/examples/te_llama/te_llama.py +++ b/docs/examples/te_llama/te_llama.py @@ -21,12 +21,12 @@ from transformers.utils.hub import get_checkpoint_shard_files @contextmanager -def replace_decoder(te_decodder_cls): +def replace_decoder(te_decoder_cls): """ Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`. """ original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer - transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls + transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls try: yield finally: @@ -56,6 +56,7 @@ def __init__(self, config, *args, **kwargs): normalization="RMSNorm", activation="swiglu", attn_input_format="bshd", + num_gqa_groups=config.num_key_value_heads, ) te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads) self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda() @@ -84,7 +85,7 @@ class is monkey-patched with `TELlamaDecoderLayer` class before """ def __new__(cls, config: LlamaConfig): - with replace_decoder(te_decodder_cls=TELlamaDecoderLayer): + with replace_decoder(te_decoder_cls=TELlamaDecoderLayer): llama_for_causal_lm = LlamaForCausalLM(config) return llama_for_causal_lm diff --git a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb index 974077de57..178922c9d2 100644 --- a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb +++ b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1f37565e", + "id": "2cac9d39", "metadata": {}, "source": [ "# Accelerating a Hugging Face Llama 2 model with Transformer Engine\n", @@ -11,14 +11,14 @@ "\n", "Goal\n", "\n", - "This tutorial showcases how accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n", + "This tutorial showcases how to accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n", "\n", "\n" ] }, { "cell_type": "markdown", - "id": "ab4c0b82", + "id": "401f7fb1", "metadata": {}, "source": [ "## Dependencies for this tutorial\n", @@ -35,7 +35,7 @@ }, { "cell_type": "markdown", - "id": "466ff515", + "id": "33bdb5fe", "metadata": {}, "source": [ "## Table of contents\n", @@ -53,7 +53,7 @@ }, { "cell_type": "markdown", - "id": "8e84bcaa", + "id": "7645f176", "metadata": {}, "source": [ "## From \"Transformer\" to \"Llama\" \n", @@ -89,7 +89,7 @@ }, { "cell_type": "markdown", - "id": "e31303c7", + "id": "d0cfa787", "metadata": {}, "source": [ "## Hugging Face's `LlamaModel`\n", @@ -166,7 +166,7 @@ }, { "cell_type": "markdown", - "id": "686df4ef", + "id": "f4f21369", "metadata": {}, "source": [ "## [Baseline] Running HF `LlamaModel` (Precision: `BF16`)\n", @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "107a8146", + "id": "24a8d0a5", "metadata": {}, "source": [ "
\n", @@ -206,8 +206,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "975f9184", + "execution_count": 1, + "id": "e36ff380", "metadata": {}, "outputs": [ { @@ -215,7 +215,7 @@ "output_type": "stream", "text": [ "10 finetuning steps complete!\n", - "Average time taken per step: 289 milliseconds\n" + "Average time taken per step: 315 milliseconds\n" ] } ], @@ -247,19 +247,19 @@ }, { "cell_type": "markdown", - "id": "c2d5b174", + "id": "a64f0f33", "metadata": {}, "source": [ "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n", "\n", "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", - "| HF (baseline) | BF16 | 289 | 1 |" + "| HF (baseline) | BF16 | 315 | 1 |" ] }, { "cell_type": "markdown", - "id": "a7d436bf", + "id": "d9898383", "metadata": {}, "source": [ "## [Improvement 1] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n", @@ -322,6 +322,7 @@ " normalization=\"RMSNorm\",\n", " activation=\"swiglu\",\n", " attn_input_format=\"bshd\",\n", + " num_gqa_groups=config.num_key_value_heads,\n", " )\n", " te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)\n", " self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()\n", @@ -339,10 +340,11 @@ "8. `fuse_qkv_params`: if set to True, TransformerLayer module exposes a single fused parameter for query-key-value. This enables optimizations such as QKV fusion without concatentations/splits and also enables the argument fuse_wgrad_accumulation.\n", "9. `normalization`: type of normalization applied. Default is `LayerNorm`.\n", "10. `activation`: type of activation used in the MLP block. Default is `gelu`.\n", - "11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules. \n", + "11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules.\n", + "12. `num_gqa_groups`: number of GQA groups in the transformer layer. Grouped Query Attention is described in [this paper](https://arxiv.org/pdf/2305.13245.pdf). This only affects the keys and values, not the querys. GQA-1 is equivalent to Multi-Query Attention ([MQA](https://arxiv.org/pdf/1911.02150.pdf)), while GQA-H is equivalent to MultiHead Attention, i.e. `num_gqa_groups = num_attention_heads`.\n", "\n", "\n", - "Further, note that `RotaryPositionEmbedding` is defined as part of the TE's `TransformerLayer` itself since it expects this rope cache if RoPE is used in the model. \n", + "Further, note that `RotaryPositionEmbedding` is defined as part of the `TELlamaDecoderLayer` (wrapper around TE's `TransformerLayer`) itself since it expects this rope cache if RoPE is used in the model. \n", "\n", "Let's revisit how `LlamaDecoderLayer`s form the core of the decoder layer stack in HF's llama implementation:\n", "```\n", @@ -422,12 +424,12 @@ "\n", "```\n", "@contextmanager\n", - "def replace_decoder(te_decodder_cls):\n", + "def replace_decoder(te_decoder_cls):\n", " \"\"\"\n", " Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.\n", " \"\"\"\n", " original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer\n", - " transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls\n", + " transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls\n", " try:\n", " yield\n", " finally:\n", @@ -446,7 +448,7 @@ " \"\"\"\n", "\n", " def __new__(cls, config: LlamaConfig):\n", - " with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):\n", + " with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):\n", " llama_for_causal_lm = LlamaForCausalLM(config)\n", " return llama_for_causal_lm\n", ".\n", @@ -530,7 +532,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "48dc8935", + "id": "4974b738", "metadata": {}, "outputs": [ { @@ -538,7 +540,7 @@ "output_type": "stream", "text": [ "10 finetuning steps complete!\n", - "Average time taken per step: 242 milliseconds\n" + "Average time taken per step: 252 milliseconds\n" ] } ], @@ -570,20 +572,20 @@ }, { "cell_type": "markdown", - "id": "3c3d228a", + "id": "85c78c7f", "metadata": {}, "source": [ - "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **19%** even when using only BF16 precision!\n", + "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **25%** even when using only BF16 precision!\n", "\n", "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", - "| HF (baseline) | BF16 | 289 | 1 |\n", - "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |" + "| HF (baseline) | BF16 | 315 | 1 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 252 | 1.25 |" ] }, { "cell_type": "markdown", - "id": "b92d6792", + "id": "e2fb88e9", "metadata": {}, "source": [ "## [Improvement 2] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n", @@ -608,8 +610,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "6bba7cc1", + "execution_count": 1, + "id": "8f2b752e", "metadata": {}, "outputs": [ { @@ -617,7 +619,7 @@ "output_type": "stream", "text": [ "10 finetuning steps complete!\n", - "Average time taken per step: 231 milliseconds\n" + "Average time taken per step: 226 milliseconds\n" ] } ], @@ -649,27 +651,27 @@ }, { "cell_type": "markdown", - "id": "602239d7", + "id": "67ec126c", "metadata": {}, "source": [ "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", - "| HF (baseline) | BF16 | 289 | 1 |\n", - "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |\n", - "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 231 | 1.25 |\n", + "| HF (baseline) | BF16 | 315 | 1 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 252 | 1.25 |\n", + "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 226 | 1.39 |\n", "\n", "\n", - "After turning on FP8 precision, we get even more speedup of **25%**!" + "After turning on FP8 precision, we get even more speedup of almost **40%**!" ] }, { "cell_type": "markdown", - "id": "372867d5", + "id": "41b80b0f", "metadata": {}, "source": [ "## Conclusion\n", "\n", - "Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides speedup over Hugging Face's native Llama 2 implementation. This needs careful initializing of model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!" + "Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides a speedup over Hugging Face's native Llama 2 implementation. This needs careful initialization of the model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!" ] } ], diff --git a/docs/examples/te_llama/utils.py b/docs/examples/te_llama/utils.py index 04abe39b6a..54b329f12b 100644 --- a/docs/examples/te_llama/utils.py +++ b/docs/examples/te_llama/utils.py @@ -26,7 +26,9 @@ def __init__(self): self.batch_size = 8 self.max_seq_length = 256 self.gradient_accumulation_steps = 1 + self.num_warmup_steps=5 self.num_training_steps=10 + hyperparams = HyperParameters() @@ -132,11 +134,9 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, optimizer.zero_grad() train_dataloader = enumerate(train_dataloader) - time_vals = [] - - for _ in range(hyperparams.num_training_steps): + # Warmup iters + for _ in range(hyperparams.num_warmup_steps): step, batch = next(train_dataloader) - start_time = time.time() with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss @@ -146,15 +146,28 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler.step() optimizer.zero_grad() - end_time = time.time() - total_time = end_time - start_time - time_vals.append(total_time) + # Get the timers ready + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start.record() + # Training iters + for _ in range(hyperparams.num_training_steps): + step, batch = next(train_dataloader) + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + torch.cuda.synchronize() + end.record() accelerator.end_training() - # ignore the first couple of time vals - time_vals = time_vals[2:] - print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(sum(time_vals)/len(time_vals)) * 1000:.0f} milliseconds") + print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f} milliseconds") def restart_jupyter_notebook(): # Try restarting the Jupyter kernel From 297459bd08e1b791ca7a2872cfa8582220477782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Sun, 31 Mar 2024 12:11:43 -0700 Subject: [PATCH 088/427] Llama tutorial fixes (#730) Llama tutorial fixes - all Signed-off-by: Pawel Gadzinski Co-authored-by: Pawel Gadzinski --- docs/examples/te_llama/te_llama.py | 46 +++++++++++-------- ...tutorial_accelerate_hf_llama_with_te.ipynb | 9 ++-- docs/examples/te_llama/utils.py | 1 + 3 files changed, 34 insertions(+), 22 deletions(-) mode change 100644 => 100755 docs/examples/te_llama/te_llama.py mode change 100644 => 100755 docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb mode change 100644 => 100755 docs/examples/te_llama/utils.py diff --git a/docs/examples/te_llama/te_llama.py b/docs/examples/te_llama/te_llama.py old mode 100644 new mode 100755 index c73bed45b4..aa23b638f0 --- a/docs/examples/te_llama/te_llama.py +++ b/docs/examples/te_llama/te_llama.py @@ -56,7 +56,7 @@ def __init__(self, config, *args, **kwargs): normalization="RMSNorm", activation="swiglu", attn_input_format="bshd", - num_gqa_groups=config.num_key_value_heads, + num_gqa_groups=config.num_key_value_heads ) te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads) self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda() @@ -121,12 +121,12 @@ def from_pretrained_local(cls, pretrained_model_name_or_path, *args, config, **k assert not isinstance(resolved_archive_file, list) resolved_archive_file = [resolved_archive_file] - error_msgs = [] for shard_file in resolved_archive_file: state_dict = load_state_dict(shard_file) - replaced_layers = replace_params(state_dict, vanilla_model.state_dict()) - - error_msgs += _load_state_dict_into_model(vanilla_model, state_dict, start_prefix="") + # replace_params copies parameters relevant only to TransformerEngine + replace_params(state_dict, vanilla_model.state_dict(), config) + # _load_state_dict_into_model copies parameters other than those in TransformerEngine + _load_state_dict_into_model(vanilla_model, state_dict, start_prefix="") # Force mem release. Taken from huggingface code del state_dict @@ -134,7 +134,7 @@ def from_pretrained_local(cls, pretrained_model_name_or_path, *args, config, **k return vanilla_model -def replace_params(hf_state_dict, te_state_dict): +def replace_params(hf_state_dict, te_state_dict, config): # collect all layer prefixes to update all_layer_prefixes = set() for param_key in hf_state_dict.keys(): @@ -142,32 +142,40 @@ def replace_params(hf_state_dict, te_state_dict): m = re.match(layer_prefix_pat, param_key) if m is not None: all_layer_prefixes.add(m.group()) + + for layer_prefix in all_layer_prefixes: # When loading weights into models with less number of layers, skip the - # copy if the corresponding layer doesn't exist in TE model - if layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight' in te_state_dict: + # copy if the corresponding layer doesn't exist in HF model + if layer_prefix + 'input_layernorm.weight' in hf_state_dict: te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.layer_norm_weight'].data[:] = hf_state_dict[layer_prefix + 'input_layernorm.weight'].data[:] - if layer_prefix + 'self_attention.layernorm_qkv.query_weight' in te_state_dict: + if layer_prefix + 'self_attn.q_proj.weight' in hf_state_dict: te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.query_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.q_proj.weight'].data[:] - if layer_prefix + 'self_attention.layernorm_qkv.key_weight' in te_state_dict: + if layer_prefix + 'self_attn.k_proj.weight' in hf_state_dict: te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.key_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.k_proj.weight'].data[:] - if layer_prefix + 'self_attention.layernorm_qkv.value_weight' in te_state_dict: + if layer_prefix + 'self_attn.v_proj.weight' in hf_state_dict: te_state_dict[layer_prefix + 'self_attention.layernorm_qkv.value_weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.v_proj.weight'].data[:] - if layer_prefix + 'self_attention.proj.weight' in te_state_dict: + if layer_prefix + 'self_attn.o_proj.weight' in hf_state_dict: te_state_dict[layer_prefix + 'self_attention.proj.weight'].data[:] = hf_state_dict[layer_prefix + 'self_attn.o_proj.weight'].data[:] - if layer_prefix + 'layernorm_mlp.layer_norm_weight' in te_state_dict: + if layer_prefix + 'post_attention_layernorm.weight' in hf_state_dict: te_state_dict[layer_prefix + 'layernorm_mlp.layer_norm_weight'].data[:] = hf_state_dict[layer_prefix + 'post_attention_layernorm.weight'].data[:] - - if layer_prefix + 'layernorm_mlp.fc1_weight' in te_state_dict: - te_state_dict[layer_prefix + 'layernorm_mlp.fc1_weight'].data[:] = torch.cat((hf_state_dict[layer_prefix + 'mlp.gate_proj.weight'].data[:], hf_state_dict[layer_prefix + 'mlp.up_proj.weight'].data[:]), dim=0) - - if layer_prefix + 'layernorm_mlp.fc2_weight' in te_state_dict: + + # It may happen that gate_proj.weight and up_proj.weight will be in the different files, so we need to + # load them separately. + if layer_prefix + 'mlp.gate_proj.weight' in hf_state_dict: + te_state_dict[layer_prefix + 'layernorm_mlp.fc1_weight'].data[:config.intermediate_size] = \ + hf_state_dict[layer_prefix + 'mlp.gate_proj.weight'].data + + if layer_prefix + 'mlp.up_proj.weight' in hf_state_dict: + te_state_dict[layer_prefix + 'layernorm_mlp.fc1_weight'].data[config.intermediate_size:] = \ + hf_state_dict[layer_prefix + 'mlp.up_proj.weight'].data + + if layer_prefix + 'mlp.down_proj.weight' in hf_state_dict: te_state_dict[layer_prefix + 'layernorm_mlp.fc2_weight'].data[:] = hf_state_dict[layer_prefix + 'mlp.down_proj.weight'].data[:] - return all_layer_prefixes \ No newline at end of file diff --git a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb old mode 100644 new mode 100755 index 178922c9d2..cc77b484f9 --- a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb +++ b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb @@ -231,7 +231,8 @@ "\n", "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", "## !!! `model_name` attr must point to the location of the model weights !!!\n", - "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/ and then coverted to the HuggingFace format.\n", + "## Instructions for conversion are available on the website https://ai.meta.com/blog/5-steps-to-getting-started-with-llama-2/ - steps 1 and 2.\n", "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", "hyperparams.mixed_precision = \"bf16\"\n", "\n", @@ -556,7 +557,8 @@ "\n", "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", "## !!! `model_name` attr must point to the location of the model weights !!!\n", - "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/ and then coverted to the HuggingFace format.\n", + "## Instructions for conversion are available on the website https://ai.meta.com/blog/5-steps-to-getting-started-with-llama-2/ - steps 1 and 2.\n", "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", "hyperparams.mixed_precision = \"bf16\"\n", "\n", @@ -635,7 +637,8 @@ "\n", "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", "## !!! `model_name` attr must point to the location of the model weights !!!\n", - "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/\n", + "## Weights can be downloaded from: https://llama.meta.com/llama-downloads/ and then coverted to the HuggingFace format.\n", + "## Instructions for conversion are available on the website https://ai.meta.com/blog/5-steps-to-getting-started-with-llama-2/ - steps 1 and 2.\n", "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n", "hyperparams.mixed_precision = \"fp8\"\n", "\n", diff --git a/docs/examples/te_llama/utils.py b/docs/examples/te_llama/utils.py old mode 100644 new mode 100755 index 54b329f12b..9c36e5bd17 --- a/docs/examples/te_llama/utils.py +++ b/docs/examples/te_llama/utils.py @@ -91,6 +91,7 @@ def init_te_llama_model(hyperparams): # Init the model from te_llama import TELlamaForCausalLM config = AutoConfig.from_pretrained(hyperparams.model_name) + config._attn_implementation = "flash_attention_2" model = TELlamaForCausalLM.from_pretrained_local( hyperparams.model_name, config=config, From 35a8754cb284f15e3f3768f7164564bd20b597c1 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 28 Mar 2024 21:53:59 -0700 Subject: [PATCH 089/427] [PyTorch] Fix backward compatibility with checkpoint API (#740) * Fix backward compatibility with checkpoint API Signed-off-by: Kirthi Shankar Sivamani * review comments and fix lint Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/distributed.py | 37 +++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index 67fc4db0d0..6a2a801efd 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -516,6 +516,12 @@ def checkpoint( kwargs : dict dictionary of string keys for keyword arguments to :attr:`function`. """ + only_tensor_args = True + for arg in args: + if not isinstance(arg, torch.Tensor): + only_tensor_args = False + break + # Pop out te.distributed.checkpoint() arguments global _USE_REENTRANT_ACTIVATION_RECOMPUTE _USE_REENTRANT_ACTIVATION_RECOMPUTE = kwargs.pop("use_reentrant", True) @@ -523,6 +529,27 @@ def checkpoint( tp_group = kwargs.pop("tp_group", None) get_rng_state_tracker = kwargs.pop("get_rng_state_tracker", None) + # Ensure backward compatibility. + if not only_tensor_args: + warnings.warn( + "Passing non-tensor non-keyword arguments is deprecated and support will be removed in " + "future releases of TransformerEngine. `distribute_saved_activations`, `tp_group`, and " + "`get_rng_state_tracker` must be passed as keyword arguments to `checkpoint`.", + DeprecationWarning, stacklevel=2, + ) + assert len(args) > 3, "Incorrect number of arguments for deprecated `checkpoint` API." + assert ( + isinstance(args[0], bool) and callable(args[1]) + and isinstance(args[2], None | dist_group_type) + ), "Incorrect arguments for deprecated `checkpoint` API." + for arg in args[3:]: + assert ( + isinstance(arg, None | torch.Tensor) + ), f"Expected tensor argument, found {type(arg)}." + + distribute_saved_activations, get_rng_state_tracker, tp_group = args[:3] # pylint: disable=unbalanced-tuple-unpacking + args = args[3:] + # Trigger the native PyTorch checkpoint if: # 1. `function` is a `torch.nn.Module` # AND @@ -555,16 +582,6 @@ def checkpoint( assert torch.distributed.is_initialized(), "torch.distributed is not initialized." tp_group = torch.distributed.GroupMember.WORLD if tp_group is None else tp_group - # Make sure at least one tensor input has `requires_grad=True` - input_requires_grad = False - for arg in args: - if isinstance(arg, torch.Tensor) and arg.requires_grad: - input_requires_grad = True - break - assert input_requires_grad, ( - "`use_reentrant=True` requires at least one input tensor with `requires_grad=True`." - ) - return _CheckpointFunction.apply( function, distribute_saved_activations, From 6a9edc38bf9b941b7d369af5103fa8fe0b121d61 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 4 Apr 2024 09:59:14 -0700 Subject: [PATCH 090/427] [PyTorch] Fix backward compatibility for checkpoint API (#748) * Args can be None Signed-off-by: Kirthi Shankar Sivamani * Fix other arg types Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/distributed.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index 6a2a801efd..239cecf39b 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -516,12 +516,6 @@ def checkpoint( kwargs : dict dictionary of string keys for keyword arguments to :attr:`function`. """ - only_tensor_args = True - for arg in args: - if not isinstance(arg, torch.Tensor): - only_tensor_args = False - break - # Pop out te.distributed.checkpoint() arguments global _USE_REENTRANT_ACTIVATION_RECOMPUTE _USE_REENTRANT_ACTIVATION_RECOMPUTE = kwargs.pop("use_reentrant", True) @@ -530,23 +524,14 @@ def checkpoint( get_rng_state_tracker = kwargs.pop("get_rng_state_tracker", None) # Ensure backward compatibility. - if not only_tensor_args: + if (len(args) > 3 and isinstance(args[0], bool) and callable(args[1]) + and isinstance(args[2], None | dist_group_type)): warnings.warn( "Passing non-tensor non-keyword arguments is deprecated and support will be removed in " "future releases of TransformerEngine. `distribute_saved_activations`, `tp_group`, and " "`get_rng_state_tracker` must be passed as keyword arguments to `checkpoint`.", DeprecationWarning, stacklevel=2, ) - assert len(args) > 3, "Incorrect number of arguments for deprecated `checkpoint` API." - assert ( - isinstance(args[0], bool) and callable(args[1]) - and isinstance(args[2], None | dist_group_type) - ), "Incorrect arguments for deprecated `checkpoint` API." - for arg in args[3:]: - assert ( - isinstance(arg, None | torch.Tensor) - ), f"Expected tensor argument, found {type(arg)}." - distribute_saved_activations, get_rng_state_tracker, tp_group = args[:3] # pylint: disable=unbalanced-tuple-unpacking args = args[3:] From 1187e655aaa1ec58150a86dc1b3c1de44d90bcd8 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 16 Apr 2024 08:58:16 -0700 Subject: [PATCH 091/427] [PyTorch] Use __torch_function__ as a class method (#783) Use torch function as a class method Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/float8_tensor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/float8_tensor.py b/transformer_engine/pytorch/float8_tensor.py index 9923d24a42..c4aebf1a8b 100644 --- a/transformer_engine/pytorch/float8_tensor.py +++ b/transformer_engine/pytorch/float8_tensor.py @@ -766,5 +766,8 @@ def _set_data(self, tensor: torch.Tensor) -> None: _transpose_invalid = property(**_make_fp8_attr_property_funcs("transpose_invalid")) _scale_inv = property(**_make_fp8_attr_property_funcs("scale_inv")) - # Do not force the Float8Tensor type on the returned tensor - __torch_function__ = torch._C._disabled_torch_function_impl + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return torch._C._disabled_torch_function_impl(func, types, args, kwargs) From 09d576df5c1879d8554197045eaa18517014f8b7 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 17 Apr 2024 09:02:41 -0700 Subject: [PATCH 092/427] [PyTorch] Misc fixes for release_v1.6 (#784) * fixes; docs Signed-off-by: Kirthi Shankar Sivamani * Check for FP8 Signed-off-by: Kirthi Shankar Sivamani * Fix LoRa-like use cases Signed-off-by: Kirthi Shankar Sivamani * Reviews Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/pytorch/module/layernorm_linear.py | 9 +++++++-- transformer_engine/pytorch/module/layernorm_mlp.py | 8 ++++++-- transformer_engine/pytorch/module/linear.py | 9 +++++++-- transformer_engine/pytorch/utils.py | 8 ++++++++ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 5df4950276..a4e6b8c5b9 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -28,6 +28,7 @@ cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, + requires_grad, ) from ..distributed import ( set_tensor_model_parallel_attributes, @@ -328,7 +329,11 @@ def forward( ctx.requires_dgrad = inp.requires_grad ctx.normalization = normalization ctx.primary_weights_in_fp8 = primary_weights_in_fp8 - ctx.is_first_module = FP8GlobalStateManager.is_first_fp8_module() + ctx.reduce_and_update_bwd_fp8_tensors = False + if ctx.fp8 and requires_grad(inp, ln_weight, ln_bias, weight, bias): + ctx.reduce_and_update_bwd_fp8_tensors = ( + ctx.reduce_and_update_bwd_fp8_tensors or + FP8GlobalStateManager.is_first_fp8_module()) # Row Parallel Linear if parallel_mode == "row" and sequence_parallel: @@ -661,7 +666,7 @@ def backward( else: wgrad = None - if ctx.is_first_module and not is_graph_capturing(): + if ctx.reduce_and_update_bwd_fp8_tensors and not is_graph_capturing(): FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False) return ( diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index 6efb72b8db..9b80ea3a21 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -33,6 +33,7 @@ cast_if_needed, assert_dim_for_fp8_exec, clear_tensor_data, + requires_grad, ) from ..distributed import ( set_tensor_model_parallel_attributes, @@ -544,7 +545,10 @@ def forward( ctx.requires_dgrad = inp.requires_grad ctx.normalization = normalization ctx.primary_weights_in_fp8 = primary_weights_in_fp8 - ctx.is_first_module = FP8GlobalStateManager.is_first_fp8_module() + ctx.reduce_and_update_bwd_fp8_tensors = False + if ctx.fp8 and requires_grad( + inp, ln_weight, ln_bias, fc1_weight, fc2_weight, fc1_bias, fc2_bias): + ctx.reduce_and_update_bwd_fp8_tensors = FP8GlobalStateManager.is_first_fp8_module() # Row Parallel Linear if ub_overlap_rs: @@ -1121,7 +1125,7 @@ def backward( else: fc2_wgrad = None - if ctx.is_first_module and not is_graph_capturing(): + if ctx.reduce_and_update_bwd_fp8_tensors and not is_graph_capturing(): FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False) return ( diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 3c055270b0..9829719c86 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -25,6 +25,7 @@ assert_dim_for_fp8_exec, clear_tensor_data, init_method_constant, + requires_grad, ) from ..distributed import ( set_tensor_model_parallel_attributes, @@ -320,7 +321,11 @@ def forward( ctx.tp_size = tp_size ctx.requires_dgrad = inp.requires_grad ctx.primary_weights_in_fp8 = primary_weights_in_fp8 - ctx.is_first_module = FP8GlobalStateManager.is_first_fp8_module() + ctx.reduce_and_update_bwd_fp8_tensors = False + if ctx.fp8 and requires_grad(inp, weight, bias): + ctx.reduce_and_update_bwd_fp8_tensors = ( + ctx.reduce_and_update_bwd_fp8_tensors or + FP8GlobalStateManager.is_first_fp8_module()) # Row Parallel Linear if ub_overlap_rs: @@ -530,7 +535,7 @@ def backward( else: wgrad = None - if ctx.is_first_module and not is_graph_capturing(): + if ctx.reduce_and_update_bwd_fp8_tensors and not is_graph_capturing(): FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False) return ( diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 09eb433957..25e6a74b34 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -8,6 +8,14 @@ import torch +def requires_grad(*tensors: Tuple[Optional[torch.Tensor], ...]) -> None: + """Check if any of the given tensors require gradient.""" + for tensor in tensors: + if tensor is not None and tensor.requires_grad: + return True + return False + + def clear_tensor_data(*tensors: Tuple[Optional[torch.Tensor], ...]) -> None: """ Trick to deallocate tensor memory when delete operation does not From 4f5723e8657a078e500bc8650b13709fe3c05fd4 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Fri, 19 Apr 2024 14:00:14 -0700 Subject: [PATCH 093/427] [PyTorch] Fix typo from #768 (#795) Fix typo Signed-off-by: Tim Moon --- transformer_engine/pytorch/module/base.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 7e0cf5c106..3c5887d942 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -684,13 +684,16 @@ def grad_output_preprocess( grad_output_c = ctx.ub_obj_gradout.get_ubuf_output(0) else: grad_output_c = torch.empty_like(grad_output_mat, dtype=torch.uint8) - cast_to_fp8( - grad_output_mat, - ctx.fp8_meta["scaling_bwd"], - tex.FP8BwdTensors.GRAD_OUTPUT1, - fp8_dtype_backward, - out=grad_output_c, - ) + if not isinstance(grad_output_mat, Float8Tensor): + cast_to_fp8( + grad_output_mat, + ctx.fp8_meta["scaling_bwd"], + tex.FP8BwdTensors.GRAD_OUTPUT1, + fp8_dtype_backward, + out=grad_output_c, + ) + else: + grad_output_c = grad_output_mat if not ctx.ub_overlap_ag: grad_output_c, _ = gather_along_first_dim(grad_output_c, ctx.tp_group) grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) From 78358957f1b656c7184b4002b51d201468f3876b Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:30:19 -0700 Subject: [PATCH 094/427] [JAX] Cherry-pick #785 and #780 (#800) * [JAX] Fixing CI failure due to incorrect use of `static_argnums` in jax.jit (#785) * fixed static argnums for jax.jit in single gpu encoder test, changed warning filtering for pytest Signed-off-by: Alp Dener * propagating the fix to the JAX mnist example Signed-off-by: Alp Dener * fixed missing space ibetween flags i QAA scripts Signed-off-by: Alp Dener * added TE warnings into the ignore list Signed-off-by: Alp Dener --------- Signed-off-by: Alp Dener Signed-off-by: Tim Moon * [JAX] Allow multi-dims for dgamma and dbeta in LN descriptor. (#780) * Allow multi-dims for dgamma and dbeta in LN descriptor. Signed-off-by: Ming Huang * Fix the jit error in examples/jax Signed-off-by: Ming Huang --------- Signed-off-by: Ming Huang Signed-off-by: Tim Moon --------- Signed-off-by: Alp Dener Signed-off-by: Tim Moon Signed-off-by: Ming Huang Co-authored-by: Alp Dener Co-authored-by: Ming-Xu Huang --- .../jax/encoder/test_single_gpu_encoder.py | 2 +- examples/jax/mnist/test_single_gpu_mnist.py | 2 +- qa/L0_jax_unittest/test.sh | 9 +-- qa/L1_jax_distributed_unittest/test.sh | 2 +- tests/jax/pytest.ini | 28 +++++++++ transformer_engine/jax/cpp_extensions.py | 25 ++++---- transformer_engine/jax/csrc/modules.cpp | 60 +++++++++++-------- transformer_engine/jax/csrc/modules.h | 16 +++-- 8 files changed, 91 insertions(+), 53 deletions(-) create mode 100644 tests/jax/pytest.ini diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py index 85e03342b2..b892437925 100644 --- a/examples/jax/encoder/test_single_gpu_encoder.py +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -55,7 +55,7 @@ def __call__(self, x, mask, disable_dropout=False): return x -@partial(jax.jit, static_argnums=6) +@partial(jax.jit) def train_step(state, inputs, masks, labels, var_collect, rngs): """Computes gradients, loss and accuracy for a single batch.""" diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py index dc28a9fd46..ae74a66337 100644 --- a/examples/jax/mnist/test_single_gpu_mnist.py +++ b/examples/jax/mnist/test_single_gpu_mnist.py @@ -74,7 +74,7 @@ def loss_fn(var_collect, disable_dropout=False): return grads, loss, accuracy -@partial(jax.jit, static_argnums=2) +@partial(jax.jit) def update_model(state, grads): """Update model params and FP8 meta.""" state = state.apply_gradients(grads=grads[PARAMS_KEY]) diff --git a/qa/L0_jax_unittest/test.sh b/qa/L0_jax_unittest/test.sh index 9f20769045..b640e3ee4f 100644 --- a/qa/L0_jax_unittest/test.sh +++ b/qa/L0_jax_unittest/test.sh @@ -5,14 +5,15 @@ set -xe : ${TE_PATH:=/opt/transformerengine} -pytest -Wignore -v $TE_PATH/tests/jax -k 'not distributed' + +pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax -k 'not distributed' pip install -r $TE_PATH/examples/jax/mnist/requirements.txt pip install -r $TE_PATH/examples/jax/encoder/requirements.txt -pytest -Wignore -v $TE_PATH/examples/jax/mnist +pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/mnist # Make encoder tests to have run-to-run deterministic to have the stable CI results export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops" -pytest -Wignore -v $TE_PATH/examples/jax/encoder --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py -pytest -Wignore -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py +pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py +pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py diff --git a/qa/L1_jax_distributed_unittest/test.sh b/qa/L1_jax_distributed_unittest/test.sh index 51512d0744..1966f35208 100644 --- a/qa/L1_jax_distributed_unittest/test.sh +++ b/qa/L1_jax_distributed_unittest/test.sh @@ -5,5 +5,5 @@ set -xe : ${TE_PATH:=/opt/transformerengine} -pytest -Wignore -v $TE_PATH/tests/jax/test_distributed_* +pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax/test_distributed_* diff --git a/tests/jax/pytest.ini b/tests/jax/pytest.ini new file mode 100644 index 0000000000..4da88e1476 --- /dev/null +++ b/tests/jax/pytest.ini @@ -0,0 +1,28 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +[pytest] +filterwarnings= + ignore:sharding_type of.*:DeprecationWarning + ignore:major_sharding_type of.*:DeprecationWarning + ignore:Fused attention is not enabled.*:UserWarning + ignore:The hookimpl.*:DeprecationWarning + ignore:xmap is an experimental feature and probably has bugs! + ignore:the imp module is deprecated in favour of importlib.*:DeprecationWarning + ignore:can't resolve package from __spec__ or __package__:ImportWarning + ignore:Using or importing the ABCs.*:DeprecationWarning + ignore:numpy.ufunc size changed + ignore:.*experimental feature + ignore:The distutils.* is deprecated.*:DeprecationWarning + ignore:backend and device argument on jit is deprecated.*:DeprecationWarning + ignore:ml_dtypes.float8_e4m3b11 is deprecated. + ignore:np.find_common_type is deprecated.*:DeprecationWarning + ignore:jax.numpy.in1d is deprecated.*:DeprecationWarning + ignore:The numpy.array_api submodule is still experimental.*:UserWarning + ignore:case not machine-readable.*:UserWarning + ignore:not machine-readable.*:UserWarning + ignore:Special cases found for .* but none were parsed.*:UserWarning + ignore:jax.extend.mlir.dialects.mhlo is deprecated.*:DeprecationWarning + ignore:jax.experimental.maps and .* are deprecated.*:DeprecationWarning + ignore:The host_callback APIs are deprecated .*:DeprecationWarning diff --git a/transformer_engine/jax/cpp_extensions.py b/transformer_engine/jax/cpp_extensions.py index 08bcb94239..3356aafef5 100644 --- a/transformer_engine/jax/cpp_extensions.py +++ b/transformer_engine/jax/cpp_extensions.py @@ -385,8 +385,8 @@ def lowering(ctx, x, gamma, beta, *, zero_centered_gamma, epsilon): hidden_size, wkspace_aval.size, barrier_aval.size, - 0, # no dgamma_part in FWD pass - 0, # no dbeta_part in BWD pass + (0,), # no dgamma_part in FWD pass + (0,), # no dbeta_part in BWD pass jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), @@ -464,7 +464,6 @@ def partition(zero_centered_gamma, epsilon, mesh, arg_infos, result_infos): f"Enforcing no sharding of parameters hidden dim! " \ ) - x_sharding = NamedSharding(mesh, PartitionSpec(*x_spec[:-1], None)) g_sharding = NamedSharding(mesh, PartitionSpec(None)) b_sharding = NamedSharding(mesh, PartitionSpec(None)) @@ -589,8 +588,8 @@ def lowering(ctx, dz, x, mu, rsigma, gamma, *, zero_centered_gamma, epsilon): hidden_size, wkspace_aval.size, barrier_aval.size, - dgamma_part_aval.size, - dbeta_part_aval.size, + dgamma_part_aval.shape, + dbeta_part_aval.shape, jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), @@ -791,8 +790,8 @@ def lowering(ctx, x, gamma, *, epsilon): hidden_size, wkspace_aval.size, barrier_aval.size, - 0, # no dgamma_part in FWD pass - 0, # no dbeta_part in BWD pass + (0,), # no dgamma_part in FWD pass + (0,), # no dbeta_part in BWD pass jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), @@ -968,8 +967,8 @@ def lowering(ctx, dz, x, rsigma, gamma, *, epsilon): hidden_size, wkspace_aval.size, barrier_aval.size, - dgamma_part_aval.size, - 0, # no dbeta_part for RMSnorm + dgamma_part_aval.shape, + (0,), # no dbeta_part for RMSnorm jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), @@ -3588,8 +3587,8 @@ def lowering(ctx, x, gamma, beta, amax, scale, scale_inv, *, out_dtype, zero_cen hidden_size, wkspace_aval.size, barrier_aval.size, - 0, # no dgamma_part in FWD pass - 0, # no dbeta_part in BWD pass + (0,), # no dgamma_part in FWD pass + (0,), # no dbeta_part in BWD pass jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), @@ -3840,8 +3839,8 @@ def lowering(ctx, x, gamma, amax, scale, scale_inv, *, out_dtype, epsilon): hidden_size, wkspace_aval.size, barrier_aval.size, - 0, # no dgamma_part in FWD pass - 0, # no dbeta_part in BWD pass + (0,), # no dgamma_part in FWD pass + (0,), # no dbeta_part in BWD pass jax_dtype_to_te_dtype(x_aval.dtype), jax_dtype_to_te_dtype(gamma_aval.dtype), jax_dtype_to_te_dtype(wkspace_aval.dtype), diff --git a/transformer_engine/jax/csrc/modules.cpp b/transformer_engine/jax/csrc/modules.cpp index 1c4c468d51..4ac6fa58b1 100644 --- a/transformer_engine/jax/csrc/modules.cpp +++ b/transformer_engine/jax/csrc/modules.cpp @@ -71,17 +71,28 @@ pybind11::bytes PackCustomCallCommonWkDescriptor(const std::vector &shap return PackOpaque(desc); } -pybind11::bytes PackCustomCallNormDescriptor(size_t batch_size, size_t hidden_size, - size_t wkspace_size, size_t barrier_size, - size_t *dgamma_part_sizes, size_t *dbeta_part_sizes, - DType x_dtype, DType w_dtype, DType wkspace_dtype, - DType barrier_dtype, DType dgamma_part_dtype, - DType dbeta_part_dtype, bool zero_centered_gamma, - float eps, int sm_margin) { - return PackOpaque(CustomCallNormDescriptor{ - batch_size, hidden_size, wkspace_size, barrier_size, dgamma_part_sizes, dbeta_part_sizes, - x_dtype, w_dtype, wkspace_dtype, barrier_dtype, dgamma_part_dtype, dbeta_part_dtype, - zero_centered_gamma, eps, sm_margin}); +pybind11::bytes PackCustomCallNormDescriptor( + size_t batch_size, size_t hidden_size, size_t wkspace_size, size_t barrier_size, + const std::vector &dgamma_part_shape, const std::vector &dbeta_part_shape, + DType x_dtype, DType w_dtype, DType wkspace_dtype, DType barrier_dtype, DType dgamma_part_dtype, + DType dbeta_part_dtype, bool zero_centered_gamma, float eps, int sm_margin) { + CustomCallNormDescriptor desc; + desc.batch_size = batch_size; + desc.hidden_size = hidden_size; + desc.wkspace_size = wkspace_size; + desc.barrier_size = barrier_size; + desc.dgamma_part_shape.from_vector(dgamma_part_shape); + desc.dbeta_part_shape.from_vector(dbeta_part_shape); + desc.x_dtype = x_dtype; + desc.w_dtype = w_dtype; + desc.wkspace_dtype = wkspace_dtype; + desc.barrier_dtype = barrier_dtype; + desc.dgamma_part_dtype = dgamma_part_dtype; + desc.dbeta_part_dtype = dbeta_part_dtype; + desc.zero_centered_gamma = zero_centered_gamma; + desc.eps = eps; + desc.sm_margin = sm_margin; + return PackOpaque(desc); } pybind11::bytes PackCustomCallSoftmaxDescriptor(size_t batch_size, size_t padding_size, @@ -529,7 +540,7 @@ pybind11::tuple GetLayerNormBackwardWorkspaceSizes(size_t batch_size, size_t hid } void LayerNormBackwardImpl(size_t batch_size, size_t hidden_size, size_t wkspace_size, - size_t barrier_size, size_t *dgamma_part_sizes, size_t *dbeta_part_sizes, + size_t barrier_size, Shape dgamma_part_shape, Shape dbeta_part_shape, bool zero_centered_gamma, float eps, void *input, DType in_dtype, void *weight, DType w_dtype, void *ograd, void *workspace, DType wkspace_dtype, void *barrier, DType barrier_dtype, void *mu, @@ -563,14 +574,14 @@ void LayerNormBackwardImpl(size_t batch_size, size_t hidden_size, size_t wkspace auto workspace_tensor = TensorWrapper(workspace, workspace_shape, wkspace_dtype); auto barrier_shape = std::vector{barrier_size}; auto barrier_tensor = TensorWrapper(barrier, barrier_shape, barrier_dtype); - auto dgamma_part_shape = std::vector{dgamma_part_sizes[0], dgamma_part_sizes[1]}; - auto dgamma_part_tensor = TensorWrapper(dgamma_part, dgamma_part_shape, dgamma_dtype); + auto dgamma_part_tensor = + TensorWrapper(dgamma_part, dgamma_part_shape.to_vector(), dgamma_dtype); if (is_layer_norm) { auto mu_tensor = TensorWrapper(mu, intermediates_shape, intermediates_dtype); auto dbeta_tensor = TensorWrapper(dbeta, weight_shape, w_dtype); - auto dbeta_part_shape = std::vector{dbeta_part_sizes[0], dbeta_part_sizes[1]}; - auto dbeta_part_tensor = TensorWrapper(dbeta_part, dbeta_part_shape, dbeta_dtype); + auto dbeta_part_tensor = + TensorWrapper(dbeta_part, dbeta_part_shape.to_vector(), dbeta_dtype); layernorm_bwd_func(dz_tensor.data(), x_tensor.data(), mu_tensor.data(), rsigma_tensor.data(), gamma_tensor.data(), xgrad_tensor.data(), @@ -664,8 +675,8 @@ void LayerNormBackward(cudaStream_t stream, void **buffers, const char *opaque, auto hidden_size = desc.hidden_size; auto wkspace_size = desc.wkspace_size; auto barrier_size = desc.barrier_size; - auto *dgamma_part_sizes = desc.dgamma_part_sizes; - auto *dbeta_part_sizes = desc.dbeta_part_sizes; + auto dgamma_part_shape = desc.dgamma_part_shape; + auto dbeta_part_shape = desc.dbeta_part_shape; auto in_dtype = desc.x_dtype; auto w_dtype = desc.w_dtype; auto wkspace_dtype = desc.wkspace_dtype; @@ -689,8 +700,8 @@ void LayerNormBackward(cudaStream_t stream, void **buffers, const char *opaque, auto *dgamma_part = buffers[10]; auto *dbeta_part = buffers[11]; - LayerNormBackwardImpl(batch_size, hidden_size, wkspace_size, barrier_size, dgamma_part_sizes, - dbeta_part_sizes, zero_centered_gamma, eps, input, in_dtype, weight, + LayerNormBackwardImpl(batch_size, hidden_size, wkspace_size, barrier_size, dgamma_part_shape, + dbeta_part_shape, zero_centered_gamma, eps, input, in_dtype, weight, w_dtype, ograd, workspace, wkspace_dtype, barrier, barrier_dtype, mu, rsigma, xgrad, wgrad, dbeta, dgamma_part, dgamma_part_dtype, dbeta_part, dbeta_part_dtype, stream); @@ -786,8 +797,9 @@ void RMSNormBackward(cudaStream_t stream, void **buffers, const char *opaque, si auto hidden_size = desc.hidden_size; auto wkspace_size = desc.wkspace_size; auto barrier_size = desc.barrier_size; - auto dgamma_part_sizes = desc.dgamma_part_sizes; - size_t dbeta_part_sizes[2] = {0, 0}; + auto dgamma_part_shape = desc.dgamma_part_shape; + Shape dbeta_part_shape; + dbeta_part_shape.from_vector({0, 0}); auto in_dtype = desc.x_dtype; auto w_dtype = desc.w_dtype; auto wkspace_dtype = desc.wkspace_dtype; @@ -797,8 +809,8 @@ void RMSNormBackward(cudaStream_t stream, void **buffers, const char *opaque, si auto eps = desc.eps; auto zero_centered_gamma = desc.zero_centered_gamma; - LayerNormBackwardImpl(batch_size, hidden_size, wkspace_size, barrier_size, dgamma_part_sizes, - dbeta_part_sizes, zero_centered_gamma, eps, input, in_dtype, weight, + LayerNormBackwardImpl(batch_size, hidden_size, wkspace_size, barrier_size, dgamma_part_shape, + dbeta_part_shape, zero_centered_gamma, eps, input, in_dtype, weight, w_dtype, ograd, workspace, wkspace_dtype, barrier, barrier_dtype, mu, rsigma, xgrad, wgrad, dbeta, dgamma_part, dgamma_part_dtype, dbeta_part, dbeta_part_dtype, stream); diff --git a/transformer_engine/jax/csrc/modules.h b/transformer_engine/jax/csrc/modules.h index e392931d04..04f0039b02 100644 --- a/transformer_engine/jax/csrc/modules.h +++ b/transformer_engine/jax/csrc/modules.h @@ -69,8 +69,8 @@ struct CustomCallNormDescriptor { size_t hidden_size; size_t wkspace_size; size_t barrier_size; - size_t *dgamma_part_sizes; // 2D tensor - size_t *dbeta_part_sizes; // 2D tensor + Shape dgamma_part_shape; + Shape dbeta_part_shape; DType x_dtype; DType w_dtype; DType wkspace_dtype; @@ -82,13 +82,11 @@ struct CustomCallNormDescriptor { int sm_margin; }; -pybind11::bytes PackCustomCallNormDescriptor(size_t batch_size, size_t hidden_size, - size_t wkspace_size, size_t barrier_size, - size_t *dgamma_part_sizes, size_t *dbeta_part_sizes, - DType x_dtype, DType w_dtype, DType wkspace_dtype, - DType barrier_dtype, DType dgamma_part_dtype, - DType dbeta_part_dtype, bool zero_centered_gamma, - float eps, int sm_margin); +pybind11::bytes PackCustomCallNormDescriptor( + size_t batch_size, size_t hidden_size, size_t wkspace_size, size_t barrier_size, + const std::vector &dgamma_part_shape, const std::vector &dbeta_part_shape, + DType x_dtype, DType w_dtype, DType wkspace_dtype, DType barrier_dtype, DType dgamma_part_dtype, + DType dbeta_part_dtype, bool zero_centered_gamma, float eps, int sm_margin); struct SoftmaxDescriptor { size_t batch_size; From 9e4091e742638721f35f67f710815cfb36309831 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 24 Apr 2024 09:20:31 -0700 Subject: [PATCH 095/427] [PyTorch] Avoid using LRU cache for cu_seqlens (#798) * Try using global buffer for cu_seqlens Signed-off-by: Kirthi Shankar Sivamani * Avoid using functools.lru_cache Signed-off-by: Kirthi Shankar Sivamani * fixes Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Vasudevan Rengasamy --- transformer_engine/pytorch/attention.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index f57b58d736..862ae8adf8 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -5,7 +5,6 @@ """Attention.""" import collections from contextlib import nullcontext -import functools from importlib.metadata import version import math import os @@ -265,8 +264,7 @@ def get_indices(max_seqlen: int, cu_seqlens: torch.Tensor) -> torch.Tensor: return indices - -@functools.lru_cache +_cu_seqlens_cache = {} def _get_full_cu_seqlens( batch_size: int, max_seqlen: int, @@ -277,13 +275,16 @@ def _get_full_cu_seqlens( All sequences in batch have the maximum sequence length. """ - return torch.arange( - 0, - (batch_size + 1) * max_seqlen, - step=max_seqlen, - dtype=torch.int32, - device=device, - ) + global _cu_seqlens_cache + if (batch_size, max_seqlen) not in _cu_seqlens_cache: + _cu_seqlens_cache[(batch_size, max_seqlen)] = torch.arange( + 0, + (batch_size + 1) * max_seqlen, + step=max_seqlen, + dtype=torch.int32, + device=device, + ) + return _cu_seqlens_cache[(batch_size, max_seqlen)] @jit_fuser From 090e72412e06f44fe43aa4c4564ae11469961c9a Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Fri, 19 Apr 2024 09:37:33 -0700 Subject: [PATCH 096/427] FP8 attention and all post fixes Signed-off-by: Kirthi Shankar Sivamani --- 3rdparty/cudnn-frontend | 2 +- qa/L0_pytorch_unittest/test.sh | 2 +- tests/pytorch/fused_attn/test_fused_attn.py | 594 ++++++-- tests/pytorch/test_numerics.py | 2 +- .../common/fused_attn/fused_attn.cpp | 71 +- .../fused_attn_f16_arbitrary_seqlen.cu | 23 +- .../fused_attn_f16_arbitrary_seqlen.h | 2 +- .../common/fused_attn/fused_attn_fp8.cu | 1205 ++++++++++++++++- .../common/fused_attn/fused_attn_fp8.h | 56 +- transformer_engine/common/fused_attn/utils.h | 7 +- transformer_engine/common/recipe/__init__.py | 21 +- transformer_engine/pytorch/attention.py | 853 ++++++++++-- .../pytorch/cpp_extensions/fused_attn.py | 78 +- .../pytorch/csrc/comm_gemm_overlap.h | 4 +- transformer_engine/pytorch/csrc/extensions.h | 9 + .../pytorch/csrc/extensions/attention.cu | 171 ++- transformer_engine/pytorch/float8_tensor.py | 89 +- transformer_engine/pytorch/fp8.py | 6 +- transformer_engine/pytorch/module/base.py | 57 +- .../pytorch/module/layernorm_linear.py | 46 +- transformer_engine/pytorch/module/linear.py | 148 +- transformer_engine/pytorch/utils.py | 9 +- 22 files changed, 2991 insertions(+), 464 deletions(-) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index a86ad708db..1b0b5eac54 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit a86ad708db725e4d29919bb6fadf8e6cdfa5dc06 +Subproject commit 1b0b5eac540b7f8fd19b18f1e6b8427c95503348 diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index 50f54cd714..ded45dd377 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -6,7 +6,7 @@ set -e : ${TE_PATH:=/opt/transformerengine} -pip install pytest==6.2.5 onnxruntime==1.13.1 +pip install pytest==7.2 onnxruntime==1.13.1 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py index b2c8f69ef3..40cfdd34b7 100644 --- a/tests/pytorch/fused_attn/test_fused_attn.py +++ b/tests/pytorch/fused_attn/test_fused_attn.py @@ -2,6 +2,7 @@ # # See LICENSE for license information. +import math import functools from importlib.metadata import version import os @@ -12,9 +13,10 @@ import torch from transformer_engine.common import recipe -from transformer_engine.pytorch import TransformerLayer, fp8_autocast +from transformer_engine.pytorch import TransformerLayer, fp8_autocast, fp8_model_init from transformer_engine.pytorch.attention import ( DotProductAttention, + MultiheadAttention, RotaryPositionEmbedding, ) from transformer_engine.pytorch.constants import TE_DType @@ -939,52 +941,415 @@ def _run_transformer_layer( return out, inp.grad -model_configs_fp8 = { +model_configs_fp8_vs_f16 = { # test: b, h, hg, d, sq, skv, p, mask, bias - "fp8_1": ModelConfig(1, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"), - "fp8_2": ModelConfig(4, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"), + "fp8_9 ": ModelConfig(2, 24, 24, 128, 2048, 2048, 0.0, "no_mask", "no_bias"), + "fp8_10": ModelConfig(2, 24, 24, 128, 2048, 2048, 0.0, "causal", "no_bias"), + "fp8_11": ModelConfig(2, 24, 12, 128, 2048, 2048, 0.0, "no_mask", "no_bias"), + "fp8_12": ModelConfig(2, 24, 12, 128, 2048, 2048, 0.0, "causal", "no_bias"), + "fp8_13": ModelConfig(1, 32, 4, 128, 8192, 8192, 0.0, "no_mask", "no_bias"), + "fp8_14": ModelConfig(1, 32, 4, 128, 8192, 8192, 0.0, "causal", "no_bias"), } -param_types_fp8 = [torch.float16] +param_types_fp8_vs_f16 = [torch.float16, torch.bfloat16] +qkv_layout_fp8_vs_f16 = ['sbh3d', 'bshd_bshd_bshd', 'sbhd_sbhd_sbhd'] +qkv_format_fp8_vs_f16 = ['bshd', 'sbhd'] + +def _rmse(a, b): + return math.sqrt((torch.pow((a-b), 2)/a.numel()).sum()) @pytest.mark.skipif(_cudnn_version() < (8,9,3), reason="cuDNN 8.9.3+ is required.") @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) @pytest.mark.skipif(get_device_compute_capability() != (9, 0), reason="FP8 tests require Hopper.") -@pytest.mark.parametrize("dtype", param_types_fp8) -@pytest.mark.parametrize("model", model_configs_fp8.keys()) -def test_dpa_fp8(dtype, model): - """Test FP8 dot product attention +@pytest.mark.parametrize("dtype", param_types_fp8_vs_f16) +@pytest.mark.parametrize("model", model_configs_fp8_vs_f16.keys()) +@pytest.mark.parametrize("qkv_format", qkv_format_fp8_vs_f16) +@pytest.mark.parametrize("input_layernorm", [True, False]) +@pytest.mark.parametrize("fp8_dpa_bwd", [True, False]) +def test_mha_fp8_vs_f16(dtype, model, qkv_format, input_layernorm, fp8_dpa_bwd): + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "1" + config = model_configs_fp8_vs_f16[model] + + os.environ["NVTE_FP8_DPA_BWD"] = "1" if fp8_dpa_bwd else "0" + if _NVTE_DEBUG: + print() + print("[test_mha_fp8_vs_f16]: run with fp8_mha = True") + fused_attn_fwd_fp8, param_names, fused_attn_bwd_fp8 = _run_mha_fp8_vs_f16( + dtype, config, True, qkv_format, input_layernorm) + if _NVTE_DEBUG: + print() + print("[test_mha_fp8_vs_f16]: run with fp8_mha = False") + fused_attn_fwd_f16, param_names, fused_attn_bwd_f16 = _run_mha_fp8_vs_f16( + dtype, config, False, qkv_format, input_layernorm) + + tols = dict(atol=5e-1, rtol=5e-1) + rmse_tol = 0.1 + fwd_rmse = _rmse(fused_attn_fwd_fp8, fused_attn_fwd_f16) + fwd_range = max(fused_attn_fwd_fp8.max().item(), + fused_attn_fwd_f16.max().item()) - min(fused_attn_fwd_fp8.min().item(), + fused_attn_fwd_f16.min().item()) + if _NVTE_DEBUG: + print() + print('========== {:^25s} =========='.format('forward output')) + print('fused_attn_fwd_fp8 min {:.6f} max {:.6f}'.format( + fused_attn_fwd_fp8.min().item(),fused_attn_fwd_fp8.max().item())) + print('fused_attn_fwd_f16 min {:.6f} max {:.6f}'.format( + fused_attn_fwd_f16.min().item(), fused_attn_fwd_f16.max().item())) + print('fused_attn_fwd RMSE: {:.6f}'.format(fwd_rmse)) + try: + torch.testing.assert_close(fused_attn_fwd_fp8, fused_attn_fwd_f16, **tols) + except Exception as e: + print(e) + print() + assert(fwd_rmse < rmse_tol * fwd_range + ), "FWD RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format( + fwd_rmse, rmse_tol * fwd_range, rmse_tol, fwd_range) + for i in range(len(param_names[:1])): + bwd_rmse = _rmse(fused_attn_bwd_fp8[i], fused_attn_bwd_f16[i]) + bwd_range = max(fused_attn_bwd_fp8[i].max().item(), + fused_attn_bwd_f16[i].max().item()) - min(fused_attn_bwd_fp8[i].min().item(), + fused_attn_bwd_f16[i].min().item()) + if _NVTE_DEBUG: + print() + print('========== {:^25s} =========='.format(param_names[i])) + print('fused_attn_bwd_fp8[{}] min {:.6f} max {:.6f}'.format(i, + fused_attn_bwd_fp8[i].min().item(), fused_attn_bwd_fp8[i].max().item())) + print('fused_attn_bwd_f16[{}] min {:.6f} max {:.6f}'.format(i, + fused_attn_bwd_f16[i].min().item(), fused_attn_bwd_f16[i].max().item())) + print('fused_attn_bwd RMSE[{}]: {:.6f}'.format(i, bwd_rmse)) + try: + torch.testing.assert_close(fused_attn_bwd_fp8[i], fused_attn_bwd_f16[i], **tols) + except Exception as e: + print(e) + print() + assert(bwd_rmse < rmse_tol * bwd_range + ), "BWD RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format( + bwd_rmse, rmse_tol * bwd_range, rmse_tol, bwd_range) + +def _run_mha_fp8_vs_f16(dtype, config, fp8_mha, qkv_format, input_layernorm): + reset_rng_states() + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker: + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER - FusedAttention uses fused_attn_fwd/bwd_qkvpacked from cpp_extensions, - and UnfusedDotProductAttention uses plain PyTorch operations in FP16 - and converts inputs/outputs from/to FP8. + fp8_recipe = recipe.DelayedScaling( + margin=0, + interval=1, + fp8_format=recipe.Format.HYBRID, + amax_history_len=1, + amax_compute_algo="most_recent", + fp8_dpa=fp8_mha, + fp8_mha=fp8_mha, + ) - """ + with fp8_model_init(enabled=fp8_mha): + mha = (MultiheadAttention( + hidden_size=config.hidden_size, + num_attention_heads=config.num_heads, + kv_channels=config.head_dim, + num_gqa_groups=config.num_gqa_groups, + attention_dropout=config.dropout_p, + layer_number=1, + bias=True, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, + params_dtype=dtype, + input_layernorm=input_layernorm, + fuse_qkv_params=True, + attention_type="self", + qkv_weight_interleaved=True, + qkv_format=qkv_format, + ).to(dtype=dtype, device="cuda") + ) - config = model_configs_fp8[model] + seqlens_q = torch.full([config.batch_size], config.max_seqlen_q, + dtype=torch.int32, device="cuda") + seqlens_kv = torch.full([config.batch_size], config.max_seqlen_kv, + dtype=torch.int32, device="cuda") + cu_seqlens_q = torch.zeros(config.batch_size + 1, dtype=torch.int32, device="cuda") + cu_seqlens_kv = torch.zeros(config.batch_size + 1, dtype=torch.int32, device="cuda") + cu_seqlens_q[1:] = torch.cumsum(seqlens_q, dim=0) + cu_seqlens_kv[1:] = torch.cumsum(seqlens_kv, dim=0) - # Skip if not supported - fused_attn_supported, fused_attn_backend = _is_fused_attention_supported( - config, dtype) - if not fused_attn_supported: - pytest.skip("FusedAttention does not support this model config") + dim_to_num = { + 'b' : config.batch_size, + 'sq' : config.max_seqlen_q, + 'skv': config.max_seqlen_kv, + 'h' : config.num_heads, + 'hg' : config.num_gqa_groups, + 'd' : config.head_dim, + 't' : cu_seqlens_q[-1], + 'tg' : cu_seqlens_kv[-1], + '3' : 3, + '2' : 2, + '1' : 1, + } + layout = '_'.join(qkv_format) + layout = layout.replace('s', 'sq') + tensor_shape = [dim_to_num[j] for j in layout.split('_')] + tensor = 0.01 * torch.randint(-100, 100, tensor_shape, dtype=dtype, device="cuda") + hidden_states = tensor.view(*tensor.shape[:-2], -1) + hidden_states.requires_grad = True + tensor = 0.01 * torch.randn(tensor_shape, dtype=dtype, device="cuda") + out_grad = tensor.view(*tensor.shape[:-2], -1) + + with fp8_autocast(enabled=fp8_mha, fp8_recipe=fp8_recipe): + out = mha(hidden_states, + attn_mask_type=config.attn_mask_type, + checkpoint_core_attention=False, + core_attention_bias_type=config.attn_bias_type, + is_first_microbatch=None, + ) + out.backward(out_grad) - # Run dot-product attention with different backends - fused_attn_fwd, fused_attn_bwd = _run_dpa_fp8( - dtype, config, "FusedAttention") - unfused_attn_fwd, unfused_attn_bwd = _run_dpa_fp8_ref( - dtype, config, "UnfusedDotProductAttention") + param_names = [] + param_names.append('hidden_states.grad') + params = [] + params.append(hidden_states) + for name, param in mha.named_parameters(): + if param.requires_grad: + param_names.append(name+'.grad') + params.append(param) - tols = dict(atol=2.5e-2, rtol=2.5e-2) - torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, **tols) - torch.testing.assert_close(fused_attn_bwd, unfused_attn_bwd, **tols) + return out, param_names, tuple(x.grad for x in params) -def _run_dpa_fp8(dtype, config, backend): - """Run FusedAttention FP8 backend, i.e. - fused_attn_fwd/bwd_qkvpacked from cpp_extensions""" +@pytest.mark.skipif(_cudnn_version() < (8,9,3), reason="cuDNN 8.9.3+ is required.") +@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) +@pytest.mark.skipif(get_device_compute_capability() != (9, 0), reason="FP8 tests require Hopper.") +@pytest.mark.parametrize("dtype", param_types_fp8_vs_f16) +@pytest.mark.parametrize("model", model_configs_fp8_vs_f16.keys()) +@pytest.mark.parametrize("qkv_layout", qkv_layout_fp8_vs_f16) +@pytest.mark.parametrize("fp8_dpa_bwd", [True, False]) +def test_dpa_fp8_vs_f16(dtype, model, qkv_layout, fp8_dpa_bwd): + config = model_configs_fp8_vs_f16[model] + + if (config.num_heads != config.num_gqa_groups and '3' in qkv_layout): + pytest.skip("qkv_layout not applicable for MQA/GQA"); + + os.environ["NVTE_FP8_DPA_BWD"] = "1" if fp8_dpa_bwd else "0" + if _NVTE_DEBUG: + print() + print("[test_dpa_fp8_vs_f16]: run with fp8_dpa = True") + fused_attn_fwd_fp8, fused_attn_bwd_fp8 = _run_dpa_fp8_vs_f16( + dtype, config, True, qkv_layout) + if _NVTE_DEBUG: + print("[test_dpa_fp8_vs_f16]: run with fp8_dpa = False") + fused_attn_fwd_f16, fused_attn_bwd_f16 = _run_dpa_fp8_vs_f16( + dtype, config, False, qkv_layout) + + tols = dict(atol=5e-1, rtol=5e-2) + if _NVTE_DEBUG: + print('[test_dpa_fp8_vs_f16]: ', tols) + print('fused_attn_fwd_fp8 min {:.6f} max {:.6f}'.format( + fused_attn_fwd_fp8.min().item(),fused_attn_fwd_fp8.max().item())) + print('fused_attn_fwd_f16 min {:.6f} max {:.6f}'.format( + fused_attn_fwd_f16.min().item(), fused_attn_fwd_f16.max().item())) + print('fused_attn_fwd RMSE: {:.6f}'.format( + _rmse(fused_attn_fwd_fp8, fused_attn_fwd_f16))) + torch.testing.assert_close(fused_attn_fwd_fp8, fused_attn_fwd_f16, **tols) + for i,_ in enumerate(fused_attn_bwd_f16): + if _NVTE_DEBUG: + print('fused_attn_bwd_fp8 min {:.6f} max {:.6f}'.format( + fused_attn_bwd_fp8[i].min().item(), fused_attn_bwd_fp8[i].max().item())) + print('fused_attn_bwd_f16 min {:.6f} max {:.6f}'.format( + fused_attn_bwd_f16[i].min().item(), fused_attn_bwd_f16[i].max().item())) + print('fused_attn_bwd RMSE: {:.6f}'.format( + _rmse(fused_attn_bwd_fp8[i], fused_attn_bwd_f16[i]))) + torch.testing.assert_close(fused_attn_bwd_fp8[i], fused_attn_bwd_f16[i], **tols) + + +def _run_dpa_fp8_vs_f16(dtype, config, fp8_dpa, qkv_layout): + reset_rng_states() + _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) + def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker: + """Get cuda rng tracker.""" + return _DUMMY_CUDA_RNG_STATE_TRACKER + + fp8_recipe = recipe.DelayedScaling( + margin=0, + interval=1, + fp8_format=recipe.Format.HYBRID, + amax_history_len=1, + amax_compute_algo="most_recent", + fp8_dpa=fp8_dpa, + ) + + qkv_format = ''.join([i for i in qkv_layout.split('_')[0] if i.isalpha()]) + with fp8_model_init(enabled=fp8_dpa): + dpa = ( + DotProductAttention( + config.num_heads, + config.head_dim, + num_gqa_groups=config.num_gqa_groups, + attention_dropout=config.dropout_p, + sequence_parallel=False, + tp_size=1, + get_rng_state_tracker=get_dummy_cuda_rng_tracker, + tp_group=None, + layer_number=1, + attention_type="self", + qkv_format=qkv_format, + ).to(dtype=dtype, device="cuda") + ) + + seqlens_q = torch.full([config.batch_size], config.max_seqlen_q, + dtype=torch.int32, device="cuda") + seqlens_kv = torch.full([config.batch_size], config.max_seqlen_kv, + dtype=torch.int32, device="cuda") + cu_seqlens_q = torch.zeros(config.batch_size + 1, dtype=torch.int32, device="cuda") + cu_seqlens_kv = torch.zeros(config.batch_size + 1, dtype=torch.int32, device="cuda") + cu_seqlens_q[1:] = torch.cumsum(seqlens_q, dim=0) + cu_seqlens_kv[1:] = torch.cumsum(seqlens_kv, dim=0) + + dim_to_num = { + 'b' : config.batch_size, + 'sq' : config.max_seqlen_q, + 'skv': config.max_seqlen_kv, + 'h' : config.num_heads, + 'hg' : config.num_gqa_groups, + 'd' : config.head_dim, + 't' : cu_seqlens_q[-1], + 'tg' : cu_seqlens_kv[-1], + '3' : 3, + '2' : 2, + '1' : 1, + } + inp = [] + for i,layout in enumerate(qkv_layout.split('_')): + layout = '_'.join(layout) + if i == 0: + layout = layout.replace('s', 'sq') + else: + layout = layout.replace('s', 'skv') + layout = layout.replace('h', 'hg') + layout = layout.replace('t', 'tg') + tensor_shape = [dim_to_num[j] for j in layout.split('_')] + tensor = 0.1 * torch.randn(tensor_shape, dtype=dtype, device="cuda") + tensor_count = 1 + split_dim = 0 + for dim, l in enumerate(layout.split('_')): + if l.isdigit(): + tensor_count = int(l) + split_dim = dim + break + tensors = torch.split(tensor, 1, dim=split_dim) if split_dim != 0 else [tensor] + for j in range(tensor_count): + if split_dim != 0: + inp.append(tensors[j].squeeze(split_dim)) + else: + inp.append(tensors[j]) + for i in range(3): + inp[i].requires_grad = True + + qkv_format_kv = '_'.join(qkv_format) + qkv_format_kv = qkv_format_kv.replace('s', 'sq') + out_grad_shape = [dim_to_num[i] for i in qkv_format_kv.split('_')] + out_grad_shape_new = [*out_grad_shape[:-2], out_grad_shape[-2] * out_grad_shape[-1]] + out_grad = 0.1 * torch.randn(out_grad_shape_new, dtype=dtype, device="cuda") + + with fp8_autocast(enabled=fp8_dpa, fp8_recipe=fp8_recipe): + out = dpa(inp[0], inp[1], inp[2], + qkv_format=qkv_format, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_kv=cu_seqlens_kv, + max_seqlen_q=config.max_seqlen_q, + max_seqlen_kv=config.max_seqlen_kv, + attn_mask_type=config.attn_mask_type, + checkpoint_core_attention=False, + core_attention_bias_type=config.attn_bias_type, + is_first_microbatch=True, + ) + out.backward(out_grad) + + return out, (inp[0].grad, inp[1].grad, inp[2].grad) + + +model_configs_fp8 = { + # test: b, h, hg, d, sq, skv, p, mask, bias + "fp8_1": ModelConfig(1, 1, 1, 64, 512, 512, 0.0, "no_mask", "no_bias"), + "fp8_2": ModelConfig(4, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"), + "fp8_3": ModelConfig(1, 1, 1, 128, 2048, 2048, 0.0, "no_mask", "no_bias"), + "fp8_4": ModelConfig(2, 24, 24, 128, 2048, 2048, 0.0, "no_mask", "no_bias"), + "fp8_5": ModelConfig(1, 1, 1, 64, 512, 512, 0.0, "causal", "no_bias"), + "fp8_6": ModelConfig(4, 16, 16, 64, 512, 512, 0.0, "causal", "no_bias"), + "fp8_7": ModelConfig(1, 1, 1, 128, 2048, 2048, 0.0, "causal", "no_bias"), + "fp8_8": ModelConfig(2, 24, 24, 128, 2048, 2048, 0.0, "causal", "no_bias"), +} +param_types_fp8 = [torch.float16, torch.bfloat16] +cudnn_frontend_version = int(os.getenv('NVTE_FUSED_ATTN_FE_VER','1')) +models_v0 = ['fp8_1', 'fp8_2', 'fp8_5', 'fp8_6'] +models_v1 = ['fp8_3', 'fp8_4', 'fp8_7', 'fp8_8'] + + +@pytest.mark.skipif(_cudnn_version() < (8,9,3), reason="cuDNN 8.9.3+ is required.") +@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) +@pytest.mark.skipif(get_device_compute_capability() != (9, 0), reason="FP8 tests require Hopper.") +@pytest.mark.parametrize("dtype", param_types_fp8) +@pytest.mark.parametrize("model", models_v1 if cudnn_frontend_version == 1 else models_v0) +def test_custom_mha_fp8_vs_f16(dtype, model): + """Test FP8 dot product attention implementations based on cuDNN frontend + v0.9 and v1.0+. Each test compares results from a custom implementation of + an FP8 MHA module, i.e. Custom_MHA_FP8(), to results from an F16 MHA + implementation, i.e. transformer_engine.pytorch.attention.MultiHeadAttention. + Both paths take F16 input and output. QKV layout is t3hd or bs3hd""" + + config = model_configs_fp8[model] + + fused_attn_fwd_fp8, fused_attn_bwd_fp8 = _run_custom_mha_fp8( + dtype, config, "FusedAttention") + unfused_attn_fwd_f16, unfused_attn_bwd_f16 = _run_ref_mha_f16( + dtype, config, "UnfusedAttention") + + tols = dict(atol=5e-1, rtol=5e-1) + rmse_tol = 0.1 + fwd_rmse = _rmse(fused_attn_fwd_fp8, unfused_attn_fwd_f16) + fwd_range = max(fused_attn_fwd_fp8.max().item(), + unfused_attn_fwd_f16.max().item()) - min(fused_attn_fwd_fp8.min().item(), + unfused_attn_fwd_f16.min().item()) + bwd_rmse = _rmse(fused_attn_bwd_fp8, unfused_attn_bwd_f16) + bwd_range = max(fused_attn_bwd_fp8.max().item(), + unfused_attn_bwd_f16.max().item()) - min(fused_attn_bwd_fp8.min().item(), + unfused_attn_bwd_f16.min().item()) + if _NVTE_DEBUG: + print('fused_attn_fwd_fp8 min {:.6f} max {:.6f}'.format( + fused_attn_fwd_fp8.min().item(),fused_attn_fwd_fp8.max().item())) + print('unfused_attn_fwd_f16 min {:.6f} max {:.6f}'.format( + unfused_attn_fwd_f16.min().item(), unfused_attn_fwd_f16.max().item())) + print('fused_attn_fwd_fp8 vs unfused_attn_fwd_f16 RMSE: {:.6f}'.format( + fwd_rmse)) + try: + torch.testing.assert_close(fused_attn_fwd_fp8, unfused_attn_fwd_f16, **tols) + except Exception as e: + print(e) + print() + print('fused_attn_bwd_fp8 min {:.6f} max {:.6f}'.format( + fused_attn_bwd_fp8.min().item(), fused_attn_bwd_fp8.max().item())) + print('unfused_attn_bwd_f16 min {:.6f} max {:.6f}'.format( + unfused_attn_bwd_f16.min().item(), unfused_attn_bwd_f16.max().item())) + print('fused_attn_bwd_fp8 vs unfused_attn_bwd_f16 RMSE: {:.6f}'.format( + bwd_rmse)) + try: + torch.testing.assert_close(fused_attn_bwd_fp8, unfused_attn_bwd_f16, **tols) + except Exception as e: + print(e) + print() + assert(fwd_rmse < rmse_tol * fwd_range + ), "FWD RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format( + fwd_rmse, rmse_tol * fwd_range, rmse_tol, fwd_range) + assert(bwd_rmse < rmse_tol * bwd_range + ), "FWD RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format( + bwd_rmse, rmse_tol * bwd_range, rmse_tol, bwd_range) + + +def _run_custom_mha_fp8(dtype, config, backend): + """Run Custom_MHA_FP8 with FP8 FusedAttention backend. Both input and output + are in F16. QKV GEMM, DPA, and projection GEMM are calculated in FP8.""" reset_rng_states() os.environ["NVTE_FLASH_ATTN"] = "0" os.environ["NVTE_FUSED_ATTN"] = "0" @@ -993,13 +1358,14 @@ def _run_dpa_fp8(dtype, config, backend): if backend == "FusedAttention": os.environ["NVTE_FUSED_ATTN"] = "1" - inp = 0.01 * torch.randn( - config.batch_size * config.max_seqlen_q, config.num_heads * config.head_dim, + inp = 0.0001 * torch.randint(0, 100, + (config.batch_size * config.max_seqlen_q, config.num_heads * config.head_dim), dtype=dtype, device="cuda", requires_grad=True) seqlens = torch.full([config.batch_size], config.max_seqlen_q, dtype=torch.int32, device="cuda") cu_seqlens = torch.zeros(config.batch_size + 1, device="cuda", dtype=torch.int32) cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) + out_grad = 0.01 * torch.randn( config.batch_size * config.max_seqlen_q, config.num_heads * config.head_dim, dtype=dtype, device="cuda") @@ -1013,22 +1379,21 @@ def _run_dpa_fp8(dtype, config, backend): amax_compute_algo="most_recent", ) - dpa = DPA_FP8(config).to(dtype=torch.float16, device="cuda") + mha = Custom_MHA_FP8(config).to(dtype=dtype, device="cuda") with fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): - out = dpa(inp, cu_seqlens, config.max_seqlen_q) + out = mha(inp, cu_seqlens, config.max_seqlen_q) out.backward(out_grad) - context = torch.load("ctx.pt") + out = torch.load("out.pt") dqkv = torch.load('dqkv.pt') - return (context.view(config.batch_size, config.max_seqlen_q, -1).transpose(0,1), + return (out.view(config.batch_size, config.max_seqlen_q, -1), dqkv.view(config.batch_size, config.max_seqlen_q, 3, - config.num_heads, config.head_dim).transpose(0,1).contiguous()) + config.num_heads, config.head_dim).contiguous()) -def _run_dpa_fp8_ref(dtype, config, backend): - """Run UnfusedDotProductAttention as a reference, i.e. - plain PyTorch implementation in FP16 and inputs/outputs - are converted from/to FP8""" +def _run_ref_mha_f16(dtype, config, backend): + """Run reference F16 FusedAttention. Both input and output + are in F16. QKV GEMM, DPA, and projection GEMM are also in F16.""" os.environ["NVTE_FLASH_ATTN"] = "0" os.environ["NVTE_FUSED_ATTN"] = "0" @@ -1043,7 +1408,7 @@ def _run_dpa_fp8_ref(dtype, config, backend): cu_seqlens = torch.zeros(config.batch_size + 1, device="cuda", dtype=torch.int32) cu_seqlens[1:] = torch.cumsum(seqlens, dim=0) out_grad = torch.load('out_grad.pt').to(device="cuda").view( - config.batch_size, config.max_seqlen_q, -1).transpose(0,1) + config.batch_size, config.max_seqlen_q, -1) _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() _DUMMY_CUDA_RNG_STATE_TRACKER.add("model-parallel-rng", seed) @@ -1069,13 +1434,14 @@ def get_dummy_cuda_rng_tracker(): get_rng_state_tracker=get_dummy_cuda_rng_tracker, tp_group=None, layer_number=1, - attention_type="self" + attention_type="self", + qkv_format="bshd", ).to(dtype=dtype, device="cuda") ) - q = inp[:, :,0,:,:] - k = inp[:, :,1,:,:] - v = inp[:, :,2,:,:] + q = inp[:,:,0,:,:] + k = inp[:,:,1,:,:] + v = inp[:,:,2,:,:] out = block(q, k, v, attn_mask_type=config.attn_mask_type) out.backward(out_grad) @@ -1088,14 +1454,14 @@ def get_dummy_cuda_rng_tracker(): _2X_ACC_WGRAD = False META_QKV = tex.FP8FwdTensors.GEMM1_OUTPUT +META_DQKV = tex.FP8BwdTensors.GRAD_OUTPUT1 META_O = tex.FP8FwdTensors.GEMM2_INPUT META_DO = tex.FP8BwdTensors.GRAD_INPUT2 -META_DQKV = tex.FP8BwdTensors.GRAD_OUTPUT1 +META_S = tex.FP8FwdTensors.GEMM3_OUTPUT +META_DP = tex.FP8BwdTensors.GRAD_INPUT3 -META_S = tex.FP8FwdTensors.GEMM3_WEIGHT -META_DS = tex.FP8BwdTensors.GRAD_INPUT3 -class _dpa_fp8(torch.autograd.Function): +class _custom_mha_fp8(torch.autograd.Function): @staticmethod def forward( ctx, @@ -1110,6 +1476,7 @@ def forward( fp8_meta: Dict[str, Any], workspace: torch.Tensor, is_training: bool, + mask_type: str, ) -> torch.Tensor: assert inp.dim() == 2 @@ -1117,14 +1484,10 @@ def forward( h = num_heads d = in_features // h b = cu_seqlens.numel() - 1 - is_nl = False - if b < 4 and b > 1: - max_s = 512 - is_nl = True fp8_dtype_forward = fp8.get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) - inputmat, inputmat_t = ext.fp8_cast_transpose_fused( + inp_fp8, inp_t_fp8 = ext.fp8_cast_transpose_fused( inp, fp8_meta["scaling_fwd"], tex.FP8FwdTensors.GEMM1_INPUT, @@ -1142,12 +1505,12 @@ def forward( ZInv = None philox_unpacked = None - qkv_out, _ = ext.fp8_gemm( + qkv, _ = ext.fp8_gemm( qkv_weight_fp8, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - inputmat, + inp_fp8, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, @@ -1160,26 +1523,29 @@ def forward( use_split_accumulator=_2X_ACC_FPROP, D_dtype=fp8_dtype_forward, ) - qkv_out = qkv_out.view(-1, 3, h, d) - qkv_out_fp16 = ext.cast_from_fp8(qkv_out, fp8_meta["scaling_fwd"], + qkv = qkv.view(-1, 3, h, d) + qkv_fp16 = ext.cast_from_fp8(qkv, fp8_meta["scaling_fwd"], META_QKV, fp8_dtype_forward, - tex.DType.kFloat16).view(b, max_s, 3, h, d).transpose(0,1).contiguous() - torch.save(qkv_out_fp16, 'qkv.pt') + tex.DType.kFloat16).view(b, max_s, 3, h, d).contiguous() + torch.save(qkv_fp16, 'qkv.pt') + if cudnn_frontend_version == 1: + qkv = qkv.view(b, max_s, 3, h, d) # bs3hd # FMHA - context_, aux_ctx_tensors, *rest = fused_attn_fwd( + out, aux_ctx_tensors, *rest = fused_attn_fwd( is_training, max_s, max_s, cu_seqlens, cu_seqlens, - qkv_out[:,0,:,:], - qkv_out[:,1,:,:], - qkv_out[:,2,:,:], + qkv[:,:,0,:,:] if cudnn_frontend_version == 1 else qkv[:,0,:,:], + qkv[:,:,1,:,:] if cudnn_frontend_version == 1 else qkv[:,1,:,:], + qkv[:,:,2,:,:] if cudnn_frontend_version == 1 else qkv[:,2,:,:], fp8_dtype_forward, FusedAttnBackend["FP8"], None, fp8_meta["scaling_fwd"].scale_inv[META_QKV], + fp8_meta["scaling_fwd"].scale_inv[META_S], fp8_meta["scaling_fwd"].scale[META_S], fp8_meta["scaling_fwd"].scale[META_O], fp8_meta["scaling_fwd"].amax_history[0][META_S], @@ -1187,20 +1553,17 @@ def forward( attn_scale=None, dropout=p_dropout, fast_zero_fill=fast_zero_fill, - qkv_layout="t3hd", + qkv_layout="bs3hd" if cudnn_frontend_version == 1 else "t3hd", attn_bias_type="no_bias", - attn_mask_type="padding", + attn_mask_type=mask_type if cudnn_frontend_version == 1 else "padding", rng_gen=None, ) - M, ZInv, philox_unpacked = aux_ctx_tensors - context = context_.view(-1, in_features) - context_t = tex.fp8_transpose(context, fp8_dtype_forward) + M, ZInv, philox_unpacked = aux_ctx_tensors ctx.save_for_backward( - inputmat_t, qkv_weight_t_fp8, workspace, - qkv_out, - context_, context_t, + inp_t_fp8, qkv_weight_t_fp8, workspace, + qkv, out, fp8_meta["scaling_fwd"].scale, fp8_meta["scaling_fwd"].scale_inv, ) @@ -1210,14 +1573,16 @@ def forward( ctx.p_dropout = p_dropout ctx.max_s = max_s ctx.fast_zero_fill = fast_zero_fill - ctx.is_nl = is_nl ctx.hidden_size = in_features ctx.num_heads = num_heads + ctx.mask_type = mask_type + ctx.dtype = inp.dtype - context_fp16 = ext.cast_from_fp8(context, fp8_meta["scaling_fwd"], + out = out.view(-1, in_features) # (bs)(hd) + out_fp16 = ext.cast_from_fp8(out, fp8_meta["scaling_fwd"], META_O, fp8_dtype_forward, tex.DType.kFloat16) - torch.save(context_fp16, 'ctx.pt') - return context_fp16 + torch.save(out_fp16, 'out.pt') # (bs)(hd) + return out_fp16 @staticmethod @@ -1226,11 +1591,10 @@ def backward( ) -> Tuple[Union[torch.Tensor, None], ...]: with torch.cuda.nvtx.range("_DPA"): ( - inputmat_t, + inp_t_fp8, qkv_weight_t_fp8, workspace, - qkv_out, - context, context_t, + qkv, out, fwd_scales, fwd_scale_inverses, ) = ctx.saved_tensors @@ -1243,51 +1607,59 @@ def backward( proj_dgrad = ext.cast_to_fp8( grad_output, ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward - ) + ) # (bs)(hd) dq, dk, dv, *rest = fused_attn_bwd( ctx.max_s, ctx.max_s, ctx.cu_seqlens, ctx.cu_seqlens, - qkv_out[:,0,:,:], - qkv_out[:,1,:,:], - qkv_out[:,2,:,:], - context, - proj_dgrad.view_as(context), + qkv[:,:,0,:,:] if cudnn_frontend_version == 1 else qkv[:,0,:,:], + qkv[:,:,1,:,:] if cudnn_frontend_version == 1 else qkv[:,1,:,:], + qkv[:,:,2,:,:] if cudnn_frontend_version == 1 else qkv[:,2,:,:], + out, + proj_dgrad.view_as(out), fp8_dtype_forward, + fp8_dtype_backward, ctx.aux_ctx_tensors, FusedAttnBackend["FP8"], fwd_scale_inverses[META_QKV], # d_scale_qkv, fwd_scale_inverses[META_S], # d_scale_s, fwd_scale_inverses[META_O], # d_scale_o, ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO], # d_scale_do + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DP], # d_scale_dp fwd_scales[META_S], # q_scale_s - ctx.fp8_meta['scaling_bwd'].scale[META_DS], # q_scale_ds + ctx.fp8_meta['scaling_bwd'].scale[META_DP], # q_scale_dp ctx.fp8_meta['scaling_bwd'].scale[META_DQKV], # q_scale_dqkv - ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DS], # amax_ds + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DP], # amax_dp ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DQKV], # amax_dqkv - None, - ctx.p_dropout, - ctx.fast_zero_fill, - "t3hd", - "no_bias", - "padding", + attn_scale=None, + dropout=ctx.p_dropout, + fast_zero_fill=ctx.fast_zero_fill, + qkv_layout="bs3hd" if cudnn_frontend_version == 1 else "t3hd", + attn_bias_type="no_bias", + attn_mask_type=ctx.mask_type if cudnn_frontend_version == 1 else "padding", ) - dqkv = torch.cat([dq.unsqueeze(1), dk.unsqueeze(1), dv.unsqueeze(1)], dim=1) - - dqkv_grad_output_c = dqkv.view(-1, 3*ctx.hidden_size) - dqkv_grad_output_c_fp16 = ext.cast_from_fp8(dqkv_grad_output_c, + dim = 2 if cudnn_frontend_version == 1 else 1 + dqkv = torch.Tensor().to(device=dq.device, dtype=dq.dtype) + dqkv_shape = list(dq.shape) + dqkv_shape.insert(dim, 3) + dqkv_stride = list(dq.stride()) + dqkv_stride.insert(dim, int(dqkv_stride[-3]/3)) + dqkv.set_(dq.untyped_storage(), dq.storage_offset(), dqkv_shape, dqkv_stride) # bs3hd + + dqkv_c = dqkv.view(-1, 3*ctx.hidden_size) + dqkv_c_fp16 = ext.cast_from_fp8(dqkv_c, ctx.fp8_meta["scaling_bwd"], META_DQKV, fp8_dtype_backward, tex.DType.kFloat16) - torch.save(dqkv_grad_output_c_fp16, 'dqkv.pt') + torch.save(dqkv_c_fp16, 'dqkv.pt') - qkv_bgrad, dqkv_grad_output_t = ext.fp8_transpose_bgrad_fused( - dqkv_grad_output_c, + qkv_bgrad, dqkv_t = ext.fp8_transpose_bgrad_fused( + dqkv_c, ctx.fp8_meta["scaling_bwd"], META_DQKV, fp8_dtype_backward, - torch.float16, + ctx.dtype, ) # QKV DGRAD @@ -1296,25 +1668,25 @@ def backward( fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - dqkv_grad_output_c, + dqkv_c, ctx.fp8_meta["scaling_bwd"].scale_inv, META_DQKV, fp8_dtype_backward, - torch.float16, + ctx.dtype, workspace, use_split_accumulator=_2X_ACC_DGRAD, ) # QKV WGRAD qkv_wgrad, _ = ext.fp8_gemm( - inputmat_t, + inp_t_fp8, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, - dqkv_grad_output_t, + dqkv_t, ctx.fp8_meta["scaling_bwd"].scale_inv, META_DQKV, fp8_dtype_backward, - torch.float16, + ctx.dtype, workspace, use_split_accumulator=_2X_ACC_WGRAD, ) @@ -1334,7 +1706,7 @@ def backward( None) -class DPA_FP8(TransformerEngineBaseModule): +class Custom_MHA_FP8(TransformerEngineBaseModule): def __init__( self, config, @@ -1345,6 +1717,7 @@ def __init__( self.hidden_size = config.hidden_size self.head_dim = config.head_dim self.fast_zero_fill = True + self.mask_type = config.attn_mask_type self.qkv_weight = torch.nn.Parameter( torch.empty( @@ -1374,7 +1747,7 @@ def forward( cu_seqlens, max_s, ) -> torch.Tensor: with self.prepare_forward(inp, None, num_gemms=3) as inp: - out = _dpa_fp8.apply( + out = _custom_mha_fp8.apply( inp, self.qkv_weight, self.qkv_bias, @@ -1385,7 +1758,8 @@ def forward( self.fast_zero_fill, self.fp8_meta, self.workspace, - self.training) + self.training, + self.mask_type) return out def get_fp8_weights_scratchpad( diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index ddb3ecf49f..0cda82e0c4 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -1091,7 +1091,7 @@ def test_layernorm_linear_accuracy(dtype, bs, model, normalization, zero_centere torch_outputs = _test_granular_accuracy(torch_ln_linear, bs, dtype, config) # Check output. - atol = {torch.float32 : 2e-4, + atol = {torch.float32 : 2.5e-4, torch.half : 2e-3, torch.bfloat16: 2e-2, } diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp index 43e7d17350..2d9759898f 100644 --- a/transformer_engine/common/fused_attn/fused_attn.cpp +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -85,15 +85,25 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type."); NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); auto cudnn_runtime_version = cudnnGetVersion(); - if ((q_dtype == NVTEDType::kNVTEFloat8E4M3) || (q_dtype == NVTEDType::kNVTEFloat8E5M2) - && (sm_arch_ >= 90) - && (max_seqlen_q == max_seqlen_kv) - && (num_attn_heads == num_gqa_groups) - && (max_seqlen_q <= 512) - && (head_dim == 64) - && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) - && (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) - && (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD)) { + if (((q_dtype == NVTEDType::kNVTEFloat8E4M3) + || (q_dtype == NVTEDType::kNVTEFloat8E5M2)) + && (sm_arch_ >= 90) + && (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) + && ( + ((cudnn_runtime_version >= 8900) + && (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) + && (max_seqlen_q == max_seqlen_kv) + && (max_seqlen_q <= 512) + && (head_dim == 64) + && (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK)) + || ((cudnn_runtime_version >= 90100) + && (max_seqlen_q % 128 == 0) + && (max_seqlen_kv % 128 == 0) + && (head_dim == 128) + && ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) + && ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) + || (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK))))) { if (cudnn_runtime_version >= 8900) { backend = NVTE_Fused_Attn_Backend::NVTE_FP8; } else { @@ -269,7 +279,7 @@ void nvte_fused_attn_fwd_qkvpacked( #if (CUDNN_VERSION >= 8900) fused_attn_fp8_fwd_qkvpacked( b, h, max_seqlen, d, - is_training, attn_scale, dropout, qkv_layout, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, input_QKV, input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens, @@ -379,7 +389,7 @@ void nvte_fused_attn_bwd_qkvpacked( const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); fused_attn_fp8_bwd_qkvpacked( b, h, max_seqlen, d, - attn_scale, dropout, qkv_layout, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, input_QKV, input_O, input_dO, input_M, input_ZInv, input_S, input_output_dP, @@ -476,7 +486,18 @@ void nvte_fused_attn_fwd_kvpacked( "cuDNN 8.9.3 is required for BF16/FP16 fused attention with arbitrary sequence length. \n"); #endif } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { - NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); +#if (CUDNN_VERSION >= 8900) + fused_attn_fp8_fwd_kvpacked( + b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_KV, input_output_S, output_O, + Aux_CTX_Tensors, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); +#endif } else { NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } @@ -580,7 +601,23 @@ void nvte_fused_attn_bwd_kvpacked( NVTE_ERROR(err_msg); #endif } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { - NVTE_ERROR("The FP8 fused attention API only supports packed QKV input. \n"); +#if (CUDNN_VERSION >= 8900) + const Tensor *input_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + const Tensor *input_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + fused_attn_fp8_bwd_kvpacked( + b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, + input_Q, input_KV, input_O, input_dO, + input_M, input_ZInv, + input_S, input_output_dP, + output_dQ, output_dKV, + input_cu_seqlens_q, input_cu_seqlens_kv, + input_rng_state, + wkspace, stream, handle); +#else + NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); +#endif } else { NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); } @@ -662,8 +699,8 @@ void nvte_fused_attn_fwd( } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { #if (CUDNN_VERSION >= 8900) fused_attn_fp8_fwd( - b, h_q, max_seqlen_q, max_seqlen_kv, d, - is_training, attn_scale, dropout, qkv_layout, + b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, + is_training, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, input_Q, input_K, input_V, input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens_q, input_cu_seqlens_kv, @@ -775,8 +812,8 @@ void nvte_fused_attn_bwd( const Tensor *input_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); const Tensor *input_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); fused_attn_fp8_bwd( - b, h_q, max_seqlen_q, max_seqlen_kv, d, - attn_scale, dropout, qkv_layout, + b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, + attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, input_Q, input_K, input_V, input_O, input_dO, input_M, input_ZInv, input_S, input_output_dP, diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu index 8ffd8608b6..180759f327 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu @@ -76,7 +76,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl( scaling_factor, is_training, dropout_probability, layout, bias_type, mask_type, - tensorType}; + tensorType, tensorType}; namespace fe = cudnn_frontend; using graph_and_tensors = std::tuple, @@ -147,7 +147,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl( fe::graph::SDPA_attributes sdpa_options; sdpa_options = fe::graph::SDPA_attributes() .set_name("flash_attention") - .set_is_inference(!is_training) + .set_is_inference(false) .set_causal_mask(is_causal) .set_attn_scale(attn_scale); @@ -199,11 +199,9 @@ void fused_attn_arbitrary_seqlen_fwd_impl( layout, NVTE_QKV_Matrix::NVTE_O_Matrix); O->set_output(true).set_dim({b, h, s_q, d}).set_stride(o_stride); - if (is_training) { - Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) - .set_dim({b, h, s_q, 1}) - .set_stride({h * s_q, s_q, 1, 1}); - } + Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) + .set_dim({b, h, s_q, 1}) + .set_stride({h * s_q, s_q, 1, 1}); std::tuple, // Q std::shared_ptr, // K @@ -211,7 +209,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl( std::shared_ptr, // attn_scale std::shared_ptr > // O key_tensors_tuple = std::make_tuple(Q, K, V, attn_scale, O); - auto Stats_tuple = is_training ? std::make_tuple(Stats) : std::make_tuple(nullptr); + auto Stats_tuple = std::make_tuple(Stats); auto bias_tuple = is_bias ? std::make_tuple(bias) : std::make_tuple(nullptr); auto padding_tuple = is_padding ? std::make_tuple(seq_q, seq_kv) : std::make_tuple(nullptr, nullptr); @@ -258,11 +256,8 @@ void fused_attn_arbitrary_seqlen_fwd_impl( {K, devPtrK}, {V, devPtrV}, {attn_scale, &scaling_factor}, - {O, devPtrO}}; - - if (is_training) { - variant_pack[Stats] = devPtrSoftmaxStats; - } + {O, devPtrO}, + {Stats, devPtrSoftmaxStats}}; if (is_bias) { variant_pack[bias] = devPtrBias; @@ -321,7 +316,7 @@ void fused_attn_arbitrary_seqlen_bwd_impl( scaling_factor, true, dropout_probability, layout, bias_type, mask_type, - tensorType}; + tensorType, tensorType}; namespace fe = cudnn_frontend; using graph_and_tensors = std::tuple, diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h index 55a5638b26..a8866908ce 100644 --- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h +++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h @@ -19,7 +19,7 @@ namespace transformer_engine { #if (CUDNN_VERSION >= 8900) void fused_attn_arbitrary_seqlen_fwd_qkvpacked( size_t batch, size_t num_attn_heads, size_t max_seqlen, - size_t head_size, bool is_training, float attn_scale, + size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_Bias, diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu index 76c1a44b0d..66185c0c41 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu @@ -8,6 +8,7 @@ #include "../common.h" #include "utils.h" +#include "../util/system.h" #include "fused_attn_fp8.h" namespace transformer_engine { @@ -984,7 +985,7 @@ static cudnn_frontend::Tensor createdSQBMM( return After_dSTranspose_Q; } -// fused attention FWD FP8 +// fused attention FWD FP8 with FE 0.9 void fused_attn_fp8_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, bool isTraining, float attnScale, float dropoutProbability, NVTE_QKV_Layout layout, @@ -1295,7 +1296,7 @@ void fused_attn_fp8_fwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, in } } -// fused attention BWD FP8 +// fused attention BWD FP8 with FE 0.9 void fused_attn_fp8_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, int64_t d, float attnScale, float dropoutProbability, NVTE_QKV_Layout layout, void* devPtrQ, void* devPtrK, void* devPtrV, @@ -1846,6 +1847,707 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, in } } +// fused attention FWD FP8 with FE 1.0+ +void fused_attn_fp8_fwd_impl_v1(int64_t b, int64_t h, int64_t hg, + int64_t s_q, int64_t s_kv, int64_t d, + bool is_training, float scaling_factor, + float dropout_probability, NVTE_QKV_Layout layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + void* devPtrQ, void* devPtrK, void* devPtrV, + void* devPtrM, void* devPtrZInv, + void* devPtrO, + void* devPtrDescaleQ, void* devPtrDescaleK, void* devPtrDescaleV, + void* devPtrDescaleS, void* devPtrScaleS, void* devPtrScaleO, + void* devPtrAmaxO, void* devPtrAmaxS, + void* devPtrcuSeqlensQ, void* devPtrcuSeqlensKV, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnn_frontend::DataType_t fwd_tensor_type, + void* workspace, + size_t* workspace_size, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + bool is_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS); + bool is_alibi = (bias_type == NVTE_Bias_Type::NVTE_ALIBI); + bool is_causal = ((mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) + || (mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)); + bool is_padding = ((mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) + || (mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)); + bool is_dropout = (is_training && dropout_probability != 0.0f); + auto bias_b = b; + auto bias_h = h; + NVTE_CHECK(~is_bias, "FP8 fused attention does not support pre/post_scale_bias yet!"); + NVTE_CHECK(~is_alibi, "FP8 fused attention does not support ALiBi yet!"); + NVTE_CHECK(~is_padding, + "FP8 fused attention does not support padding/padding_causal mask yet!"); + NVTE_CHECK(~is_dropout, "FP8 fused attention does not support dropout yet!"); + + try { + FADescriptor_v1 descriptor{b, h, + hg, s_q, + s_kv, d, + bias_b, bias_h, + scaling_factor, is_training, + dropout_probability, layout, + bias_type, mask_type, + fwd_tensor_type, fwd_tensor_type}; + + namespace fe = cudnn_frontend; + using graph_and_tensors = std::tuple, + std::shared_ptr, // Q + std::shared_ptr, // K + std::shared_ptr, // V + std::shared_ptr, // descale_q + std::shared_ptr, // descale_k + std::shared_ptr, // descale_v + std::shared_ptr, // descale_s + std::shared_ptr, // scale_s + std::shared_ptr, // scale_o + std::shared_ptr, // attn_scale + std::shared_ptr, // O + std::shared_ptr, // amax_s + std::shared_ptr, // amax_o + std::shared_ptr, // Stats + std::shared_ptr, // bias + std::shared_ptr, // seq_q + std::shared_ptr, // seq_kv + std::shared_ptr, // dropout_seed + std::shared_ptr >; // dropout_offset + + using CacheType = std::map; + static thread_local CacheType sdpa_fp8_fprop_cache; + + // Get plan from cache if cache is available, otherwise create one + auto get_graph = [&](CacheType &cache, const FADescriptor_v1 &descriptor) + -> graph_and_tensors { + // if hit, return + auto it = cache.find(descriptor); + if (it != cache.end()) { + auto graph = it->second; + return graph; + } + + // otherwise, build the op_graph and the plan. Then update cache + auto mha_graph = std::make_shared(); + mha_graph->set_io_data_type(fwd_tensor_type) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + + std::shared_ptr Q, K, V, attn_scale; + std::shared_ptr descale_q, descale_k, descale_v; + std::shared_ptr descale_s, scale_s, scale_o; + std::shared_ptr bias, seq_q, seq_kv; + std::shared_ptr dropout_seed, dropout_offset; + + std::vector q_stride(4); + std::vector k_stride(4); + std::vector v_stride(4); + generateMatrixStrides(b, h, s_q, s_kv, d, q_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); + generateMatrixStrides(b, hg, s_q, s_kv, d, k_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_K_Matrix); + generateMatrixStrides(b, hg, s_q, s_kv, d, v_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_V_Matrix); + Q = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Q") + .set_dim({b, h, s_q, d}) + .set_stride(q_stride)); + K = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("K") + .set_dim({b, hg, s_kv, d}) + .set_stride(k_stride)); + V = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("V") + .set_dim({b, hg, s_kv, d}) + .set_stride(v_stride)); + + attn_scale = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + + descale_q = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Descale_q") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + descale_k = mha_graph->tensor_like(descale_q, "Descale_q"); + descale_v = mha_graph->tensor_like(descale_q, "Descale_V"); + descale_s = mha_graph->tensor_like(descale_q, "Descale_S"); + scale_s = mha_graph->tensor_like(descale_q, "Scale_S"); + scale_o = mha_graph->tensor_like(descale_q, "Scale_O"); + + fe::graph::SDPA_fp8_attributes sdpa_options; + sdpa_options = fe::graph::SDPA_fp8_attributes() + .set_name("sdpa_fp8") + .set_is_inference(false) + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale); + + // sdpa_options.set_alibi_mask(is_alibi); + // if (is_bias) { + // bias = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("bias") + // .set_dim({bias_b, bias_h, s_q, s_kv}) + // .set_stride({bias_h * s_q * s_kv, s_q * s_kv, s_kv, 1})); + // sdpa_options.set_bias(bias); + // } + + // if (is_padding) { + // seq_q = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("seq_q") + // .set_dim({b, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT32)); + // seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("seq_kv") + // .set_dim({b, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT32)); + // sdpa_options.set_padding_mask(is_padding) + // .set_seq_len_q(seq_q) + // .set_seq_len_kv(seq_kv); + // } + + // if (is_dropout) { + // dropout_seed = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("Seed") + // .set_dim({1, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT64)); + // dropout_offset = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("Offset") + // .set_dim({1, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT64)); + // sdpa_options.set_dropout( + // dropout_probability, dropout_seed, dropout_offset); + // } + + auto [O, Stats, amax_s, amax_o] = mha_graph->sdpa_fp8( + Q, K, V, descale_q, descale_k, descale_v, descale_s, + scale_s, scale_o, sdpa_options); + + std::vector o_stride(4); + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + O->set_output(true).set_dim({b, h, s_q, d}).set_stride(o_stride); + amax_o->set_output(true).set_dim({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT); + amax_s->set_output(true).set_dim({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT); + + Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) + .set_dim({b, h, s_q, 1}) + .set_stride({h * s_q, s_q, 1, 1}); + + std::tuple, // Q + std::shared_ptr, // K + std::shared_ptr, // V + std::shared_ptr, // descale_q + std::shared_ptr, // descale_k + std::shared_ptr, // descale_v + std::shared_ptr, // descale_s + std::shared_ptr, // scale_s + std::shared_ptr, // scale_o + std::shared_ptr, // attn_scale + std::shared_ptr, // O + std::shared_ptr, // amax_s + std::shared_ptr > // amax_o + key_tensors_tuple = std::make_tuple(Q, K, V, descale_q, descale_k, descale_v, + descale_s, scale_s, scale_o, attn_scale, O, amax_s, amax_o); + auto Stats_tuple = std::make_tuple(Stats); + auto bias_tuple = is_bias ? std::make_tuple(bias) : std::make_tuple(nullptr); + auto padding_tuple = is_padding ? + std::make_tuple(seq_q, seq_kv) : std::make_tuple(nullptr, nullptr); + auto dropout_tuple = is_dropout ? + std::make_tuple(dropout_seed, dropout_offset) : std::make_tuple(nullptr, nullptr); + + NVTE_CHECK_CUDNN_FE(mha_graph->validate()); + NVTE_CHECK_CUDNN_FE(mha_graph->build_operation_graph(handle)); + NVTE_CHECK_CUDNN_FE(mha_graph->create_execution_plans({fe::HeurMode_t::A})); + NVTE_CHECK_CUDNN_FE(mha_graph->check_support(handle)); + NVTE_CHECK_CUDNN_FE(mha_graph->build_plans(handle)); + + auto return_tuple = std::tuple_cat( + std::make_tuple(mha_graph), key_tensors_tuple, + Stats_tuple, bias_tuple, padding_tuple, dropout_tuple); + cache.insert({descriptor, return_tuple}); + + return return_tuple; + }; + + auto [mha_graph, Q, K, V, descale_q, descale_k, descale_v, descale_s, + scale_s, scale_o, attn_scale, O, amax_s, amax_o, Stats, + bias, seq_q, seq_kv, dropout_seed, dropout_offset] = get_graph( + sdpa_fp8_fprop_cache, descriptor); + + auto plan_workspace_size = mha_graph->get_workspace_size(); + + // Exit to request upper level API to allocate memory if needed + size_t actual_seqlen_workspace_size = 2 * b * sizeof(int32_t); + if (workspace == nullptr) { + *workspace_size = plan_workspace_size + actual_seqlen_workspace_size; + return; + } + + // cuDNN stream check needs to be moved here to support dummy kernel calls with + // null streams for sizing the cuDNN workspace. + NVTE_CHECK_CUDNN(cudnnSetStream(handle, stream)); + + // Build variant pack + std::unordered_map, void*> variant_pack = { + {Q, devPtrQ}, + {K, devPtrK}, + {V, devPtrV}, + {descale_q, devPtrDescaleQ}, + {descale_k, devPtrDescaleK}, + {descale_v, devPtrDescaleV}, + {descale_s, devPtrDescaleS}, + {scale_s, devPtrScaleS}, + {scale_o, devPtrScaleO}, + {attn_scale, &scaling_factor}, + {O, devPtrO}, + {amax_s, devPtrAmaxS}, + {amax_o, devPtrAmaxO}, + {Stats, devPtrM}}; + + // if (is_bias) { + // variant_pack[bias] = devPtrBias; + // } + + // if (is_padding) { + // constexpr size_t nthreads_per_block = 128; + // const size_t grid = (b + nthreads_per_block - 1) / nthreads_per_block; + // void *devActualSeqlenQ = static_cast(workspace) + plan_workspace_size; + // void *devActualSeqlenKV = static_cast(devActualSeqlenQ) + // + b * sizeof(int32_t); + // cu_seqlens_to_actual_seqlens<<>>( + // b, static_cast(devPtrCuSeqlensQ), + // static_cast(devPtrCuSeqlensKV), + // static_cast(devActualSeqlenQ), + // static_cast(devActualSeqlenKV)); + // variant_pack[seq_q] = devActualSeqlenQ; + // variant_pack[seq_kv] = devActualSeqlenKV; + // } + + // if (is_dropout) { + // variant_pack[dropout_seed] = devPtrDropoutSeed; + // variant_pack[dropout_offset] = devPtrDropoutOffset; + // } + NVTE_CHECK_CUDNN_FE(mha_graph->execute(handle, variant_pack, workspace)); + } catch (cudnn_frontend::cudnnException &e) { + NVTE_ERROR(e.what()); + } +} + +// fused attention BWD FP8 with FE 1.0+ +void fused_attn_fp8_bwd_impl_v1(int64_t b, int64_t h, int64_t hg, + int64_t s_q, int64_t s_kv, int64_t d, + float scaling_factor, float dropout_probability, NVTE_QKV_Layout layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + void* devPtrQ, void* devPtrK, void* devPtrV, + void* devPtrM, void* devPtrZInv, + void* devPtrO, void* devPtrdO, + void* devPtrdQ, void* devPtrdK, void* devPtrdV, + void* devPtrDescaleQ, void* devPtrDescaleK, void* devPtrDescaleV, + void* devPtrDescaleO, void* devPtrDescaledO, + void* devPtrDescaleS, void* devPtrDescaledP, + void* devPtrScaleS, void* devPtrScaledP, + void* devPtrScaledQ, void* devPtrScaledK, void* devPtrScaledV, + void* devPtrAmaxdP, + void* devPtrAmaxdQ, void* devPtrAmaxdK, void* devPtrAmaxdV, + void* devPtrcuSeqlensQ, void* devPtrcuSeqlensKV, + void* devPtrDropoutSeed, void* devPtrDropoutOffset, + cudnn_frontend::DataType_t fwd_tensor_type, + cudnn_frontend::DataType_t bwd_tensor_type, + void* workspace, + size_t* workspace_size, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + bool is_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS); + bool is_alibi = (bias_type == NVTE_Bias_Type::NVTE_ALIBI); + bool is_causal = ((mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) + || (mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)); + bool is_padding = ((mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) + || (mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)); + bool is_dropout = (dropout_probability != 0.0f); + auto bias_b = b; + auto bias_h = h; + NVTE_CHECK(~is_bias, "FP8 fused attention does not support pre/post_scale_bias yet!"); + NVTE_CHECK(~is_alibi, "FP8 fused attention does not support ALiBi yet!"); + NVTE_CHECK(~is_padding, + "FP8 fused attention does not support padding/padding_causal mask yet!"); + NVTE_CHECK(~is_dropout, "FP8 fused attention does not support dropout yet!"); + + try { + FADescriptor_v1 descriptor{b, h, + hg, s_q, + s_kv, d, + bias_b, bias_h, + scaling_factor, true, + dropout_probability, layout, + bias_type, mask_type, + fwd_tensor_type, bwd_tensor_type}; + + namespace fe = cudnn_frontend; + using graph_and_tensors = std::tuple, + std::shared_ptr, // q + std::shared_ptr, // k + std::shared_ptr, // v + std::shared_ptr, // o + std::shared_ptr, // stats + std::shared_ptr, // dO + std::shared_ptr, // attn_scale + std::shared_ptr, // descale_q + std::shared_ptr, // descale_k + std::shared_ptr, // descale_v + std::shared_ptr, // descale_o + std::shared_ptr, // descale_dO + std::shared_ptr, // descale_s + std::shared_ptr, // descale_dP + std::shared_ptr, // scale_dQ + std::shared_ptr, // scale_dK + std::shared_ptr, // scale_dV + std::shared_ptr, // scale_s + std::shared_ptr, // scale_dP + std::shared_ptr, // dQ + std::shared_ptr, // dK + std::shared_ptr, // dV + std::shared_ptr, // amax_dQ + std::shared_ptr, // amax_dK + std::shared_ptr, // amax_dV + std::shared_ptr, // amax_dP + std::shared_ptr, // bias + std::shared_ptr, // dBias + std::shared_ptr, // seq_q + std::shared_ptr, // seq_kv + std::shared_ptr, // dropout_seed + std::shared_ptr >; // dropout_offset + + using CacheType = std::map; + static thread_local CacheType sdpa_fp8_bprop_cache; + + // Get plan from cache if cache is available, otherwise create one + auto get_graph = [&](CacheType &cache, const FADescriptor_v1 &descriptor) + -> graph_and_tensors { + // if hit, return + auto it = cache.find(descriptor); + if (it != cache.end()) { + auto graph = it->second; + return graph; + } + + // otherwise, build the op_graph and the plan. Then update cache + auto mha_graph = std::make_shared(); + + mha_graph->set_io_data_type(fwd_tensor_type) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + + std::shared_ptr q, k, v, o, dO, stats, attn_scale; + std::shared_ptr descale_q, descale_k, descale_v; + std::shared_ptr descale_s, descale_o; + std::shared_ptr descale_dP, descale_dO; + std::shared_ptr scale_s, scale_dP; + std::shared_ptr scale_dQ, scale_dK, scale_dV; + std::shared_ptr bias, dBias, seq_q, seq_kv; + std::shared_ptr dropout_seed, dropout_offset; + + std::vector q_stride(4); + std::vector k_stride(4); + std::vector v_stride(4); + std::vector o_stride(4); + generateMatrixStrides(b, h, s_q, s_kv, d, q_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_Q_Matrix); + generateMatrixStrides(b, hg, s_q, s_kv, d, k_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_K_Matrix); + generateMatrixStrides(b, hg, s_q, s_kv, d, v_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_V_Matrix); + generateMatrixStrides(b, h, s_q, s_kv, d, o_stride.data(), + layout, NVTE_QKV_Matrix::NVTE_O_Matrix); + q = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Q") + .set_dim({b, h, s_q, d}) + .set_stride(q_stride)); + k = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("K") + .set_dim({b, hg, s_kv, d}) + .set_stride(k_stride)); + v = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("V") + .set_dim({b, hg, s_kv, d}) + .set_stride(v_stride)); + o = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("O") + .set_dim({b, h, s_q, d}) + .set_stride(o_stride)); + dO = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("dO") + .set_dim({b, h, s_q, d}) + .set_stride(o_stride)); + stats = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("stats") + .set_dim({b, h, s_q, 1}) + .set_stride({h * s_q, s_q, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + + attn_scale = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + + descale_q = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Descale_q") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + descale_k = mha_graph->tensor_like(descale_q, "Descale_q"); + descale_v = mha_graph->tensor_like(descale_q, "Descale_V"); + descale_s = mha_graph->tensor_like(descale_q, "Descale_S"); + descale_o = mha_graph->tensor_like(descale_q, "Descale_O"); + descale_dP = mha_graph->tensor_like(descale_q, "Descale_dP"); + descale_dO = mha_graph->tensor_like(descale_q, "Descale_dO"); + scale_s = mha_graph->tensor_like(descale_q, "Scale_S"); + scale_dP = mha_graph->tensor_like(descale_q, "Scale_dP"); + scale_dQ = mha_graph->tensor_like(descale_q, "Scale_dQ"); + scale_dK = mha_graph->tensor_like(descale_q, "Scale_dK"); + scale_dV = mha_graph->tensor_like(descale_q, "Scale_dV"); + + fe::graph::SDPA_fp8_backward_attributes sdpa_backward_options; + sdpa_backward_options = fe::graph::SDPA_fp8_backward_attributes() + .set_name("sdpa_fp8_backward") + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale); + + // sdpa_backward_options.set_alibi_mask(is_alibi); + + // if (is_bias) { + // bias = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("bias") + // .set_dim({bias_b, bias_h, s_q, s_kv}) + // .set_stride({bias_h * s_q * s_kv, s_q * s_kv, s_kv, 1})); + // dBias = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("dBias") + // .set_dim({bias_b, bias_h, s_q, s_kv}) + // .set_stride({bias_h * s_q * s_kv, s_q * s_kv, s_kv, 1})); + // sdpa_backward_options.set_bias(bias); + // // shapes [1, 1, s, s], [b, 1, s, s], [b, h, s, s] + // // are not supported for dbias calculation but they are + // // supported for forward bias calculation + // if ((bias_b == 1) && (bias_h == h)) { + // sdpa_backward_options.set_dbias(dBias); + // } + // } + + // if (is_padding) { + // seq_q = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("seq_q") + // .set_dim({b, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT32)); + // seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("seq_kv") + // .set_dim({b, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT32)); + // sdpa_backward_options.set_padding_mask(is_padding) + // .set_seq_len_q(seq_q) + // .set_seq_len_kv(seq_kv); + // } + + // if (is_dropout) { + // dropout_seed = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("Seed") + // .set_dim({1, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT64)); + // dropout_offset = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("Offset") + // .set_dim({1, 1, 1, 1}) + // .set_stride({1, 1, 1, 1}) + // .set_data_type(fe::DataType_t::INT64)); + // sdpa_backward_options.set_dropout( + // dropout_probability, dropout_seed, dropout_offset); + // } + + auto [dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP] = mha_graph->sdpa_fp8_backward( + q, k, v, o, dO, stats, + descale_q, descale_k, descale_v, + descale_o, descale_dO, descale_s, descale_dP, + scale_s, scale_dQ, scale_dK, scale_dV, scale_dP, + sdpa_backward_options); + + dQ->set_output(true) + .set_dim({b, h, s_q, d}) + .set_stride(q_stride); + dK->set_output(true) + .set_dim({b, hg, s_kv, d}) + .set_stride(k_stride); + dV->set_output(true) + .set_dim({b, hg, s_kv, d}) + .set_stride(v_stride); + amax_dQ->set_output(true) + .set_dim({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT); + amax_dK->set_output(true) + .set_dim({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT); + amax_dV->set_output(true) + .set_dim({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT); + amax_dP->set_output(true) + .set_dim({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT); + + dO->set_data_type(bwd_tensor_type); + dQ->set_data_type(bwd_tensor_type); + dK->set_data_type(bwd_tensor_type); + dV->set_data_type(bwd_tensor_type); + + std::tuple, // q + std::shared_ptr, // k + std::shared_ptr, // v + std::shared_ptr, // o + std::shared_ptr, // stats + std::shared_ptr, // dO + std::shared_ptr, // attn_scale + std::shared_ptr, // descale_q + std::shared_ptr, // descale_k + std::shared_ptr, // descale_v + std::shared_ptr, // descale_o + std::shared_ptr, // descale_dO + std::shared_ptr, // descale_s + std::shared_ptr, // descale_dP + std::shared_ptr, // scale_dQ + std::shared_ptr, // scale_dK + std::shared_ptr, // scale_dV + std::shared_ptr, // scale_s + std::shared_ptr, // scale_dP + std::shared_ptr, // dQ + std::shared_ptr, // dK + std::shared_ptr, // dV + std::shared_ptr, // amax_dQ + std::shared_ptr, // amax_dK + std::shared_ptr, // amax_dV + std::shared_ptr > // amax_dP + key_tensors_tuple = std::make_tuple( + q, k, v, o, stats, dO, attn_scale, + descale_q, descale_k, descale_v, + descale_o, descale_dO, descale_s, descale_dP, + scale_s, scale_dQ, scale_dK, scale_dV, scale_dP, + dQ, dK, dV, + amax_dQ, amax_dK, amax_dV, amax_dP); + auto bias_tuple = is_bias ? + std::make_tuple(bias, dBias) : std::make_tuple(nullptr, nullptr); + auto padding_tuple = is_padding ? + std::make_tuple(seq_q, seq_kv) : std::make_tuple(nullptr, nullptr); + auto dropout_tuple = is_dropout ? + std::make_tuple(dropout_seed, dropout_offset) : std::make_tuple(nullptr, nullptr); + + NVTE_CHECK_CUDNN_FE(mha_graph->validate()); + NVTE_CHECK_CUDNN_FE(mha_graph->build_operation_graph(handle)); + NVTE_CHECK_CUDNN_FE(mha_graph->create_execution_plans({fe::HeurMode_t::A})); + NVTE_CHECK_CUDNN_FE(mha_graph->check_support(handle)); + NVTE_CHECK_CUDNN_FE(mha_graph->build_plans(handle)); + + auto return_tuple = std::tuple_cat( + std::make_tuple(mha_graph), key_tensors_tuple, + bias_tuple, padding_tuple, dropout_tuple); + cache.insert({descriptor, return_tuple}); + + return return_tuple; + }; + + auto [mha_graph, q, k, v, o, stats, dO, attn_scale, + descale_q, descale_k, descale_v, + descale_o, descale_dO, descale_s, descale_dP, + scale_s, scale_dQ, scale_dK, scale_dV, scale_dP, + dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP, + bias, dBias, seq_q, seq_kv, dropout_seed, dropout_offset] = get_graph( + sdpa_fp8_bprop_cache, descriptor); + + auto plan_workspace_size = mha_graph->get_workspace_size(); + + // Exit to request upper level API to allocate memory if needed + size_t actual_seqlen_workspace_size = 2 * b * sizeof(int32_t); + if (workspace == nullptr) { + *workspace_size = plan_workspace_size + actual_seqlen_workspace_size; + return; + } + + // cuDNN stream check needs to be moved here to support dummy kernel calls with + // null streams for sizing the cuDNN workspace. + NVTE_CHECK_CUDNN(cudnnSetStream(handle, stream)); + + // build variant pack + std::unordered_map, void*> variant_pack = { + {q, devPtrQ}, + {k, devPtrK}, + {v, devPtrV}, + {o, devPtrO}, + {stats, devPtrM}, + {dO, devPtrdO}, + {attn_scale, &scaling_factor}, + {descale_q, devPtrDescaleQ}, + {descale_k, devPtrDescaleK}, + {descale_v, devPtrDescaleV}, + {descale_o, devPtrDescaleO}, + {descale_dO, devPtrDescaledO}, + {descale_s, devPtrDescaleS}, + {descale_dP, devPtrDescaledP}, + {scale_s, devPtrScaleS}, + {scale_dQ, devPtrScaledQ}, + {scale_dK, devPtrScaledK}, + {scale_dV, devPtrScaledV}, + {scale_dP, devPtrScaledP}, + {dQ, devPtrdQ}, + {dK, devPtrdK}, + {dV, devPtrdV}, + {amax_dQ, devPtrAmaxdQ}, + {amax_dK, devPtrAmaxdK}, + {amax_dV, devPtrAmaxdV}, + {amax_dP, devPtrAmaxdP}, + }; + + // if (is_bias) { + // variant_pack[bias] = devPtrBias; + // if ((bias_b == 1) && (bias_h == h)) { + // variant_pack[dBias] = devPtrdBias; + // } else { + // variant_pack[dBias] = nullptr; + // } + // } + + // if (is_padding) { + // constexpr size_t nthreads_per_block = 128; + // const size_t grid = (b + nthreads_per_block - 1) / nthreads_per_block; + // void *devActualSeqlenQ = static_cast(workspace) + plan_workspace_size; + // void *devActualSeqlenKV = static_cast(devActualSeqlenQ) + // + b * sizeof(int32_t); + // cu_seqlens_to_actual_seqlens<<>>( + // b, static_cast(devPtrCuSeqlensQ), + // static_cast(devPtrCuSeqlensKV), + // static_cast(devActualSeqlenQ), + // static_cast(devActualSeqlenKV)); + // variant_pack[seq_q] = devActualSeqlenQ; + // variant_pack[seq_kv] = devActualSeqlenKV; + // } + + // if (is_dropout) { + // variant_pack[dropout_seed] = devPtrDropoutSeed; + // variant_pack[dropout_offset] = devPtrDropoutOffset; + // } + + NVTE_CHECK_CUDNN_FE(mha_graph->execute(handle, variant_pack, workspace)); + } catch (cudnn_frontend::cudnnException &e) { + NVTE_ERROR(e.what()); + } +} + #endif } // namespace fused_attn @@ -1853,9 +2555,10 @@ void fused_attn_fp8_bwd_impl(int64_t b, int64_t h, int64_t s_q, int64_t s_kv, in #if (CUDNN_VERSION >= 8900) // fused attention FWD FP8 with packed QKV void fused_attn_fp8_fwd_qkvpacked( - size_t b, size_t h, size_t max_seqlen, size_t d, + size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, Tensor *input_output_S, Tensor *output_O, @@ -1866,11 +2569,18 @@ void fused_attn_fp8_fwd_qkvpacked( cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - // QKV shape is [total_seqs, 3, h, d] + const DType QKV_type = input_QKV->data.dtype; void* devPtrQKV = input_QKV->data.dptr; - void* devPtrQ = reinterpret_cast(devPtrQKV); - void* devPtrK = reinterpret_cast(reinterpret_cast(devPtrQKV) + h * d); - void* devPtrV = reinterpret_cast(reinterpret_cast(devPtrQKV) + 2 * h * d); + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + size_t stride = 0; + if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) { + stride = typeToSize(QKV_type) * num_attn_heads * head_dim; + } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) { + stride = typeToSize(QKV_type) * head_dim; + } + void *devPtrQ = static_cast(devPtrQKV); + void *devPtrK = static_cast(static_cast(devPtrQKV) + stride); + void *devPtrV = static_cast(static_cast(devPtrQKV) + 2 * stride); void* devPtrDescaleQ = input_QKV->scale_inv.dptr; void* devPtrDescaleK = input_QKV->scale_inv.dptr; void* devPtrDescaleV = input_QKV->scale_inv.dptr; @@ -1882,21 +2592,19 @@ void fused_attn_fp8_fwd_qkvpacked( void* devPtrM = nullptr; void* devPtrZInv = nullptr; if (Aux_CTX_Tensors->size == 0) { - if (is_training) { - Aux_CTX_Tensors->size = 3; - Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); - Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); - Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); - output_M->data.dptr = nullptr; - output_M->data.shape = {b, h, max_seqlen, 1}; - output_M->data.dtype = DType::kFloat32; - output_ZInv->data.dptr = nullptr; - output_ZInv->data.shape = {b, h, max_seqlen, 1}; - output_ZInv->data.dtype = DType::kFloat32; - output_rng_state->data.dptr = nullptr; - output_rng_state->data.shape = {2}; - output_rng_state->data.dtype = DType::kInt64; - } + Aux_CTX_Tensors->size = 3; + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + output_M->data.dptr = nullptr; + output_M->data.shape = {batch, num_attn_heads, max_seqlen, 1}; + output_M->data.dtype = DType::kFloat32; + output_ZInv->data.dptr = nullptr; + output_ZInv->data.shape = {batch, num_attn_heads, max_seqlen, 1}; + output_ZInv->data.dtype = DType::kFloat32; + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; } else if (Aux_CTX_Tensors->size == 3) { Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); @@ -1919,11 +2627,27 @@ void fused_attn_fp8_fwd_qkvpacked( void* devPtrDropoutOffset = reinterpret_cast( reinterpret_cast(rng_state->data.dptr) + 1); - const DType QKV_type = input_QKV->data.dtype; size_t workspace_size = 0; + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_fwd_impl_v1( + batch, num_attn_heads, num_attn_heads, max_seqlen, max_seqlen, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlens, devPtrcuSeqlens, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { fused_attn::fused_attn_fp8_fwd_impl( - b, h, max_seqlen, max_seqlen, d, + batch, num_attn_heads, max_seqlen, max_seqlen, head_dim, is_training, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, devPtrM, devPtrZInv, @@ -1935,6 +2659,9 @@ void fused_attn_fp8_fwd_qkvpacked( devPtrDropoutSeed, devPtrDropoutOffset, get_cudnn_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { @@ -1950,8 +2677,9 @@ void fused_attn_fp8_fwd_qkvpacked( } // fused attention BWD FP8 with packed QKV void fused_attn_fp8_bwd_qkvpacked( - size_t b, size_t h, size_t max_seqlen, size_t d, + size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_O, const Tensor *input_dO, @@ -1966,11 +2694,19 @@ void fused_attn_fp8_bwd_qkvpacked( cudaStream_t stream, cudnnHandle_t handle) { using namespace transformer_engine; - // QKV shape is [total_seqs, 3, h, d] + const DType QKV_type = input_QKV->data.dtype; + const DType dQKV_type = output_dQKV->data.dtype; void* devPtrQKV = input_QKV->data.dptr; - void* devPtrQ = reinterpret_cast(devPtrQKV); - void* devPtrK = reinterpret_cast(reinterpret_cast(devPtrQKV) + h * d); - void* devPtrV = reinterpret_cast(reinterpret_cast(devPtrQKV) + 2 * h * d); + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + size_t stride = 0; + if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) { + stride = typeToSize(QKV_type) * num_attn_heads * head_dim; + } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) { + stride = typeToSize(QKV_type) * head_dim; + } + void *devPtrQ = devPtrQKV; + void *devPtrK = static_cast(static_cast(devPtrQKV) + stride); + void *devPtrV = static_cast(static_cast(devPtrQKV) + 2 * stride); void* devPtrDescaleQ = input_QKV->scale_inv.dptr; void* devPtrDescaleK = input_QKV->scale_inv.dptr; void* devPtrDescaleV = input_QKV->scale_inv.dptr; @@ -1985,15 +2721,14 @@ void fused_attn_fp8_bwd_qkvpacked( void* devPtrScaleS = input_S->scale.dptr; void* devPtrDescaleS = input_S->scale_inv.dptr; - void* devPtrAmaxdS = input_output_dP->amax.dptr; - void* devPtrScaledS = input_output_dP->scale.dptr; - void* devPtrDescaledS = input_output_dP->scale_inv.dptr; - - // dQKV shape is [total_seqs, 3, h, d] - void* devPtrdQKV = output_dQKV->data.dptr; - void* devPtrdQ = reinterpret_cast(devPtrdQKV); - void* devPtrdK = reinterpret_cast(reinterpret_cast(devPtrdQKV) + h * d); - void* devPtrdV = reinterpret_cast(reinterpret_cast(devPtrdQKV) + 2 * h * d); + void* devPtrAmaxdP = input_output_dP->amax.dptr; + void* devPtrScaledP = input_output_dP->scale.dptr; + void* devPtrDescaledP = input_output_dP->scale_inv.dptr; + + void *devPtrdQKV = output_dQKV->data.dptr; + void *devPtrdQ = devPtrdQKV; + void *devPtrdK = static_cast(static_cast(devPtrdQKV) + stride); + void *devPtrdV = static_cast(static_cast(devPtrdQKV) + 2 * stride); void* devPtrAmaxdQ = output_dQKV->amax.dptr; void* devPtrAmaxdK = output_dQKV->amax.dptr; void* devPtrAmaxdV = output_dQKV->amax.dptr; @@ -2008,11 +2743,33 @@ void fused_attn_fp8_bwd_qkvpacked( void* devPtrDropoutOffset = reinterpret_cast( reinterpret_cast(rng_state->data.dptr) + 1); - const DType QKV_type = input_QKV->data.dtype; size_t workspace_size = 0; + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_bwd_impl_v1( + batch, num_attn_heads, num_attn_heads, max_seqlen, max_seqlen, head_dim, + attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdP, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlens, devPtrcuSeqlens, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + get_cudnn_fe_dtype(dQKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { fused_attn::fused_attn_fp8_bwd_impl( - b, h, max_seqlen, max_seqlen, d, + batch, num_attn_heads, max_seqlen, max_seqlen, head_dim, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, devPtrM, devPtrZInv, @@ -2020,15 +2777,278 @@ void fused_attn_fp8_bwd_qkvpacked( devPtrdQ, devPtrdK, devPtrdV, devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, devPtrDescaleO, devPtrDescaledO, - devPtrDescaleS, devPtrDescaledS, - devPtrScaleS, devPtrScaledS, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, devPtrScaledQ, devPtrScaledK, devPtrScaledV, - devPtrAmaxdS, + devPtrAmaxdP, devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, devPtrcuSeqlens, devPtrcuSeqlens, devPtrDropoutSeed, devPtrDropoutOffset, get_cudnn_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} +// fused attention FWD FP8 with packed KV +void fused_attn_fp8_fwd_kvpacked( + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, + const Tensor *input_KV, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_CTX_Tensors, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + const DType QKV_type = input_Q->data.dtype; + void* devPtrQ = input_Q->data.dptr; + void *devPtrKV = input_KV->data.dptr; + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + size_t stride = 0; + if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) { + stride = typeToSize(QKV_type) * num_gqa_groups * head_dim; + } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { + stride = typeToSize(QKV_type) * head_dim; + } + void *devPtrK = devPtrKV; + void *devPtrV = static_cast(static_cast(devPtrKV) + stride); + void* devPtrDescaleQ = input_Q->scale_inv.dptr; + void* devPtrDescaleK = input_KV->scale_inv.dptr; + void* devPtrDescaleV = input_KV->scale_inv.dptr; + + void* devPtrO = output_O->data.dptr; + void* devPtrAmaxO = output_O->amax.dptr; + void* devPtrScaleO = output_O->scale.dptr; + + void* devPtrM = nullptr; + void* devPtrZInv = nullptr; + if (Aux_CTX_Tensors->size == 0) { + Aux_CTX_Tensors->size = 3; + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + output_M->data.dptr = nullptr; + output_M->data.shape = {batch, num_attn_heads, max_seqlen_q, 1}; + output_M->data.dtype = DType::kFloat32; + output_ZInv->data.dptr = nullptr; + output_ZInv->data.shape = {batch, num_attn_heads, max_seqlen_q, 1}; + output_ZInv->data.dtype = DType::kFloat32; + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; + } else if (Aux_CTX_Tensors->size == 3) { + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + devPtrM = output_M->data.dptr; + devPtrZInv = output_ZInv->data.dptr; + output_rng_state->data.dptr = rng_state->data.dptr; + } else { + NVTE_ERROR("Unexpected Aux_CTX_Tensors->size."); + } + + void* devPtrAmaxS = input_output_S->amax.dptr; + void* devPtrScaleS = input_output_S->scale.dptr; + void* devPtrDescaleS = input_output_S->scale_inv.dptr; + + void* devPtrcuSeqlensQ = reinterpret_cast( + reinterpret_cast(cu_seqlens_q->data.dptr)); + void* devPtrcuSeqlensKV = reinterpret_cast( + reinterpret_cast(cu_seqlens_kv->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + size_t workspace_size = 0; + + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_fwd_impl_v1( + batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { + fused_attn::fused_attn_fp8_fwd_impl( + batch, num_attn_heads, max_seqlen_q, max_seqlen_kv, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } + + if (workspace_size > 0) { + if (workspace->data.dptr == nullptr) { + workspace->data.shape = { workspace_size }; + workspace->data.dtype = DType::kByte; + return; + } + } else if (workspace_size == 0) { + workspace->data.shape = { 1 }; + workspace->data.dtype = DType::kByte; + return; + } +} +// fused attention BWD FP8 with packed KV +void fused_attn_fp8_bwd_kvpacked( + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, + const Tensor *input_KV, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQ, + const Tensor *output_dKV, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle) { + using namespace transformer_engine; + const DType QKV_type = input_Q->data.dtype; + const DType dQKV_type = output_dQ->data.dtype; + void *devPtrQ = input_Q->data.dptr; + void *devPtrKV = input_KV->data.dptr; + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + size_t stride = 0; + if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) { + stride = typeToSize(QKV_type) * num_gqa_groups * head_dim; + } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { + stride = typeToSize(QKV_type) * head_dim; + } + void *devPtrK = devPtrKV; + void *devPtrV = static_cast(static_cast(devPtrKV) + stride); + void* devPtrDescaleQ = input_Q->scale_inv.dptr; + void* devPtrDescaleK = input_KV->scale_inv.dptr; + void* devPtrDescaleV = input_KV->scale_inv.dptr; + + void* devPtrO = input_O->data.dptr; + void* devPtrDescaleO = input_O->scale_inv.dptr; + void* devPtrdO = input_dO->data.dptr; + void* devPtrDescaledO = input_dO->scale_inv.dptr; + + void* devPtrM = input_M->data.dptr; + void* devPtrZInv = input_ZInv->data.dptr; + + void* devPtrScaleS = input_S->scale.dptr; + void* devPtrDescaleS = input_S->scale_inv.dptr; + void* devPtrAmaxdP = input_output_dP->amax.dptr; + void* devPtrScaledP = input_output_dP->scale.dptr; + void* devPtrDescaledP = input_output_dP->scale_inv.dptr; + + void *devPtrdQ = output_dQ->data.dptr; + void *devPtrdKV = output_dKV->data.dptr; + void *devPtrdK = devPtrdKV; + void *devPtrdV = static_cast(static_cast(devPtrdKV) + stride); + void* devPtrAmaxdQ = output_dQ->amax.dptr; + void* devPtrAmaxdK = output_dKV->amax.dptr; + void* devPtrAmaxdV = output_dKV->amax.dptr; + void* devPtrScaledQ = output_dQ->scale.dptr; + void* devPtrScaledK = output_dKV->scale.dptr; + void* devPtrScaledV = output_dKV->scale.dptr; + + void* devPtrcuSeqlensQ = reinterpret_cast( + reinterpret_cast(cu_seqlens_q->data.dptr)); + void* devPtrcuSeqlensKV = reinterpret_cast( + reinterpret_cast(cu_seqlens_kv->data.dptr)); + void* devPtrDropoutSeed = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr)); + void* devPtrDropoutOffset = reinterpret_cast( + reinterpret_cast(rng_state->data.dptr) + 1); + + size_t workspace_size = 0; + + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_bwd_impl_v1( + batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim, + attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdP, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + get_cudnn_fe_dtype(dQKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { + fused_attn::fused_attn_fp8_bwd_impl( + batch, num_attn_heads, max_seqlen_q, max_seqlen_kv, head_dim, + attn_scale, p_dropout, qkv_layout, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdP, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { @@ -2044,9 +3064,11 @@ void fused_attn_fp8_bwd_qkvpacked( } // fused attention FWD FP8 with separate Q, K, V void fused_attn_fp8_fwd( - size_t b, size_t h, size_t max_seqlen_q, size_t max_seqlen_kv, size_t d, + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, @@ -2074,21 +3096,19 @@ void fused_attn_fp8_fwd( void* devPtrM = nullptr; void* devPtrZInv = nullptr; if (Aux_CTX_Tensors->size == 0) { - if (is_training) { - Aux_CTX_Tensors->size = 3; - Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); - Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); - Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); - output_M->data.dptr = nullptr; - output_M->data.shape = {b, h, max_seqlen_q, 1}; - output_M->data.dtype = DType::kFloat32; - output_ZInv->data.dptr = nullptr; - output_ZInv->data.shape = {b, h, max_seqlen_q, 1}; - output_ZInv->data.dtype = DType::kFloat32; - output_rng_state->data.dptr = nullptr; - output_rng_state->data.shape = {2}; - output_rng_state->data.dtype = DType::kInt64; - } + Aux_CTX_Tensors->size = 3; + Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); + Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); + Tensor *output_rng_state = reinterpret_cast(Aux_CTX_Tensors->tensors[2]); + output_M->data.dptr = nullptr; + output_M->data.shape = {batch, num_attn_heads, max_seqlen_q, 1}; + output_M->data.dtype = DType::kFloat32; + output_ZInv->data.dptr = nullptr; + output_ZInv->data.shape = {batch, num_attn_heads, max_seqlen_q, 1}; + output_ZInv->data.dtype = DType::kFloat32; + output_rng_state->data.dptr = nullptr; + output_rng_state->data.shape = {2}; + output_rng_state->data.dtype = DType::kInt64; } else if (Aux_CTX_Tensors->size == 3) { Tensor *output_M = reinterpret_cast(Aux_CTX_Tensors->tensors[0]); Tensor *output_ZInv = reinterpret_cast(Aux_CTX_Tensors->tensors[1]); @@ -2116,8 +3136,25 @@ void fused_attn_fp8_fwd( const DType QKV_type = input_Q->data.dtype; size_t workspace_size = 0; + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_fwd_impl_v1( + batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim, + is_training, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleS, devPtrScaleS, devPtrScaleO, + devPtrAmaxO, devPtrAmaxS, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { fused_attn::fused_attn_fp8_fwd_impl( - b, h, max_seqlen_q, max_seqlen_kv, d, + batch, num_attn_heads, max_seqlen_q, max_seqlen_kv, head_dim, is_training, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, devPtrM, devPtrZInv, @@ -2129,6 +3166,9 @@ void fused_attn_fp8_fwd( devPtrDropoutSeed, devPtrDropoutOffset, get_cudnn_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { @@ -2144,8 +3184,10 @@ void fused_attn_fp8_fwd( } // fused attention BWD FP8 with separate Q, K, V void fused_attn_fp8_bwd( - size_t b, size_t h, size_t max_seqlen_q, size_t max_seqlen_kv, size_t d, + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, @@ -2182,9 +3224,9 @@ void fused_attn_fp8_bwd( void* devPtrScaleS = input_S->scale.dptr; void* devPtrDescaleS = input_S->scale_inv.dptr; - void* devPtrAmaxdS = input_output_dP->amax.dptr; - void* devPtrScaledS = input_output_dP->scale.dptr; - void* devPtrDescaledS = input_output_dP->scale_inv.dptr; + void* devPtrAmaxdP = input_output_dP->amax.dptr; + void* devPtrScaledP = input_output_dP->scale.dptr; + void* devPtrDescaledP = input_output_dP->scale_inv.dptr; void* devPtrdQ = output_dQ->data.dptr; void* devPtrdK = output_dK->data.dptr; @@ -2206,10 +3248,34 @@ void fused_attn_fp8_bwd( reinterpret_cast(rng_state->data.dptr) + 1); const DType QKV_type = input_Q->data.dtype; + const DType dQKV_type = output_dQ->data.dtype; size_t workspace_size = 0; + NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); + if ((qkv_format == NVTE_QKV_Format::NVTE_BSHD) + || (qkv_format == NVTE_QKV_Format::NVTE_SBHD)) { + fused_attn::fused_attn_fp8_bwd_impl_v1( + batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim, + attn_scale, p_dropout, qkv_layout, bias_type, mask_type, + devPtrQ, devPtrK, devPtrV, + devPtrM, devPtrZInv, + devPtrO, devPtrdO, + devPtrdQ, devPtrdK, devPtrdV, + devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, + devPtrDescaleO, devPtrDescaledO, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, + devPtrScaledQ, devPtrScaledK, devPtrScaledV, + devPtrAmaxdP, + devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, + devPtrcuSeqlensQ, devPtrcuSeqlensKV, + devPtrDropoutSeed, devPtrDropoutOffset, + get_cudnn_fe_dtype(QKV_type), + get_cudnn_fe_dtype(dQKV_type), + workspace->data.dptr, &workspace_size, stream, handle); + } else if (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD) { fused_attn::fused_attn_fp8_bwd_impl( - b, h, max_seqlen_q, max_seqlen_kv, d, + batch, num_attn_heads, max_seqlen_q, max_seqlen_kv, head_dim, attn_scale, p_dropout, qkv_layout, devPtrQ, devPtrK, devPtrV, devPtrM, devPtrZInv, @@ -2217,15 +3283,18 @@ void fused_attn_fp8_bwd( devPtrdQ, devPtrdK, devPtrdV, devPtrDescaleQ, devPtrDescaleK, devPtrDescaleV, devPtrDescaleO, devPtrDescaledO, - devPtrDescaleS, devPtrDescaledS, - devPtrScaleS, devPtrScaledS, + devPtrDescaleS, devPtrDescaledP, + devPtrScaleS, devPtrScaledP, devPtrScaledQ, devPtrScaledK, devPtrScaledV, - devPtrAmaxdS, + devPtrAmaxdP, devPtrAmaxdQ, devPtrAmaxdK, devPtrAmaxdV, devPtrcuSeqlensQ, devPtrcuSeqlensKV, devPtrDropoutSeed, devPtrDropoutOffset, get_cudnn_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle); + } else { + NVTE_ERROR("FP8 fused attention only supports qkv_layout=t3hd or qkv_format=bshd/sbhd. \n"); + } if (workspace_size > 0) { if (workspace->data.dptr == nullptr) { diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h index 3373e0cb3b..3b0ea6c2c2 100644 --- a/transformer_engine/common/fused_attn/fused_attn_fp8.h +++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h @@ -14,9 +14,10 @@ namespace transformer_engine { #if (CUDNN_VERSION >= 8900) // fused attention FWD FP8 with packed QKV void fused_attn_fp8_fwd_qkvpacked( - size_t b, size_t h, size_t max_seqlen, size_t d, + size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, Tensor *input_output_S, Tensor *output_O, @@ -29,8 +30,9 @@ void fused_attn_fp8_fwd_qkvpacked( // fused attention BWD FP8 with packed QKV void fused_attn_fp8_bwd_qkvpacked( - size_t b, size_t h, size_t max_seqlen, size_t d, + size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_QKV, const Tensor *input_O, const Tensor *input_dO, @@ -45,11 +47,55 @@ void fused_attn_fp8_bwd_qkvpacked( cudaStream_t stream, cudnnHandle_t handle); +// fused attention FWD FP8 with packed KV +void fused_attn_fp8_fwd_kvpacked( + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, + bool is_training, float attn_scale, + float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, + const Tensor *input_KV, + Tensor *input_output_S, + Tensor *output_O, + NVTETensorPack* Aux_CTX_Tensors, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); + +// fused attention BWD FP8 with packed KV +void fused_attn_fp8_bwd_kvpacked( + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, + float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, + const Tensor *input_Q, + const Tensor *input_KV, + const Tensor *input_O, + const Tensor *input_dO, + const Tensor *input_M, + const Tensor *input_ZInv, + const Tensor *input_S, + Tensor *input_output_dP, + const Tensor *output_dQ, + const Tensor *output_dKV, + const Tensor *cu_seqlens_q, + const Tensor *cu_seqlens_kv, + const Tensor *rng_state, + Tensor *workspace, + cudaStream_t stream, + cudnnHandle_t handle); + // fused attention FWD FP8 with separate Q, K, V void fused_attn_fp8_fwd( - size_t b, size_t h, size_t max_seqlen_q, size_t max_seqlen_kv, size_t d, + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, Tensor *input_output_S, Tensor *output_O, @@ -63,8 +109,10 @@ void fused_attn_fp8_fwd( // fused attention BWD FP8 with separate Q, K, V void fused_attn_fp8_bwd( - size_t b, size_t h, size_t max_seqlen_q, size_t max_seqlen_kv, size_t d, + size_t batch, size_t num_attn_heads, size_t num_gqa_groups, + size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, + NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V, const Tensor *input_O, const Tensor *input_dO, diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h index 49d056ff1c..11da5cf56c 100644 --- a/transformer_engine/common/fused_attn/utils.h +++ b/transformer_engine/common/fused_attn/utils.h @@ -111,19 +111,20 @@ struct FADescriptor_v1 { NVTE_QKV_Layout layout; NVTE_Bias_Type bias_type; NVTE_Mask_Type mask_type; - cudnn_frontend::DataType_t tensor_type; + cudnn_frontend::DataType_t fwd_tensor_type; + cudnn_frontend::DataType_t bwd_tensor_type; bool operator<(const FADescriptor_v1 &rhs) const { return std::tie(b, h, hg, s_q, s_kv, d, bias_b, bias_h, attnScale, isTraining, dropoutProbability, - layout, mask_type, bias_type, tensor_type) + layout, mask_type, bias_type, fwd_tensor_type, bwd_tensor_type) < std::tie( rhs.b, rhs.h, rhs.hg, rhs.s_q, rhs.s_kv, rhs.d, rhs.bias_b, rhs.bias_h, rhs.attnScale, rhs.isTraining, rhs.dropoutProbability, rhs.layout, rhs.mask_type, rhs.bias_type, - rhs.tensor_type); + rhs.fwd_tensor_type, rhs.bwd_tensor_type); } }; diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py index 9abbb69cbe..989dd03d62 100644 --- a/transformer_engine/common/recipe/__init__.py +++ b/transformer_engine/common/recipe/__init__.py @@ -96,7 +96,7 @@ def scaling_factor_compute(amax: Tensor, where `Tensor` is a framework tensor type. override_linear_precision: Tuple(bool, bool, bool), default=(False, False, False) - Whether or not the execute the `fprop`, `dgrad`, and `wgrad` + Whether or not to execute the `fprop`, `dgrad`, and `wgrad` GEMMs (respectively) in higher precision when using FP8. reduce_amax: bool, default = `True` By default, if `torch.distributed` is initialized, the `amax` value for FP8 @@ -106,6 +106,20 @@ def scaling_factor_compute(amax: Tensor, GPU maintains local amaxes and scaling factors. To ensure results are numerically identical across checkpointing boundaries in this case, all ranks must checkpoint in order to store the local tensors. + fp8_dpa: bool, default = `False` + Whether to enable FP8 dot product attention (DPA). When the model is placed in an + `fp8_autocast(enabled=True)` region and `fp8_dpa` is set to `True`, DPA casts the + inputs from higher precision to FP8, performs attention in FP8, and casts tensors + back to higher precision as outputs. FP8 DPA currently is only supported in the + `FusedAttention` backend. + fp8_mha: bool, default = `False` + Whether to enable FP8 multi-head attention (MHA). When `True`, it removes the casting + operations mentioned above at the DPA boundaries. Currently only standard MHA modules + i.e. `LayerNormLinear/Linear + DPA + Linear`, are supported for this feature. When + `fp8_mha = False, fp8_dpa = True`, a typical MHA module works as + `LayerNormLinear (BF16 output) -> (cast to FP8 ) FP8 DPA (cast to BF16) -> Linear`. + When `fp8_mha = True, fp8_dpa = True`, it becomes + `LayerNormLinear (FP8 output) -> FP8 DPA -> Linear`. Notes ----- @@ -116,6 +130,9 @@ def scaling_factor_compute(amax: Tensor, FP8_MAX = maximum_representable_value(fp8_format) new_scaling_factor = (FP8_MAX / amax) / (2 ^ margin) + + * `fp8_dpa` and `fp8_mha` are Beta features, and their API and functionality are + subject to change in future Transformer Engine releases. """ margin: int = 0 @@ -126,6 +143,8 @@ def scaling_factor_compute(amax: Tensor, override_linear_precision: _OverrideLinearPrecision = _OverrideLinearPrecision() scaling_factor_compute_algo: Optional[Callable] = None reduce_amax: bool = True + fp8_dpa: bool = False + fp8_mha: bool = False def __post_init__(self) -> None: assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported." diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 862ae8adf8..4bb39b913f 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -18,6 +18,10 @@ import torch.nn.functional as F import transformer_engine_extensions as tex +from transformer_engine.pytorch.cpp_extensions import ( + cast_to_fp8, + cast_from_fp8, +) from transformer_engine.pytorch.cpp_extensions.fused_attn import ( fused_attn_fwd_qkvpacked, fused_attn_bwd_qkvpacked, @@ -30,7 +34,10 @@ AttnMaskType, FusedAttnBackend, ) +from transformer_engine.pytorch.fp8 import get_fp8_te_dtype +from transformer_engine.pytorch.float8_tensor import Float8Tensor from transformer_engine.pytorch.module import LayerNormLinear, Linear +from transformer_engine.pytorch.module.base import TransformerEngineBaseModule from transformer_engine.pytorch.utils import ( divide, attention_mask_func, @@ -73,6 +80,12 @@ from flash_attn.flash_attn_interface import _flash_attn_varlen_forward as _flash_attn_forward # pylint: disable=no-name-in-module,ungrouped-imports from flash_attn.flash_attn_interface import _flash_attn_varlen_backward as _flash_attn_backward # pylint: disable=no-name-in-module +META_QKV = tex.FP8FwdTensors.GEMM1_OUTPUT +META_DQKV = tex.FP8BwdTensors.GRAD_OUTPUT1 +META_O = tex.FP8FwdTensors.GEMM2_INPUT +META_DO = tex.FP8BwdTensors.GRAD_INPUT2 +META_S = tex.FP8FwdTensors.GEMM3_OUTPUT +META_DP = tex.FP8BwdTensors.GRAD_INPUT3 _NVTE_DEBUG = int(os.getenv("NVTE_DEBUG", "0")) _alibi_cache = { @@ -811,7 +824,7 @@ def backward(ctx, dout): dq_, dk_, dv_, _ = fused_attn_bwd( ctx.max_seqlen_q, ctx.max_seqlen_k, cu_seqlens_q, cu_seqlens_k, - q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], + q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], TE_DType[kv.dtype], [softmax_lse, ctx.rng_states[cp_size-i-1]], tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen, attn_scale=ctx.softmax_scale, @@ -851,7 +864,7 @@ def backward(ctx, dout): dq_, dk_, dv_, _ = fused_attn_bwd( ctx.max_seqlen_q, ctx.max_seqlen_k//2, cu_seqlens_q, cu_seqlens_k//2, - q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], + q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], TE_DType[kv.dtype], [softmax_lse, ctx.rng_states[cp_size-i-1]], tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen, attn_scale=ctx.softmax_scale, @@ -891,7 +904,7 @@ def backward(ctx, dout): dq_, dk_, dv_, _ = fused_attn_bwd( ctx.max_seqlen_q//2, ctx.max_seqlen_k, cu_seqlens_q//2, cu_seqlens_k, - q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], + q_, kv_[0], kv_[1], out_, dout_, TE_DType[q.dtype], TE_DType[kv.dtype], [softmax_lse_, ctx.rng_states[cp_size-i-1]], tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen, attn_scale=ctx.softmax_scale, @@ -924,7 +937,7 @@ def backward(ctx, dout): dq_, dk_, dv_, _ = fused_attn_bwd( ctx.max_seqlen_q, ctx.max_seqlen_k, cu_seqlens_q, cu_seqlens_k, - q, kv[0], kv[1], out, dout, TE_DType[q.dtype], + q, kv[0], kv[1], out, dout, TE_DType[q.dtype], TE_DType[kv.dtype], [softmax_lse, ctx.rng_states[cp_size-i-1]], tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen, attn_scale=ctx.softmax_scale, @@ -1247,6 +1260,14 @@ def forward(ctx, ) -> Tuple[torch.Tensor, ...]: ctx.split_dim = split_dim ctx.split_size_or_sections = split_size_or_sections + if isinstance(mixed_x_layer, Float8Tensor): + return tuple(Float8Tensor.make_like( + mixed_x_layer, + data=x, + ) for x in torch.split( + mixed_x_layer._data, + split_size_or_sections=split_size_or_sections, + dim=split_dim)) return torch.split(mixed_x_layer, split_size_or_sections, dim = split_dim) @staticmethod @@ -1263,6 +1284,37 @@ def backward(ctx, dims = len(grad_outputs[0].shape) split_dim = (ctx.split_dim + dims) % dims + if isinstance(grad_outputs[0], Float8Tensor): + noop_ok = True + strides = grad_outputs[0].stride() + data_ptr = grad_outputs[0]._data.untyped_storage().data_ptr() + shape = list(grad_outputs[0].shape) + for i, tensor in enumerate(grad_outputs): + shape_i = shape + shape_i[split_dim] = split_sizes[i] + offset_size = sum(split_sizes[:i]) * np.prod(shape[split_dim+1:]) + if (tensor.stride() != strides or + list(tensor.shape) != shape_i or + tensor._data.untyped_storage().data_ptr() != data_ptr or + tensor.storage_offset() != offset_size): + noop_ok = False + break + if noop_ok: + ret = torch.Tensor().to(device=grad_outputs[0].device, + dtype=grad_outputs[0]._data.dtype) + new_shape = list(shape) + new_shape[split_dim] = sum(split_sizes) + ret.set_(grad_outputs[0]._data.untyped_storage(), + grad_outputs[0]._data.storage_offset(), + new_shape, + strides + ) + return Float8Tensor.make_like(grad_outputs[0], data=ret), None, None + + grad_outputs_data = [x._data for x in grad_outputs] + return Float8Tensor.make_like( + grad_outputs[0], + data=torch.cat(grad_outputs_data, dim = split_dim)), None, None noop_ok = True strides = grad_outputs[0].stride() data_ptr = grad_outputs[0].untyped_storage().data_ptr() @@ -1277,7 +1329,6 @@ def backward(ctx, tensor.storage_offset() != offset_size): noop_ok = False break - if noop_ok: ret = torch.Tensor().to(device=grad_outputs[0].device, dtype=grad_outputs[0].dtype) @@ -1849,6 +1900,35 @@ def forward( return output +def _combine_tensors( + tensors: List[torch.Tensor], + dim: int, + ) -> torch.Tensor: + """Combine tensors along a particular dimension""" + + num_tensors = len(tensors) + new_shape = list(tensors[0].shape) + new_shape.insert(dim, num_tensors) + new_stride = list(tensors[0].stride()) + new_stride.insert(dim, int(new_stride[dim-1]/num_tensors)) + if isinstance(tensors[0], Float8Tensor): + combined_tensor = torch.Tensor().to( + device=tensors[0].device, dtype=tensors[0]._data.dtype) + combined_tensor.set_( + tensors[0]._data.untyped_storage(), + tensors[0]._data.storage_offset(), + new_shape, new_stride) + combined_tensor = Float8Tensor.make_like( + tensors[0], data=combined_tensor) + else: + combined_tensor = torch.Tensor().to( + device=tensors[0].device, dtype=tensors[0].dtype) + combined_tensor.set_( + tensors[0].untyped_storage(), + tensors[0].storage_offset(), + new_shape, new_stride) + + return combined_tensor class FusedAttnFunc_qkvpacked(torch.autograd.Function): """Function for FusedAttention with packed QKV input""" @@ -1856,15 +1936,83 @@ class FusedAttnFunc_qkvpacked(torch.autograd.Function): @staticmethod def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, - rng_gen, fused_attention_backend, use_FAv2_bwd): - out, aux_ctx_tensors = fused_attn_fwd_qkvpacked( - is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, - fused_attention_backend, attn_bias, - None, None, None, None, None, - attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, - rng_gen) - - ctx.save_for_backward(qkv, out, cu_seqlens) + rng_gen, fused_attention_backend, use_FAv2_bwd, + fp8, fp8_meta, tp_size, tp_group): + if fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 forward') + if fp8_meta["recipe"].fp8_mha: + assert (isinstance(qkv, Float8Tensor)), "qkv must be Float8Tensors for FP8 MHA." + fp8_meta["scaling_fwd"].scale_inv[META_QKV] = qkv._scale_inv + fused_attention_backend = FusedAttnBackend["FP8"] + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + # 1: qkv packed, 2: kv packed, 3: qkv separate + qkv_group = len(qkv_layout.split('_')) + assert (qkv_group == 1 + ), f"qkv layout should conform to 3hd or h3d, e.g. sb3hd, \ + but found {qkv_layout}." + if fp8_meta["recipe"].fp8_mha: + qkv_fp8 = qkv._data + else: + qkv_c = qkv.view(-1, qkv.shape[-3] * qkv.shape[-2] * qkv.shape[-1]) + qkv_fp8 = cast_to_fp8(qkv_c, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(qkv.shape) + out_fp8, aux_ctx_tensors = fused_attn_fwd_qkvpacked( + is_training, max_seqlen, cu_seqlens, + qkv_fp8, fp8_dtype_forward, fused_attention_backend, attn_bias, + fp8_meta["scaling_fwd"].scale_inv[META_QKV], + fp8_meta["scaling_fwd"].scale_inv[META_S], + fp8_meta["scaling_fwd"].scale[META_S], + fp8_meta["scaling_fwd"].scale[META_O], + fp8_meta["scaling_fwd"].amax_history[0][META_S], + fp8_meta["scaling_fwd"].amax_history[0][META_O], + attn_scale, dropout_p, fast_zero_fill, qkv_layout, + attn_bias_type, attn_mask_type, rng_gen) + if fp8_meta["recipe"].fp8_mha: + out_ret = Float8Tensor(data=out_fp8, + fp8_meta=fp8_meta, + fp8_meta_forward=True, + fp8_meta_index=META_O, + fp8_dtype=fp8_dtype_forward, + dtype=qkv.dtype, + ) + else: + out_ret = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + out_save = out_ret + if fp8_meta["recipe"].fp8_mha and not int(os.getenv("NVTE_FP8_DPA_BWD", "1")): + qkv_c = qkv.view(-1, qkv.shape[-3] * qkv.shape[-2] * qkv.shape[-1]) + qkv = cast_from_fp8(qkv_c._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[qkv.dtype]).view(qkv.shape) + out_save = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + fp8_tensors = (qkv_fp8, out_fp8, + fp8_meta["scaling_fwd"].scale.clone(), + fp8_meta["scaling_fwd"].scale_inv.clone()) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 forward') + out_ret, aux_ctx_tensors = fused_attn_fwd_qkvpacked( + is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, + fused_attention_backend, attn_bias, + None, None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + fp8_tensors = (None, None, None, None) + out_save = out_ret + + ctx.fp8 = fp8 and int(os.getenv("NVTE_FP8_DPA_BWD", "1")) + qkvo_tensors = (qkv, out_save) if not ctx.fp8 else (None, None) + ctx.save_for_backward(*qkvo_tensors, cu_seqlens, *fp8_tensors) + ctx.fp8_meta = fp8_meta + ctx.tp_size = tp_size + ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen = max_seqlen ctx.qkv_dtype = qkv_dtype @@ -1874,15 +2022,23 @@ def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, ctx.qkv_layout = qkv_layout ctx.attn_bias_type = attn_bias_type ctx.attn_mask_type = attn_mask_type - ctx.fused_attention_backend = fused_attention_backend + ctx.fused_attention_backend = \ + fused_attention_backend if ctx.fp8 else FusedAttnBackend["F16_arbitrary_seqlen"] ctx.use_FAv2_bwd = use_FAv2_bwd - return out + return out_ret @staticmethod def backward(ctx, d_out): + if ctx.fp8_meta["recipe"].fp8_mha: + assert (isinstance(d_out, Float8Tensor) + ), "Gradient of the DPA output must be in Float8Tensor type for FP8 MHA." + d_out_f8tensor = d_out + d_out = d_out._data + d_out = d_out.contiguous() - qkv, out, cu_seqlens = ctx.saved_tensors + (qkv, out, cu_seqlens, + qkv_fp8, out_fp8, fwd_scales, fwd_scale_invs) = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() if ctx.use_FAv2_bwd: @@ -1899,13 +2055,65 @@ def backward(ctx, d_out): ) dqkv = dqkv[..., :d_out.shape[-1]] else: - dqkv, *rest = fused_attn_bwd_qkvpacked( - ctx.max_seqlen, cu_seqlens, qkv, out, d_out, - ctx.qkv_dtype, ctx.aux_ctx_tensors, - ctx.fused_attention_backend, - None, None, None, None, None, None, None, None, None, - ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, - ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + with torch.cuda.nvtx.range("_FusedAttn_qkvpacked"): + if ctx.fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 backward') + fp8_dtype_forward = get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=True) + fp8_dtype_backward = get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=False) + if ctx.fp8_meta["recipe"].fp8_mha: + d_out_fp8 = d_out + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO] = d_out_f8tensor._scale_inv + else: + d_out_fp8 = cast_to_fp8( + d_out.view(-1, d_out.shape[-2] * d_out.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward + ).view(d_out.shape) + dqkv_fp8, *rest = fused_attn_bwd_qkvpacked( + ctx.max_seqlen, cu_seqlens, + qkv_fp8, out_fp8, d_out_fp8, + fp8_dtype_forward, fp8_dtype_backward, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + fwd_scale_invs[META_QKV], # d_scale_qkv, + fwd_scale_invs[META_S], # d_scale_s, + fwd_scale_invs[META_O], # d_scale_o, + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO], # d_scale_do + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DP], # d_scale_dp + fwd_scales[META_S], # q_scale_s + ctx.fp8_meta['scaling_bwd'].scale[META_DP], # q_scale_dp + ctx.fp8_meta['scaling_bwd'].scale[META_DQKV], # q_scale_dqkv + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DP], # amax_dp + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DQKV], # amax_dqkv + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + if ctx.fp8_meta["recipe"].fp8_mha: + dqkv = Float8Tensor(data=dqkv_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + else: + dqkv_c_fp8 = dqkv_fp8.view(-1, + dqkv_fp8.shape[-3] * dqkv_fp8.shape[-2] * dqkv_fp8.shape[-1]) + dqkv = cast_from_fp8(dqkv_c_fp8, + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dqkv_fp8.shape) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 backward') + if d_out.dtype == torch.uint8: + d_out = d_out_f8tensor.from_float8(qkv.dtype) + dqkv, *rest = fused_attn_bwd_qkvpacked( + ctx.max_seqlen, cu_seqlens, qkv, out, d_out, + ctx.qkv_dtype, ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) # if no_bias or alibi, return dqkv if ctx.attn_bias_type in ["no_bias", "alibi"]: @@ -1924,16 +2132,90 @@ class FusedAttnFunc_kvpacked(torch.autograd.Function): @staticmethod def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, q, kv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, - qkv_layout, attn_bias_type, attn_mask_type, - rng_gen, fused_attention_backend, use_FAv2_bwd): - out, aux_ctx_tensors = fused_attn_fwd_kvpacked( - is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, - q, kv, qkv_dtype, fused_attention_backend, attn_bias, - None, None, None, None, None, - attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, - rng_gen) - - ctx.save_for_backward(q, kv, out, cu_seqlens_q, cu_seqlens_kv) + qkv_layout, attn_bias_type, attn_mask_type, rng_gen, fused_attention_backend, + use_FAv2_bwd, fp8, fp8_meta, tp_size, tp_group): + if fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 forward') + if fp8_meta["recipe"].fp8_mha: + assert (isinstance(q, Float8Tensor) + and isinstance(kv, Float8Tensor)), "q/kv must be Float8Tensors for FP8 MHA." + fp8_meta["scaling_fwd"].scale_inv[META_QKV] = q._scale_inv + fused_attention_backend = FusedAttnBackend["FP8"] + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + if fp8_meta["recipe"].fp8_mha: + q_fp8, kv_fp8 = q._data, kv._data + else: + # 1: qkv packed, 2: kv packed, 3: qkv separate + qkv_group = len(qkv_layout.split('_')) + assert (qkv_group == 2 + ), f"qkv layout should conform to hd_2hd or hd_h2d, e.g. sbhd_sb2hd, \ + but found {qkv_layout}." + q_fp8 = cast_to_fp8(q, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(q.shape) + kv_c = kv.view(-1, kv.shape[-3] * kv.shape[-2] * kv.shape[-1]) + kv_fp8 = cast_to_fp8(kv_c, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(kv.shape) + out_fp8, aux_ctx_tensors = fused_attn_fwd_kvpacked( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q_fp8, kv_fp8, fp8_dtype_forward, fused_attention_backend, attn_bias, + fp8_meta["scaling_fwd"].scale_inv[META_QKV], + fp8_meta["scaling_fwd"].scale_inv[META_S], + fp8_meta["scaling_fwd"].scale[META_S], + fp8_meta["scaling_fwd"].scale[META_O], + fp8_meta["scaling_fwd"].amax_history[0][META_S], + fp8_meta["scaling_fwd"].amax_history[0][META_O], + attn_scale, dropout_p, fast_zero_fill, qkv_layout, + attn_bias_type, attn_mask_type, rng_gen) + if fp8_meta["recipe"].fp8_mha: + out_ret = Float8Tensor(data=out_fp8, + fp8_meta=fp8_meta, + fp8_meta_forward=True, + fp8_meta_index=META_O, + fp8_dtype=fp8_dtype_forward, + dtype=q.dtype, + ) + else: + out_ret = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + out_save = out_ret + if fp8_meta["recipe"].fp8_mha and not int(os.getenv("NVTE_FP8_DPA_BWD", "1")): + q = cast_from_fp8(q._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[q.dtype]).view(q.shape) + kv_c = kv.view(-1, kv.shape[-3] * kv.shape[-2] * kv.shape[-1]) + kv = cast_from_fp8(kv_c._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[kv.dtype]).view(kv.shape) + out_save = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + fp8_tensors = (q_fp8, kv_fp8, out_fp8, + fp8_meta["scaling_fwd"].scale.clone(), + fp8_meta["scaling_fwd"].scale_inv.clone()) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 forward') + out_ret, aux_ctx_tensors = fused_attn_fwd_kvpacked( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, kv, qkv_dtype, fused_attention_backend, attn_bias, + None, None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + out_save = out_ret + fp8_tensors = (None, None, None, None, None) + + ctx.fp8 = fp8 and int(os.getenv("NVTE_FP8_DPA_BWD", "1")) + qkvo_tensors = (q, kv, out_save) if not ctx.fp8 else (None, None, None) + ctx.save_for_backward(*qkvo_tensors, cu_seqlens_q, cu_seqlens_kv, *fp8_tensors) + ctx.fp8_meta = fp8_meta + ctx.tp_size = tp_size + ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen_q = max_seqlen_q ctx.max_seqlen_kv = max_seqlen_kv @@ -1944,15 +2226,23 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql ctx.qkv_layout = qkv_layout ctx.attn_bias_type = attn_bias_type ctx.attn_mask_type = attn_mask_type - ctx.fused_attention_backend = fused_attention_backend + ctx.fused_attention_backend = \ + fused_attention_backend if ctx.fp8 else FusedAttnBackend["F16_arbitrary_seqlen"] ctx.use_FAv2_bwd = use_FAv2_bwd - return out + return out_ret @staticmethod def backward(ctx, d_out): + if ctx.fp8_meta["recipe"].fp8_mha: + assert (isinstance(d_out, Float8Tensor) + ), "Gradient of the DPA output must be in Float8Tensor type for FP8 MHA." + d_out_f8tensor = d_out + d_out = d_out._data + d_out = d_out.contiguous() - q, kv, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors + (q, kv, out, cu_seqlens_q, cu_seqlens_kv, + q_fp8, kv_fp8, out_fp8, fwd_scales, fwd_scale_invs) = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() if ctx.use_FAv2_bwd: @@ -1971,14 +2261,77 @@ def backward(ctx, d_out): dq = dq[..., :d_out.shape[-1]] dkv = dkv[..., :d_out.shape[-1]] else: - dq, dkv, *rest = fused_attn_bwd_kvpacked( - ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, - q, kv, out, d_out, - ctx.qkv_dtype, ctx.aux_ctx_tensors, - ctx.fused_attention_backend, - None, None, None, None, None, None, None, None, None, - ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, - ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + with torch.cuda.nvtx.range("_FusedAttn_kvpacked"): + if ctx.fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 backward') + fp8_dtype_forward = get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=True) + fp8_dtype_backward = get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=False) + if ctx.fp8_meta["recipe"].fp8_mha: + d_out_fp8 = d_out + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO] = d_out_f8tensor._scale_inv + else: + d_out_fp8 = cast_to_fp8( + d_out.view(-1, d_out.shape[-2] * d_out.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward + ).view(d_out.shape) + dq_fp8, dkv_fp8, *rest = fused_attn_bwd_kvpacked( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q_fp8, kv_fp8, out_fp8, d_out_fp8, + fp8_dtype_forward, fp8_dtype_backward, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + fwd_scale_invs[META_QKV], # d_scale_qkv, + fwd_scale_invs[META_S], # d_scale_s, + fwd_scale_invs[META_O], # d_scale_o, + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO], # d_scale_do + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DP], # d_scale_dp + fwd_scales[META_S], # q_scale_s + ctx.fp8_meta['scaling_bwd'].scale[META_DP], # q_scale_dp + ctx.fp8_meta['scaling_bwd'].scale[META_DQKV], # q_scale_dqkv + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DP], # amax_dp + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DQKV], # amax_dqkv + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + if ctx.fp8_meta["recipe"].fp8_mha: + dq = Float8Tensor(data=dq_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + dkv = Float8Tensor(data=dkv_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + else: + dq = cast_from_fp8( + dq_fp8.view(-1, dq_fp8.shape[-2] * dq_fp8.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dq_fp8.shape) + dkv_c_fp8 = dkv_fp8.view(-1, + dkv_fp8.shape[-3] * dkv_fp8.shape[-2] * dkv_fp8.shape[-1]) + dkv = cast_from_fp8(dkv_c_fp8, + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dkv_fp8.shape) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 backward') + if d_out.dtype == torch.uint8: + d_out = d_out_f8tensor.from_float8(q.dtype) + dq, dkv, *rest = fused_attn_bwd_kvpacked( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, kv, out, d_out, + ctx.qkv_dtype, ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) # if no_bias or alibi, return dqkv if ctx.attn_bias_type in ["no_bias", "alibi"]: @@ -1990,32 +2343,153 @@ def backward(ctx, d_out): None, None, None, None, None, None, None, None, None, None, None, None) - class FusedAttnFunc(torch.autograd.Function): """Function for FusedAttention with separate Q, K, V tensors""" @staticmethod def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, q, k, v, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, - qkv_layout, attn_bias_type, attn_mask_type, - rng_gen, fused_attention_backend, use_FAv2_bwd): - out, aux_ctx_tensors = fused_attn_fwd( - is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, - q, k, v, qkv_dtype, fused_attention_backend, attn_bias, - None, None, None, None, None, - attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, - rng_gen) + qkv_layout, attn_bias_type, attn_mask_type, rng_gen, fused_attention_backend, + use_FAv2_bwd, fp8, fp8_meta, tp_size, tp_group): + if fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 forward') + fused_attention_backend = FusedAttnBackend["FP8"] + fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) + if fp8_meta["recipe"].fp8_mha: + assert (isinstance(q, Float8Tensor) + and isinstance(k, Float8Tensor) + and isinstance(v, Float8Tensor)), "q/k/v must be Float8Tensors for FP8 MHA." + fp8_meta["scaling_fwd"].scale_inv[META_QKV] = q._scale_inv + q_fp8, k_fp8, v_fp8 = q._data, k._data, v._data + else: + # 1: qkv packed, 2: kv packed, 3: qkv separate + qkv_group = len(qkv_layout.split('_')) + if qkv_group == 1: + dim = qkv_layout.find('3') + qkv = _combine_tensors([q,k,v], dim) + qkv_c = qkv.view(-1, qkv.shape[-3] * qkv.shape[-2] * qkv.shape[-1]) + qkv_fp8 = cast_to_fp8(qkv_c, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(qkv.shape) + q_fp8, k_fp8, v_fp8 = _SplitAlongDim.apply(qkv_fp8, dim, [1,1,1]) + q_fp8, k_fp8, v_fp8 = [x.squeeze(dim) for x in [q_fp8, k_fp8, v_fp8]] + if qkv_group == 2: + q_fp8 = cast_to_fp8(q, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(q.shape) + dim = qkv_layout.split('_')[1].find('2') + kv = _combine_tensors([k,v], dim) + kv_c = kv.view(-1, kv.shape[-3] * kv.shape[-2] * kv.shape[-1]) + kv_fp8 = cast_to_fp8(kv_c, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(kv.shape) + k_fp8, v_fp8 = _SplitAlongDim.apply(kv_fp8, dim, [1,1]) + k_fp8, v_fp8 = [x.squeeze(dim) for x in [k_fp8, v_fp8]] + if qkv_group == 3: + q_fp8 = cast_to_fp8(q, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(q.shape) + k_fp8 = cast_to_fp8(k, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(k.shape) + v_fp8 = cast_to_fp8(v, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward).view(v.shape) + out_fp8, aux_ctx_tensors = fused_attn_fwd( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q_fp8, k_fp8, v_fp8, fp8_dtype_forward, fused_attention_backend, attn_bias, + fp8_meta["scaling_fwd"].scale_inv[META_QKV], + fp8_meta["scaling_fwd"].scale_inv[META_S], + fp8_meta["scaling_fwd"].scale[META_S], + fp8_meta["scaling_fwd"].scale[META_O], + fp8_meta["scaling_fwd"].amax_history[0][META_S], + fp8_meta["scaling_fwd"].amax_history[0][META_O], + attn_scale, dropout_p, fast_zero_fill, qkv_layout, + attn_bias_type, attn_mask_type, rng_gen) + if fp8_meta["recipe"].fp8_mha: + out_ret = Float8Tensor(data=out_fp8, + fp8_meta=fp8_meta, + fp8_meta_forward=True, + fp8_meta_index=META_O, + fp8_dtype=fp8_dtype_forward, + dtype=q.dtype, + ) + else: + out_ret = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + out_save = out_ret + + if fp8_meta["recipe"].fp8_mha and not int(os.getenv("NVTE_FP8_DPA_BWD", "1")): + # 1: qkv packed, 2: kv packed, 3: qkv separate + qkv_group = len(qkv_layout.split('_')) + if qkv_group == 1: + dim = qkv_layout.find('3') + qkv = _combine_tensors([q,k,v], dim) + qkv_c = qkv.view(-1, qkv.shape[-3] * qkv.shape[-2] * qkv.shape[-1]) + qkv_no_fp8 = cast_from_fp8(qkv_c._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[qkv.dtype]).view(qkv.shape) + q, k, v = _SplitAlongDim.apply(qkv_no_fp8, dim, [1,1,1]) + q, k, v = [x.squeeze(dim) for x in [q, k, v]] + if qkv_group == 2: + q = cast_from_fp8(q._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[q.dtype]).view(q.shape) + dim = qkv_layout.split('_')[1].find('2') + kv = _combine_tensors([k,v], dim) + kv_c = kv.view(-1, kv.shape[-3] * kv.shape[-2] * kv.shape[-1]) + kv_no_fp8 = cast_from_fp8(kv_c._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[kv.dtype]).view(kv.shape) + k, v = _SplitAlongDim.apply(kv_no_fp8, dim, [1,1]) + k, v = [x.squeeze(dim) for x in [k, v]] + if qkv_group == 3: + q = cast_from_fp8(q._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[q.dtype]).view(q.shape) + k = cast_from_fp8(k._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[k.dtype]).view(k.shape) + v = cast_from_fp8(v._data, + fp8_meta["scaling_fwd"], + META_QKV, fp8_dtype_forward, TE_DType[v.dtype]).view(v.shape) + out_save = cast_from_fp8( + out_fp8.view(-1, out_fp8.shape[-2] * out_fp8.shape[-1]), + fp8_meta["scaling_fwd"], META_O, + fp8_dtype_forward, qkv_dtype).view(out_fp8.shape) + + fp8_tensors = (q_fp8, k_fp8, v_fp8, out_fp8, + fp8_meta["scaling_fwd"].scale.clone(), + fp8_meta["scaling_fwd"].scale_inv.clone()) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 forward') + out_ret, aux_ctx_tensors = fused_attn_fwd( + is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, k, v, qkv_dtype, fused_attention_backend, attn_bias, + None, None, None, None, None, None, + attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, + rng_gen) + out_save = out_ret + fp8_tensors = (None, None, None, None, None, None) from .cpu_offload import CPUOffloadEnabled if CPUOffloadEnabled: - tensor_list = [q, k, v, out, cu_seqlens_q, cu_seqlens_kv] + tensor_list = [q, k, v, out_save, cu_seqlens_q, cu_seqlens_kv] qkv_layout = 'sbhd_sbhd_sbhd' for tensor in tensor_list: if tensor is not None: tensor.activation_offloading = True - - ctx.save_for_backward(q, k, v, out, cu_seqlens_q, cu_seqlens_kv) + ctx.fp8 = fp8 and int(os.getenv("NVTE_FP8_DPA_BWD", "1")) + qkvo_tensors = (q, k, v, out_save) if not ctx.fp8 else (None, None, None, None) + ctx.save_for_backward(*qkvo_tensors, cu_seqlens_q, cu_seqlens_kv, *fp8_tensors) + ctx.fp8_meta = fp8_meta + ctx.tp_size = tp_size + ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen_q = max_seqlen_q ctx.max_seqlen_kv = max_seqlen_kv @@ -2026,15 +2500,23 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql ctx.qkv_layout = qkv_layout ctx.attn_bias_type = attn_bias_type ctx.attn_mask_type = attn_mask_type - ctx.fused_attention_backend = fused_attention_backend + ctx.fused_attention_backend = \ + fused_attention_backend if ctx.fp8 else FusedAttnBackend["F16_arbitrary_seqlen"] ctx.use_FAv2_bwd = use_FAv2_bwd - return out + return out_ret @staticmethod def backward(ctx, d_out): + if ctx.fp8_meta["recipe"].fp8_mha: + assert (isinstance(d_out, Float8Tensor) + ), "Gradient of the DPA output must be in Float8Tensor type for FP8 MHA." + d_out_f8tensor = d_out + d_out = d_out._data + d_out = d_out.contiguous() - q, k, v, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors + (q, k, v, out, cu_seqlens_q, cu_seqlens_kv, + q_fp8, k_fp8, v_fp8, out_fp8, fwd_scales, fwd_scale_invs) = ctx.saved_tensors if not ctx.aux_ctx_tensors[0].is_contiguous(): ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous() if ctx.use_FAv2_bwd: @@ -2055,14 +2537,112 @@ def backward(ctx, d_out): dk = dk[..., :d_out.shape[-1]] dv = dv[..., :d_out.shape[-1]] else: - dq, dk, dv, *rest = fused_attn_bwd( - ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, - q, k, v, out, d_out, - ctx.qkv_dtype, ctx.aux_ctx_tensors, - ctx.fused_attention_backend, - None, None, None, None, None, None, None, None, None, - ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, - ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + with torch.cuda.nvtx.range("_FusedAttn"): + if ctx.fp8: + if _NVTE_DEBUG: + print('[DotProductAttention]: using FP8 backward') + fp8_dtype_forward = get_fp8_te_dtype(ctx.fp8_meta["recipe"], fprop_tensor=True) + fp8_dtype_backward = get_fp8_te_dtype( + ctx.fp8_meta["recipe"], fprop_tensor=False) + if ctx.fp8_meta["recipe"].fp8_mha: + d_out_fp8 = d_out + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO] = d_out_f8tensor._scale_inv + else: + d_out_fp8 = cast_to_fp8( + d_out.view(-1, d_out.shape[-2] * d_out.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DO, fp8_dtype_backward + ).view(d_out.shape) + dq_fp8, dk_fp8, dv_fp8, *rest = fused_attn_bwd( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q_fp8, k_fp8, v_fp8, out_fp8, d_out_fp8, + fp8_dtype_forward, fp8_dtype_backward, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + fwd_scale_invs[META_QKV], # d_scale_qkv, + fwd_scale_invs[META_S], # d_scale_s, + fwd_scale_invs[META_O], # d_scale_o, + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DO], # d_scale_do + ctx.fp8_meta['scaling_bwd'].scale_inv[META_DP], # d_scale_dp + fwd_scales[META_S], # q_scale_s + ctx.fp8_meta['scaling_bwd'].scale[META_DP], # q_scale_dp + ctx.fp8_meta['scaling_bwd'].scale[META_DQKV], # q_scale_dqkv + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DP], # amax_dp + ctx.fp8_meta['scaling_bwd'].amax_history[0][META_DQKV], # amax_dqkv + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) + if ctx.fp8_meta["recipe"].fp8_mha: + dq = Float8Tensor(data=dq_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + dk = Float8Tensor(data=dk_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + dv = Float8Tensor(data=dv_fp8, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=META_DQKV, + fp8_dtype=fp8_dtype_backward, + dtype=d_out_f8tensor.dtype, + ) + else: + qkv_group = len(ctx.qkv_layout.split('_')) + if qkv_group == 1: + dim = ctx.qkv_layout.find('3') + dqkv_fp8 = _combine_tensors([dq_fp8,dk_fp8,dv_fp8], dim) + dqkv_c_fp8 = dqkv_fp8.view(-1, + dqkv_fp8.shape[-3] * dqkv_fp8.shape[-2] * dqkv_fp8.shape[-1]) + dqkv = cast_from_fp8(dqkv_c_fp8, + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dqkv_fp8.shape) + dq, dk, dv = _SplitAlongDim.apply(dqkv, dim, [1,1,1]) + dq, dk, dv = [x.squeeze(dim) for x in [dq, dk, dv]] + if qkv_group == 2: + dq = cast_from_fp8( + dq_fp8.view(-1, dq_fp8.shape[-2] * dq_fp8.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dq_fp8.shape) + dim = ctx.qkv_layout.split('_')[1].find('2') + dkv_fp8 = _combine_tensors([dk_fp8,dv_fp8], dim) + dkv_c_fp8 = dkv_fp8.view(-1, + dkv_fp8.shape[-3] * dkv_fp8.shape[-2] * dkv_fp8.shape[-1]) + dkv = cast_from_fp8(dkv_c_fp8, + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dkv_fp8.shape) + dk, dv = _SplitAlongDim.apply(dkv, dim, [1,1]) + dk, dv = [x.squeeze(dim) for x in [dk, dv]] + if qkv_group == 3: + dq = cast_from_fp8( + dq_fp8.view(-1, dq_fp8.shape[-2] * dq_fp8.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dq_fp8.shape) + dk = cast_from_fp8( + dk_fp8.view(-1, dk_fp8.shape[-2] * dk_fp8.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dk_fp8.shape) + dv = cast_from_fp8( + dv_fp8.view(-1, dv_fp8.shape[-2] * dv_fp8.shape[-1]), + ctx.fp8_meta["scaling_bwd"], META_DQKV, + fp8_dtype_backward, ctx.qkv_dtype).view(dv_fp8.shape) + else: + if _NVTE_DEBUG: + print('[DotProductAttention]: using non-FP8 backward') + if d_out.dtype == torch.uint8: + d_out = d_out_f8tensor.from_float8(q.dtype) + dq, dk, dv, *rest = fused_attn_bwd( + ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, + q, k, v, out, d_out, + ctx.qkv_dtype, ctx.qkv_dtype, ctx.aux_ctx_tensors, + ctx.fused_attention_backend, + None, None, None, None, None, None, None, None, None, None, + ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill, + ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type) # if no_bias or alibi, return dqkv if ctx.attn_bias_type in ["no_bias", "alibi"]: @@ -2075,7 +2655,7 @@ def backward(ctx, d_out): None, None, None, None, None, None) -class FusedAttention(torch.nn.Module): +class FusedAttention(TransformerEngineBaseModule): """Dot product attention, with multiple backends: 1. FusedAttnBackend["F16_max512_seqlen"] @@ -2111,6 +2691,8 @@ def __init__( attention_type: str = "self", layer_number: Optional[int] = None, deterministic: bool = False, + tp_size: int = 1, + tp_group: Optional[dist_group_type] = None, ) -> None: super().__init__() @@ -2137,6 +2719,15 @@ def __init__( if os.environ["NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT"] == "1": os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = "-1" + self.tp_size = tp_size + self.tp_group = tp_group + + def get_fp8_weights_scratchpad( + self, + is_first_microbatch: Union[bool, None], + ) -> List[Float8Tensor]: + """Needs override.""" + @no_torch_dynamo() def forward( self, @@ -2158,6 +2749,7 @@ def forward( cp_group: Optional[dist_group_type] = None, cp_global_ranks: List[int] = None, cp_stream: torch.cuda.Stream = None, + is_first_microbatch: Optional[bool] = None, ) -> torch.Tensor: """fused attention fprop""" @@ -2165,9 +2757,9 @@ def forward( != tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend ), 'No fused attention backend supports this input combination!' assert ( - (query_layer.dtype in [torch.float16, torch.bfloat16]) - and (key_layer.dtype in [torch.float16, torch.bfloat16]) - and (value_layer.dtype in [torch.float16, torch.bfloat16]) + (query_layer.dtype in [torch.float16, torch.bfloat16, torch.uint8]) + and (key_layer.dtype in [torch.float16, torch.bfloat16, torch.uint8]) + and (value_layer.dtype in [torch.float16, torch.bfloat16, torch.uint8]) ), 'FusedAttention only supports FP16 and BF16 data types.' assert ( query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda @@ -2249,24 +2841,43 @@ def forward( if qkv_format == 'sbhd': output = output.transpose(0,1).contiguous() else: - with self.attention_dropout_ctx(): - output = FusedAttnFunc.apply( - self.training, - max_seqlen_q, max_seqlen_kv, - cu_seqlens_q, cu_seqlens_kv, - query_layer, key_layer, value_layer, - qkv_dtype, - core_attention_bias, - 1.0/self.norm_factor, - self.attention_dropout if self.training else 0.0, - fast_zero_fill, - qkv_layout, - core_attention_bias_type, - attn_mask_type, - None, # rng_gen - fused_attention_backend, - use_FAv2_bwd, - ) + with self.prepare_forward(query_layer, + is_first_microbatch, + num_gemms=3, + allow_non_contiguous=True) as query_layer: + with self.attention_dropout_ctx(): + forced_fp8_dpa = "" + if self.fp8_meta["recipe"].fp8_mha: + if not self.fp8_meta["recipe"].fp8_dpa: + self.fp8_meta["recipe"].fp8_dpa = True + forced_fp8_dpa = " (forced)" + if _NVTE_DEBUG: + print("[DotProductAttention]: " + f"""using fp8_recipe.fp8_mha={self.fp8_meta["recipe"].fp8_mha}, """ + f"""fp8_recipe.fp8_dpa={self.fp8_meta["recipe"].fp8_dpa}""" + f"""{forced_fp8_dpa} and """ + f"""NVTE_FP8_DPA_BWD={int(os.getenv("NVTE_FP8_DPA_BWD", "1"))}""") + output = FusedAttnFunc.apply( + self.training, + max_seqlen_q, max_seqlen_kv, + cu_seqlens_q, cu_seqlens_kv, + query_layer, key_layer, value_layer, + qkv_dtype, + core_attention_bias, + 1.0/self.norm_factor, + self.attention_dropout if self.training else 0.0, + fast_zero_fill, + qkv_layout, + core_attention_bias_type, + attn_mask_type, + None, # rng_gen + fused_attention_backend, + use_FAv2_bwd, + self.fp8 and self.fp8_meta["recipe"].fp8_dpa, + self.fp8_meta, + self.tp_size, + self.tp_group, + ) # ...hd -> ...(hd) return output.view(*output.shape[:-2], -1) @@ -2464,7 +3075,9 @@ def __init__( attention_type=attention_type, layer_number=layer_number, deterministic=self.deterministic, - **attn_kwargs) + **attn_kwargs, + tp_size=self.tp_size, + tp_group=self.tp_group) self.unfused_attention = UnfusedDotProductAttention( norm_factor, **attn_kwargs, layer_number=layer_number) @@ -2533,6 +3146,7 @@ def forward( alibi_slopes: Optional[torch.Tensor] = None, fast_zero_fill: bool = True, inference_params: Optional[InferenceParams] = None, + is_first_microbatch: Optional[bool] = None, ) -> torch.Tensor: """ Dot Product Attention Layer. @@ -2636,6 +3250,19 @@ def forward( Adjustments of the sequence_len_offset should be done after a complete forward pass. If rotary positional embeddings (RoPE) are utilized, they must be prepared beforehand. Supports "sbhd" and "bshd" layouts, with the "sbhd" layout being more efficient. + is_first_microbatch : {True, False, None}, default = None + During training using either gradient accumulation or + pipeline parallelism a minibatch of data is further split + into microbatches. Between the microbatches of the same minibatch + the model weights are not updated. Setting this parameter indicates + whether the current microbatch is the first in a minibatch or not. + When set, this parameter enables additional optimizations: + + * during FP8 training, it allows caching of the FP8 versions of + the weights + * it also allows skipping gradient accumulation during the + first microbatch (since it is the first gradient being + produced) """ assert ( @@ -2747,8 +3374,14 @@ def forward( ), """Sequence lengths indicated by cu_seqlens_kv must be no greater than the sequence dimention in 'key_layer' and 'value_layer'!""" - qkv_layout, query_layer, key_layer, value_layer = _get_qkv_layout( - query_layer, key_layer, value_layer, qkv_format = qkv_format) + if (isinstance(query_layer, Float8Tensor) + and isinstance(key_layer, Float8Tensor) + and isinstance(value_layer, Float8Tensor)): + qkv_layout, query_layer._data, key_layer._data, value_layer._data = _get_qkv_layout( + query_layer._data, key_layer._data, value_layer._data, qkv_format = qkv_format) + else: + qkv_layout, query_layer, key_layer, value_layer = _get_qkv_layout( + query_layer, key_layer, value_layer, qkv_format = qkv_format) # The priority for attention backends (subject to availability and clearing the filters) # is: FlashAttention > FusedAttention (cuDNN) > UnfusedDotProductAttention. @@ -2768,8 +3401,13 @@ def forward( if (query_layer.dtype not in [torch.bfloat16, torch.float16] or key_layer.dtype not in [torch.bfloat16, torch.float16] or value_layer.dtype not in [torch.bfloat16, torch.float16] + or any(isinstance(x, Float8Tensor) for x in [query_layer, key_layer, value_layer]) ): use_flash_attention = False + if (query_layer.dtype not in [torch.bfloat16, torch.float16] + or key_layer.dtype not in [torch.bfloat16, torch.float16] + or value_layer.dtype not in [torch.bfloat16, torch.float16] + ): use_fused_attention = False # Filter: Device and dimensions. @@ -2866,8 +3504,10 @@ def forward( if use_fused_attention: fused_attention_backend = tex.get_fused_attn_backend( - TE_DType[query_layer.dtype], - TE_DType[key_layer.dtype], + TE_DType[query_layer.dtype] + if not isinstance(query_layer, Float8Tensor) else query_layer._fp8_dtype, + TE_DType[key_layer.dtype] + if not isinstance(key_layer, Float8Tensor) else key_layer._fp8_dtype, QKVLayout[qkv_layout], AttnBiasType[fu_core_attention_bias_type], AttnMaskType[attn_mask_type], @@ -2880,7 +3520,9 @@ def forward( ) # DPA does not support FP8; for FP8, use cpp_extensions modules directly is_backend_avail = (fused_attention_backend in - [FusedAttnBackend["F16_max512_seqlen"], FusedAttnBackend["F16_arbitrary_seqlen"]]) + [FusedAttnBackend["F16_max512_seqlen"], + FusedAttnBackend["F16_arbitrary_seqlen"], + FusedAttnBackend["FP8"]]) use_fused_attention = ( \ use_fused_attention and is_backend_avail and \ (not context_parallel or \ @@ -2951,6 +3593,8 @@ def forward( qkv_layout=qkv_layout, cu_seqlens_q=cu_seqlens_q, cu_seqlens_kv=cu_seqlens_kv, + max_seqlen_q=max_seqlen_q, + max_seqlen_kv=max_seqlen_kv, attn_mask_type=attn_mask_type, attention_mask=attention_mask, fused_attention_backend=fused_attention_backend, @@ -2960,8 +3604,7 @@ def forward( cp_group=self.cp_group, cp_global_ranks=self.cp_global_ranks, cp_stream=self.cp_stream, - max_seqlen_q=max_seqlen_q, - max_seqlen_kv=max_seqlen_kv) + is_first_microbatch=is_first_microbatch) return self.fused_attention( query_layer, key_layer, @@ -2969,6 +3612,8 @@ def forward( qkv_layout=qkv_layout, cu_seqlens_q=cu_seqlens_q, cu_seqlens_kv=cu_seqlens_kv, + max_seqlen_q=max_seqlen_q, + max_seqlen_kv=max_seqlen_kv, attn_mask_type=attn_mask_type, attention_mask=attention_mask, fused_attention_backend=fused_attention_backend, @@ -2978,8 +3623,7 @@ def forward( cp_group=self.cp_group, cp_global_ranks=self.cp_global_ranks, cp_stream=self.cp_stream, - max_seqlen_q=max_seqlen_q, - max_seqlen_kv=max_seqlen_kv) + is_first_microbatch=is_first_microbatch) assert (not context_parallel), \ "Context parallelism is only implemented with Flash Attention and Fused Attention!" @@ -3553,6 +4197,7 @@ def forward( mixed_x_layer = self.qkv( hidden_states, is_first_microbatch=is_first_microbatch, + is_first_module_in_mha=True, # specific to FP8 MHA ) num_queries_per_key_value = (self.num_attention_heads_per_partition // @@ -3604,6 +4249,7 @@ def forward( mixed_kv_layer = self.key_value( encoder_output, is_first_microbatch=is_first_microbatch, + is_first_module_in_mha=True, # specific to FP8 MHA ) if self.qkv_weight_interleaved: @@ -3634,6 +4280,9 @@ def forward( key_layer, value_layer = torch.split( mixed_kv_layer, mixed_kv_layer.shape[split_dim] // 2, dim = split_dim, ) + key_layer, value_layer = (x.reshape( + x.size(0), x.size(1), -1, self.hidden_size_per_attention_head, + ) for x in (key_layer, value_layer)) # Attention head [sq, b, h] --> [sq, b, hp] if self.input_layernorm: @@ -3649,6 +4298,7 @@ def forward( query_layer = self.query_layer( hidden_states, is_first_microbatch=is_first_microbatch, + is_first_module_in_mha=True, # specific to FP8 MHA ) # [sq, b, hp] --> [sq, b, np, hn] @@ -3663,6 +4313,9 @@ def forward( # ====================================================== if rotary_pos_emb is not None: + assert (not isinstance(query_layer, Float8Tensor) + and not isinstance(key_layer, Float8Tensor) + ), "RoPE is not supported for Float8Tensors!" # duplicate the pos_emb for self attention if not isinstance(rotary_pos_emb, tuple): rotary_pos_emb = ((rotary_pos_emb,) * 2) diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py index 0f9a88454f..574627ac5d 100644 --- a/transformer_engine/pytorch/cpp_extensions/fused_attn.py +++ b/transformer_engine/pytorch/cpp_extensions/fused_attn.py @@ -84,6 +84,7 @@ def fused_attn_fwd_qkvpacked( fused_attention_backend: tex.NVTE_Fused_Attn_Backend, attn_bias: torch.Tensor = None, d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_o: torch.Tensor = None, amax_s: torch.Tensor = None, @@ -119,6 +120,8 @@ def fused_attn_fwd_qkvpacked( shape [1, num_heads, max_seqlen, max_seqlen], same data type as qkv d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_o: torch.Tensor, default = None @@ -206,6 +209,8 @@ def fused_attn_fwd_qkvpacked( assert (d_scale_qkv is not None ), "d_scale_qkv is required as an input for FP8 fused attention." + assert (d_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." assert (q_scale_s is not None ), "q_scale_s is required as an input for FP8 fused attention." assert (q_scale_o is not None @@ -220,7 +225,7 @@ def fused_attn_fwd_qkvpacked( max_seqlen, is_training, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], cu_seqlens, qkv, qkv_dtype, - d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, attn_bias, + d_scale_qkv, d_scale_s, q_scale_s, q_scale_o, amax_s, amax_o, attn_bias, rng_gen, rng_elts_per_thread, ) @@ -235,12 +240,14 @@ def fused_attn_bwd_qkvpacked( o: torch.Tensor, d_o: torch.Tensor, qkv_dtype: tex.DType, + dqkv_dtype: tex.DType, aux_ctx_tensors: List[torch.Tensor], fused_attention_backend: tex.NVTE_Fused_Attn_Backend, d_scale_qkv: torch.Tensor = None, d_scale_s: torch.Tensor = None, d_scale_o: torch.Tensor = None, d_scale_do: torch.Tensor = None, + d_scale_dp: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_dp: torch.Tensor = None, q_scale_dqkv: torch.Tensor = None, @@ -272,6 +279,8 @@ def fused_attn_bwd_qkvpacked( same shape as Q, i.e. thd, sbhd or bshd (see `qkv_layout` for details) qkv_dtype: tex.DType data type of QKV; in tex.DType, not torch.dtype + dqkv_dtype: tex.DType + data type of dQKV; in tex.DType, not torch.dtype aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors of the forward pass when its is_training is True, e.g. aux_ctx_tensors = [M, ZInv, rng_state] @@ -285,6 +294,8 @@ def fused_attn_bwd_qkvpacked( input tensor for the dequantization of O in FP8 computations d_scale_do: torch.Tensor, default = None input tensor for the dequantization of dO in FP8 computations + d_scale_dp: torch.Tensor, default = None + input tensor for the dequantization of dP in FP8 computations q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations q_scale_dp: torch.Tensor, default = None @@ -336,6 +347,7 @@ def fused_attn_bwd_qkvpacked( assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (d_scale_dp is not None), "d_scale_dp is required for FP8 fused attention." assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." @@ -348,8 +360,8 @@ def fused_attn_bwd_qkvpacked( output_tensors = tex.fused_attn_bwd_qkvpacked( max_seqlen, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], - cu_seqlens, qkv, o, d_o, qkv_dtype, aux_ctx_tensors, - d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + cu_seqlens, qkv, o, d_o, qkv_dtype, dqkv_dtype, aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, d_scale_dp, q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, ) @@ -368,6 +380,7 @@ def fused_attn_fwd_kvpacked( fused_attention_backend: tex.NVTE_Fused_Attn_Backend, attn_bias: torch.Tensor = None, d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_o: torch.Tensor = None, amax_s: torch.Tensor = None, @@ -410,6 +423,8 @@ def fused_attn_fwd_kvpacked( shape [1, num_heads, max_seqlen_q, max_seqlen_kv], same data type as q and kv d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of QKV in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_o: torch.Tensor, default = None @@ -496,12 +511,25 @@ def fused_attn_fwd_kvpacked( rng_elts_per_thread = (max_seqlen_q * max_seqlen_q + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + assert (d_scale_qkv is not None + ), "d_scale_qkv is required as an input for FP8 fused attention." + assert (d_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." + assert (q_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." + assert (q_scale_o is not None + ), "q_scale_o is required as an input for FP8 fused attention." + assert (amax_s is not None + ), "amax_s is required as an input for FP8 fused attention." + assert (amax_o is not None + ), "amax_o is required as an input for FP8 fused attention." + # execute kernel output_tensors = tex.fused_attn_fwd_kvpacked( max_seqlen_q, max_seqlen_kv, is_training, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], cu_seqlens_q, cu_seqlens_kv, q, kv, qkv_dtype, - d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, + d_scale_qkv, d_scale_s, q_scale_s, q_scale_o, amax_s, amax_o, attn_bias, rng_gen, rng_elts_per_thread, ) @@ -519,12 +547,14 @@ def fused_attn_bwd_kvpacked( o: torch.Tensor, d_o: torch.Tensor, qkv_dtype: tex.DType, + dqkv_dtype: tex.DType, aux_ctx_tensors: List[torch.Tensor], fused_attention_backend: tex.NVTE_Fused_Attn_Backend, d_scale_qkv: torch.Tensor = None, d_scale_s: torch.Tensor = None, d_scale_o: torch.Tensor = None, d_scale_do: torch.Tensor = None, + d_scale_dp: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_dp: torch.Tensor = None, q_scale_dqkv: torch.Tensor = None, @@ -562,7 +592,9 @@ def fused_attn_bwd_kvpacked( input tensor dO (gradient of O); same shape as Q, i.e. thd, sbhd or bshd (see `qkv_layout` for details) qkv_dtype: tex.DType - data type of QKV; in tex.DType, not torch.dtype + data type of Q and KV; in tex.DType, not torch.dtype + dqkv_dtype: tex.DType + data type of dQ and dKV; in tex.DType, not torch.dtype aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors of the forward pass when its is_training is True, e.g. aux_ctx_tensors = [M, ZInv, rng_state] @@ -576,6 +608,8 @@ def fused_attn_bwd_kvpacked( input tensor for the dequantization of O in FP8 computations d_scale_do: torch.Tensor, default = None input tensor for the dequantization of dO in FP8 computations + d_scale_dp: torch.Tensor, default = None + input tensor for the dequantization of dP in FP8 computations q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations q_scale_dp: torch.Tensor, default = None @@ -631,6 +665,7 @@ def fused_attn_bwd_kvpacked( assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (d_scale_dp is not None), "d_scale_dp is required for FP8 fused attention." assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." @@ -643,8 +678,8 @@ def fused_attn_bwd_kvpacked( output_tensors = tex.fused_attn_bwd_kvpacked( max_seqlen_q, max_seqlen_kv, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], - cu_seqlens_q, cu_seqlens_kv, q, kv, o, d_o, qkv_dtype, aux_ctx_tensors, - d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + cu_seqlens_q, cu_seqlens_kv, q, kv, o, d_o, qkv_dtype, dqkv_dtype, aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, d_scale_dp, q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, ) @@ -664,6 +699,7 @@ def fused_attn_fwd( fused_attention_backend: tex.NVTE_Fused_Attn_Backend, attn_bias: torch.Tensor = None, d_scale_qkv: torch.Tensor = None, + d_scale_s: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_o: torch.Tensor = None, amax_s: torch.Tensor = None, @@ -710,6 +746,8 @@ def fused_attn_fwd( shape [1, num_heads, max_seqlen_q, max_seqlen_kv], same data type as q, k and v d_scale_qkv: torch.Tensor, default = None input tensor for the dequantization of Q, K and V in FP8 computations + d_scale_s: torch.Tensor, default = None + input tensor for the dequantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations, S = Softmax(Q * K.T) q_scale_o: torch.Tensor, default = None @@ -798,12 +836,25 @@ def fused_attn_fwd( rng_elts_per_thread = (max_seqlen_q * max_seqlen_q + BACKEND_F16m512_FP8_THREADS_PER_CTA - 1)//BACKEND_F16m512_FP8_THREADS_PER_CTA + assert (d_scale_qkv is not None + ), "d_scale_qkv is required as an input for FP8 fused attention." + assert (d_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." + assert (q_scale_s is not None + ), "q_scale_s is required as an input for FP8 fused attention." + assert (q_scale_o is not None + ), "q_scale_o is required as an input for FP8 fused attention." + assert (amax_s is not None + ), "amax_s is required as an input for FP8 fused attention." + assert (amax_o is not None + ), "amax_o is required as an input for FP8 fused attention." + # execute kernel output_tensors = tex.fused_attn_fwd( max_seqlen_q, max_seqlen_kv, is_training, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], cu_seqlens_q, cu_seqlens_kv, q, k, v, qkv_dtype, - d_scale_qkv, q_scale_s, q_scale_o, amax_s, amax_o, + d_scale_qkv, d_scale_s, q_scale_s, q_scale_o, amax_s, amax_o, attn_bias, rng_gen, rng_elts_per_thread, ) @@ -822,12 +873,14 @@ def fused_attn_bwd( o: torch.Tensor, d_o: torch.Tensor, qkv_dtype: tex.DType, + dqkv_dtype: tex.DType, aux_ctx_tensors: List[torch.Tensor], fused_attention_backend: tex.NVTE_Fused_Attn_Backend, d_scale_qkv: torch.Tensor = None, d_scale_s: torch.Tensor = None, d_scale_o: torch.Tensor = None, d_scale_do: torch.Tensor = None, + d_scale_dp: torch.Tensor = None, q_scale_s: torch.Tensor = None, q_scale_dp: torch.Tensor = None, q_scale_dqkv: torch.Tensor = None, @@ -869,6 +922,8 @@ def fused_attn_bwd( same shape as Q qkv_dtype: tex.DType data type of Q, K and V; in tex.DType, not torch.dtype + dqkv_dtype: tex.DType + data type of dQ, dK and dV; in tex.DType, not torch.dtype aux_ctx_tensors: List[torch.Tensor] auxiliary output tensors of the forward pass when its is_training is True, e.g. aux_ctx_tensors = [M, ZInv, rng_state] @@ -882,6 +937,8 @@ def fused_attn_bwd( input tensor for the dequantization of O in FP8 computations d_scale_do: torch.Tensor, default = None input tensor for the dequantization of dO in FP8 computations + d_scale_dp: torch.Tensor, default = None + input tensor for the dequantization of dP in FP8 computations q_scale_s: torch.Tensor, default = None input tensor for the quantization of S in FP8 computations q_scale_dp: torch.Tensor, default = None @@ -941,6 +998,7 @@ def fused_attn_bwd( assert (d_scale_s is not None), "d_scale_s is required for FP8 fused attention." assert (d_scale_o is not None), "d_scale_o is required for FP8 fused attention." assert (d_scale_do is not None), "d_scale_do is required for FP8 fused attention." + assert (d_scale_dp is not None), "d_scale_dp is required for FP8 fused attention." assert (q_scale_s is not None), "q_scale_s is required for FP8 fused attention." assert (q_scale_dp is not None), "q_scale_dp is required for FP8 fused attention." assert (q_scale_dqkv is not None), "q_scale_dqkv is required for FP8 fused attention." @@ -953,8 +1011,8 @@ def fused_attn_bwd( output_tensors = tex.fused_attn_bwd( max_seqlen_q, max_seqlen_kv, attn_scale, dropout, fast_zero_fill, QKVLayout[qkv_layout], AttnBiasType[attn_bias_type], AttnMaskType[attn_mask_type], - cu_seqlens_q, cu_seqlens_kv, q, k, v, o, d_o, qkv_dtype, aux_ctx_tensors, - d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, + cu_seqlens_q, cu_seqlens_kv, q, k, v, o, d_o, qkv_dtype, dqkv_dtype, aux_ctx_tensors, + d_scale_qkv, d_scale_s, d_scale_o, d_scale_do, d_scale_dp, q_scale_s, q_scale_dp, q_scale_dqkv, amax_dp, amax_dqkv, ) diff --git a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h index 3c039b9a88..dfbcfe3e8a 100644 --- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h +++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h @@ -786,9 +786,7 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder, UbufBase { // Get communication and GEMM output chunk sizes const int comm_bytes = _ubufs[0].numel() * _ubufs[0].element_size(); const bool do_gelu = pre_gelu_out.numel() > 0; - const int output_chunk_bytes = (do_gelu - ? (n_chunk * m) * D.element_size() - : (n_chunk * m) * HALF_BYTES); + const int output_chunk_bytes = (n_chunk * m) * D.element_size(); const int aux_chunk_bytes = do_gelu ? (n_chunk * m) * pre_gelu_out.element_size() : 0; // Get output and workspace data pointers diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index bf0bb576ec..abbecb1609 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -32,6 +32,7 @@ std::vector fused_attn_fwd_qkvpacked( const at::Tensor QKV, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -51,11 +52,13 @@ std::vector fused_attn_bwd_qkvpacked( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, @@ -74,6 +77,7 @@ std::vector fused_attn_fwd_kvpacked( const at::Tensor KV, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -95,11 +99,13 @@ std::vector fused_attn_bwd_kvpacked( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, @@ -119,6 +125,7 @@ std::vector fused_attn_fwd( const at::Tensor V, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -141,11 +148,13 @@ std::vector fused_attn_bwd( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cu b/transformer_engine/pytorch/csrc/extensions/attention.cu index 0a84ea3089..cc747655c4 100644 --- a/transformer_engine/pytorch/csrc/extensions/attention.cu +++ b/transformer_engine/pytorch/csrc/extensions/attention.cu @@ -97,6 +97,7 @@ std::vector fused_attn_fwd_qkvpacked( const at::Tensor QKV, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -126,22 +127,24 @@ std::vector fused_attn_fwd_qkvpacked( // FP8 auto h = q_shape[q_shape.size() - 2]; auto d = q_shape[q_shape.size() - 1]; - if (set_zero && ((h * d) % block_size == 0)) { + if (set_zero + && ((h * d) % block_size == 0) + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(O, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); } else { O.fill_(0); } - if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) - || (!amax_S.has_value()) || (!amax_O.has_value())) { - std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, scale_S, scale_O, amax_S and amax_O "; NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), qkv_shape, qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - at::Tensor descale_S = torch::empty_like(scale_S.value()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, amax_S.value().data_ptr(), - scale_S.value().data_ptr(), descale_S.data_ptr()); + scale_S.value().data_ptr(), descale_S.value().data_ptr()); te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { @@ -261,11 +264,13 @@ std::vector fused_attn_bwd_qkvpacked( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, @@ -284,26 +289,29 @@ std::vector fused_attn_bwd_qkvpacked( auto h = q_shape[q_shape.size() - 2]; // create output tensor dQKV - at::Tensor dQKV = torch::empty_like(QKV); - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto options = torch::TensorOptions().dtype(GetATenDType(dqkv_type)).device(torch::kCUDA); + at::Tensor dQKV = torch::empty_like(QKV, options); // construct NVTE tensors TensorWrapper te_QKV, te_O, te_dO, te_S, te_dP, te_dQKV; if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { // FP8 auto d = q_shape[q_shape.size() - 1]; - if (set_zero && ((h * d) % block_size == 0)) { + if (set_zero + && ((h * d) % block_size == 0) + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(dQKV, cu_seqlens.index({torch::indexing::Slice(-1, torch::indexing::None)})); } else { dQKV.fill_(0); } if ((!descale_QKV.has_value()) || (!descale_S.has_value()) - || (!descale_O.has_value()) || (!descale_dO.has_value()) - || (!scale_S.has_value()) || (!scale_dP.has_value()) - || (!scale_dQKV.has_value()) - || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { - std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; - err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!descale_dP.has_value()) || (!scale_S.has_value()) + || (!scale_dP.has_value()) || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, descale_dO, descale_dP, "; + err_tensors = err_tensors + std::string("scale_S, scale_dP, scale_dQKV, "); + err_tensors = err_tensors + std::string("amax_dP and amax_dQKV "); NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_QKV = makeTransformerEngineTensor(QKV.data_ptr(), qkv_shape, @@ -311,14 +319,13 @@ std::vector fused_attn_bwd_qkvpacked( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + dqkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); - at::Tensor descale_dP = torch::empty_like(scale_dP.value()); te_dP = makeTransformerEngineTensor(nullptr, {0}, - DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), - descale_dP.data_ptr()); - te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), qkv_shape, qkv_type, + DType::kFloat32, amax_dP.value().data_ptr(), + scale_dP.value().data_ptr(), descale_dP.value().data_ptr()); + te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), qkv_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { // BF16 or FP16 @@ -327,13 +334,13 @@ std::vector fused_attn_bwd_qkvpacked( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, nullptr); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dQKV = makeTransformerEngineTensor(dQKV.data_ptr(), qkv_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); } else { NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } @@ -433,6 +440,7 @@ std::vector fused_attn_fwd_kvpacked( const at::Tensor KV, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -458,24 +466,26 @@ std::vector fused_attn_fwd_kvpacked( // FP8 auto h = q_shape[q_shape.size() - 2]; auto d = q_shape[q_shape.size() - 1]; - if (set_zero && ((h * d) % block_size == 0)) { + if (set_zero + && ((h * d) % block_size == 0) + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); } else { O.fill_(0); } - if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) - || (!amax_S.has_value()) || (!amax_O.has_value())) { - std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, scale_S, scale_O, amax_S and amax_O "; NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); te_KV = makeTransformerEngineTensor(KV.data_ptr(), kv_shape, qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - at::Tensor descale_S = torch::empty_like(scale_S.value()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, amax_S.value().data_ptr(), - scale_S.value().data_ptr(), descale_S.data_ptr()); + scale_S.value().data_ptr(), descale_S.value().data_ptr()); te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { @@ -608,11 +618,13 @@ std::vector fused_attn_bwd_kvpacked( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, @@ -635,15 +647,18 @@ std::vector fused_attn_bwd_kvpacked( auto d = q_shape[q_shape.size() - 1]; // create output tensors dQ and dKV - at::Tensor dQ = torch::empty_like(Q); - at::Tensor dKV = torch::empty_like(KV); - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto options = torch::TensorOptions().dtype(GetATenDType(dqkv_type)).device(torch::kCUDA); + at::Tensor dQ = torch::empty_like(Q, options); + at::Tensor dKV = torch::empty_like(KV, options); // construct NVTE tensors TensorWrapper te_Q, te_KV, te_O, te_dO, te_S, te_dP, te_dQ, te_dKV; if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) { // FP8 - if (set_zero && ((h_q * d)% block_size == 0) && ((h_kv * d)% block_size == 0)) { + if (set_zero + && ((h_q * d)% block_size == 0) + && ((h_kv * d)% block_size == 0) + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); mha_fill(dKV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); } else { @@ -651,12 +666,13 @@ std::vector fused_attn_bwd_kvpacked( dKV.fill_(0); } if ((!descale_QKV.has_value()) || (!descale_S.has_value()) - || (!descale_O.has_value()) || (!descale_dO.has_value()) - || (!scale_S.has_value()) || (!scale_dP.has_value()) - || (!scale_dQKV.has_value()) - || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { - std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; - err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!descale_dP.has_value()) || (!scale_S.has_value()) + || (!scale_dP.has_value()) || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, descale_dO, descale_dP, "; + err_tensors = err_tensors + std::string("scale_S, scale_dP, scale_dQKV, "); + err_tensors = err_tensors + std::string("amax_dP and amax_dQKV "); NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, @@ -666,16 +682,15 @@ std::vector fused_attn_bwd_kvpacked( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + dqkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); - at::Tensor descale_dP = torch::empty_like(scale_dP.value()); te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), - descale_dP.data_ptr()); - te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, qkv_type, + descale_dP.value().data_ptr()); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), kv_shape, qkv_type, + te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), kv_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { // BF16 or FP16 @@ -686,15 +701,15 @@ std::vector fused_attn_bwd_kvpacked( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, nullptr); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_dKV = makeTransformerEngineTensor(dKV.data_ptr(), kv_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); } else { NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } @@ -806,6 +821,7 @@ std::vector fused_attn_fwd( const at::Tensor V, const transformer_engine::DType qkv_type, const c10::optional descale_QKV, + const c10::optional descale_S, const c10::optional scale_S, const c10::optional scale_O, c10::optional amax_S, @@ -832,14 +848,17 @@ std::vector fused_attn_fwd( // FP8 auto h = q_shape[q_shape.size() - 2]; auto d = q_shape[q_shape.size() - 1]; - if (set_zero && ((h * d) % block_size == 0)) { + if (set_zero + && ((h * d) % block_size == 0) + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(O, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); } else { O.fill_(0); } - if ((!descale_QKV.has_value()) || (!scale_S.has_value()) || (!scale_O.has_value()) - || (!amax_S.has_value()) || (!amax_O.has_value())) { - std::string err_tensors = "descale_QKV, scale_S, scale_O, amax_S and amax_O"; + if ((!descale_QKV.has_value()) || (!descale_S.has_value()) + || (!scale_S.has_value()) || (!scale_O.has_value()) + || (!amax_S.has_value()) || (!amax_O.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, scale_S, scale_O, amax_S and amax_O "; NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, @@ -848,10 +867,9 @@ std::vector fused_attn_fwd( qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); te_V = makeTransformerEngineTensor(V.data_ptr(), v_shape, qkv_type, nullptr, nullptr, descale_QKV.value().data_ptr()); - at::Tensor descale_S = torch::empty_like(scale_S.value()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, amax_S.value().data_ptr(), - scale_S.value().data_ptr(), descale_S.data_ptr()); + scale_S.value().data_ptr(), descale_S.value().data_ptr()); te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, amax_O.value().data_ptr(), scale_O.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { @@ -990,11 +1008,13 @@ std::vector fused_attn_bwd( const at::Tensor O, const at::Tensor dO, const transformer_engine::DType qkv_type, + const transformer_engine::DType dqkv_type, const std::vector Aux_CTX_Tensors, const c10::optional descale_QKV, const c10::optional descale_S, const c10::optional descale_O, const c10::optional descale_dO, + const c10::optional descale_dP, const c10::optional scale_S, const c10::optional scale_dP, const c10::optional scale_dQKV, @@ -1011,7 +1031,7 @@ std::vector fused_attn_bwd( auto h_q = q_shape[q_shape.size() - 2]; auto h_kv = k_shape[k_shape.size() - 2]; auto d = q_shape[q_shape.size() - 1]; - auto options = torch::TensorOptions().dtype(GetATenDType(qkv_type)).device(torch::kCUDA); + auto options = torch::TensorOptions().dtype(GetATenDType(dqkv_type)).device(torch::kCUDA); at::Tensor dQ; at::Tensor dK; @@ -1046,7 +1066,7 @@ std::vector fused_attn_bwd( torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); break; case NVTE_QKV_Layout_Group::NVTE_HD_2HD: - dQ = torch::empty_like(Q); + dQ = torch::empty_like(Q, options); tmp_shape = std::vector{k_sizes.begin(), k_sizes.end()}; tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 2, int64_t(2)); dKV = torch::empty(c10::IntArrayRef(tmp_shape), options); @@ -1058,7 +1078,7 @@ std::vector fused_attn_bwd( torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 3); break; case NVTE_QKV_Layout_Group::NVTE_HD_H2D: - dQ = torch::empty_like(Q); + dQ = torch::empty_like(Q, options); tmp_shape = std::vector{k_sizes.begin(), k_sizes.end()}; tmp_shape.insert(tmp_shape.begin() + tmp_shape.size() - 1, int64_t(2)); dKV = torch::empty(c10::IntArrayRef(tmp_shape), options); @@ -1068,9 +1088,9 @@ std::vector fused_attn_bwd( torch::indexing::Slice(0, torch::indexing::None, 1)}).squeeze(tmp_shape.size() - 2); break; case NVTE_QKV_Layout_Group::NVTE_HD_HD_HD: - dQ = torch::empty_like(Q); - dK = torch::empty_like(K); - dV = torch::empty_like(V); + dQ = torch::empty_like(Q, options); + dK = torch::empty_like(K, options); + dV = torch::empty_like(V, options); break; default: NVTE_ERROR("QKV layout not supported!"); @@ -1085,7 +1105,8 @@ std::vector fused_attn_bwd( && ((h_kv * d) % block_size == 0) && dQ.is_contiguous() && dK.is_contiguous() - && dV.is_contiguous()) { + && dV.is_contiguous() + && (nvte_get_qkv_format(qkv_layout) == NVTE_QKV_Format::NVTE_THD)) { mha_fill(dQ, cu_seqlens_q.index({torch::indexing::Slice(-1, torch::indexing::None)})); mha_fill(dK, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); mha_fill(dV, cu_seqlens_kv.index({torch::indexing::Slice(-1, torch::indexing::None)})); @@ -1095,12 +1116,13 @@ std::vector fused_attn_bwd( dV.fill_(0); } if ((!descale_QKV.has_value()) || (!descale_S.has_value()) - || (!descale_O.has_value()) || (!descale_dO.has_value()) - || (!scale_S.has_value()) || (!scale_dP.has_value()) - || (!scale_dQKV.has_value()) - || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { - std::string err_tensors = "descale_QKV, descale_S, descale_O, scale_S, scale_dP, "; - err_tensors = err_tensors + std::string("scale_dQKV, amax_dP and amax_dQKV"); + || (!descale_O.has_value()) || (!descale_dO.has_value()) + || (!descale_dP.has_value()) || (!scale_S.has_value()) + || (!scale_dP.has_value()) || (!scale_dQKV.has_value()) + || (!amax_dP.has_value()) || (!amax_dQKV.has_value())) { + std::string err_tensors = "descale_QKV, descale_S, descale_O, descale_dO, descale_dP, "; + err_tensors = err_tensors + std::string("scale_S, scale_dP, scale_dQKV, "); + err_tensors = err_tensors + std::string("amax_dP and amax_dQKV "); NVTE_ERROR(err_tensors + std::string("are required for FP8 operation. \n")); } te_Q = makeTransformerEngineTensor(Q.data_ptr(), q_shape, @@ -1112,18 +1134,17 @@ std::vector fused_attn_bwd( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, descale_O.value().data_ptr()); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); + dqkv_type, nullptr, nullptr, descale_dO.value().data_ptr()); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, scale_S.value().data_ptr(), descale_S.value().data_ptr()); - at::Tensor descale_dP = torch::empty_like(scale_dP.value()); te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, amax_dP.value().data_ptr(), scale_dP.value().data_ptr(), - descale_dP.data_ptr()); - te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, qkv_type, + descale_dP.value().data_ptr()); + te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - te_dK = makeTransformerEngineTensor(dK.data_ptr(), k_shape, qkv_type, + te_dK = makeTransformerEngineTensor(dK.data_ptr(), k_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); - te_dV = makeTransformerEngineTensor(dV.data_ptr(), v_shape, qkv_type, + te_dV = makeTransformerEngineTensor(dV.data_ptr(), v_shape, dqkv_type, amax_dQKV.value().data_ptr(), scale_dQKV.value().data_ptr(), nullptr); } else if (qkv_type == DType::kBFloat16 || qkv_type == DType::kFloat16) { // BF16 or FP16 @@ -1136,17 +1157,17 @@ std::vector fused_attn_bwd( te_O = makeTransformerEngineTensor(O.data_ptr(), q_shape, qkv_type, nullptr, nullptr, nullptr); te_dO = makeTransformerEngineTensor(dO.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_S = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dP = makeTransformerEngineTensor(nullptr, {0}, DType::kFloat32, nullptr, nullptr, nullptr); te_dQ = makeTransformerEngineTensor(dQ.data_ptr(), q_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_dK = makeTransformerEngineTensor(dK.data_ptr(), k_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); te_dV = makeTransformerEngineTensor(dV.data_ptr(), v_shape, - qkv_type, nullptr, nullptr, nullptr); + dqkv_type, nullptr, nullptr, nullptr); } else { NVTE_ERROR("Fused attention only supports FP8 and BF16/FP16 data types. \n"); } diff --git a/transformer_engine/pytorch/float8_tensor.py b/transformer_engine/pytorch/float8_tensor.py index c4aebf1a8b..f93d6ae5cb 100644 --- a/transformer_engine/pytorch/float8_tensor.py +++ b/transformer_engine/pytorch/float8_tensor.py @@ -4,7 +4,7 @@ """Tensor class with FP8 data""" from __future__ import annotations -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple, Union import torch from torch.utils._pytree import tree_map @@ -233,6 +233,87 @@ def forward( def backward(ctx, grad): return grad.to(ctx.input_dtype), None +class _ViewFunc(torch.autograd.Function): + """View function + + View the Float8Tensor using the provided shape. + + """ + + @staticmethod + def forward( + ctx, + tensor: torch.Tensor, + shape: Tuple[int] = None, + ) -> torch.Tensor: + + # Return input tensor if shape is not provided + ctx.shape = tensor.shape + if shape is None: + return tensor + + # Construct new tensor if shape is provided + if isinstance(tensor, Float8Tensor): + return Float8Tensor.make_like( + tensor, + data=tensor._data.view(*shape), + ) + return tensor.view(*shape) + + @staticmethod + def backward(ctx, + grad: torch.Tensor, + ) -> Tuple[Union[torch.Tensor, None], ...]: + + if isinstance(grad, Float8Tensor): + dgrad = Float8Tensor.make_like( + grad, + data=grad._data.view(ctx.shape), + ) + return dgrad, None + return grad.view(ctx.shape), None + + +class _ReshapeFunc(torch.autograd.Function): + """Reshape function + + Reshape the Float8Tensor using the provided shape. + + """ + + @staticmethod + def forward( + ctx, + tensor: torch.Tensor, + shape: Tuple[int] = None, + ) -> torch.Tensor: + + # Return input tensor if shape is not provided + ctx.shape = tensor.shape + if shape is None: + return tensor + + # Construct new tensor if shape is provided + if isinstance(tensor, Float8Tensor): + return Float8Tensor.make_like( + tensor, + data=tensor._data.reshape(*shape), + ) + return tensor.reshape(*shape) + + @staticmethod + def backward(ctx, + grad: torch.Tensor, + ) -> Tuple[Union[torch.Tensor, None], ...]: + + if isinstance(grad, Float8Tensor): + dgrad = Float8Tensor.make_like( + grad, + data=grad._data.reshape(ctx.shape), + ) + return dgrad, None + return grad.reshape(ctx.shape), None + class Float8Tensor(torch.Tensor): """Experimental tensor class with FP8 data @@ -453,6 +534,12 @@ def cpu(self) -> torch.Tensor: def clone(self) -> Float8Tensor: return _IdentityFunc.apply(self, {"data": self._data.detach().clone()}) + def view(self, *shape: Tuple[int]) -> Float8Tensor: + return _ViewFunc.apply(self, shape) + + def reshape(self, *shape: Tuple[int]) -> Float8Tensor: + return _ReshapeFunc.apply(self, shape) + def expand_as(self, other: torch.Tensor): if other is self: # Note: expand_as is hackily used to create dummy autograd nodes diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py index d06443efb6..b871169a11 100644 --- a/transformer_engine/pytorch/fp8.py +++ b/transformer_engine/pytorch/fp8.py @@ -202,6 +202,11 @@ def add_fp8_tensors_to_global_buffer( # `fp8_param_to_autocast`. This is used for keeping track of FP8 weights # in an autocasted region and cross reference them in `float8_tensor.py` # to perform the forward amax reduction. + fp8_meta_tensor_key = cls.get_meta_tensor_key(forward=forward) + if fp8_meta_tensor_key not in fp8_meta: + # Handles non-parameter FP8 modules, e.g. DPA. + continue + if forward and fp8_weights is not None: autocast_key = cls.get_unique_autocast_key( fp8_meta["recipe"], fp8_meta["fp8_group"]) @@ -217,7 +222,6 @@ def add_fp8_tensors_to_global_buffer( key = cls.get_key_in_buffer( forward, fp8_weights is not None, fp8_meta["recipe"], fp8_meta["fp8_group"]) - fp8_meta_tensor_key = cls.get_meta_tensor_key(forward=forward) if key not in cls.global_amax_buffer: cls.global_amax_buffer[key] = [fp8_meta[fp8_meta_tensor_key].amax_history[0]] diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 3c5887d942..e0bf5efbbf 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -268,6 +268,9 @@ def adjust_amax_history_length(self, length: int, fwd: Optional[bool] = None) -> fp8_meta_tensor_keys = ("scaling_fwd" if fwd else "scaling_bwd",) for meta_key in fp8_meta_tensor_keys: + if meta_key not in self.fp8_meta: + # Handles non-parameter FP8 modules, e.g. DPA. + continue curr_len = self.fp8_meta[meta_key].amax_history.shape[0] if length == curr_len: continue @@ -568,6 +571,7 @@ def prepare_forward( inp: torch.Tensor, is_first_microbatch: Union[bool, None], num_gemms: int = 1, + allow_non_contiguous: bool = False, ) -> Generator[torch.Tensor, None, None]: """Checks and prep for FWD. The context manager is needed because there isn't a way for a module to know @@ -610,7 +614,10 @@ def prepare_forward( FP8GlobalStateManager.copy_forward_fp8_meta_tensors_for_recompute(self.fp8_meta) with torch.cuda.nvtx.range(self.__class__.__name__ + " forward"): - yield inp.contiguous() + if not allow_non_contiguous: + yield inp.contiguous() + else: + yield inp if self.fp8 and in_fp8_activation_recompute_phase(): FP8GlobalStateManager.restore_fp8_meta_tensors(self.fp8_meta) @@ -645,8 +652,11 @@ def grad_output_preprocess( R4: bias gradient on R1. """ - grad_output = grad_output.contiguous() - grad_output_mat = grad_output.view((-1, grad_output.shape[-1])) + if isinstance(grad_output, Float8Tensor): + grad_output._data = grad_output._data.contiguous() + else: + grad_output = grad_output.contiguous() + grad_output_mat = grad_output.view(-1, grad_output.shape[-1]) gather_grad_output = row_parallel_mode and ctx.sequence_parallel # No-FP8 case: bgrad is fused with wgrad for this case. @@ -696,7 +706,10 @@ def grad_output_preprocess( grad_output_c = grad_output_mat if not ctx.ub_overlap_ag: grad_output_c, _ = gather_along_first_dim(grad_output_c, ctx.tp_group) - grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) + if not isinstance(grad_output_c, Float8Tensor): + grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) + else: + grad_output_t = grad_output_c.transpose_2d() else: grad_output_c = ctx.ub_obj_gradout.get_ubuf_output(1) grad_output_t = None @@ -705,28 +718,38 @@ def grad_output_preprocess( # FP8 case without gather: cast, transpose, bgrad fused if ctx.use_bias: + grad_output_mat_no_fp8 = grad_output_mat + if isinstance(grad_output_mat, Float8Tensor): + grad_output_mat_no_fp8 = grad_output_mat.from_float8(grad_output_mat.dtype) grad_bias, grad_output_c, grad_output_t = fp8_cast_transpose_bgrad_fused( - grad_output_mat, + grad_output_mat_no_fp8, ctx.fp8_meta["scaling_bwd"], tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, ) else: if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: - grad_output_c, grad_output_t = fp8_cast_transpose_fused( - grad_output_mat, - ctx.fp8_meta["scaling_bwd"], - tex.FP8BwdTensors.GRAD_OUTPUT1, - fp8_dtype_backward, - ) + if isinstance(grad_output_mat, Float8Tensor): + grad_output_c = grad_output_mat + grad_output_t = grad_output_c.transpose_2d() + else: + grad_output_c, grad_output_t = fp8_cast_transpose_fused( + grad_output_mat, + ctx.fp8_meta["scaling_bwd"], + tex.FP8BwdTensors.GRAD_OUTPUT1, + fp8_dtype_backward, + ) else: grad_output_t = None - grad_output_c = cast_to_fp8( - grad_output_mat, - ctx.fp8_meta["scaling_bwd"], - tex.FP8BwdTensors.GRAD_OUTPUT1, - fp8_dtype_backward, - ) + if not isinstance(grad_output_mat, Float8Tensor): + grad_output_c = cast_to_fp8( + grad_output_mat, + ctx.fp8_meta["scaling_bwd"], + tex.FP8BwdTensors.GRAD_OUTPUT1, + fp8_dtype_backward, + ) + else: + grad_output_c = grad_output_mat grad_bias = None return grad_output_mat, grad_output_c, grad_output_t, grad_bias diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index a4e6b8c5b9..7d7bb0bbd5 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -44,6 +44,7 @@ from ..graph import is_graph_capturing from ._common import _apply_normalization, _noop_cat from ..float8_tensor import Float8Tensor +_NVTE_DEBUG = int(os.getenv("NVTE_DEBUG", "0")) __all__ = ["LayerNormLinear"] @@ -191,6 +192,9 @@ def forward( ln_out = ln_out_total if fp8: + if _NVTE_DEBUG: + print('[LayerNormLinear]: using FP8 forward') + bias_dtype = ( torch.bfloat16 if activation_dtype == torch.float32 @@ -231,6 +235,15 @@ def forward( ) weight_t_fp8 = None + if fp8_meta["recipe"].fp8_mha: + out_index, meta_tensor, output_te_dtype, output_dtype = ( + tex.FP8FwdTensors.GEMM1_OUTPUT, + fp8_meta["scaling_fwd"], + fp8_dtype_forward, + torch.uint8) + else: + out_index, meta_tensor, output_te_dtype, output_dtype = ( + None, None, None, activation_dtype) out, _ = tex.fp8_gemm( weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, @@ -240,7 +253,7 @@ def forward( fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, - activation_dtype, + output_dtype, get_workspace(), bias=bias, use_bias=use_bias, @@ -248,8 +261,22 @@ def forward( ub_algo=ub_algo if ub_overlap_ag else None, ub=ub_obj_lnout if ub_overlap_ag else None, extra_output_tensor=ln_out if ub_overlap_ag else None, + out_index=out_index, + fp8_meta_tensor=meta_tensor, + D_dtype=output_te_dtype, ) + if output_dtype == torch.uint8: + out = Float8Tensor(data=out, + fp8_meta=fp8_meta, + fp8_meta_forward=True, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_OUTPUT, + fp8_dtype=fp8_dtype_forward, + dtype=activation_dtype, + ) else: + if _NVTE_DEBUG: + print('[LayerNormLinear]: using non-FP8 forward') + # Cast for native AMP weight = cast_if_needed(weight, activation_dtype) bias = cast_if_needed(bias, activation_dtype) if use_bias else bias @@ -343,7 +370,6 @@ def forward( # [*, in_features] -> [*, out_features] except first dimension changes for SP out = out.view(-1, *inp.shape[1:-1], out.shape[-1]) - if return_layernorm_output: if return_layernorm_output_gathered: shape = list(inp.shape) @@ -357,6 +383,10 @@ def forward( def backward( ctx, *grad_outputs: Tuple[torch.Tensor, ...] ) -> Tuple[Union[torch.Tensor, None], ...]: + if isinstance(grad_outputs[0], Float8Tensor): + ctx.fp8_meta["scaling_bwd"].scale_inv[ + tex.FP8BwdTensors.GRAD_OUTPUT1] = grad_outputs[0]._scale_inv + with torch.cuda.nvtx.range("_LayerNormLinear_backward"): ( inputmat, @@ -470,6 +500,9 @@ def backward( ub_obj = None if ctx.fp8: + if _NVTE_DEBUG: + print('[LayerNormLinear]: using FP8 backward') + fp8_dtype_forward = get_fp8_te_dtype( ctx.fp8_meta["recipe"], fprop_tensor=True ) @@ -491,7 +524,8 @@ def backward( fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - grad_output_c, + grad_output_c._data + if isinstance(grad_output_c, Float8Tensor) else grad_output_c, ctx.fp8_meta["scaling_bwd"].scale_inv, tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, @@ -508,6 +542,9 @@ def backward( ) clear_tensor_data(grad_output_c) else: + if _NVTE_DEBUG: + print('[LayerNormLinear]: using non-FP8 backward') + # DGRAD: Evaluated unconditionally to feed into Linear backward _, _, _ = tex.gemm( weight, @@ -556,7 +593,8 @@ def backward( fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, - grad_output_t, + grad_output_t._data + if isinstance(grad_output_t, Float8Tensor) else grad_output_t, ctx.fp8_meta["scaling_bwd"].scale_inv, tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 9829719c86..cb2f6871b3 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -3,6 +3,7 @@ # See LICENSE for license information. """Linear API""" +import os from typing import Union, Optional, Callable, Tuple, List, Dict, Any import torch @@ -47,6 +48,8 @@ from ..graph import is_graph_capturing from ..float8_tensor import Float8Tensor +_NVTE_DEBUG = int(os.getenv("NVTE_DEBUG", "0")) + __all__ = ["Linear"] @@ -82,11 +85,16 @@ def forward( ub_overlap_rs: bool, ub_overlap_ag: bool, ub_name: str, + is_first_module_in_mha: bool, ) -> torch.Tensor: + is_input_fp8 = isinstance(inp, Float8Tensor) + if is_input_fp8: + fp8_meta["scaling_fwd"].scale_inv[tex.FP8FwdTensors.GEMM1_INPUT] = inp._scale_inv[0] + # Make sure input dimensions are compatible in_features = weight.shape[-1] assert inp.shape[-1] == in_features, "GEMM not possible" - inputmat = inp.view((-1, in_features)) + inputmat = inp.view(-1, in_features) if fp8: assert_dim_for_fp8_exec(inputmat) assert_dim_for_fp8_exec(weight) @@ -104,29 +112,40 @@ def forward( inputmat = cast_if_needed(inputmat, activation_dtype) inputmat_t = None inputmat_no_fp8 = inputmat + if fp8: fp8_dtype_forward = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True) - if ( - not fp8_meta["recipe"].override_linear_precision.wgrad - and is_grad_enabled - and weight.requires_grad - and not sequence_parallel - ): - # FP8 input for forward, FP8 input transpose for backward wgrad - inputmat, inputmat_t = fp8_cast_transpose_fused( - inputmat, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - ) + if isinstance(inputmat, Float8Tensor): + if ( + not fp8_meta["recipe"].override_linear_precision.wgrad + and is_grad_enabled + and weight.requires_grad + and not sequence_parallel + ): + # FP8 input for forward, FP8 input transpose for backward wgrad + inputmat_t = inputmat.transpose_2d() else: - # FP8 input for forward - inputmat = cast_to_fp8( - inputmat, - fp8_meta["scaling_fwd"], - tex.FP8FwdTensors.GEMM1_INPUT, - fp8_dtype_forward, - ) + if ( + not fp8_meta["recipe"].override_linear_precision.wgrad + and is_grad_enabled + and weight.requires_grad + and not sequence_parallel + ): + # FP8 input for forward, FP8 input transpose for backward wgrad + inputmat, inputmat_t = fp8_cast_transpose_fused( + inputmat, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + ) + else: + # FP8 input for forward + inputmat = cast_to_fp8( + inputmat, + fp8_meta["scaling_fwd"], + tex.FP8FwdTensors.GEMM1_INPUT, + fp8_dtype_forward, + ) # Column Parallel Linear if parallel_mode == "column" and sequence_parallel: @@ -135,6 +154,9 @@ def forward( inputmat_total = inputmat if fp8: + if _NVTE_DEBUG: + print('[Linear]: using FP8 forward') + bias_dtype = ( torch.bfloat16 if activation_dtype == torch.float32 @@ -175,8 +197,16 @@ def forward( ) weight_t_fp8 = None - proj_out_index, meta_tensor, proj_out_tetype, proj_out_pttype = ( - None, None, None, activation_dtype) + if is_first_module_in_mha: + proj_out_index, meta_tensor, proj_out_tetype, proj_out_pttype = ( + tex.FP8FwdTensors.GEMM1_OUTPUT, + fp8_meta["scaling_fwd"], + fp8_dtype_forward, + torch.uint8) + else: + proj_out_index, meta_tensor, proj_out_tetype, proj_out_pttype = ( + None, None, None, activation_dtype) + if ub_overlap_rs: ub_obj_projout = get_ub(ub_name+"_fprop") out = ub_obj_projout.get_ubuf_output(1) @@ -203,14 +233,15 @@ def forward( else: dim_size = list(inputmat_total.size()) dim_size[1] = weight.size(0) - out = torch.empty(dim_size, dtype=activation_dtype, device=inputmat_total.device) + out = torch.empty(dim_size, dtype=proj_out_pttype, device=inputmat_total.device) _ = fp8_gemm( weight_fp8._data, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_WEIGHT, fp8_dtype_forward, - inputmat_total, + inputmat_total._data + if isinstance(inputmat_total, Float8Tensor) else inputmat_total, fp8_meta["scaling_fwd"].scale_inv, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, @@ -227,7 +258,18 @@ def forward( fp8_meta_tensor = meta_tensor, D_dtype = proj_out_tetype, ) + if is_first_module_in_mha: + out = Float8Tensor(data=out, + fp8_meta=fp8_meta, + fp8_meta_forward=True, + fp8_meta_index=tex.FP8FwdTensors.GEMM1_OUTPUT, + fp8_dtype=fp8_dtype_forward, + dtype=activation_dtype, + ) else: + if _NVTE_DEBUG: + print('[Linear]: using non-FP8 forward') + # Cast for native AMP weight = cast_if_needed(weight, activation_dtype) bias = cast_if_needed(bias, activation_dtype) if use_bias else bias @@ -320,6 +362,7 @@ def forward( ctx.ub_name = ub_name ctx.tp_size = tp_size ctx.requires_dgrad = inp.requires_grad + ctx.is_input_fp8 = is_input_fp8 ctx.primary_weights_in_fp8 = primary_weights_in_fp8 ctx.reduce_and_update_bwd_fp8_tensors = False if ctx.fp8 and requires_grad(inp, weight, bias): @@ -343,6 +386,10 @@ def forward( def backward( ctx, grad_output: torch.Tensor ) -> Tuple[Union[torch.Tensor, None], ...]: + if isinstance(grad_output, Float8Tensor): + ctx.fp8_meta["scaling_bwd"].scale_inv[ + tex.FP8BwdTensors.GRAD_OUTPUT1] = grad_output._scale_inv + with torch.cuda.nvtx.range("_Linear_backward"): ( inputmat, @@ -417,6 +464,18 @@ def backward( if ctx.requires_dgrad: if ctx.fp8: + if _NVTE_DEBUG: + print('[Linear]: using FP8 backward') + + if ctx.is_input_fp8: + out_index, meta_tensor, output_te_dtype, output_dtype = ( + tex.FP8BwdTensors.GRAD_INPUT1, + ctx.fp8_meta["scaling_bwd"], + fp8_dtype_backward, + torch.uint8) + else: + out_index, meta_tensor, output_te_dtype, output_dtype = ( + None, None, None, ctx.activation_dtype) dgrad, _ = fp8_gemm( weight_t_fp8, fwd_scale_inverses, @@ -426,13 +485,27 @@ def backward( ctx.fp8_meta["scaling_bwd"].scale_inv, tex.FP8BwdTensors.GRAD_OUTPUT1, fp8_dtype_backward, - ctx.activation_dtype, + output_dtype, get_workspace(), use_split_accumulator=_2X_ACC_DGRAD, ub_algo=ub_algo if ctx.ub_overlap_ag else None, ub=ctx.ub_obj_gradout if ctx.ub_overlap_ag else None, + out_index=out_index, + fp8_meta_tensor=meta_tensor, + D_dtype=output_te_dtype, ) + if output_dtype == torch.uint8: + dgrad = Float8Tensor(data=dgrad, + fp8_meta=ctx.fp8_meta, + fp8_meta_forward=False, + fp8_meta_index=tex.FP8BwdTensors.GRAD_INPUT1, + fp8_dtype=fp8_dtype_backward, + dtype=ctx.activation_dtype, + ) else: + if _NVTE_DEBUG: + print('[Linear]: using non-FP8 backward') + dgrad, _, _ = gemm( weight, grad_output, @@ -460,11 +533,19 @@ def backward( # WGRAD if not ctx.fp8_meta["recipe"].override_linear_precision.wgrad: if ctx.ub_overlap_ag: - grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) + if isinstance(grad_output_c, Float8Tensor): + grad_output_t = grad_output_c.transpose_2d() + else: + grad_output_t = tex.fp8_transpose(grad_output_c, fp8_dtype_backward) if inputmat_t_total is None: - inputmat_t_total = tex.fp8_transpose(inputmat_total, fp8_dtype_backward) + if isinstance(inputmat_total, Float8Tensor): + inputmat_t_total = inputmat_total.transpose_2d() + else: + inputmat_t_total = tex.fp8_transpose( + inputmat_total, fp8_dtype_backward) wgrad, _ = fp8_gemm( - inputmat_t_total, + inputmat_t_total._data + if isinstance(inputmat_t_total, Float8Tensor) else inputmat_t_total, fwd_scale_inverses, tex.FP8FwdTensors.GEMM1_INPUT, fp8_dtype_forward, @@ -563,6 +644,7 @@ def backward( None, None, None, + None, ) @@ -855,6 +937,7 @@ def forward( self, inp: torch.Tensor, is_first_microbatch: Optional[bool] = None, + is_first_module_in_mha: Optional[bool] = False, ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: """ Apply the linear transformation to the input. @@ -882,10 +965,14 @@ def forward( if skip_fp8_weight_update is not None: is_first_microbatch = False - with self.prepare_forward(inp, is_first_microbatch) as inp: + with self.prepare_forward(inp, + is_first_microbatch, + allow_non_contiguous=isinstance(inp,Float8Tensor)) as inp: assert self.fp8 or not self.primary_weights_in_fp8, \ "Need to run inside fp8_autocast region when weights are stored in FP8." + is_first_module_in_mha = is_first_module_in_mha and self.fp8_meta["recipe"].fp8_mha + # Get concatenated weight and bias tensors if len(self.parameter_split_sizes) == 1: weight_tensor = getattr(self, self.weight_names[0]) @@ -944,6 +1031,7 @@ def forward( self.ub_overlap_rs, self.ub_overlap_ag, self.ub_name, + is_first_module_in_mha, ) out = linear_fn(*args) diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 25e6a74b34..f60f8c29c7 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -23,10 +23,15 @@ def clear_tensor_data(*tensors: Tuple[Optional[torch.Tensor], ...]) -> None: Must be used carefully. """ + from .float8_tensor import Float8Tensor for t in tensors: if t is not None: - t.data = torch.Tensor() - del t + if isinstance(t, Float8Tensor): + t._data.data = torch.Tensor() + del t + else: + t.data = torch.Tensor() + del t def get_device_compute_capability() -> Tuple[int, int]: From 9f0a4a4b4d3617152e7bc2f57fff257ae4caddd4 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Mon, 29 Apr 2024 13:22:54 -0700 Subject: [PATCH 097/427] [PyTorch] Fix tp_group_initialized error (#819) remove tp_size/tp_group as amax reduction is handled by fp8_group() Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- transformer_engine/pytorch/attention.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 4bb39b913f..3bf4598fc1 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1937,7 +1937,7 @@ class FusedAttnFunc_qkvpacked(torch.autograd.Function): def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, rng_gen, fused_attention_backend, use_FAv2_bwd, - fp8, fp8_meta, tp_size, tp_group): + fp8, fp8_meta): if fp8: if _NVTE_DEBUG: print('[DotProductAttention]: using FP8 forward') @@ -2011,8 +2011,6 @@ def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, qkvo_tensors = (qkv, out_save) if not ctx.fp8 else (None, None) ctx.save_for_backward(*qkvo_tensors, cu_seqlens, *fp8_tensors) ctx.fp8_meta = fp8_meta - ctx.tp_size = tp_size - ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen = max_seqlen ctx.qkv_dtype = qkv_dtype @@ -2133,7 +2131,7 @@ class FusedAttnFunc_kvpacked(torch.autograd.Function): def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, q, kv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, rng_gen, fused_attention_backend, - use_FAv2_bwd, fp8, fp8_meta, tp_size, tp_group): + use_FAv2_bwd, fp8, fp8_meta): if fp8: if _NVTE_DEBUG: print('[DotProductAttention]: using FP8 forward') @@ -2214,8 +2212,6 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql qkvo_tensors = (q, kv, out_save) if not ctx.fp8 else (None, None, None) ctx.save_for_backward(*qkvo_tensors, cu_seqlens_q, cu_seqlens_kv, *fp8_tensors) ctx.fp8_meta = fp8_meta - ctx.tp_size = tp_size - ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen_q = max_seqlen_q ctx.max_seqlen_kv = max_seqlen_kv @@ -2350,7 +2346,7 @@ class FusedAttnFunc(torch.autograd.Function): def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, q, k, v, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type, rng_gen, fused_attention_backend, - use_FAv2_bwd, fp8, fp8_meta, tp_size, tp_group): + use_FAv2_bwd, fp8, fp8_meta): if fp8: if _NVTE_DEBUG: print('[DotProductAttention]: using FP8 forward') @@ -2488,8 +2484,6 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql qkvo_tensors = (q, k, v, out_save) if not ctx.fp8 else (None, None, None, None) ctx.save_for_backward(*qkvo_tensors, cu_seqlens_q, cu_seqlens_kv, *fp8_tensors) ctx.fp8_meta = fp8_meta - ctx.tp_size = tp_size - ctx.tp_group = tp_group ctx.aux_ctx_tensors = aux_ctx_tensors ctx.max_seqlen_q = max_seqlen_q ctx.max_seqlen_kv = max_seqlen_kv @@ -2691,8 +2685,6 @@ def __init__( attention_type: str = "self", layer_number: Optional[int] = None, deterministic: bool = False, - tp_size: int = 1, - tp_group: Optional[dist_group_type] = None, ) -> None: super().__init__() @@ -2719,9 +2711,6 @@ def __init__( if os.environ["NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT"] == "1": os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = "-1" - self.tp_size = tp_size - self.tp_group = tp_group - def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], @@ -2875,8 +2864,6 @@ def forward( use_FAv2_bwd, self.fp8 and self.fp8_meta["recipe"].fp8_dpa, self.fp8_meta, - self.tp_size, - self.tp_group, ) # ...hd -> ...(hd) @@ -3075,9 +3062,7 @@ def __init__( attention_type=attention_type, layer_number=layer_number, deterministic=self.deterministic, - **attn_kwargs, - tp_size=self.tp_size, - tp_group=self.tp_group) + **attn_kwargs) self.unfused_attention = UnfusedDotProductAttention( norm_factor, **attn_kwargs, layer_number=layer_number) From 3c604eb0d3a8b3418bea7a9ff62dbdb677d8f6e1 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 30 Apr 2024 14:12:46 -0700 Subject: [PATCH 098/427] Avoid amax roll for non-run modules (#825) Signed-off-by: Kirthi Shankar Sivamani --- .../common/recipe/delayed_scaling.cu | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/transformer_engine/common/recipe/delayed_scaling.cu b/transformer_engine/common/recipe/delayed_scaling.cu index 38e71b74de..de48a53ebf 100644 --- a/transformer_engine/common/recipe/delayed_scaling.cu +++ b/transformer_engine/common/recipe/delayed_scaling.cu @@ -197,16 +197,18 @@ kernel_bulk( const auto last_amax = ((amax_reduction_buffer != nullptr) && (amax_reduction_buffer[offset_in_buffer+count] != 0.0f)) ? amax_reduction_buffer[offset_in_buffer+count] : amax_history[0]; - for (size_t off = 0; off < length; off += bsize) { - const size_t i = off + tid; - float a = 0; - if (i < length) { - a = (i < length - 1) ? amax_history[(i+1)*stride] : last_amax; - amax = fmaxf(amax, a); - } - __syncthreads(); // Inplace roll - if (i < length) { - amax_history[i*stride] = (i > 0) ? a : 0; + if (last_amax != 0.0f) { + for (size_t off = 0; off < length; off += bsize) { + const size_t i = off + tid; + float a = 0; + if (i < length) { + a = (i < length - 1) ? amax_history[(i+1)*stride] : last_amax; + amax = fmaxf(amax, a); + } + __syncthreads(); // Inplace roll + if (i < length) { + amax_history[i*stride] = (i > 0) ? a : 0; + } } } From c81733f1032a56a817b594c8971a738108ded7d0 Mon Sep 17 00:00:00 2001 From: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Date: Wed, 1 May 2024 20:41:59 -0700 Subject: [PATCH 099/427] [PyTorch] Miscellanous fixes for FP8 DPA module (#804) * initialize tp_group for FP8 DPA Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * fix cuDNN version in unit tests for cuDNN v9 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add hook to ignore missing fused_attn._extra_states if training from old checkpoints Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove test and redundant implementation from last commit Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove warning message and replace with docstring Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove tp_size/tp_group in FusedAttention; amax reduction is handled with fp8_group Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * move core_attention.fused_attention._extra_state to core_attention._extra_state Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * simplify post_state_dict_hooks between FU and DPA Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * add temporary test Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove previous attempts to move core_attention.fused_attention to core_attention; keep the test Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * remove the test Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * disable pylint self arg for hook which is required by hook Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> --- tests/pytorch/fused_attn/test_fused_attn.py | 3 ++- transformer_engine/pytorch/attention.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py index 40cfdd34b7..caba385d46 100644 --- a/tests/pytorch/fused_attn/test_fused_attn.py +++ b/tests/pytorch/fused_attn/test_fused_attn.py @@ -70,7 +70,8 @@ def reset_global_fp8_state(): def _cudnn_version() -> Tuple[int, int, int]: """Runtime cuDNN version (major, minor, patch)""" encoded_version = ext.get_cudnn_version() - major, encoded_version = divmod(encoded_version, 1000) + major_version_magnitude = 1000 if encoded_version < 90000 else 10000 + major, encoded_version = divmod(encoded_version, major_version_magnitude) minor, patch = divmod(encoded_version, 100) return (major, minor, patch) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 3bf4598fc1..af6c151cab 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -2711,6 +2711,17 @@ def __init__( if os.environ["NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT"] == "1": os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = "-1" + def remove_extra_states_check(self, incompatible_keys): # pylint: disable=unused-argument + """ + Temporarily remove fused_attention._extra_state as a missing key + when loading older TransformerEngine checkpoints. Will phase out + this hook in TransformerEngine 2.0. + """ + for key in incompatible_keys.missing_keys: + if 'fused_attention._extra_state' in key: + incompatible_keys.missing_keys.remove(key) + self.register_load_state_dict_post_hook(remove_extra_states_check) + def get_fp8_weights_scratchpad( self, is_first_microbatch: Union[bool, None], @@ -3063,6 +3074,7 @@ def __init__( layer_number=layer_number, deterministic=self.deterministic, **attn_kwargs) + self.unfused_attention = UnfusedDotProductAttention( norm_factor, **attn_kwargs, layer_number=layer_number) From 7413843fd7d9b4a98f9abdb8843b24821a6b96a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Mon, 20 May 2024 09:33:04 -0700 Subject: [PATCH 100/427] [PyTorch] Fixed bug with loading calibrated weights (#771) * Calibration fix Signed-off-by: Pawel Gadzinski * Lint fix Signed-off-by: Pawel Gadzinski --------- Signed-off-by: Pawel Gadzinski Co-authored-by: Pawel Gadzinski --- qa/L0_pytorch_unittest/test.sh | 1 + tests/pytorch/test_torch_save_load.py | 37 +++++++++++++++++++++-- transformer_engine/pytorch/module/base.py | 19 ++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index 2c14664dce..2aa58e6018 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -17,3 +17,4 @@ NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_a pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py +pytest -v -s $TE_PATH/tests/pytorch/test_torch_save_load.py \ No newline at end of file diff --git a/tests/pytorch/test_torch_save_load.py b/tests/pytorch/test_torch_save_load.py index 85ec7685b3..211030fe6d 100644 --- a/tests/pytorch/test_torch_save_load.py +++ b/tests/pytorch/test_torch_save_load.py @@ -65,6 +65,9 @@ def __init__(self, precision, use_bias): self.inp_type = tex.DType.kFloat8E4M3 self.weights_type = tex.DType.kFloat8E4M3 self.outp_type = precision + + def get_fp8_weights_scratchpad(self, is_first_microbatch): + raise RuntimeError("Method get_fp8_weights_scratchpad is dummy and should not be invoked.") def forward(self, inp, weight): inp_fp8 = cast_to_fp8( @@ -145,14 +148,11 @@ def test_fp8_model_checkpoint( params_dtype=dtype, device=device, ) - # Keep track of model output x = torch.randn(dims, dtype=dtype, device=device) with te.fp8_autocast(): y_ref = model(x.detach().clone()).detach().clone() - # Keep track of weights and FP8 scaling factors - weight_ref = model.weight.float().detach().clone() fp8_meta_ref = { "scaling_fwd": {}, "scaling_bwd": {} } with te.fp8_autocast(), torch.no_grad(): fp8_meta_fwd = model.fp8_meta["scaling_fwd"] @@ -168,6 +168,18 @@ def test_fp8_model_checkpoint( fp8_meta_bwd.scale.copy_(fp8_meta_bwd_ref["scale"]) fp8_meta_bwd.scale_inv.copy_(fp8_meta_bwd_ref["scale_inv"]) del fp8_meta_fwd, fp8_meta_bwd + + # [ This is part of logic that tests save_fp8_model=False and load_fp8_model=True ] + # This line copies the fp8 scale_inv from the model metadata to the weight fp8 tensor. + # The sole purpose of the following lines is to set the scale_inv of the weight tensor, which is the simplest method. + # It is essential for these values to be equal, so setting scale_inv only in the model metadata is insufficient. + model.weight.data.copy_(model.weight.float().cuda()) + # After copying, the tensor computes the meta scale_inv based on the amax history; we then reset these values. + model.fp8_meta["scaling_fwd"].scale = fp8_meta_fwd_ref["scale"] + model.fp8_meta["scaling_fwd"].scale_inv = fp8_meta_fwd_ref["scale_inv"] + + # Keep track of weights and FP8 scaling factors + weight_ref = model.weight.float().detach().clone() # Save checkpoint byte_stream = io.BytesIO() @@ -214,6 +226,18 @@ def test_fp8_model_checkpoint( with pytest.raises(AssertionError): torch.testing.assert_close(y, y_ref, **tols) + + # [ This is part of logic that tests save_fp8_model=False and load_fp8_model=True ] + # When save_fp8_model=True, we load a model with weights in high precision, + # which does not include _scale_inv, + # but has the fp8 scaling factor in the meta data. This scenario can occur + # when using te.fp8_autocast(enabled=False, calibrating=True). + # + # In such cases, the default behavior of load_state_dict is incorrect - it loads tensors first, + # followed by the fp8 metadata. This results in an incorrect _scale_inv for the tensor. This behavior + # is corrected by overriding the _load_state_dict method from PyTorch in TransformerEngineBaseModule, + # to load the fp8 metadata before loading tensors. + # # Load checkpoint model.load_state_dict(torch.load(io.BytesIO(model_bytes))) del model_bytes @@ -232,3 +256,10 @@ def test_fp8_model_checkpoint( with te.fp8_autocast(): y = model(x.detach().clone()) torch.testing.assert_close(y, y_ref, **tols) + + if load_fp8_model: + # [ This is part of logic that tests save_fp8_model=False and load_fp8_model=True ] + # We need to ensure that the tensor's scale_inv parameter matches its meta data. + # This is crucial to avoid confusion about which value is correct. + meta_index = model.weight._fp8_meta_index + torch.testing.assert_close(model.weight._scale_inv.item(), fp8_meta_fwd_ref["scale_inv"][meta_index].item()) \ No newline at end of file diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 0803b474f6..31011be897 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -858,3 +858,22 @@ def get_fp8_weights_scratchpad( is_first_microbatch: Union[bool, None], ) -> List[torch.Tensor]: """Needs override.""" + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + """ + This function loads tensors and extra state including fp8 metadata. + This metadata is essential for copying fp8 tensors, as the copy_ function + uses the scale_inv parameter from fp8_meta to set the correct scaling factor + for the new tensor. + Hence, this extra state must be loaded before the tensor copying process, + not after, as is typically done in _load_from_state_dict. + Tensors are copied into fp8 tensors only when self.primary_weights_in_fp8=True, + otherwise, this behavior is not required. + """ + if self.primary_weights_in_fp8: + extra_state_key = prefix + torch.nn.modules.module._EXTRA_STATE_KEY_SUFFIX + if extra_state_key in state_dict: + self.set_extra_state(state_dict[extra_state_key]) + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) From b2f2e1dc09faa9329f17fb36f9fed6357e0e9c50 Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Tue, 21 May 2024 15:41:36 -0500 Subject: [PATCH 101/427] [PyTorch] Replaced deprecated `pkg_resources` with `packaging` (#860) replaced deprecated pkg_resources with packaging Signed-off-by: Alp Dener --- setup.py | 1 + transformer_engine/pytorch/attention.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 769d62a25b..e7bf2f38b7 100644 --- a/setup.py +++ b/setup.py @@ -246,6 +246,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: install_reqs: List[str] = [ "pydantic", "importlib-metadata>=1.0; python_version<'3.8'", + "packaging", ] test_reqs: List[str] = ["pytest"] diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index d4198e688d..841f2ba8af 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -5,14 +5,14 @@ """Attention.""" import collections from contextlib import nullcontext -from importlib.metadata import version +from importlib.metadata import version as get_pkg_version import math import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union import warnings import numpy as np -from pkg_resources import packaging +from packaging.version import Version as PkgVersion import torch import torch.nn.functional as F @@ -67,13 +67,13 @@ from transformer_engine.pytorch.graph import is_graph_capturing -_flash_attn_version = packaging.version.Version(version("flash-attn")) -_flash_attn_version_required = packaging.version.Version("2.0.6") -_flash_attn_max_version = packaging.version.Version("2.5.8") -_flash_attn_2_1_plus = _flash_attn_version >= packaging.version.Version("2.1") -_flash_attn_2_3_plus = _flash_attn_version >= packaging.version.Version("2.3") -_flash_attn_2_4_plus = _flash_attn_version >= packaging.version.Version("2.4") -_flash_attn_2_4_1_plus = _flash_attn_version >= packaging.version.Version("2.4.1") +_flash_attn_version = PkgVersion(get_pkg_version("flash-attn")) +_flash_attn_version_required = PkgVersion("2.0.6") +_flash_attn_max_version = PkgVersion("2.5.8") +_flash_attn_2_1_plus = _flash_attn_version >= PkgVersion("2.1") +_flash_attn_2_3_plus = _flash_attn_version >= PkgVersion("2.3") +_flash_attn_2_4_plus = _flash_attn_version >= PkgVersion("2.4") +_flash_attn_2_4_1_plus = _flash_attn_version >= PkgVersion("2.4.1") if _flash_attn_version >= _flash_attn_version_required: from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_forward_func # pylint: disable=no-name-in-module From 5895eab18609829c793c2112c6a3d1b358a5aee9 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen <36155692+phu0ngng@users.noreply.github.com> Date: Tue, 21 May 2024 17:01:26 -0700 Subject: [PATCH 102/427] [Common] Added Alignment Requirements for CuBLAS heuristics (#845) * added alignment requirements for CuBLAS heuristics Signed-off-by: Phuong Nguyen * minor rewords Signed-off-by: Phuong Nguyen * added unit test for gemm with unaligned inputs Signed-off-by: Phuong Nguyen * added pytest skip if fp8 is not available Signed-off-by: Phuong Nguyen * changed offset so that it has alignment with 128 Signed-off-by: Phuong Nguyen --------- Signed-off-by: Phuong Nguyen --- tests/pytorch/test_sanity.py | 62 +++++++++++++++++++ .../common/gemm/cublaslt_gemm.cu | 28 ++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py index cf17eccd1b..91e67e8f9a 100644 --- a/tests/pytorch/test_sanity.py +++ b/tests/pytorch/test_sanity.py @@ -29,6 +29,10 @@ get_cpu_offload_context, ) from transformer_engine.common import recipe +import transformer_engine_extensions as tex +from transformer_engine.pytorch.cpp_extensions import gemm, fp8_gemm, gelu, cast_to_fp8, cast_from_fp8 +from transformer_engine.pytorch.module.base import get_workspace +from test_onnx_export import create_meta # Only run FP8 tests on H100. fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() @@ -924,3 +928,61 @@ def test_model_multiple_cast(): y2 = m(a) assert y2.dtype == torch.float16 + + +@pytest.mark.parametrize("N", [32]) +@pytest.mark.parametrize("offset", [1, 3, 5]) +@pytest.mark.parametrize("datatype", param_types) +def test_sanity_gemm_with_unalignment(N, offset, datatype): + scratchpad = torch.randn(N*N + 2*offset, device="cuda", dtype=datatype) + inp = torch.reshape(scratchpad[offset:-offset], (N, N)) + weight = torch.reshape(scratchpad[offset*2:], (N, N)) + + _, _, _ = gemm( + A=weight, + B=inp, + dtype=datatype, + workspace=get_workspace()) + torch.cuda.synchronize() + + +@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) +@pytest.mark.parametrize("N", [32]) +@pytest.mark.parametrize("datatype", [torch.float16, torch.bfloat16]) +def test_sanity_fp8_gemm_with_unalignment(N, datatype): + offset = 16 + scratchpad = torch.randn(N*N + offset, device="cuda", dtype=datatype) + + fp8_tensor_inp = tex.FP8FwdTensors.GEMM1_INPUT + fp8_tensor_weight = tex.FP8FwdTensors.GEMM1_WEIGHT + + nb_inp_scales, nb_weight_scales = 1, N + scale_factor = 1. + meta_inp = create_meta(scale_factor, nb_inp_scales) + meta_weight = create_meta(scale_factor, nb_weight_scales) + inp_type = tex.DType.kFloat8E4M3 + weights_type = tex.DType.kFloat8E4M3 + outp_type = datatype + + scratchpad_fp8 = cast_to_fp8( + scratchpad, + meta_weight, + fp8_tensor_inp, + inp_type) + inp_fp8 = torch.reshape(scratchpad_fp8[:-offset], (N, N)) + weight_fp8 = torch.reshape(scratchpad_fp8[offset:], (N, N)) + _, _ = fp8_gemm( + weight_fp8, + meta_weight.scale_inv, + fp8_tensor_weight, + inp_type, + inp_fp8, + meta_inp.scale_inv, + fp8_tensor_inp, + weights_type, + outp_type, + get_workspace(), + bias=None, + use_bias=False, + use_split_accumulator=False) + torch.cuda.synchronize() diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index d68c21cd19..a4c65661dc 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "../common.h" @@ -34,6 +35,16 @@ cudaDataType_t get_cuda_dtype(const transformer_engine::DType t) { } } +uint32_t _getAlignment(uintptr_t address) { + // alignment are in bytes + uint32_t alignment = 256; + for (; ; alignment /= 2) { + if (address % alignment == 0) { + return alignment; + } + } +} + } // namespace namespace transformer_engine { @@ -260,6 +271,22 @@ void cublas_gemm(const Tensor *inputA, NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize))); + const auto A_alignment = _getAlignment(reinterpret_cast(A)); + const auto B_alignment = _getAlignment(reinterpret_cast(B)); + const auto C_alignment = _getAlignment(reinterpret_cast(C)); + const auto D_alignment = _getAlignment(reinterpret_cast(D)); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, + &A_alignment, sizeof(A_alignment))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, + &B_alignment, sizeof(B_alignment))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, + &C_alignment, sizeof(C_alignment))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, + &D_alignment, sizeof(D_alignment))); const auto status = cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, 1, &heuristicResult, @@ -271,7 +298,6 @@ void cublas_gemm(const Tensor *inputA, if (returnedResults == 0) throw std::runtime_error("Unable to find any suitable algorithms"); // D = alpha * (A * B) + beta * C - NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, operationDesc, static_cast(&one), /* alpha */ From 08042a509c999844685dfeda7d4332be2da12c7e Mon Sep 17 00:00:00 2001 From: Alp Dener Date: Wed, 22 May 2024 13:52:38 -0500 Subject: [PATCH 103/427] [PyTorch] Support `torch.amp.autocast` in TE checkpoint (#791) TE checkpoint now preserves the torch autocast context from the forward pass during the recompute phase Signed-off-by: Alp Dener --- transformer_engine/pytorch/distributed.py | 46 +++++++++++++++++++---- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index caaef91985..b0fb80b6a1 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -228,6 +228,26 @@ def in_fp8_activation_recompute_phase() -> bool: return _FP8_ACTIVATION_RECOMPUTE_PHASE +def _get_active_autocast_contexts(): + """ + Returns new CPU and GPU torch.amp.autocast(..) contexts that match the active autocast state + at the time of this function's execution. + """ + autocast_cached = torch.is_autocast_cache_enabled() + + gpu_autocast_enabled = torch.is_autocast_enabled() + gpu_autocast_dtype = torch.get_autocast_gpu_dtype() + gpu_autocast_ctx = torch.cuda.amp.autocast( + gpu_autocast_enabled, gpu_autocast_dtype, autocast_cached) + + cpu_autocast_enabled = torch.is_autocast_cpu_enabled() + cpu_autocast_dtype = torch.get_autocast_cpu_dtype() + cpu_autocast_ctx = torch.cpu.amp.autocast( + cpu_autocast_enabled, cpu_autocast_dtype, autocast_cached) + + return gpu_autocast_ctx, cpu_autocast_ctx + + class _CheckpointFunction(torch.autograd.Function): """This function is adapted from torch.utils.checkpoint with two main changes: @@ -262,6 +282,10 @@ def forward( forward_ctx, recompute_ctx = context_fn() else: forward_ctx, recompute_ctx = noop_context_fn() + + # Preserve torch autocast context for the backward pass + torch_gpu_amp_ctx, torch_cpu_amp_ctx = _get_active_autocast_contexts() + with torch.no_grad(), forward_ctx: with activation_recompute_forward( activation_recompute=True, recompute_phase=False @@ -287,6 +311,8 @@ def forward( ctx.get_rng_state_tracker = get_rng_state_tracker ctx.tp_group = tp_group ctx.recompute_ctx = recompute_ctx + ctx.torch_gpu_amp_ctx = torch_gpu_amp_ctx + ctx.torch_cpu_amp_ctx = torch_cpu_amp_ctx ctx.kwargs = kwargs return outputs @@ -331,11 +357,11 @@ def backward( # Compute the forward pass. detached_inputs = detach_variable(inputs) - with torch.enable_grad(), ctx.recompute_ctx: - with activation_recompute_forward( - activation_recompute=True, recompute_phase=True - ): - outputs = ctx.run_function(*detached_inputs, **ctx.kwargs) + with (torch.enable_grad(), ctx.recompute_ctx, + ctx.torch_gpu_amp_ctx, ctx.torch_cpu_amp_ctx, + activation_recompute_forward( + activation_recompute=True, recompute_phase=True)): + outputs = ctx.run_function(*detached_inputs, **ctx.kwargs) # Set the states back to what it was at the start of this function. torch.set_rng_state(bwd_cpu_rng_state) @@ -639,8 +665,13 @@ def checkpoint( user_forward_ctx, user_recompute_ctx = context_fn() te_forward_ctx, te_recompute_ctx = get_activation_recompute_contexts() + # Preserve the torch autocast contexts from the forward pass during recompute phase. + torch_gpu_amp_forward_ctx, torch_cpu_amp_forward_ctx = _get_active_autocast_contexts() + def recompute_fn(*args, **kwargs): - with torch.autograd.enable_grad(), te_recompute_ctx, user_recompute_ctx: + with (torch.autograd.enable_grad(), + te_recompute_ctx, user_recompute_ctx, + torch_gpu_amp_forward_ctx, torch_cpu_amp_forward_ctx): function(*args, **kwargs) # Initialize a new checkpoint frame for each new forward pass. @@ -650,7 +681,8 @@ def recompute_fn(*args, **kwargs): ) new_frame.cache_rng_states(forward=True) - with _checkpoint_hook(new_frame, args, kwargs), te_forward_ctx, user_forward_ctx: + with (_checkpoint_hook(new_frame, args, kwargs), + te_forward_ctx, user_forward_ctx): out = function(*args, **kwargs) return out From 7190c30a4d9159a0b5466d2f85f5bb29e63fe3f9 Mon Sep 17 00:00:00 2001 From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 24 May 2024 18:42:32 -0700 Subject: [PATCH 104/427] [C] Allow bias support for sm80/86/89 for cuDNN 9+ (#863) allow bias support for sm80/86/89 for cuDNN 9+ Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- transformer_engine/common/fused_attn/fused_attn.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp index 2d9759898f..71f8e6c6d9 100644 --- a/transformer_engine/common/fused_attn/fused_attn.cpp +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -148,7 +148,10 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( && attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK && sm_arch_ == 90) || (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS - && sm_arch_ == 90)))) + && sm_arch_ == 90))) + || ((cudnn_runtime_version >= 90000) + && (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS + && sm_arch_ >= 80))) && ((cudnn_runtime_version < 8906 && attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) || ((cudnn_runtime_version >= 8906) && (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK From 0c4cc05d369acd7b103bc0a49e46355334459446 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Wed, 22 May 2024 14:33:22 -0400 Subject: [PATCH 105/427] [JAX] Fixed the shape miss-matching issue in MLP. (#859) * Fixed the shape mismatching issue in MLP. Signed-off-by: Ming Huang * Add a corresponding test Signed-off-by: Ming Huang --------- Signed-off-by: Ming Huang Co-authored-by: Phuong Nguyen <36155692+phu0ngng@users.noreply.github.com> --- tests/jax/test_layer.py | 2 ++ transformer_engine/jax/flax/module.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py index 1493b50cf0..a3a506f1c1 100644 --- a/tests/jax/test_layer.py +++ b/tests/jax/test_layer.py @@ -177,6 +177,8 @@ def enable_fused_attn(): _KEY_OF_SELF_ATTN_BIAS_TYPE: "no_bias", }, { _KEY_OF_ATTENTION_DROPOUT: 0.3, +}, { + _KEY_OF_MLP_ACTIVATIONS: (('relu', 'relu')), }] ATTRS = [{**BASE_ATTRS, **attr} for attr in ATTRS] diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 442396d47c..1f827b505a 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1148,8 +1148,8 @@ def kernel_1_init(key, num_kernels, stack_axis, *init_args): x_i = _convert_to_activation_function(act_fn)(x[idx]) activations.append(x_i) z = functools.reduce(operator.mul, activations) - if num_activations == 1: - z = jnp.reshape(z, (*z.shape[:-2], -1)) + # Remove act axis + z = jnp.reshape(z, (*z.shape[:-2], -1)) z = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=self.intermediate_hidden_dropout_dims, From ad24fc549bb276c015b2e50c4ec1141626cf3e43 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 28 May 2024 16:38:55 -0700 Subject: [PATCH 106/427] Use correct FP8 group in multi-GPU docs (#852) * Use correct FP8 group in multi-GPU docs FP8 process group should be tensor-parallel group Signed-off-by: Tim Moon * Synchronize FP8 scales over world group in multi-GPU docs Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon --- docs/examples/advanced_optimizations.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/examples/advanced_optimizations.ipynb b/docs/examples/advanced_optimizations.ipynb index c7cd006dae..3d889859ba 100644 --- a/docs/examples/advanced_optimizations.ipynb +++ b/docs/examples/advanced_optimizations.ipynb @@ -115,12 +115,13 @@ "# Configure parallel groups\n", "import os\n", "import torch\n", - "world_group = torch.distributed.init_process_group(\n", + "torch.distributed.init_process_group(\n", " \"nccl\",\n", " init_method=\"file:///tmp/rdzv\",\n", " world_size=1,\n", " rank=0,\n", ")\n", + "world_group = torch.distributed.new_group(ranks=[0], backend=\"nccl\")\n", "data_parallel_group = torch.distributed.new_group(ranks=[0], backend=\"nccl\")\n", "tensor_parallel_group = torch.distributed.new_group(ranks=[0], backend=\"nccl\")" ] @@ -132,7 +133,9 @@ "source": [ "We only initialize with one GPU to keep this example simple. Please consult the documentation [torch.distributed](https://pytorch.org/docs/stable/distributed.html) for guidance on running with multiple GPUs. Note that we require that each distributed process corresponds to exactly one GPU, so we treat them interchangeably. In practice, there are multiple factors that can affect the optimal parallel layout: the system hardware, the network topology, usage of other parallelism schemes like pipeline parallelism. A rough rule-of-thumb is to interpret the GPUs as a 2D grid with dimensions of $\\text{num_nodes} \\times \\text{gpus_per_node}$. The rows are tensor-parallel groups and the columns are data-parallel groups.\n", "\n", - "Enabling data parallelism with Transformer Engine is similar to enabling data parallelism with standard PyTorch models: simply wrap the modules with [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html). FP8 training requires extra synchronization for the scaling factors, so the data-parallel process group must also be passed to the [fp8_autocast](../api/pytorch.rst#transformer_engine.pytorch.fp8_autocast) context manager. Transformer Engine modules also have native support for tensor and sequence parallelism. If the user provides a process group for tensor parallelism, the modules will distribute the data and perform communication internally. If sequence parallelism is enabled, it will be applied for operations that are not amenable to tensor parallelism and it will use the tensor-parallel process group. In this case, the tensor parallel group must also be passed to the **fp8_group** argument in the [fp8_autocast](../api/pytorch.rst#transformer_engine.pytorch.fp8_autocast) context manager, either directly or as a subset of a larger distributed group." + "Enabling data parallelism with Transformer Engine is similar to enabling data parallelism with standard PyTorch models: simply wrap the modules with [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html). Transformer Engine modules also have native support for tensor and sequence parallelism. If the user provides a process group for tensor parallelism, the modules will distribute the data and perform communication internally. If sequence parallelism is enabled, it will be applied for operations that are not amenable to tensor parallelism and it will use the tensor-parallel process group.\n", + "\n", + "One important consideration for multi-GPU FP8 training is how to synchronize the FP8 scaling factors between GPUs. If tensor parallelism is enabled, the scales must be synchronized over the tensor-parallel group. However, synchronizing over both the data-parallel and tensor-parallel groups is recommended for the best convergence. This can be configured with the **fp8_group** argument in the [fp8_autocast](../api/pytorch.rst#transformer_engine.pytorch.fp8_autocast) context manager." ] }, { @@ -166,7 +169,7 @@ ")\n", "\n", "# Training step\n", - "with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=data_parallel_group):\n", + "with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=world_group):\n", " y = parallel_transformer(x, attention_mask=None)\n", "y.backward(dy)\n", "\n", @@ -179,7 +182,7 @@ " fp8_autocast_kwargs = {\n", " \"enabled\": True,\n", " \"fp8_recipe\": fp8_recipe,\n", - " \"fp8_group\": data_parallel_group,\n", + " \"fp8_group\": world_group,\n", " },\n", ")" ] From 4e4aecbd11faefbba6d5e2789a7747bca73890b4 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 28 May 2024 18:25:25 -0700 Subject: [PATCH 107/427] [PyTorch] Make sure RoPE frequencies are in FP32 (#875) Make sure RoPE frequencies are in FP32 Signed-off-by: Tim Moon --- transformer_engine/pytorch/attention.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 841f2ba8af..a6e2a7a21a 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -1432,6 +1432,8 @@ def forward( tensor_format: str = "sbhd", cu_seqlens: Union[torch.Tensor, None] = None, ) -> torch.Tensor: + if freqs.dtype != torch.float32: + freqs = freqs.float() if tensor_format == "sbhd": output = tex.fused_rope_forward(t, freqs, False) elif tensor_format == "bshd": From 61ffb58357291cac967bc1d1579f31b9afff46b8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Wed, 29 May 2024 09:50:57 -0700 Subject: [PATCH 108/427] New NVIDIA footer in documentation (#876) * Change the documentation footer Signed-off-by: Przemek Tredak * Update docs toolchain versions Signed-off-by: Przemek Tredak --------- Signed-off-by: Przemek Tredak --- .github/workflows/docs.yml | 4 ++-- docs/_static/NVIDIA-LogoBlack.svg | 1 + docs/_static/css/nvidia_footer.css | 29 +++++++++++++++++++++++++++++ docs/_templates/footer.html | 23 +++++++++++++++++++++++ docs/_templates/layout.html | 4 ---- docs/conf.py | 2 ++ 6 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 docs/_static/NVIDIA-LogoBlack.svg create mode 100644 docs/_static/css/nvidia_footer.css create mode 100644 docs/_templates/footer.html diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b4eeefa70b..581ff1e935 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,8 +17,8 @@ jobs: uses: actions/checkout@v3 - name: 'Install dependencies' run: | - pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7 - pip install breathe==4.34.0 sphinx-autoapi==2.0.1 + pip install sphinx==7.1.2 sphinx_rtd_theme==2.0.0 nbsphinx==0.9.4 IPython ipython_genutils==0.2.0 ipywidgets==8.1.3 astroid==3.2.2 + pip install breathe==4.35.0 sphinx-autoapi==3.1.1 sudo apt-get install -y pandoc graphviz doxygen export GIT_SHA=$(git show-ref --hash HEAD) - name: 'Build docs' diff --git a/docs/_static/NVIDIA-LogoBlack.svg b/docs/_static/NVIDIA-LogoBlack.svg new file mode 100644 index 0000000000..c612396c71 --- /dev/null +++ b/docs/_static/NVIDIA-LogoBlack.svg @@ -0,0 +1 @@ +NVIDIA-LogoBlack \ No newline at end of file diff --git a/docs/_static/css/nvidia_footer.css b/docs/_static/css/nvidia_footer.css new file mode 100644 index 0000000000..9d18fb3b47 --- /dev/null +++ b/docs/_static/css/nvidia_footer.css @@ -0,0 +1,29 @@ +footer img { + display: block; + width: 137.5px; + position: relative; + left: -9px; + margin: 0 0 15px 0; +} + +footer p { + color: #666666; + font-weight: normal; + font-size: 12px; + line-height: 1.25em; +} + +footer p:not(.notices) { + display: inline; + margin: 0; +} + +footer p a, +footer p a:link, +footer p a:visited { + color: #666666; +} + +footer p a:hover { + color: #666666; +} diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html new file mode 100644 index 0000000000..1ef5505d34 --- /dev/null +++ b/docs/_templates/footer.html @@ -0,0 +1,23 @@ +{% extends '!footer.html' %} + +{% block contentinfo %} + +

+Privacy Policy +| +Manage My Privacy +| +Do Not Sell or Share My Data +| +Terms of Service +| +Accessibility +| +Corporate Policies +| +Product Security +| +Contact +

+{{ super() }} +{% endblock %} diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html index 65b5b90931..cb372b3a72 100644 --- a/docs/_templates/layout.html +++ b/docs/_templates/layout.html @@ -61,10 +61,6 @@ } - {% endblock %} - - {% block footer %} {{ super() }} -

3I1uO|3tF{uqxQoyX-Xtjmvu5!!vRVWT&eFQ1F$OU7f#oWU47s2zs) zYK>ui+hS1fmT>CX5(9g*!oZ%b(XU$z^z8TpI<@*1O}?&)dNnGcPWAGrS+zW>SFMPu z@|^>>HEvuVEt)k%)5dksqDd{ZYE%tv8dgTr+GSCvQgM_kTM#9R*`c6gZazPD_HAro z%V#Y0mocuqwPc&r{l@4(xlH*X>oIbEuH1Q$J$GIdu(QE7&m}m0Bow~8x3F#%!-KfV zg|ZF}VckkbkT-W?hzMex9L&0v3JtQ70%-0Nh?F)%c#NE*I8m5UiCjUx5W}Iop;Qyr z@LF2rJJ#o7viL!9tDC8nF*RTIEx5Y3`7I1j*Q3v-2bF6L9@CUxiO7BjvmD>NVcGBoesqoko(EYH>t>A%Z3M=dFR z52m;?(GAEKWlnsG_aV@1%;jzYTMID%duW!2OpwR|&DPdsOI)na=Qa6Ul!aTMnII$6 zWy(h)Ap{%w@Zs_8TX=NyI^sfunYj<5e%;!bFnR>x@)O>_(a`*!5NEMAnef?CuF21ASNOV*DjvNpTGQww=bS5OYLETvAH*C#dizii;!I#F}YU< z^l4B7Ka?nhllyn$Z^l(R7C-;`23M|~MWgQ;z&>wYSD#hWpG)x5{JXW z5fc=IXm)<0`Fv$5!%JDU_vsj$jVN_GEGH)5fT!l-eEDGeYO0v%56*#lbk#kh}=DJkSfl`qZ(*5_CJIR1ZiC13zQSv_sz_K?1<*V*=h|!t5jY&!w!(ws^v0f*OfJTaxvN7vdH{5%y%vWdTOz;y_?bmbmbGGK-#A zxF5S_Ry^T8fzX=hNvJg`$&+BoutWmENuKf%BE#akY)d6frx1{3nI>Rul3YK9$0ZVy zW#Sb>3fxsZwlq`8?lOM$0A5G@qn)^}x2G`eQ>iqD zo#8^TcaaHA#3C$`|Lg=XEz_j~v4&~7(g_cjGc2yBx$s_Oyk<&<^Ly?&7B9WWnUDxh*IX2e`g_Zy>hr+K)}!ec`ipH#~i|!NYqCJU48mcEHPLH$2wwf+w|Mqc2vj z^g{o>BQSp4G%Q-U0?U@K#*(GWv3%8PtZ;F`YEL(;S?`98d$wSI*j@xi`XMyRj{v?G zu}A%J_;@IeoQlMWGtoG8CI+XeGh9D?I>M|ICqr@gXb_?fv7U%#y%xc`A$%7iqkNUZ z!gnJybQkY)C;S7pA~0w>LL+<;6t)WogSO$=sStJ|_T%K?9XNg14<`;CL|pV<#Kr8w ziNjlP;lx&4IlUYAu7=>jjW9g88L4#ldN}T03&+hXVYqTG2$xO=;>^*#ICa<;C*yo^ zI%Yf0#csi==#4lQ;fCZiZijExOBu5H%_j{y>mWzaMl}lk9pxntOu?{ti{=2 z*5A7pBh+g)_N*KS@7aB^XncD(4{m``1KXp2?+$3)x-oueSRLP0DvH)sN}y}aQW)R1 zA$rs+hlVBW(6D4ad|M(Pnv`)w^}_bBx6Ol`ws}#S>!E%d@bKn&W&M@vQj2!&azM48{;c)Kn-5g|9LFqkS&-{sYbeB0+=O0 z$`fU~f?ZxzC9IVU$o+Q`_ze?_wN+%y=bIX788$`@MvOHHBl;pVGk!WONu+-V z&EiI}pOCDr&HlatUxa1_&U`4kXCq6KWIfXZl!t0)pK@=o9zgdgK5B z3u|B&{dZ^U}lFX7h-Sis)k))GxzC350o_d~mRyct{N|7O zTf8sv&3=xzJmv%A`jH)gq?AOQJbMnIF|qIu3ueVG1$+-ecW*+2^2IT#cV~Qjo`N^e zQt(k+Pk3MAA|pfilq^vkd2MZx*EGxBclE!)(zy$;XUkT0c0!daMkG5t#%ngQAQ+Ml zGW0t0376uR7RYTf?Wrtf!XL9t`(<72H;&;_L-Sq*&5?WeYD=>pxBKlSG)sAxa?1QI z{&9?lF|Di?%L1i11m-}2<=rOjVL6iNfLVbtZV?dyY`9%f zs7OH*G}|D1jvUG_y?e#V@FX<*eBTTknl*!atF~}y)edf4y0<0_w`l`!La|8T*r#nf zY;4;>$*Y~HBR0r7!*Ffe2`=qA!@W%xxOVQ1(cd&iXB|0kF=gc?c_gb8f2EnfLZ6EICu|0(s~RRB~#R7QdzToRhqM{tJU zu1PXZvGPxLAF4)uNLV`nkJt4jfcGUh_oMoAoe=HXAMu3H1S-*!aWg52Fq+~e;H+IG zk~{>Yd2e2_o$@B2t{;q4E@dobnZjBmV=If#vAAO-xe~G&&m@U6VKsr0`x5sQxd+*o zVv>S?<@P|yQmu73rV_x#JwwAZ#sA_*p17J?#p}w4dTV){3(pga zHqS|xkN*hl(z!`k#SYF2p0l(+5|;KuGT}Lm=ccUWw&lDZKG&zNqw&;z3|_By#+%Jk z@pk)6yxukoAAJ|$#g;kvuzd+`%pQW1qq^hV*giNjrZ>(`=!eUb2I2;Fi|~AX&IlZt zF&sN53`CQ%#g$btcdk5W(xf@2I#0*qnTxP&?h-7TwE%Ob&%w0G(=cw#7z`ZHA03#l zYgDO%$`vZ2RIw85WZPrdun~miF!-`f*u!>t4?CwjSw^?`?#1TazFhiY)9&4H_1S=h zD_3LEtoax=b`pB@AA)w>dZTUE-e}jQmr~nKJz>}lEtFyq{^T5Zez>hb8pQvAM1AjaO{*g!} z0sl$?kYUnO>KXL}_-865`xAh_?*o6{0e)pTzg_}<7hPmnXBh5bhR^H0iw0iD0xzPd zaQyuu0)M^b{=d`lufN!4{rm(k6EEV-zO^{#GZWW+rr`dTX}IC#jEipLaL&~kr`I{- zg7+-!TQ&)miWNkTY}rwqo%)Ca+wkel6GTUavkhSV92`al5NI~Jzj2#Dr?_KSV443F zFpD%a%LH&5nyrCZyYw3WjUjl2Q)C9_Kxs?3{2ZDI%DlE2oHMbO$Y3$;K5KBD;ys4> zKSML)Va=hROJG>U_QsGf8uevp<^i9(Ls;O;7#>9Qe*tF&8EGMypcNFNp!v>~%QzAlhW)#@WBi!W3PfbcG;w7R@O=9!T|)rXKq|lU zTf#H-;hli92IfzMR`DDD?Wa#jc=!;9qT&diC)o=>g~L&Y8GaltoH|FqeTDb0(+IrU zA6ncpe)^z4E)-yvVd(DNyo!rwPAb3_-)9ZQ0=^b&v;ecXLI^bLv7-s3giG^;D<)XB zmO!(B@*lr`z+bX@t_KZ3Z}NF*jaz@LBoj1Rp36UX8Z6CQ-~gz4u`QrMwje8h!>aeZ*-3eNC4 zkvu0jDjIto@ZIoP#%Gr>~NN+nPks{GHGN4}sylhjwv2U#9ihOCwO zqYjHQL$e_b&ta6j?oypX$@^mMHo>wC{2vi)Kr<^Ov&0H5+hlo=#~eUp&^|;2iaW+` zcH(z(Dbmm!!T3h9f(&Oo;=+Qk-oph&3gjnPWQR?5LUZ=)YUscoRja|XLl>-X-U8k& zn!~L{E4Z|1376(Ax!w|5AL3u+5!gl*k*wm&Ifx06+ zlsdzsWhc0G=!qe<>Y{`|b50x7%9RUW=d#6jd2^#@iEP-?xgegctB)tEYa-RNGahZ6 ziaQ$?;DP&eB)E;D^-Ps`QN<1jB z8mI7HQ`YcW0>Dfs^}!yO$!i3R3FI<{wF|H3GMx7_3=dZjqE-&Xqt*Qg=Dm519@=Nz zqYobQd5D$%q02ZtS~mglD|yZ`rVAC%^AcC_*$}25uO!5~j=>Y}NqD|_I^OM^hxdCH zG+TfbF!xqTTf%ov2Tll$ZR0mp&NWX9xxl90{j^Bj7Y>1R--21`ZucfE|b~J-VZH zySAvqI-%TG6_GcO1M=GCM~l|&@I$kfXwGHhmaWmKRcn0Px;5%IZHjtLzDM=1zee$j z72!~%7z&mwi`oqvqj|e7XxzF3zW$*l8W7lP*8dKr%U4CgqNP!!Xc-hPRt6=@RzRt* zDxhSU@+eWJ9Euh##eF4Fy3AK7$MDLOFUS35xh#*eWvik@iAr2n!&l|1qg2^SDDzb% zu2(?0a^j!+6)Kh~gL0)xpm>R5C{nUG3YRE?0!8zqKw&#L7P5uCqYd)rW5?JoJ96g9 zhU_`BAzO~jE(#(sC`jEXX=k`KB@ z^xXFvgZmpN;P&Q;xV>!(?(Lj`2fJtCq3>M8?^}cf|7A!FT7{&L)kqF=L29H2QX)N( z7`Y9PqkNGVw-2c&0`T-=1YQ%ye@r}yzn)#hKVDwJ-!Cr^h|l5gXGgh=!e7rr@aL0# z_$$K?|9r9^|IFA&?Zu~r-FTZ2f`9&*f`9(wHU9OFmjvhxq#oXcJB(MN&q(F6rOFx0 zjl4yQ_ZW-EK2vejbsB0FFR0$Ql`CwI=zu-SO)@qn1_4qx2Za(c+3x!XlSc@;rteJj zP;+DzCD!xGuwndX9c{KWTLZH(fP@8{1*(NJ0{zKdWFPrRFe@+%k7VW)VJu-SldL(8 zvNndQ(KrN>t$_Le3eDPD&ZV?xR%~Xi&y1ZsHk!;6=xc=uL- znc!>!W+kyUn;=yz(m%d`hZ|R}AT~0JaCr>J@mcK$0F%r0^wF%4pMoXSd6va zy-L+q5}w~Fh<=ulj7#TF;l$BcoH!!x83bl7Psc{%+>t}LbezB(9gNe5!g2m6_Z{YO zlvtnT*qc``;qO0x#e@5|c;BHoC2kv+FRG6N->MNn2>9$AoIDba$Z$d}_eY;PgJ7AY z;L>@Vx^)%jE}z3G8K&>TNhG8`K*md^?Z>x__gmhhxTn0vr;qRO#~;68ivnkR0Gb)zBP3ux4319v+H~?k*^r-+>K8b~cJRl%+Wj)1ycA8u0AY z4ITo>EeOs(G=q1Ortl<Z&{7TiUk9whnEnc1qi+9qrp=JGHrE2W)QN0b2vqW32en3H4_3~`}YKOD&$Mj(;P#P!@S z|JM^%y@ui;ve$+8QiwB5c52c}?mu z*B=s4ADJZZ8_#=-=e<1^Uu$)51Z%DfB+G~&vR%%9S)XX7>n>`Y7b4TOw{4t1`I|88uh23y9l)3tssQdSH_ z+G=*}{u}`uVy}Fo{3*OlE;ovddQB` z!&M`B+z8xVF&wv64#hRNz39SOo+RChf*??x1c7{clOXERUKS*(Z`5bahBWHX8nswSSU3fhUG|M&}hUGXT zEJ^RolGh~$G>cn?SeTUp>pf<@0B8>ew-yK6;$sd{l*hiUs!5n|FBsN}yLj zQiiM3%O|P#wDtMzGv$&YR@(UcGCj5oRY%}`r=i)fFw0V=?H0dgnVC?4S$vz5S-pFDbqa7%EGi6kH&#?>bu{cDWI)%W~XApAj0?sGi$E}BVagxw{_*e|C+`5E}=V^HN<~4r#K(K$W zXEl8PUR*WauoLr_8pytIA?=pnn9m-$*%-*OWBayPxqO+*(_t*nqG0CZP^n0ytVORxV>R2aOyTv5<5pXxZ+ly9lFD-MLWW7Yiw>xHED&d&Dvr+ zVR%=kuJG;F6MLxLUAtp!Jg?CRDHTe!ZNz`eC?XKZZW1sf=z_T8|)WoLM_?TS7E z6TC(hHppMIbb>X8CfaXrwk3u5inGv-k;dWEMfd0mDZ@6s2KR`tT; z*b&^_oG1VHxU9EWv~a!fuiquOYGOm0>I;~Q1L=!TUsq)r9&tW*je$`nD{(uL5nq@z-suL__g)wEPW zG%4YT?}|B~QE_{GSHb}eirAxm5j)f?Y==69^PqMiThuMUr9*C1&zl342#t-36~d73 z>Z2(^rg_19XjZ_1>-q3K|2HNmHY}VEUvt~HRNZ`bs7&Q2Xy&oYhwQoR38ey~d5|lo z0ID6=9gsH%0g}L5h_G9f%Ocrxpm5GyD8%~PmjC4qjQINJ;5jzz+^KS+U|Cya<)bm< z<1-nlK|VGVcaY2TDyhgH<#*Q7!*8`U0 z1Szu~EUWzg*qi0no1I~CZ;}s$UD&DKItYKCT#WaJ7U5;=8YD)0k(qUTT3k0STSv*!`oU4xEJxe>5McKqp zQG6CNL$d`|t)bZhuwRw|bxMRvJ@SUtHnS|yY_?!oC5+N%=NKSpNUmp`XAR66bgiw# z8k!B?W|2TMLE(RdW)0q21~|(HfZ8oX#0SiWqk&l-nxuzLM-9$QY{QAo$ydYG*ew~3g1XXd*_(<3l3-jAo1j3Jm(N8i?@e{(S2GzH(#O2~G zQsN(~+3+G*5YrPA2+gk{(`y^BEN&UH{ygI$UO!L3n-|Hrck{A>=Jcd^HLkO^GFzZo z090H;1e)c)_&VPvcxEI&QgECVnnl*oY`A6!FpI=j`}KT27=rU*oI7>|ClAHq^fAKq`7<~c8>y_&7mpuP zL(#n^$iHT1K!)Q>esGm(ydSaAtW@G65qjzb4xT%U{Ws3z=;PbCoAelW?%u@d3#V}M z!bv<#e59fI?Hm03{v|%hhmSwK$A|ZC@sVl!kH7y!Tx=97Rw%2UmA!plI8e5^bD~P+ zN^l-C27cSNvO>Y)S zUDJ@dXB8W{1$exEJYH^_fH#|+@lq_Ip4uHEb^RzLc@r$z_$A8lXCim8DEB8Yiz|ir z?2aJVYWD?^Sf0hwESBeo;!+^90Irs8+~>*$pN+f#?c=qk4+X_~D@#g7)ktE)o=RxV z@E(t68zBQ2ZIJ96OqG$I(F~A+4}#KvOL3(*z59G{W32-(W`jdT?%855t?(MgOmBpnI+I zXjiEunwBk$Z;CpgR{lJwDx;YYoXRo(m(7z4rR;K}6t|VK&y6zqa-+0;E|jp%iK2W? zg#Q}%>jyZk8RSP%vC`OlrK;a zUpYFWWWIbTlHY+)XpbTU!lD9*%vX*!wy?{ck6>C9*_5@IaLfD4WkV1o80Katu}mIY zG%r&U!@jPI(ce&n@LFI^%-gklgof}O;x;AQnTASS zw_%*}3H%l<4o5<>W9f=0QmzV$u|rz4bXk-rUIt&4Do2$;UdG3k&&a`!pq)D#>}-6^lOdBo8~C@8`XJq z9Y{DPG|Lc<@+L%vI%JsqTHLS@YK6aur>aB8Gdsw>MemBvn?9;rnpC*nToBod@$ zh{tr#F)Yg?l`B;Wj}fUc&{c<3uFv{{Wi6fMH-2pod^$K5Zv)2T$-aq*S0%u#scJyPkh7sF#%GTr?>Z)bF7`N+?EEaPB{2*Tz+2+hZjonpN& zZg7O;fCxkpn8VG`971RoWwtcy?@vtMp2%DCOwcSt70XoInW0%=+6>KBz#PQ$3{KI| zYyoFw)H53>MPhLlzLT~}V43Zf64w>4lCu=EvMn&PtMHeAvTmz%TSxXWI92u;$7Ygj zH)PpvjL~eZY|2NGI}K=7!*~ejsSB{N6Z1Q7PMO=4b=P$9FUSJ80+t#sGbe%t-ZCeG z{+~%I4C6cr$68Ea6~1A{w}xDiY_pcYt_Ej0Um#jp*7onFcA24BhE}t3fo1%V!p5Nywz}d84gRxHA&!Jgw*MP3! zTF22yACq`(^_qkaJ~+?<&EnG>8y-P$4nb5%kQ&QTKK_%6?I7d++pj<2+0!(Fa~hsM zPr-)|&+z7rxM&FQ>d`bly?;tDl@9@*D~OiKK7Lld$S)8V5sZ^ZVwF#>d=MuiW(X)7 z`uU@L+?R^CFOvz+k8$Jj8Js(L1h1Z^e-WBRVr`aDG{n*@6P+B12~%zz{|=fBYqNax zXMyIA%IYj2Etcj~Lh6OHCvk??J}E93N2752(g~bB5{}@VUWnY~h0q<|YLtufhoW$m z&@0d^!_J92hQRWv!_hdybMIZbh<7ia;?)b?+4!01a_Lko;zIY~NNgxVVsP$zmw5M)_wwnjhUSlNxO|Pj z{`?hp@7zG^R(cxgyln9Dv+A(ZKm>0KT@9xdi7UpNMH2X2n?n874&kYU4kwejN zUAG1fdGjdiSkCO)swBRQUzaG2fi>!*ce%>wS-vuQRH%uLr7PoWhk~eOR{#wP6hr-j z#Zb3k5!5MI7tO&FbR+q3tiDt4g--f0)MI-K}x$ zrfJ;WH9&|v5`w$CyF=XF-H2hqHE3``5?tDQK6BO1Ndn#Xd++Z*#`9v1+FHA+&X!ND zvuc&gB7K}cva#}nt(cps_>Fc5UTO{p>q*$BKO8$I6Iv(q!_LY5anNWuj$4ey zG0Sn-Z!!uy4Mt$I-XN?P-4hFj{0y5uonYLpEg`lg#|v@b`7mr+IC3Z&C|g z8`VG;f!YRD(4kHxw5(nhO{t0X`~h?3bjEJxPY(KSs_r;JQ{V`zJVDuS18hyu%VLAeRM-4}xVS~`MS2uK{ zy7#8~bw`hZz0h-LfAkza2%{#ChuvH|Y}>pIyLKGJ=BuJjB42sN6qwcVDeb( z(;JC>(?{Z3F=^2#+?I;xYAz z%L30TT;o=eiTbxhr~67W0g<`VgRKPPd|7gvh|Ak7+3~ej+wW}_u(nhd>StVj$~>QN z`;)CRxI70>x7pzC<|*vF8sV9}5uUnOBXRX81kdV=9H&KuVqRw#%RxxapcFLkHBnRE z%fMR+^O$j97RI-(uj*L0Yze;8kKwQ1enos-qJqdkFLB2RR95By51Hzm;Oxn5gk~)y zsDQF28E2xlNfabczMshu-$L`ZQkG_cWAdct68beT7eF()QmNn{axa}hl;5lZ<{`7_ zfLNH7c@_>iSC~W9_l?T;kjP^vwJ$#p`Ce&xsj)xGbFIb)NPBqy4qm=^g4YDl^z>vrdv+fmKjt7eM_^eOwc@#oq9JY? zGOe_v0=K8)!jO6Q2EkFAbVP+vd#erp6KIxHz*z4gM#0BEWG!eC!Y4Y1UC4^Cm9dy@S~6ml1IB3=#;> z7jE6cjT_gI_3{a_^Pu_t>uk1j7Cvw}H~Tq$|NS$s^AvOJW)cocDBr!xl`5%8K4e<( zE*&~y$Cj-G*Z}npgNmBM-`>yLoA-eiJoxOoI67+6$WpEjTIx=SmVZsO!F*l_^7Tca zkNkbdWtrmDhwlb1`+74^-cu5PQ>JotcC^RI<41A$;C}4evx`u>6>CthOWfCgX&1m=c^=oiMj8CjpehD8sg2PD z7=x~DF?(G2 zP92Oxh9lrH#{f<qIDtDsZeDrjB30_yXgslxK<+DGawl4bemie!dGX+x2&{iIF4t*=brAfPLf zeX?G`=3;lourZ_QC!>+%9TTvnl;e4 zX(M#&+8HC*CR6kVV}|u)tXnl3`*y6w(Y;%6^59lD9oq~qr!DaHS_{v>dGLu?1nWXc5-yKU|?#va0MMh!u;(0I{-H#oUY8cV1 z8UhxYBVwjLqD;pi&U6$aO-3NZbOgK%hr)yRNw}R6CO4^prGtKk58>E(^gwv%4T1C2 zfpDER0DjYlB5vkXT-&t_*)h)eMM=iJ1fudd;jr|Ee2APukYBq1ZsA5rc*5$>=C7Xo(R zZuoB8^4);TuB&jtWgS9y&%?GEW3haCf2=hgh&2X%u*!5K7P0(xEaOU>QP{i65GS@- z!Ex6tcpg{?_kGmfCGg(A3ZA=H!EwVP99upcTdk&G-l*P~KdKKquA@}!`vY?nZr))l>>_joR*MS-eLh37oto;?V~MXEjCeZMw-hJm-Arx z9%1*sK=f9w^SJwxa{s;Ug~8g4B2ZHgc37zMWP+TBJI%Om!fO$9m9?42?y%yvnRv9z ziswwhvx8Ic;>a|@vnejF8H?yyeUN=*mWpa4vBo7Vq>MYr*lmQ{@=ukTkVhiYaQk6f zOzYej6!3xWZ{33?I!ur;|RL7TM=+TMm`KJi;Z|Zz7&V%G{p*bI{MbZ{|mhZxK`_|H|ZTl9QdCfd% zE|@7t>ji1htkpwn(?a46Ay(#Zp;;F_Bajf~?(7KLnU?Bf)V_UtJb6Okdr1I(my4Y2 z7pjAG{P+O^b1u}ZdW2c=wG|f!U4#v>HfsR=6t7=B#;CA^ek~1=J;p$afx^w~epWG#YOQ=4!xpXF8XR%JNkWFCz{OK)z z`Q-x+9omQ5c`=JCR;Y+FrOKdqvEpdcx((K?TMvIffB5is_x1JD#C1I|@`0*GuVz`>#wNLM!=+UDa zeiDdn*9J|SHb$*lHI&u6R7n|#RNzA?DoU6tRh%kTNZc+UR`@t|EG6J6AolG(ox~@z zjQCy3I`fuoa=bvE0HB;NuqVrM9UT~#!WR*I%rnDJR0)6+RRG_ssg4psKzCWqFtqO=u@v2Mz?N;ah=*?T<3O} z@MA|z>(LdHe(H>I>@bXI-wFeoH9@QLWl^2y$;a{EfK%$Nt80N3HV@<}aRI1WydTjB;cY>Mi28>4!?W~kMu73wr^k9sXTp;62B zXw;%D8n>^e|lcF7#(dE;cHR?}QZGx_kl8?_5XjqZ`Q0yoj9pXOVmVJhE?}L(Z-9 zczfqOUT2)bySuUY_$-L;vLkr&VL#r!KZ^Gsoblnk58pAN$a)fs8|On2=yVLnx9!3H zb$hUV?QZPY%nso$g7My?IJ948p*xBbgys{+j^a2W`uMSfIC1H&vw-QhC1Cp;$hM4<5yT-ZGy@6UVUkLPFbYc`>m z@cSv}7Cyhei!bj8$ZsFuxA%|n*ZU{<>rE#9%D#s`v+v-Kteg1q_yRr(Am2ub%O$C5qtC_D#SazvA?% zLNtU-MqEVCWGc(ceh z+4rBJS*xFR18Cq}P;apm@>nIV{|9K62nS+i_F^YQ=BFP&c7*z{_UzdUuV25xvu95c z9ubO#3+H0&nBho2pN3xvXu0Ax@ivEWETAmlED6ec(LC6!NB!&?()A8klOp&bvW>En&i(m8e#~G}H6B43B)U1Am0a@Vv z+jZsB{O*+kWpU4tRKm@P8_BB|kCdf3gK(P|7pBS(7vaTrIf3gJqVeEnGSZWR5F6@> zw1g1l8geN;4H4nNh>8rQLJ&)EjwLjoOO3_*SC5pnJ(ryaiLre3;u(bdxD%Sa5FX%- zyO|ld`ZNPBNl`eFoCN!XBsj(;;8RS{{Aj}-jCVaWBuk0m@&%+6DEyEzX5&Gp;HGmAS6|dg$ z|1Z`+vBVb5Q*lCRnG!`&g&(Ub6-%LF*hCpPM#7 z+gjD|bMr1Ek5koD3< zS?}UyP^n~j^zG6M3oYkDf0Q034H*l)VdG&uP9GL}Mwn?}iFxL;u*lkunv2CY^RU`( zDYh+If&D91;n=!OaM-*Pt~(FF=Kwp12kqf^)CoZ+T@d2niZCZPM7g;kj_<5wZ!e^W zhax>94C#^KI2##(wD54Ga5*_T5=k+UNQ{a=qRf;T#^;BM3JOLbJLO*dR`_^}n@%jk zA`;*e5P`t3IQWDmz$q{W_P!A~<{64ZZh<)9;){Jw-Z(pu7WC!^A>C?EC zb`Ez_+2K8X0T0eFO*@NBZhM$~29KDE?j@(OMv5@>EGCh+_uTuV+tYFsh`LlfW} z5Qo5o3)r~pDB84Wg^u-WV%CWM*km*T11lGUhs9K+&9}rU>&ZA}ISz5=qY!O85|M_Z z5I$uD;*9lR+@cni4*Urrrc>cQybrv`_k+`fo^Y7Z1J2_K&r|y%Y1>RZO+JYa4`cC1 z?iKv?;WmEza0kDm&4`ZBgC;TL?`R~I;m-V>eupAE^m*BC(0z5jg01rWPZ2QR>#V@9~Se>(0QAixqnpPrbDhYqW8a*h#N5u7WPD5m~J zG5WCyuI;lUe461Fp)+HzIicACnOg;(P1GQ}I|ST}?cB#}$_$$#ZCRqgXp*VxHJBDF zwC1uQmgc)VEGRQ%>@ZQ24c*ykio4?LEY@h|dshbk5}NPrw8X>RJa79nJlZ)0k9X=L z_kb zETQMLxXqdhz*d9duR&S(M_VpzjMaHPh_ujnWKhon>1I_H>GD#nPtuCt*@cTAyU^V~}E!8!kYZMkr)?fi2H?8*wA z*GZ8&JKrg^Lo0ncndTt zKo&^PgJwDZb+&}2%O+?(SI2z%NLc=)S(?9mc*W~xA?xXVJb8FaP4#^)B?bwReuxeC z#Pv&Y$bNnekM5@AU9}-J3fv(th zGAODziWB0BbGbxOO(kXUolZPXLcbLkKxTeoRTe9!xH9NqSO)BrW!cJ=E`c(oi=%wG zQm9bAG-_9`j7GI;pmDw0Xx^|PS~hKp)-AqAo0hHAgbnRlwm`?$E%8H}R_M~U4Z5=2 zJv((opUxf8pB;rkKmUM%gx+DjdSGmyUYJNtA)rp}+Y^QZ`@&>ke~j+b4n|y`MaZ2) z5S}+|02VS`G;|=A4I6^hqlRN0JBJ&`kHFT+Be6?=EcP2s!eN6cIHo@pC#LGd-oOa< zd^YS&P2pr=4rgl%I9i(HxTP5mbA7*s1rAtS;h?oO_L-aGu;mOKw490c(@kJGY7B<= z?183rYN3Ke6Oc9`G#8Z)N%4{>S*#=~m8-%|MGe%fTnn`-)Iycgl~75-Zi)L1f6J=X z8)MRBbF5mn0Y|qVLR!!%MES=dz&8p3J`o7=kK}R){JbP8P88xJ6OkOBhO;T>aVF^; z(&AH*8k>Z)xFnS(MI|65<`hmzT=no+rBjh{lq^Rh+MgXm-(bWChT{yk-8_93&u-qv zhbPbQOI9}idiw$R{0sFf^*iw858%^pO25DPfTtM`kQ|$cgs50{D%ttt{e0!}HKd-q zg5c;B_yotoon=1k;DxPwk747^gV?<15Vq|*jO_;xW6hTBSh0F7wr$&ng9pWedkjtv z_OL&46vw&lc<2Zm4jhI3er`W_0v?2a-vbWt-g5$e2OQyhkRQMUuJAw159}cV{6Qc1 z9`=RLAuo6xaEIHTWANg&V)@Rzoe+s9XXEkoTpS*pj>VPeP^1M0BhE7j;SNEFI1+}C z-N6Xn9RlC&LGauX2=~o_aN6jH6YIQje7!eLZuG?o%3+fyTsFJGWve^fcX`5dzb`u` z!SFxI-|0jw0_|fElL;vjy|#uf(FI8!%_iV)X6$ zGk$1P6LZG1gEn;(`cx?n2lGitS!jWz*;5f`H6F2M;}By!4iUx^5ISAfr@^RwU3U0> zK&-Vsf=2g--?-jz9@mrV3zxC|;5>N%0_RP_6^CVbecA;duSeqbtvEco7K=Lk$p7=AH;%xGY(Hqvr`i7fGeR# zao%q~F8J)h756Q;;f-M~)jAFT3(=5@>F0)KqQ)pKXbE3ODuQ8N7D;}ZlwcLC-=0yLiqv8sBB zT)3`TR|6C@dlIafk{`s!S(8V;#NXLpLf8sdp1RP_S0N6G!;NGuSs~;Nd~m% z4{Fa-5f3+Js4`&%Si!HMS#!-$(Co$r&4=c%C6(7fqfVOhbpbg4pU_;WVoLH?ah@ev zS(*ukn#I`#9!}i$Uw~PpYoPxmUBfAu%-i;F`4pB|b#)eI1LqC% z2~}tQ@FE>B{~I)G?NiX((!f~C^Y7BZut-9)NIsIU?zbqMziwZ?c+-3H=Jt`BTOJ%Q zFFru*Ju)xnBm1#q;~LbhRb7FSSOvvGC?lsOD#O+-8}Q}xdjfB^f>51RS*DME|LzrX zbDt?-R;jW!6B-G2BAL)c3rqKy$2_6FhGvmKv!s{P`O=k{0>!HQ<~iOl&o_i$arw~V zG(Se}%LgjbLU4dPoxFFJIo{L(P9fYn{(1fRW#u=Gm^Oie>rYAViL zyN=9f&+saT^~}vumVfzep0kc`Kjb3!O*ZDvvqSZ&RZy>X9aJn=5haRC%-6h;@WqOw zX5ISg?{gX5F5AREw+!J;M%$MxMgW$WrzK1CKG0&iN<7z65X@J24`e5XcAsE!BKL++2fFT6>VM9h> z@Sx!s(0>R!Z^JNb&@c=iGz3HCZ|d$n(WP?-{M5M}di>B11Agv|p}o3bWM2YvUjj2V zl``zt3#QE5j8JSnb_DeM^~AJZKf`=rZ_F6d7qbTUh3$}ju%qTu^9J|B!odTvV)RI? zoG=n=r%%8-Li4=QLtx#nH*EU!rTTHXAFTTKg<0R;Fzn4ddv?Plb~wj%>4ecgbi~m1 zZ7{S$I}GX2p35CExNQgYYuOHMYu8n757O2psZwmKB22Zm60uWKtQ<;}tBz7-tD$1W z`lwvK79{p;k>Y$7tJg-$&i&AJ;6w}S+cs^dc3{hftvq%!HmqESwaZsw#iFIKwVnk_V>8S$ zx54}w^Qif-0NOw$zq6W$c~)~V$8;uU8d+n8;SAUq&W73488Db+gQ?@KF=nVS`g9wI zj&1s*Rns15Q?DDESL=Y*wcDdlyIvUAXE-Jf9D&h&24hI~0qEPMAG&w$kDuE2#gA?K zpmUo(=-8?^S~u;ErgeWr-Ky{t zs@zwtY&-t`KcYs(pHRJW7u2f$BWl;|h&nafpicD`s8PNi_t!$zGSzr*R6z0KRrowr zW(TqzdbVRHu~8i?nLG@Z{eDEJl0{%|J`raZTOeho9ulp`BHnTw;!O!zrsEN0G?pMu zXztJe^9OcDs+}pqCk#aJ)PeBQ9|+HB!{9x2Bzz4n5t9Ky$|2k_H=$DBlrrSuD*md*!`C(5ykW&bL`oQ_UjHJeB(f z^OT4hlD|aQct~h|M0kEE+j$L{d?@?yOq^Rh0x`3DA?M^QWrdcIllQkz!Cfk2yS}R5 zot?b)&gp9T$;ri7I30fB^2 z1B~Jl60WK+!0xyvV zROAeTv!AQLGgDR2uYp+yYK247Y0xYnSXlprxdLg>ti@KQ@`1TP9x}4u$x)^M|Agkk zd4GG2{BsGen#EaXd1gNBy{Wj%U)8ywj)lRsa9b2Wb6zTzX9dUqW1R{?b3y(E*ZpcA zZz#C|e0X#F^2YY(gCh@0FxUN@9N>6pFCQ*jl;($7;@y@gPM|4W8l_8?g2D7@Y9Qb{ zamNt13nk6cEP6{gd@Jq<1mm2Xr#yzBOOVq+^IHYY1l~7~na4jZ&60lk@B*)2J;37! zd;z~_{tB{r4xw4jm5EQpHRJPpf!uSvXOH9D>3HP6k`Q=uJ;R9pz_nTf1tcaWR?1UK*9gG+QY_QytJM|?bXB%H#*)bluyd>*?KQ?V;C z3ER1UcUS~YCM3Zv^$grjCBZW;9@-$Y}o=ow`qs2t=pq( zo6hLatPNT=`5x7)S3`-irSM&;Qre&{f_D)@vj%&Vz%}#DUnfWxFHwdc{gMg>B|Mc9 z_bHbR1<`!569}tUs)$Nu%cBC}zG~GPs9&!!nm2ES7A@MMWs7!bMc{1teH*lC)eh~b z`VxwYow;)5%c@XawPmz@^-5~=ebc&i@IB$QO`W=ESF0vE)~<=R_3NNRlZI$jw-y># zs(_X?s-tb4TIfi%uUQ>!s#ihlDizVHVmWlGS{=O`H^V>zZ@(WpphNQps9wGlDwZgw z{2f`i-j0xy$B;r=U|||C3;LSK#y_L(P7{iH29$pnspnDCfx?2F=6ogK10!}|1h-Z zI}|PZ4@F19@DC%#q347t=tn5-HGT@u8;)i_^+wBX{m{7Md`BdRuRjxsgssrb6p z8hwu%Of3`fu2T<$Xj-PyV7F=QyZv7`0#fD!1@cR1R18-x~J z2A~-`9gPUbbz65s?G{~8zv<6tNY!n`y4C(3m1;CXrCL-S)~RlD*0(jPHST~ajXI!e z(>AEyvJI-WXoFg9I-)K+8nxKbsMV@H>T|gfI~$EV{)omMenhhl-O;>5KeXu74=sOS zdv_eb?E_K2%@EXWGXnM6k3pRdqfxWnFw|-{1l7Oqi^`3DLe)l{QKccT)uc13H|>Hl z)tmEqXodC-nxIdE+E}kY76Y5tLu1|##|g)mmRjNTjEP9G8G{7tQHZk`g>cK!2sRys z#F_dq@7x5ld$vdV(piW$8jT3<53n4G0P`^jHJ^ZRi-`!f9)}Ry@d%zf0k0ws1FtU; zR4?G~_t)^(ryIZ*>hpEr<2C&G?mFLbH}K2*TlkW51s|WC$CsDs$i1J4hv)ooE5QM` z;!fa6sw-Y!^u>p(0r+?`5T7!`@aywv{FxPtzp|q7_p3d+LKX zet+kQKXYAqeHZ+l?SemFIpeRFTz>8hJaNLW*AL-E#7f*aX@MI@OmTBBp>>y`@<+ZY zv5pD5x3){H^6ALjWsVnnZSiu)9K702a3(Zo?IytPvqt1bQ}k z_S@m%K^uazJ|4-;mOI4aDbHZ)`1%fZxC1&h^{ycJxGuhd+Yc{RoQe;HoGZeh47% zsgR-hlbNs+Oi&gWR&E<&WhOs-4b6GrEQ7_p1egnhvw*Ay&0m9aA>|npm0_{acAW)S z2hHDtv(D12N&K8O=`79Pg0qg73SZ+}%T&?8)46#z~)IHmXIJi2{bE; z7jSkJt8<GCX)ZkqeDa<;!UVXR#y`pb4n2bF-BcKr0A-qF`Akf#t&1=Ij@D3CrT2 ztXYU<3ThpgD}d(qFCShaIUy7gA)dH!HW5z=xmwvYqnfUPT32xj>LsYac;^Knw z{MkL0BL|l+oKesm8x@R{Q_*S=Fb4uNN40?ysZz~fh0c=qBMI}urU^YR69oPj3anhYva(v1 zDP97#DpXXU+?LSXvr#MbYT63D2?Ikqbi$|~e}-Q7{?O|&2-63T#W=!89RjGthb_T- zqa@ESS*#dJF;7uhf_R0}Wl*+6DFx$|3Fwt8RAR@gEJ_fvi{&jI{ zxnGDiz8E{DrBqs!or~|jE2(BzGnu~#2ZF+|-6s%hPI_R$K}XCM9rM5%moQ8qI2S2e zMwO#w*FIRZ-wi7~qp?0B1uMd0u|DoB)`X>CU1$o{hf`rm*cg$F?eS-@IXVT~Vp5gX zhsI-bWHL5|Ct_>lDeR0*!~Vo`IFfP^4(G4Kd`Egbj9z1)7%;(S5`dum`>lKfEiEFQN zaQpQ;+{$@}o7An`_qg-+BksKWgey6(k@ol*k{&)m+&z{(BNK7=A0h0{Eri^@!SoJ7 zGw#Co%5}J&xd{7HXK_3}4aXDD;6xIa6VKvk%sCv4JdM5KN!ZP{*%lg$O+k@Z7Z8rs zzM)v@9fH+9;aKY*g*Du^HZTM$d;_pPC={Fd9Bxwz#jdbWo*%*vZUFX#`eA!WAU1LP z=Fnhl3J$_L{{XBcq%3p_!~&N{Eb>UgJhvp+x+KERB?)uvV_|zN1TzkJVdeo3*z9+Q z`3^^ztlf_h`c|k}xjMQxsD-r#inJWIsixv9Y0G~i7N(?q7>gy10jAyY>`cE{%jC-FXGKi=Hhfj1f3@af?`{Qm41 zuX_Z)W$wWrnLF{P=+PehmAM~(-9Lc8Z|wy#4&$%$d+@?@F76VV@9(w1!(B57o-+ud zGa#WFC4IPSCINH?9&EM2lWjBcZ2N4y;I^l`%~TxoOFL&`yU{2#C{qGeN)|)yQbn+I z+|RgUzleE?Z?RaK3El+fOqs27Tfr3d5*ALTlvk-tRWH-jZ`P)7e?W-VrnA=o`hkk^ z%w?HaXbbl<&-((q1oL|Yb2aUJ-Xup76+lz)&!3lX$Qd1C1DsB=wIL>cJ9XPXI{|%Z2x>cUc zM_p6O|G!Di&?cP}v2;_);{ACzZ9 zpdU(?lId29phoq|h>wfL$9K7!rB;Eb2F)5cKgZ`!xwv}uoU+0SG{1TKJP(@1&snGE zc>5*~mfy-?J{cHDFck?j^SpQ2k5qojvMhem0>XLFES6?*rFfh3SjF{?iSWm*Yv%}_ z_Y_Pk%d`wYBrFR!e|npP$M-Xk6d#3K*Dnz!pDAnfXTtx-_c?^wH_9y|D$EBtFCHoD z@CTWBPX^erbMS!x{(;wg!_L45mhZ{a8;FSsKx}Lfo;=P_?jH~D-B7?RR_3dhPUCb+ zG%lwn6V?gM(mpKruY~3dLbIpa3HTD4qgn3Ed)HKa>+I(b@%GhAWD*!06XLKvnovw= z-Wr!ofIfpQQOQ^znTWM930M&sgLM({SQ8vgMPOq{BsK;FVY819>?0#_{_b4`94`nW zFEbx0aDJM3AFl`^Z?a!u|K8oGUAGpBmn@F*<;$XKg$iiXpb>s(*aSb;ZNQIl6ZCJ^ z0)v{j#L(ugFuqM^=ymD=gKmQ{rPnaDtX*G$V=-|NkVzwISI76w8>40OCTQNMKAJYD zr_`)r1AO1OG3wW@g>r;s8TePcH~~^7StuhjcL_j>JAx*GW}VB3W|h^TH{Xp!ZRfc% z(L;$c1ayLT-aZ}ZN@NV3bP+Oi>oORo4zJUs)OToBrYP$3LsE;|YEU()+RUpi%g~^h z?E4ON2*R>mqEbk>I`IpX2p5u%NR}n+oJg)Cl2PiiPg0Q_Bij|oGe4gS%wpj$ zDlRS})}=`C(&#s3Iu1m|;8@C8EO+yVK{wc)48nlvvsqu6+@cu1@75ot zn~%ccxE~e-#lzYs6ej%mnYxF-$R!X4PW~#L?&yPQ_FgL0XKLu=3qxl=7`q0*+$|6m zOf6mfVa3#%>vqg%o?jRi1x8{;cq~@)URxCri#3sPyeCtzB|aIulh5Mhm0NJRc@G{L z53!ffX??^276(pX8DVg1cpQ$Uoq^M(OK=ul=OMKQZv7>q8(;J+6_7D!IlW-_C3CAy`EygxN#lM_in@O;XSxLxQ~PB7qBfc89S2Ld0@IF zK8a}xcAQGZ_ETxt%+Bd1!t=WL(^wvxf~9dOSQ49zMNF3^avS$A;r2yQr!Y4(0rM!k zpja#nW6Cz0>l=*41n336K3G6-Smf)D#Xc^W@9PXZUq{&bIAD&q1Lk@=VIDumvs}Dj ze!>qH_K~n~JOyL>WSBapz}zVbW)3khJQfK3Bfgk+*av!h-Ppb!m^5!S%2leS;twy@ zAA<%ptE&5J^VD&;zGe|F%rU@O+sR0pJpnN@#v#OJJc2C7q^1pUdf6Pr zTTel>%{YV;nxiep5~6v$*#yK|Pe#H_J^0NUi_0!+@W-uS{P8FbzhuTC>q;Q9F8Ja7 z^-%nBFAl#xNx_%rXYk?KS$uhM1s|UXFh?QhZXEu|IZrq~kH2zK@XL!Be0UU$w-5dB z=AI`3*^zl1$GfXX@$t$Le7dw3pDygc=X0C!%lR$%>*8+U62bW5LHu=PH~zZ14S!wR z!E_h?ytEU)o!gFIQ?}x_#BF#NxC+l5=HkJAYh>=W#$!VBV?uD|&RKZ4W2TY-v!t0j zZ18ZK6&`Gnn8}3UZQ}l5hI7je5wXMuQ#yA-6~1$7l`e%&RZGEZfgUms&*%As>n%pO zBkmbH%~j+KnHpZEgqP{z^QVa4YD5_;e`Pgg{bmy`%dz6aY=S$yUWT}4Y-4`n+987r zcZhq2u^MP7KFl(?Nv61F=+YhL$lN7vBC~OMg&smpd*Swm>A10a67Cbs@2lzS3C}Wx zzD#x`K);>q+o#~!(HS^mHV6%iY40zCvBU7k`!~pX@*II)gk)DAgn9%Z*dqu*0?#hq z3Yx`dS!R%xNvSkwlz*i~0+9lZ{|3$O2=pPu76xaD>zoIfq5^OxqvS#Ax6(oRSJ13n zhVwX6QUUlpR}gUF#&F%y_4}zQ* zVNX;Tn#IT2kI?7GR9B#U$$H+g|CDV1U!hr+#g|tb6sJjNeHNFA!f}%S-=JB+?Z2on zH0!`wdLzn5xsYUA;k@$29*mazd#BkO&wWIDg#^-?YU)^8Y6s|0hYD z=EBhYtt9M9J~UG@^{|X+_4n{0F#92x&>Z6Fg>d%T_H5dK=7gRNYgWR{)HoklMvWY* zW(1Td8RC+llK`QBt(waE?MoyiL?AgilFVp9P>nwbJpQ{Gw#YP6;OiHY>48La?MLKYn<7841ar1b` za%Sf|L}Fs7g67PJH&xvqKfHy6*f5-m4_D<%Ka+rSX>o*Rv0`Ttth3be^($w2uh=8d z&kaHTZn#9SmXLhnd;R+PQ@qN_M%>M7*cuv)&C#)h<|J&1JcZ4nr?4(K9xFp)u#6zH zGBh5mgA=eam@pF(jkSScSmWb|RZdPg795O=1n0*DlB^dm@bVeU{OSdsJ$sCoSuStnp=9rVLuutB0SOG)4C&jnS`J6AWwK4C9)Ak4a5hLa#*!OltcRO!|#P z@0OjFHMdmJq9{?Un6j>xlIRsBxlSN1MJOy?g4>E|QlQA=#Tu)#-s&v0x^)3s0nAD~ zwgzFcQRVVzS*;>kR4Ipc)hpqLy4BFNK~4NzzZQP1Qyo9adfn>yp>{R=RKGR`5CX=w zZ4861tzr6e8(4H}3(IcpsZOwB9c{Y*gxNj2VNw4+SU+MQ_D>mt5jR>x?$mn9#}N2 zI~EM=3cEgCF}v5#nBBJrtor;6Yvwg;KrhT1)E9PqX68>E2aC}o(TQzaf%TI?d}Yd% zLaCCP#l3imk|ht{;t+1Jz#pm6DCJ}G0!O+15M_#gCYLt zMbWJLK$z@ufa&o7tcp5|?WtEV*DnTUZlN&s2#1M#7z|y4Vc;CdGzj{R{+Q;G`Q2?OnO-%xx|(a&m`>vj>d0 z&5+w>dIexH%PZFI6`?U$6&j7rk@46XO~u4vTU@TFlozUxbgJ6B<7Q&xq!i`|OB)|tVJeYr=4{RgDFq@rryW}|7@mZZiNjf_* z26Iluz&0@sR#6c!4q|;Nqo5F&`UhZka1hp{Bx6~80t_8pSZ+_u_78Mz41$EYt1pb4y{PiQ76 zTaQD6`545Tj3G3S=ei!k%*Vk&e*jJ&Sd2ff1mmv<3HT#35wEX?;Q1MEWS=KIrw8Hf z%^3Xl^elcQG=F|{8t?8U;N$%`{QV*gc$tdd?uFswRWH0tcfkAeC-5=-1U{TSf-mQe z;J5VS_~XnG{GE0fNZyCP;s15idd_8^-Sc_l1*5ZrDI(+t7g%3W<@!oqG-gqrT zw(BBfIW52|r^R@2YyqC^x5KkNbMSb_EIi#g15b8X;R(U`;TB6g+-!*lo2?1TGjV^b z4P}XY1m^qOD4Aeni#amZn&R?OW5n7T;E?_Vbgfzm_4p2JSiA`I+t)?<<{5-${rkC_mUmpjR=R3Cuez@o1+tUhKEUtqn$S9N7x#i)1?cDR{U^ANMy- z<$fc)IKVnH51AqKE_HwF6udk<11{Dh(VXvM>G)3_(idM|J;Uo4SqSk9RDc}n>C5kb z0HN6*fqBp@kuqeeXa%nd8olyV7@7-%vj)lkgl6|V2-ZQf=v!!(U+W#nsk)*Yv@serSjg|GSV(p_7FX79ZE>ZH~C8))_*G<(8>7s!Xa ze8OkS!b|VtD`2MbeN7Z3f318U-F(S<0WgcchUUW2F*HlF4Ej@k%y}*u--7c$Ve{Yh zf5CEL6`rrWaEQ2X^V2o73R$PZ*6H#WXco(|t{n^Xbz6Mh4&UbgZ*csdq%7jjEbBM0 zTv&ynIe=vpXqJHF5_d6}H%PFD7yQ}Ys~670i39r(#*g>+-#1ghT(wFCga-TJmrrjM zG|S*WafJ|&{YZ#?uiPbG;n|bB2n=+`xpVP&LjV@#a-SB}q7XC_Y+q-KpKzvvkEg-1403$MGU$A_3FR^% zk!6+oMMU`G?w!j7@aM?9e_ch>I7?9e%ytkfb4p?)%PEjAos8$o4J0)w2Ch!WAhE$? zB7+Fqx0N+rLGz2p$jW|!TUpO>kbtl$G#nekq6y0}*ch0A4FPdj6BLW(Aq114IIIdz z#G24VtPbIFP&iik2VyCIzZJ)i<3vCJV$)J_CjC6l5lGT6U%+Jo$>kf@apA@_+YZh?W#8)0no#+cTm8BCkEg>manF!-q##`WljDrG7V zmc?C18!#vzYmv@PL8O83cPPta%6(T1l?jJ+Dpo?fdi61&Z5vGK+Y>X!jlfca3D{t! zht1{_vD;D)2dt*xC_m(`3oH=0)Ebe?EfBlX3W;lGB6+-C&E;o9E)( z)`d8?Z6Pl1ScEG(=HvQ~dAPM}J~DRB$L(DUk+EwDu5Dd}%NrNs+Lpz*ynZgOueZaM zm6o`)%na#EEO2&_6;c;i;nZ9UMA{f4&~zfAW=_Mo6;`;sVK&mY*x}69c}Ux`0B5%^ z#@Q|Nae3PU<~t8JH_yYJtxNe~TZfyQ)*yZ3S|n{=kEqov5VT}5-0bGyn9U3voN0?i zlcu6mod%i({JWB-w$s5kG`6Kni^aO~10Y>rCA3`Z|mIuNRk z_+bH|d8iqoS$x8IY?B`SVX*Tgj0n^l!qX7(;58i2-hqu%5X_wkVNSs?bqt~;r5p&7 z_I@y?OdR}~`oll~+Swn|T>>!O*$)OTelT+3K3B0s^Efwum{MkvdNB3!hq+GxEC`v_ z{vntV6b74sP}uke!!95k3xgxDkTAL|IEv6rNRCaSPGL(NJAR4D*mx=ln?y-TYFVvw zdB>>~>`OU=W9%$AUcG_+r_aHb?;}fh56lSUwL(K-85ISKc!F_43>85|Vn#v~YzWde z(NQo94uPSsz^xCa69f!A{b59yH*oWY6~WltFBs#FI`3^i`k51gt-OO_=@kSE zLW3ot!P3Q-;OxhG6OP@3VdN2t>Fy!WcV!!~eWp2jv#r?4AVg0Uc=jPkvrPu?-H(YU zPGG9T3AU9J;n@i@{T!$hnBjL6GXjoamj4mV_B}#KKMGs#W0>R3WB6UL;_qN#&-yte z!^S=Z3mg(K=V%zr_WQtyn!3*y6L)&TaF++BE!u)|6>6eitCrX{%N$MXY4O;Xj2?`8 zyVl{_QcGN3WPo#adPtk4hf}r^O=B|7*_vYMu%75rsT7>}+=b3CK&U0}cZ>0eHyw{S z(+LPAH2YahfS2hwoR~HgXZNhYAL-%vJ2Mr3zdDa!pPxp~op`*s9*ykleAnGSjo+SJ zz^~l@+ubPqkr9T!ZinLUYkv6soC|(UK7rrj58$_`UHCJ6I}o-FVET9D9()eihWDNu z@YZQ1ULBr?SNmrnXYUMT@3uzPb{o9fHXFIy=OSmvT)f(0hb*QqspnjNww;i?%@&zk zX5-NoTRh=&`grpkWD;;?2EGSdt#NOYHSTVtHqAoD##sc|S-8ji?(CSObZgrzKF2mV zztRkebEYB6W-|P2^f0eqCwyO~2pW|xiuPrTV8^(gxX!$;tvA4(ZN`LJ8Pvz+%>?H4 zdeju$U@BA7E6ejHLj}z`P!_B6ZCU5`yJD^0Y{+HthbAZ!zB9HP@SVkN1Yacrv?kf7 zf%QEC^!;4~{hd}gFrYC~>_+0|>M07GGkEPsJoe4;xp;9%EbB5Aw*ek*pN^MDY~f=! z8LdhdSO1tz?%NF?pFQBW?-jy=WD*lMgb+T(a3;RZVtH1UW)HG~0^Y*FT*%T~$l|Jl z%lRxluAJnD{m4Tbg+>ou7!Vz>46%(SHKx{{@;wI&jvx zVrT>YoZu-@GV-=5{SVMA75{eIKlIh;&2Jiwpe%8fzlCPmr#n9%ygisIXy&#&Xx8fV zZM}8+XWg|Z7+SvnpCvJy3C)GU`9DFlAEm{O=FR6O@f3adz<48sA1QBpdqTA{QWN5_ zlplV18H*-Q7^A>Z05AKcxK8K>Z3TpKzWf5H4jB z>u?rsU%Q0R03TdBmqtK-n`b@F;Mi-0qyvC%I8Yl9;R#L-ymABS~*u>`7UtPG67s=zp` zCP=Rej;CS?&XEM_Fsv49vx6hH@mV?H;{)HYQ22(2As{Lo!Lcz2B3y*T#Ud&>2?^)U z;{1&(n742)%9kyTN~KGpYtx1p-l7Tgnm2<{n!nj%S=4r$MY&bh8GFm7gkx}DgpR9b!C+$ zt}ijh<@tt$is?8r#{g-z28g$rs{A=m5!%l!FkyZ+xVdf?F74!bM;0UF&}xC$>1 ztj3F-i}7shTs+ul%W}@grOhnIp)E+*z8=Tt+G5>QJ`uh+?W?3)XY1O;7l<03xS1yD6D)#U`^QtMqp7$ z6c(`qx0E2gHZU67B9pK!ny?&w3Tp|^tK$-}EG7zKuVc9e(62=5)Q$n*@d^9GplQ^31*YU2N znC#(=DIR{9N~oRg8303;+0-u-287}XC)}V%=+(3L!c=OSJptI!mmP%w)j_au_a&$^ z<+)}afrMrPv0E^VJwvH5LNhxU&TJ<)4a%qSI)>f>(C0gJ{PB~R;^@e{k7J6%Q5d)$ zgQbrHto`g^L1?!0Kg{JLu%QH^XZjw)EbkMT>E#R?zK3nt$+2>bg`FeY$T1qWhgg>V z1m!($n6k$M<9E7onb5ps8%mU^ieW!?hRdeqXxp%s`uAtfpr4V!XX)}P8(dy)g7XV` zFWc!WXpXg+gmd#PuxjF9G%ZmCTc?a-r*tj?ZOsri(+JVlh6pjA3UAA)aI~C^T|90d zm9lLKKA#Q%o}}Q{?9=$1eFk50)A7s83-~+hD)8bOkjdqn@%Zzs2Yyeq$LENHcptbE zZwSP%omb<7>uUVswhF&EF2k3Ti}2~hBIF)jPOYN&JndbAM}*m@1mY)z*{8eg@O1Y) zJm0$**#}osYw>dL8m_PAbH56Y_pZdFy?h?`t-$@g%W!|+ay*dbeM@oY;BsUfUWtr@ zD{*7rGF;uc1ebO!<#W6oSNE;O>1|69vd98{^Gx71-v}OyOyFW?3`gtfaIu;Od)9rg z%@j-@+zoZvxoA+nI0iJRg8lk~;bhbYzLx!QW7iDa++~CN2W;6nx5AAbmbkLj6xRu~ zm$zBs+HSr#w#~qWE!LD3E^W8Q^<9May>pPUZw~Grn8RhR@1BAC2kn^896UH^i-(8o zkV!o{JQt6T%)_HY^YNH^a(Drr9$ARz$Clu|`$ile-4@={yCQRsH6HFX$Mbzw$R#kp zbzF=$4h!+dem>V{BkRa4WV@_H=$e^mRYsfIe_Z!2c=PZUzkgYX2nmD?=+i-S;8)P> z&V9mr0?LF(K2wq^7R&=+H9(LG@De~KuzHat3P}gHI(}49G=wR?VMNVMRVOl%a8)5` zTt;@u17KZ#lD9T%p?je?(VAqx?)ZHD7iiY)D}ZJlEc09e{=9l?A~d_Puytfx9?7E7{ul8qy9|!Jj=A*S<0$>`^H7p=@4J%k23u=L0y!S^@QmomFB$8!I|3`*g^na z9UO*bL6KMy5RKLTF<9*vja6J;84yc&rh;OyGMM`aD{F(pvEDxry8?n>A07dBDlj?* z0hC{KH2kBY5fUGdu!KYcbR2>S&k?5*apv*`>^rarO&ZrnrD8?Uw{b&^Z`l$Ct=ho6 zeP@_;?Sb(>cSFTu5^AmpI<;;Fr~TWJb@e=65bmF(q~O`Ve95>D zJiZS+eE>YX1Khujzi(W|ucwpoI@lMPC-&p^&NaBbX(4W}BYduzf!nLCaC?mK?2(WZVwbo#yxI!pAC)}7I|?T=_&zd7om5Qp*r0RQw! zL_t(ksfOy6Dxpfb@~BW!TrP^BewR*I=;nZvS1)6CLMrAE>}H+t!K@f|=??KSpd0hF#?hQ-DwWsfpeB%-99K2!VL@1_=1PGM~ca#AE*?{|}J9sfAkTErs z<3;($5|#;uCeDOpLZYcF0n&{i?oN3I!OV-m>>W&S4uQEZp_xEBlW;pfI10-`W3Zg} z>vC>eMF8Fy5|2%h1m>tDtRgrsiipK*LbX*u2q7|J8Uz)av} zS&az!X3Sfk(68s{MX)7MvyS?d=At6-EG{Z+6T+|&Vc3M{n6NA+ULi2{=CxQC1CJo+ zIs0ImxNQ(H^aNs&zN;^$xp*rp_EaZ#!ud&zKXDY(ob6%i>&ALI!iJDM!}kPc z_#9W7=_Am56t-UWn8SDe9R6POog%TsEe`V?Sz1Q~01ejO0G$`rp}Av9;5 z!r#v#@#o7}{P8pve`hA(w`-C3blMNEf{x-QA@$s5Tcq%LiQ{t=wO|S&=T1h1-9&`j zjz{?HF^IAohu8%Z5yxjZa;Z5&7g-}@;S40LUyife*CAp35=5+=1HXl4aJ8L|<5p8} z&}0I(O&N*xeCAe<8H^Pp2Vm)lepo!bHx`ZPjRnJdVcv)ym^-`&?1py3?7=@_=AfT2 zW5AEF?$Z%gy*t3VcV}2qvj%sE{!eYuy;fy(uTvfUn%77F){QZ!ZF3B1{XK@aYL4M; znqolPrWilCFFJQYNK-ZjcXuYCg;|I`##JzB!z=jJf}u`#A~ zsLyT9VbQY<^nYx@_N|Yh&Ff-F(>fUbeLYNU-w^s;n!&11d)N;C5#~MGK(AdB=yz%k z!yj6}s7ni&bZrUq?yX?is|{>=x5ey!oiLa6SupfxEM^^-jOvMv(??)jtNIw#tQwY& z>49YqEVL2+lH%w!bo*=^ppxH;I{_X@}o+Hxnpr!&yE-W24emkE> zfeI!s(Im!N;Yc!0A^j(47O>BUW&vmdvsjz8YwCbG@7e;wg}_+HQiZpDB?ZemeGAQQ z0?lr4=LI}?F@a`*XAPS3Ow>|PuKTf|vR*KAV1abd{53dB!xV;QQQ`fXbkMAuxlVy3 z*9%*pzjotb`X5{~3d5yN3VaJeu*}ZqFD?rFF_gIc)xQ4$nl*5iYZ98N{E?ZgkA$6* zS!li%>nZ2_CrSMaw~N4Y-ZlSCU!5zKW_~#S8#F6$rc~XejJ#20Mn`S-9^Qy7gFF!P z?QEgVN=8WP*c!=+QTX&eJKxGIa42pG`Oy463->cFBgD@Q36TNFd3G1?b02ZrMZ9=? z8(I_#4VndL)%iT{7qL*k<@v9l5SSn1Gq-;fYp(bY3kb`xltA-q0&EU}`IXFq_eNk@ zLaPZ(=Yh2h@FRe}CfMg>J;m9SL;}1ou3R{aPwym(1%a60{rOE6AziaH-@9=Mr((nL z{Lx(%8cv(dPy=TTn#IyBur6*H>;w?#W!i9wKYi=gC8VUp;_9U{NI#o|%=gK_`Pb(ZU~nt@O{9ywX8Bg>urQovbe_j`o5Ol1<7%;#A+6cUc5 z{sCAN8jeN5kysiKg%$o3p?P^=G*$$0nV_5+ zPP=gF$R=FfyAn6HFUIxFcF5Rdhld;I;_*gXWK#FVYP)GJu5X!(OVnvDCvKR7L;`EV z>KT+Z;t9|(tN2k~XM?1zb8+p&2IPb}5X1xU7eC;*8i{9thY-HP9L|ur*l}2bzSYd{U#RdrE$K977cmH`z`GI%0903QzL2#Hh5Y7|& z!Fgm~c#Rtg-|-U=ZeWNoBLjFEjK?9pVX*D@69#TGp?R z)`aFZjT)j&!-n|2b{#aSS{t=WRY1cE)zPSOE!3-A1Jx>2Le;Vr2+a}^<2%&;p#$c- z+QaF_HSAA3gN62fm`5m{byOlP2sAH1(PHBNTm&t855eS}_ONgW!hz^BgyvjA^F7!( z5r&-z!2~=*fnX;;C7mUCx+CGv!Gq#)gh*ooqmlzbTYy=~4<=3oW|sg$GXdG15UUgd z3jtbB!mf9?k~QIWrVpW+fV+bC;!6K;tRPUY@D9W3z$mN^jl+uY1m4H7m=!<(&a+UP z`xCkevm#U9K*BL0l>lwxBaj+NAg2heR$lBdL=ePdLa`*m5A&l5&(UF+5fu(&e?N?O zaKt1BH%xV9y$G`g?gVBx0onixj2|3DafhQQE0SZNwN7?Yj7StfUkvv$IBW>>W=Gv% zbi@tQ4!L3Cem9KW#X9bA#?(cdP`q?`jP3CQPC4wym>%6!XSQwaDws*2v>V+Y^GEl^ zlFpq)t*<%gPIIcj*)){bRC)0k;6dW)efmI_rV?n>xnA^J@ z)ltcousWk>TUc~!31h;d5h2s08^N%93rzd@d-Q5j6HUsNLZecp(5`Yd^la1&eVR7M z4|QswZS@LhQMnYFlrN6DrM^S;;zdw}o%zZ|HObfqC0Q29xD2WSx9JAp%3xd>_$Ge5 zHF->3b`kG)XtRaA9e zuO`b?y98B|&uz(~s9Nkhb7L-LpF$Ufbm zV>w3+K4#vfiwoYpjoEHb_+s)TR=WE=fkc7PyuEia)EGyK&e2p?AL{Q6JXY$ zSv#*VH0uENTR7H9W|G#)TW!-ty!fxstN^(%G*jFjAe^eMDaUKq*2MLHgX3?cgJK;h z7nUEdkq^!8ghmgZ;K@RGvM@SpvzkgYQqXx>nX`Ds;@$#Wrj{O(lF9gkMav5pAuX<(wY&%_N!<+5#4^VdIb|+qk zQu_$i-U_Up3C>Pxzp{Y4@+OmTf?|c{d25z0LFY~#)Q7oY<3`x9ex-^G@sTk2Cg(9h zPhd}jWo_Wyb9{RHQq9a4s(~>{MdK&V8N3?F47BHfIxRvjjSy;pWwg z2oLhdsh9}7dYVaCCIDx@;5A+-E3-hexMtkCd=3}SCTp-Q=kj_wcZ~OMvem#wndMM` zS$vtr7h3>bCJ)IX0AIRv8t2cP!o7^E%9<=~r0T}JUp&5t+c(p>{*)b#R|M$Gc#@f+ z_ABePz&`6Ju>U&yDROc$@g|Gl{G4@p^B%rwXR*vL0P_gV^FkuAATR=p2^~wQWn5k! zERYh3RUrfuE^nZ=1%#=PWqyfiaOZkZcr*f|;|R?O1m;8pL=l`L;t&#@h|su1?jux3 zMZrHJ6p?Y!2o4Fx$l=5I(cnj>Vnqz?-VObDFUU*0W6kQg;^m6813Ta~%LG9)3=lqJ z3KHfS;LLJ!oL+8wxaAlntuB|u2&CMpbyUi4t+br>9s}-JZ zqBhS$=9W3QyL}Pv9^Q!OekbucEeyZkI)~pL-N2vEZsV_)cktKqTlnMARs8Yz8vcHE z6L@t8c#{d_X5#lpSCC6^PCc{|F7u|s$#xRF7aJpJ?My^%TZEWh%MrJGIg$^o!DXkN zxIvh|9q53J5C_}}If0ZUx!nrvFJ~MtC77IRr1_0&p+f4;KPF z5$AdWdsfO|cxLDqYs}kx~sg0&h8lX}A+Ne{#GV0Z=jOGn$qGPlA=+&+T z#&-W1#)F4o-o!~5*69aSEma;>%2!6^vK3Lbq)c&M6g4}w$2?ajI975lF&@R^$up|E)t7*ug>)+vc?wfm)??>l zihwlR&Vbv^2*KvAEE~5Oh&zij^K}uo4Z^)xvE>>BXd~9ufB>!U83;XIf1;}w#=Cf6 zG(mVI-?~Mq0c4rtZ+=McOpmCjABJIRpOdUKN67RbhK}t8kRv=8EOJGV; zS=WJI%~D0tu0b8ts#F1GR?r6XR;`HdsU}s*qkj1^ zs9CZYDl%Uwrh6RxUI!b zLj7_@(X>h#d|$H)s+1_E@~cS5twzwS&f_E`n=Dr@RvJ}GmL;5jN9Zh#{w*8ghkDh} zv~qdWEngP(DwJmXltBx2Fj`hGkG8ccq7@+k_&^80zkS^rXjP{=DiznJxi8H;)vTyo zFC}eUt|Y3l{?*D9L-o={*>U*}HHs7Bixal<$|}%ZO;my%1@5m^x;Wo2MNy4#UX##Y zgMeO{Dnmdo%VR1rt-|A~aJd@Os%+azya%c?t;Kz{cwPI-&>ToE`4*bLmBeQ*JS(8ae*XtB z^SSil^DE%|ze2OXvHDk>Du8BpwO#7vmv{XC3e7qg*2$OOIwe`ollXG$_80MDre4fc z7wtg@Y1&1#0;s?9d7OZ>k0+1$H&A8;v_h~qA{A9|CwWOK`Rel4rQDtm%L1JR5|Avg zENifw4~e30p)*gCmrTPb@T$VAQT|F&2c5L?eI*6Q1WKN(O?@rp`p&g7T;UB(jZ;Lwj}I}n~O;KeLFkA+nHcb`8W$eQ-o$Eg0q0KFF`ug#~X+D?82bFy-=fS zWz?=+8z+t*fwhGJwr^gIUq9z6XnvhbI3?IB_#+I9>xK9Ui^Vp;+a4FsMB|UoS-5sF z3DIHR$h>z2ub$sS&dW@K@KfbJA(1vd5R|_NFe~?rJU?bi1{lh~LGf39P1w!N&7s~< zZz+N0SA^y)ydp#^aDF518!T52fiw3Bo@L%ad}J76LIQCk{Tx0sk54(T@ZlAqS$v@h z&0=x>#PhCQJcIkUud4Oz=b0=U;a>K$K00X-Em1d~vrGheSr=;K}}1LbxhmSY_- zi?6q&Vv&EHE$%D=&DnVR=#H{ROY{%vbZGm;h2^nkY36aS6}&$~HtU%F;6Aqa1!AdR z1m^oiVNqZ#7WomF{X(&l;JhkO6hWXMIEO}JZD<5G2Zv*)e-J!kPa!NZ8NuOE2#Sn` zUsQ~;GzZ5eAV@6E1ZlA}3p58t#vmvn625^!@FBPyKjwf*dU_~YvII((Dvb)oHGepZ zE*%hJXNag7W07Dp2L6+K!)IDgB-oC|xkZzZPF-9&1(%oWDHNexX}`K)>-1h3TwnIo`G-zY0S>$h&{R$32w)b8sv(L zi9yJ?5Qmp{&XbkTBj@%xKG`_>}n@HV8l9KjV|H>@z6PGBgCiWLYh<;tTJKWGBNwL1QU1q3bE z3+Hiyzu!tHf7l%hz-(&%34aVXpNk^J1+c$E^WH} zxN`I+1QUn}!6tb?Y?|-a>;vPI-Y^mo&P|6XFPKzT<6ptTnHrp!t|PObTbO2lN(i!gr<1Vo>YGNIAL_W}`4Nz3NxTfQB_OqEUV2lCjBT3i@?yt?uFSB}$@AwaVyGy&?wIsf59Gt6*TQ3g}w4 zI0jcQ4U0ClVb`$%=5}g;8SUy}X6Gib>DUzJZ5zXssadxcum&T9fi=#)?vIOtS7}mHZMt@%q<6G9pgl2UyxjC1c)WMj>RMWZ`E#mfZE$U%H z>xP)b{3o}kI*MyUQy6z^1+$-9!?Jrj%;?n_wtaqpUH>1kV9-xkH1sDd9`Q3)jO>AR zV|ru5*gjNmY@W~;TPO9$&S^uibIK6xnmQai^@d^RL`rWs_D&y;J=3{7WiWP59fsXT zCN_U>G*och_`SPer{Zy0f%Z|s$=G8H|bpu%a&=Mw{zGs^>!YI~V zH1d14U$ffEy<-^LYcQ7wHLHbzO#6Oc8yy=|B(Rr4^UB51q+&4weQ7l3Z`Xp*-l|@0 zw5eAY9qQLZhkA9;mfITFtcG%>OQXD4%u5u7N$wrd!x z|4=@Nel3A!t;}DSQ|hlm?&ZmsFF;K?a3(Oj2{bz~6>!$zTgsrpEc0`C%-=$@ zfVSMkn&sPtJtnSyRX(jgTHW}A=ILt}5Dl7@6`bXgFnwN>cOE$VP`-tsSuD=t+QIE% zem+>gVkz3SY>vi!_|~mjjzjx)p$vPsYgWw1U%$O4Rr?C*@TsnK}+;JO0m7f)`h+3!BQd5Wi*H}Fvn0wgpum5EIRm=#P? zxrEN#*8-QmtP_A1D9!_8oiqz_fpqY!2266lxMsX~dOz=Y*)AOfLUQ&a1K}=f{-Idw7lyU|5m*yQU=|mRJZRp?=UI69p?FV~%7c5@53~f4eK#@|#@m;Z^ z%1YI{P8~QHPe!oGU|d=_4j1N*!YRwaNSQSP7Z**y73$iO@wl~eGH$M(jEoJ_aBIU< z+}k)64>#!H3721NH$%oc6NDQNhU?g_@SV~PF*XBne(7YKUTO#*%kkL3kKD3RT`*_p z517Lb;8FvA&QB!g*xJTf*rtH`ffrX?w;auasZy44n)p{Fnqifi!b+2 z;kPHL`18d%{F!wLf4;hgUte6srzcmGzC6B&PmeC(4?^-^yidQpWQXp-4O~Byh+x9Y zjtwheYA_Lh4sOF|rf^aNtp+e&^6>D7A8kJ$t;|FZD)<@XEO-Oe?g4-^~ zam)QAZicwxa=0U|_#MJ6?*q6WVvlRVj)Zw<%$+n2)ytGZ+47}PtW0T?DqTTcqi*}| zSYYn~k8^2oOi08ECqLL8Co~`N!MwxX3YxzwCC3&;(>{Zszxz0rxP{|H+GRxEeF(>h zQgU?E?!QV@^mk zcanj58a(R&+M4h@gU~#akSr2-R_jb>5u#`Kgkol(MAry|g|GMs2f&N~ETCyjxU}#O z)8Lu1_F{g#&n?tNtiA+hf8|~=-6M$59Ed4|W=-Ch;_Rs`&QqLy zFvXQ}@q(U>KdikqChXT&i2F^99sL>!(rnov0in}&9Q zzvWn*U2Kf>g@#C)Jqh8~W8h&h7#{k(*3_X0FdmJNnUfH`z!0aFSRs720lZAc!Od_q zTqpx%sZ-DBf=!^31W{AW&v|13QO%$bVNg$4+k zHw_^R3=qcUs3iu7UNRlAOO24U(gJC#ZIHIg24_}UvPnc+Njc7+LLigU}UWzI-``TR{zY{P{hCtQj1$Ia8xxO*`P_phYk(UmiJ ztaKK4&n02WY;#m9UljH0RfijSKwJRh!a@{t$Ur_HFL!t;%PQGI4e;|&A>4FPDRhx3 zG+5T@TR{GL+t;v42Fl|?#e0e^b$qEiPM3$kYJoKPCHJUhIbM_6p9i$c3jEbMh4a-( z%2B|l|E2;!mi(n2x@*Wjfo1ZY7scfQXl7%`1$3aBUqDI$TV4SQL$l70Su2zdf_3_T zfo9RaL9@WJ45G^$NcTTN^H-4k5B)ndOZ?)mp;=dUHk^W0P5CYmEMEaQ>tI;}UroOtEegXNXu^PpK~;>mXz;g6_E`wK$+ca*=5Dg%Fue*tINCxZ}mGYV?4 zt+h~n3YwiLg0k!@49+6ChJ>1%H+u#)tz89&<3}){Z%_3xKd^fX{`&Q!g3339WzA(F z4-6?8jT;^24}X3LJ`#lQWn4h≻^jNm#zCrjM2hL|#0(gUjisaP#VE{Q5DAz$~sA z&j_ban2%UqwGef=1lnBYDbSp&p!p47}%ynWA=ab_KS4qe=FT~0c2U$NzRe&l1gL^)d^uL?isIHHW{e+POSMkkC5{z7cp0_V2!Ic zR?1)B%AYwBs|h0O2+ZpPBtCOA)&#|1O-K~h1xH|ONI3Qcg-h8KHT;rw9lyQ0j$d9}!H37^ z@HX=_-ejf`hEtIJFa>X)pJ!+3GVjxC>^R*<_Ty_fdnyEugpPUEMi|=TC$w)|AN2^y zC5q)uroy@lBv&M$RxMr}H3`p+ixoqkYSl2WR}c6tvcp{;d%V98iC-Qh;kU;u*WL5@ z?btCbq2rQPQ!0EPT}?WP~1xk#Q6wMn2#H-`6!kxiK3-Sswu|hn)N&P z!a@fRxD%Qk2+hkKeF)8dn0d$xcGPh5c_>!43Xd&@=KY63fA2}G@QQ)Mxf}3GzlsyV zv4mzf7>QL>pg9kiCA^%9oS}f3P)QILSrCRzj*GR~8&(8#ap^GUHFdIZB1Ah0XlwbH zIt3D*gY&J;Hk5#JVJV3A<}xAJhMhZWN}_F;5-?4DBoalSa?cRx)GW@yF!253mV@!md?nuP7iu`qFR!01CqFy6rpQzV*&n-&4XJTEk! zF^@O)W<3RNJ-E#6CbG=yh%1HoB#S%6Gy?WicVFlcZYOyJU?Pv3#N!2e_1t~9-3OBh zv*PbOmHCU6+K^=u*f(Q-=3+f(o~B+wsy^bAZOHX$yv8J!X(G>=M9@|)AOvuIet(Sk z5jOR7fjK|EmLATq;d_6Et0!z63C!YJbJ81T$K7GP&mIzjU>wv2sIpt zu<3&lIc+GSrVm4u=}1JHk3x*uXq+PWCD}|yoaF?B8I6R`)sT#t8_XO&F)5>?R=6b_}9y$0Badc*NOG zMB*Ghq|DPts?s!^CcvIuG!1FsCB7V5qGIF?tkX$@<^m}GR>Fnft{1>68K@AP>bO#q?l?__;rCm8eIE56 z^AySPh4U^bgD~P(=jQ3ZL$gjUC?caHC781Y%>uI;(5lS1USMgK!W2NW0_H+s`K@2F z<|e_5=1ahi1u49Zu&5wY0dIkHg8}~;zAX<5dAv^FLbHP0yfOz!xxZ>m>f0YNUo3~z zp>Un@^ly;r%kq6)W&!y^FrQz3u`+wWllgeE>>_Uhwd_|Dlkiw+10B5oD>Sp*0?iuq z7nJ2&2{e1DYdFE9Pm(Qo-&aF%M^gc_$u~<_zIDCq`H!tICY8({{ z562^Tk$DrpeR`#=&Z2j@j|r6mj~X;fq>P;0Z=pGd(EKvb3N8EIASdSy-oMRN?jBi> z?jkNE6v6!9rKcnjjtS)i>bEZm=Hi}_^&B5qo(}|5am{!~Xx7z1EWHBBj~?8@g>%Wu z)k3of{}Y;Z=ZJJLTzFZN4yff^tsQg#Ef!}PP?^IHg$|r$yR_{amgDWq2ZZNLy!nug z%s1IM92$-_{4lQ#ip0u*Fe(J=3CrsPVzDkTj@x6@L>TJ{&07M4u|FgXF5wY`WH$sy6%HtuXy9vTOs&xJpvzZ zM&m<94BlnL;{Ag}ynR4$zMqJ9{2;s|Jion5-Ah3Zm$PptDAsZl;Iu9kUyA7h0ho!J&RV`P-6p{jLZ~mMnr&rKvLCp$yY< zT(8Z}Li>s(F|c|m=zm`aYXX5@IL$@S z;Df!^BbEk14CI%}AQFbSMG3&4f%`yILw zAuJKLCp}?AX<_BWhuN2sG(bVKF`?Oruq=ZE%?ZS2C&jPX3zp7)FcT}Mlfbf0zA)!@ z3zq<<1WvI`I|gu_;7p*^;5h;|gyq7TA(m(FFwF4fez6owv<*U~Sso;dFR*~K3<4DY zW3!+Ln1n>Yi1)5pkOtec)d0f?LO1i|KAXTG*aZb(p1&t%@Hd$5a2%5z92JO4gbYjO zB@sAg`h{R7?`f+5@t+QY6`|Og0Bc1sw_s{ah&3ko8gYF(-xbpc%mU3*J^e71u&c*? zdanMMM$k2I_Jtuk4~A?X1GbYyAu(b*n7A=7o-gs8Ej@$PvN=;TN(K!Ydxb*ZL-GnB zI1`#F$y3i&qHlP?kneF5zUR#;OFpAEe5c#Ec*5F&9V&ZIn6mR?LTEPP?`yut5$3yH zV6e>@CVSj4e#u4@DN`OD8`MJd{%x=uKT`QF&m7ndp>xdOr9TFalZW8&!~r-uwjYj- z?T;g)`rz>Jo>V^^AVBXN(-Y>sIxDxUdh86Y96bmrYvj&q_KO)>_C{E2AkHmT7kg#wp zqU}Z?d?q1%_9!II<$3d`AZ_6koL)2)XO~Pv%7TfAuo;3#i@`W$Jrc<_qY1j&HBS?w z&k)Yf&KZlOStF3Zd=h32BQ%f1CBpEP*?PEatA|UrQ*gzOorJmixXkp@oGG|WV7@qW zJkn>3L;5VHvzX2thl^Z4Loh$fZRf4W^O$J}G#n0#MwQ_-p+Ec$h9PG5G;C$N*C;Ae z>leX*1`QB5*M`@fhE&3MnzbIzn@z+yLUWoqLD_sDl0_DSkYq)OwjRc|9Lx^LV7Bid zL|YERrDX)=Bewhvdf~*7#yGcZ5;AsL>9+I}%B4Diz zruS@x2I6~OwKN&W8NYsfhxn*S_{jJ)PcK3dKR+{~elLQN+)~S3$E1 zF{dtC7=W2!9{h?wtq{EOYi2`y9Vw&mIbsPWND2`0oUbL|TzH*%=YewpfO47F6yW^U zLannnD`4ia--5GtF7stQ`~@<fg*{*>Kl>cvN_ILAC`p?kptiV~xMV95- zvM;X;Oy&9l{&GXPJJ=(L56^+!JJGjCcdS{l3^l4$M74_L5bWoTKY#f|(0fX_6L*6w z1%d*Ey4mu?ANK=6FPjjpAoGn393)&4HeWuukLa)f?Ay6f4Ho?J_PLt%@4ZOeGL)5B z1ISzgXHL#bLbF885P#-8XqM%#pjoE#&f)noS;CESpHe&WSI!xMAo(SL$=|dazy^ZDDv8C+bX`a+VVS@j z9Iuww1xI3YNF;U;IQIDZBP2Byr_P;0TtXbe2+Tndgy!f-1Tggt4TE=Z7(9ar%!FoF zFJHKM`eNy_m8e?1x`H>E@lOWJ81(Io^Cwp!X`MdKtr(A+tES@Cn(0ViHV#);jlrE2 zLvVZ9VBB0j5;qCacX-T$Bq=>f>S6oMC*!|>v21hQ^KMQWZjHI_KgI*$sjCe zq#`>b4X+=b$Gaz&@&3s*e0qKhUtZnAyQdkrfAt&^BLm@hWDge3GQ*fb-O;XTO;jpX z1jX42ELN-t%9JUF@)gRXT)DD@*Al2vrUZVdRSnZRw!rMp&9I<-1MC>m1%Y#>Z(rYMhwL7HdwU;$y?cmX2=3R@ zqH*m)BEJ0k9KcI_c$A7GD{auOP7O66s%Wud?4XoEQG#&&PQ9_lCj?&S&){%aC{{Uo zVBQHI%ob>-Mp`XInJTpvG`AQs8U}}4usPx^j-R=PJ>jwNNx6u*_TDf$PVhRZL9^jW zfo4Loh|p|AXf{+50QMoI5ts?bmIOpILYsw?H_V*8U`Aj!ll_EdE4N_Gkb!v~G8m8m z>=FpGJXyJi!kYW6xlg5@;RI*OE1d8g0V@Keg#cxqHC95^iIrI_wI-f{u=Wqdq(dh# zX6HVP+O`v;H*d$p4cjqg{SHjou$#+!Fm~fEjM=mu6Sl~{4ba=T0aLc`gx>B0n7HdO z^!Gc!?5Ha&j=I45xC?CT-C*PFMG$4%5q|A>{dwM@m`f0!#}4P5z$n=Ghr>b*JPah@ z5}v&Xx5^(`oBhw2GItGt31NBqA!isIc7-vQ#Zqq~^(GYCP}a;_O^zaU^axQ_bFoD0 zbA1W{d@_M~vPy#}g1MUy4BVLy-{WTd##!>6Y|U?qHNPj8j_xqAcW1{&gJ#3M_AuS+ z1k*h((BI|=vwd!uuw*@olr4{T^=rX@*Cx!II8qIWGw;_0VeE{BT2FV2Z!~*0dAk!`v=tHc}z^T)!+xG#K-`XXxPXae;lBrVWmK9dnMXADBD2E)go7ZT=-!;SS;I8UHX zwH=GoRNBmuNFs=**p9&Ub<>f#*G9dU&Jujn3C_hjO zRS`4K5@+Tb5abE`HiTvib|TD16POA5REp&&q*#q6G>;$g^V@z{gTR90&_szt^{j>1!h#l@xw-3$1?Sryetqz-VuwFnEal4bsb3eF=JGB0letVp>CQxfG(<)Ju9Gn(oQr{W*2 zO_ZYXoo0QOvS_L1!>nCjv7?gI-<2YtMbp zI)D`yg|7kmKS=V=hvqzJWkn0aa$yOG>R|alL9=Q&U1PIeI_GvxYz*Ds5nq8b&sCBi zLGqDuNZqvSN{BW;SKc5lY$q3{1mXV-&6EOD%~GyS5l&#uv+`>DcunRnZJ=41^8i`t ze}m?4!C3|{3J{AmTiV5w?JB_RDckd*nBqu)shJ>eC1FY;Cb{t{qAe zFltn(jM%7f{Qk>F1d2Rf}=? z>?!>A@fAV)F}DkVYT)=r#5{5dnz;m2u{7t1bvX~3mDO4NoE0!jtmYgZpM~E(XW?1q z1svVC1}Oxkk1u4fAEBOQP`=0l&MJC_fHb!W)Qh+*65tmrvkcUWj|tUg82lh%;{FYq zwK^9D?ZVLfhG4EN#tN1TmPHaJqwskW?L$CXtn0E(rnqIh|_3pkpoEi8@SBj>WG}Vt|}WcyKWqQ65K-k>anc$~jj;3CslLtMPbx zD+w>}p26!!7x4D+6?}Mh9Uq_GARyl)DBncxmNlRM_0@Fp~SDogW7>xC#+tq$u{9Ib|pQ*IU8VlHBPWD0hML?iz8 zV=QnAfWc7@HL%a*q#uk#dC+V~Xck}=a289mi35R|U~J>!M_BfPxuZZcQv#={vo}l? zK#R56pKuohYj=U?P|Wg)P*&g>5^_%bk-1GLD@naHe`m2e3q)ITn>i)mEEZ?wmcevp zP!tAl+lyjN+M-0YhA39KCQ4MUg;Le(qI8W0C{?o=O4o0L(hb|8OtTIs*P;_Dwd#c0 z9e+lxA9|rim;R{o!vNIoJ`(kM5m0)LK!e^R&}hJDG#xq-O-AUU(Wq%?Ho*ujrkJ6f z(M)u=^8G!NkSu6g0P~UmH%F z*T87pFm-=V?9mQEORW*N%nCtsr^3NxC{7AEPw0!oV|(N9h@Ln?INmqpXB?T>4|Dr> zK~r`fTb3?{RipagG=cf*`q{X(bw186H%G#($w;vokMlDLyygU9vmr=dXNu%iMo3y= zf}|zJIJM9a2@4Gnzi>L@=S@NUTsnGWb=CVFMxUUAH_8`Y$ z8BP=aQwhgub`x=$5H32uc&c)VxWuxZCy<{egbR3|wi{1a9Y;_ehqJTBAeH&0+Ks_E zLUj7<35c_sfI00e!+v59-j)WOI5h4r9aw6fGWt+iH0rNOzGYRq56LD_79&!%Mz@PR0+adB%yT>>G~IuPX3e-NJgI}`Lf|Z*s@OwR_&jY}KD&J* z-F`KwkX+=&@`-#1yDBtt-Z9@ovn+qbs!F=+e{;=z&JyX^%%u>M2{e0ovw>vfA`2~< z@qoe*S{Ryr1QZoOY8RD@>i|&mH7*3Q1UK>N&4=UvOT0z_sDBNs-zp!P397nlsq0aN zp*g=Hya<+GNw#sn0CS!cD0}i;)oT%;WL=aBmxVV$UYHa=zH9}cHLLWu;H;HJiEU5- z&MbGHq`kC!HHiiO{~I*3E&}mR{7uAVg!=``J=vb_{5@1SL2lC|Q>AOMxRvDh<8LnI z31pA+(4IZ`sdFc`hX;lY9)z-GN}*=eDo9Rl73~vq-bDYvJvr&SEjvL9k{?E`a3!EbaKO0Giu! z-(=&;)r)X)IEM5yN%->V9X_z5^5(BU5Pj_iw)zKSb7(ZyQES5EG;o$_wh2p{3DnyI zL$J>;5PpeCh(AkcPCbp}=&AIJxP28@uAN0{dOTv2!VnT6{?;e4|L_j1-mna_=i5MU`gjZ;J^-D6?udp> z>Y_@OQYguff_$7LHmZtj@m+D0Dq0$)OOzp)mO^FT#|;}cK*eGu(X>)|jQg=Awi*nB z=j?Itu^a?XlRk)AsD~TJ*W=Ua5a8hjd?Bd6esUGLnV0bP$rZvfmtS1P+gCU6Hv2k0 zyuO8BUf;p*xp(ke?n8Xaeu!V+-$&-nXk1P9B3NGs{{Dc!etUyI-#*6YOR)+Hylf_5 zKm^?a@u9P@yeXS_TJxV!qh(LjY1a=8I}bwBu0znU%Rtomu`lX$ z8Gsr;4MgQ`gHWOC095WV81;uuK>d-E@%8A3YrZC zns*bL_qk)*E=L&evd6?lYf-dJ1+=JF8~au-#niDQ)qOv*YioqAwnN;CnFw8A2%kBV z;BGn+_LB$U$k=`a=H66a92nLEhsO6+Q))NkGt;VUDME8Uq;0UnRf(0pc^>f2{olSXS8=_5mA|ofFQCGvhej-5qupDBUF@g5BMUfgnhC z3W#7LDt33GfHYEKjWgr@-D{uc0d(g7zVG*4pVxJ}PwunN-sd^zw-;-#y?QLJt|gSM z8i&hEWk$@gxUke6=a;zQ{3178TPqB%T#s77<@vZ8u>jX27UEjy0^E#R ziu}0cC`?$1qQuoGPFaInTh`%P$_iXhSjP3$xShNjC34<+Z=77{fK!X@ans)mMWK^$ zeuWDTh^2Z0I|qd0yG~NPlcGbo!^k{8@_^|}$Nw&xxYlcIF zW(CJ~gIK=9jIQ)O-n%}p%M{}Y-uHxn&isRuTOl$TO~%4Xjbkb zPeXGI^A}LofH|7^%EUF%JU5zUj^=qWd^lr8T#o0%9>e3awrs&9uL-akGaB16w&2r` zK17H1ZPD$St~hdFFQK`PAbN|?ScH3b2)5-n6#z>JHG#job+=Jl(+JH9oGZ&w!S$M| z3LHJO4?8l`P+udEstx)RU=$1WZ2@MbTLfZ>%+VB@MFPz_YqJiTWulaOcL=MuPh;zr zRVXglh05Y1D7kSO64^qg!&cy|+%l+g&C;x*X9!daILmr%InJFuhP2c;96XSXx>^FV z+>=N^nbJ~O7ncmpGOqVg*1-2kFs>q?i=N)zC^=69=LTr5Dn||L@bIDim_Kh8f;R== z^tm&re()!X?>|KNo?I+W-h!oDb`X>Vp0luEOC}blZB@X$GBE|~6Os{~z7?riJCL4{ zh4genXWAB|Y)&N{$035xKsaAyq5QoChK3=W>FU+Kc=@I7TI^KAmMV4Z^&_r_uEY7+ zuDCvP6s{ARFB6*2&mTq*9**lXh7y{G;L7yjI4`am(+1-Fw0^iWy$>!;?uAQ}WH6w_ zZ?=ZjD{Ya&k2xL|;jeqw@#x+);Gaskx;dlUEAQds?+4+nulnGtek0M#%nE%*Tj7Vk z{qg>npX0T6-okS)zKAxRI-;eq5t_Bo4bp3(md(WX+5oLu8W50;U`VwzGC~U@0~i<@ zpfyv0=nni~{PcDYEHWL8ICpC#IE_TuDDnCH47;Y9p*VE~{<*jnf93B-ef}X--8_QQ z!eb~aK7oo`r%+93uDx{@b;TEOr|b&u6N2yDmTB7y@vyc8|I{$8xQr7=GH~*ECLY|& z=Y3Y-?}z+t5VEVz>_kOI2y&;`5}HTh_xImW-z;2fbrvsjyL6)+P7x!;=Qk@&<`;VH0@ z>854E6Ur`<;7fRR)Pb@9b5m$`jmdx;fmtlmPO;+mo2F(JbP#KEytr$m5}s2rfzR`2 zLx#h+bsKbG$EL%xU12+69C|xi<7X#3^!6BwAzt1vpE4aza~HyO!4kMHUImY(ei*+p z5K}iqV;;-4Bt8QxQnIitB@+vn|IDZqOd<47qh>~?U{TUmtlyOj|D40{&CbP&?EP4= z_YhVbJc=3Hv*EdU3#M$_iD{WTuxM8fBF+{=1f(1JEmrFOrw)v5tRUI*10|F+DY6p z_#SrRJKK)$U~59NWmpuf|I7MJSv&e^lI|;`pTj8+F0Gx6ifTNDTVXxy4xVg#!g?{cR zSm%!7bsi{NKMqA3#^M%Lv|&7osS^M3D5c5*#wnF=^h8DAc$5c?M@7&C$_EvjCgFC- zG~5oKfy#)PsE(VDds}_+X!l0^xi1+19F4}~(+PNdHVJ>7qE2t7Qk4EVor-@>ZpPoI z)A0D*7CgSN6^}1x;h(G9@i=b>9$(J_u51M^Q~W*ub0`}1Ti2pIb|DJGX5zA+7v+ik zfQh)V(i7(wIN{7(dz_kMgR}E&3G~j$_Z^2qs&t(X-+K-?M4&!2ek2Zg${@gDIO;wW zxgJ9lKp$bJ;;?|Nj|IFw?25p_Ut>GrcALjgWKMLz7u}y#?bF@F7!eMmaoES6kUbg) z9nFyI!1El%H+&$|LD=gw2>YA|W3LM#ocZo0jOVxy!5(Ut+aO+Z0LoX|p>Uolc3Xae z9Q&_OvBrUI;fR6&7hK-RPD-E)&TVwTX+LKiU*&?_Ro?LE^_7C=_J)QC*btzg`Pk6| zNKcMOYJxT~iGJog;UaHFD+uNqT0Iyw&W~>KYp0>ZMtSb>K`Xmt*~TPwBs|@n684Ojs5Vg=8{h zGc9t46k32;;I|Q&MKb-giY3e+wpf}q2;^~3NsB?;1YmX0ti%FHGjk#)%jXQXuTs1niHQ88%GDt;>)9@yoaU@s~WcvoEu#= zq|utFq0Az={{Mz%r6-_S@@tg7!NkW{H8$(4DKv|fS+mSbDgdp4a+3I0H-Y9D9uvdz z%1n=HigdzK410$$>?KC?p^s*J$1;uO!yd=&u{=H_H67zUJg|8Fe59o$+n`>tkG>hf6u7)66TZyu=63xO)-6k~4vOsV} zSrLi~t`R&5h!uqdMFKK`Q9!w=lzWEqU2Y1^dT_2(z^tJ8b{Rpv042o-QIMa7`nsd2 zE+^m=UnDFO#w8jB^HR`E)r)I|_+1lF#nLPjnn>tH8ldal=YUHEJdzY%Q`%E z^dQ!)S&dbzRwInh|H0Ge5wkNJYm?KldUGb0r4pVur(tM6e9Rsgi{gBD~!)VPg9z8TkJ7Za5sY z5a(7+#ASX+uFn{W>jdS?3r67Fd}_`pTp~zc<@R&ahT-(&{8Rn&S-bpFY6IPkQ1`;d%V?s0@ETD8OI$3-IUxL1g=8EL^-8OMEwC&dQCLxL`G` z#!W=mo^KM44bi-%F`BhBL309Sb0b1!OCv5DqlKXfA=4N}Ee+9{$93j0om&{89ig)w z*IOC1R1*NSGiL;&M4K0i%7{8!to6}s3@S=5vVT5pVbwpxO5yRcgEuS zh5h*F?MtZWj3M!?#tv3*5J8Db^@MeA?dv z6GKzrmzjz1ox89oA|8_jn%Bp`V|66@*-b#l=X&s1BfQ;v1ROUJTK$sYx*-OE*@tnp z7Fd?P4;BK=gk^zdiH>1SU{;ft1ShD;OvEL_RjkZ}H+Nn`~T>a2%m11?7Ao9W$L|nd(h1vTsJ##zOoji;6C(iI1rx40>!;YOn*x^$MJ9H93`;HQbBVtp*aXc#!V$Ow@3Hq zyJ5DE4~F+2q@JT6-*_1*K4Y=P!y4IB9g#iV5jh03bi0w*L}(7~|1&lX{1pNHzeC{g zpD<^@&-k#d3EnU^fakAYsTnwrEgFv#3w>~L*%X|f<%k1reR0gSH;%ddLO}Z(H&$4n zIKUZ2evT-m$~QQo*xw$7{&oasCraB_!tG@NPK0M3N0k%e$^+d{7Bp6=BFGb!L0+f~ z_C^_15jh!kDf4h|+j897<%@^={c-=mM%3r}tJdj}$LFE@hQDXu>}5`%|F zWAW#)I6OQNk3UZ(;^EmuJUEwxKhJReL@fTMYIkfvS<(_zCM`xq`~utznv8;=NhlAQ z&ax4lS9##t3O8I`E*5Y{Twg(GUhRfkz8)y|^TwsQb~x@c3dhEe!qIUfah%Y6+-;~D z+<0j0FxJg*9CaRn13uR9`KTK<_WugIy+&di!FtC;XZ-Sp{9c=>*$>0)%y8Jt9tWLF zanRNjx%R`c-*Fgn9EU3HmFOeR!wJp9kwXyQ<3_m;L$A@%8GDr$4-kQZfvl}^#Dg)^>@O>bxsPJ5Bg4oXWws>PjlNAEwFyg8g}L%;mE-~ z*qj)R%?g?&5QG>`$q)k1gv0oFa!Ug=N2qP$@2h47)WLF7>7hB{DQMP!mTdF{SCO@J zT&03Z2+*qIH2(QtYRXDVa-Qyd#b`RvRj^Bb(o0zC{{+pVC$GhIeLgy1R=&;(nAvfN zixX&;8`SwJ%Z&)^DQV!Rhh_;)rUyVhL^cBRe?T(}+gK^0rj^Cxxn23#vd1l{f^4~Y z?$d7OX=r96C~y`Z-EhPZQbkW*lX*)6@aCIBbAp6bX%O>`=P^{gJRmy7u|fYAI6qk~ zy_((+uN|W)j^M1M$|QB7-KRjU%)+$3Uw!#GS`a#98tq#JmvQe7 zp|LU_kM38Zy!Z;XrbVM5@2s-Csu}JG0Ovp$zYjHPraXZ_4V(p<)dVGkR*}r4S3y7& zAgz!{89Y}T_$OiJbj!CTa>jo_b0aW|YMF0smBeR0iL&w>!tQa@R-Y%>%YZC`w?MOi zGOt_B^3>^|S&6VCz$|cmC@4!bt|-Lp@mm_i{O&6#1mJ-X5sWwTNQ8l+MLn2KHm}-=b7W; zA}gF*WP`H{?QwpgBQ7s?!li{axU$$9r{|av$dJG&Sma`TxXxoKw&=D<++Yy!tl?H}t zqJU-w%?X*!(S{#@E-ei30zvszrms=28yetkla}b&vKijyvpe*)7qNEq5NsXqh?p_M z5isa?M0t+ExsAPF8Ix!MOMoZWEGgZl1xN{0q2!?KCQ{pJ7T^zHtsUw=SWk z>=HX)S8=y04-aqO#NW5`f$Ads%{D7Mo{I~+({MMR_x0#D9^bu9pe{sR!39)Y-izwv zskpl%0+rz_v32Yigt%Ma=l9>xTuF??Ek>ZZHGVL2!=&(3_@-}B(7c2oJJ~-Lf-MQn zR@^QViCF6ZTJNeMpxmISG`JBI-3iOClvB)RI8qLTWn03eSegZzB^|eAJ1kc(L$hbv zqeG8Z(afk7Uj6b*_@6uqpX@9+Y)yq*RysU0QsKjn?Ra)@$8Jx@gq_>fFZh|}i~e(E$rxGn7XNdz+uJbDZpjvU4M zBgZgr$1cp;z5{E|oWp|s2QW2b8zv;Cz>B|^@%#;WhR0wmJD{#1k#Hi^I;u1l&a68b zZ0JNeunz4gTS+658sr$k`;KI1fpUx90*|N+c!a0JBX~1BLN>!Qg4@{^_Ux!wheX37 zBnp;S9eeD)!j*-i6aWh_m#()%*#;Xzv^8#Rv?e?|pm>7= zf!Q8q1nDxuTN$^N5t=J_P6eU4g1}rE;${Tf&6Hym84b^eeQIj$k zbs39LleGx9GZ&(6%QDp!eO@v#MyTBwfM)(@ALjmE9@);K=p52THL8#fo55$+vOwB7;v>l_Hq zPPo3_376M7;{@;N@H%gJ_5B*pu?#X(qTlM3d`I2G!F@ZB98V}rh$j#y6P5*>V<{aE z>7Y3-J`4$QVcf2{R>;7(6n={~^LwHP=Kp1BjuR;hHn~YG*9}bdZ)k4dL>&t?l^$kg zS=dj49jJYZMRiQ3_)L=qyplGmsXIth>CbNjW}S4#)JAC>sh4uk&|8`tFHQi|Sxgfe z;`)XX(s+!5v&M^x{tKGpn2%N&JYEOQjTI)x>!CSDSv&Qx{$J3n0kclru0eCyzpTxY zx4y?Bz%0(;VqO0K8JcxcK-^F_dda=W{WZC-e?hapZrG5lHwDh&Y=m$Pn43bgNNcqJ z3pDfopBMnx0L@ZYu{vlLVAh~nEX$GnQA84yMKQ67I%rPdgG~{*Vx?Ss$Wsy$5grIA+w-cO|tB9^FO{EP4 zlo-v01pa)k6Pn8laP#_Q9NwRcOZU*BINLUY&@2!g#^2GV@JMWmh(KsWI6nF0V+C%lnm0!$V?!)* z9)seznD)2EeB?!oMF=D1++Bi2t$K*Xi1oC(aZ=fn;F8;kTN!aF+cJW+2h%kE%2t15#Hgo zJ~C{9FO8bxJHqlGhRtEyy))Jh`UNTDED`817;BvdBV+AUhuc9+}TyX_|mR!dD zvg-ums|4qKRNlIdih?VsxOov3>bwiMbL$ErI1dj9%=fCU<3UXx{;Fe# z?Z!!**`11;M|R<_ngaauXE`3;E5RL^{j%Va8n{?_;|QwGZ%0i|9P&bzBGKC%{$7^& z;k};f_obj&X8CL04nK_Xz+^(zy38yDZOz2u@HqJRMZs%rG+dVkqp$UNbnEfHg68-7 zn!=Mbaj>VHLlX#XNd)C2 zjHkRJ2+o8)4+5SmmmPRb%itJgRkkBATg7gHW$ad_6oJ<&O29d#0h;Xz%}zR4c8Su9 zz|3Ra3CgaNb4)Uv2+Z~|gzWfK!m>z z3B7K+cEfAmUij?UjY-+tnP$P8Fg-PU7v^N|!94C?u`d_Cxw%-oe;)!4A42ev;|Mu; z0>Ot*A?W03>J&B|7f?Qoz$1sS@#rB0965-9qerlE-#$X~4lLwvZ1vI8SjBTz9Xy8V z=~%ldZ68o)h*=cR;G!D1?~y zLg3(E5ImF`{4Iiqe~+aE=^wjwQJqYi&)-1S3@02~;DO@{#6`m!r{}ukprf z1`w1tdZ9Lu&>ZB6`Veo*2lv7!<9_sX+>M!my9u*#CuufnsOqG-s7;uIn%J4t3{)#k zE5txHjrGzZ1ulL*W{C=Z#0icLPa z9prWb0&Xg1K{9Tae=ZqUv(Z+GOu*3%E7Fyx_JZl{B9)=@cLvdu> zU>xxngj}Y%1n47f!*S4g1ols|!=%q&!RlULV26(x(p?8)+axCp`a+9)|BSH_)>{ol z_Bab<6O<)JbB@z+?PI&;xC}!M^WQs`$GHu|PM5y8u)qu@ zYaMW9?npxO2RJ`#C`wrF>+2kGWxWe7vaZki5%$-+;-s&G3P0yfkbj z=+(_|Syh%uZdki%E!7}hoyRih-8(evwx|&_n_fe~tXh^-uEmBD3$sYqLr9bxmb5{< zUjb)Uh`4Apr0f-`UX^NGU76M3K!JH({!&LeDPUHquD+zRAs{c&E0i=~*6NG-t4;^c zQFW+E!oI0+eofRk{kMpaOgK&^*d|eGSqIC>gzsdgNrWU729L*zyGtBXHSmuQfmoEq zx*Vmf%!Fl1pgA^L(j=w{{Lv(+fq_Z<;e-Wm!k3?EKlUL5e?^ToVC%v5JsTdg~IQ zHVaiHc?57muS5$`*VF?uI{@rF2)K)tTgt7h@l8^Z0JCPPW?CTw81qqFaG9`Oh*QUo zU|&uSP8>Uq!-tL_hhUJNxdX8&5?47vS&^gIuHj+Ph-ABmhel~|Ebz={A~-Y>Axyef%yekq9^ZWW3C3HE!W@1W zW{&;=69>PJNrSt?d%(+>I`VBScK!*A#twkD?GSwb*(WeD>CEzUfpd;12HNmlcTYPBL5}!A3h97uc+n2gx*{@$A!g?4Y zJuDD8(;d5_*P!UqemtzbfqU$j+^f8a`;`T_&*$xK$xYM?Fc(}UG+#q`;boNFxPVFl z=Nsoxdi@+B`Vs+HAUlt+eFG@Jjz`6pkauhcPVP=ZMgB?PQJHc%xpSw09kd&$DY=IF zva6^sr`Qqtlj(!A`*1mA1>$`yu+GyG-@W&ys;d^pEeXwrgyy#RVuTC4W3u4CCl?zt zGB7VF1XDKz6Pg3zvcw<#?I+^do*(f%6MWpy3>IsmF=}ZzTs9_P-OhtJefv)=N#6?# z-!RyTr8!6}%>?Hl0(1j3i(5vr(l|n|C*=_-0|GVwX4h~69M7@X6c5|j4A{nJ!77HS zEK7Xlm{cWOi59^;9JuTlBXOJ4;2O0BZaQdo=6)x_qqt@`QR1GVe42TzobQ;h1rC%; z(pJpO+>V!j`~|Jrcf<4D#Mj#dUzuBB6`%cSyZ2zqjvP$hwg)~tc4KmOHYV@fff+k? zV&3*$Sh{l${PrJ2&_TlT!GnlAb_`L+P9U7%8-DB*HgJ9Yp~F~zFc*FY_9^+xK+D{{ zSbHEB%lI3cOK4uqPU@<|C$Rj$G5DQ0gC%?RV@gswya~)AFTO7YnmwZk&C~<}vNzW! zh*SniL3&R%F~+!3$5_%fzXnS{ZE z2CL`jtrt7M*I^VQ`Fl&3nd)XcAZMl{HoK2N=$KvzV&^uLpd38-2ZRj!9;*q_eP4S4 zZ?`vu>ANpsvzHYP&-1{Mxt=&ScN|X4al=6`GaPdng3}HIaM=16T%00qM7f_WN;lXLn(a}!fw1iFjN2QWP`%y(bp+8ney*tVb0a)^;0~d=p0HXUNNpOA zyP*?MA5L(Nn1s7gQ*bwWD(YjV;%?kDYC7)3OrvB%k4dPG@J3C<1XPEMyp*cKJy8`g z9#xUt7UP53_^AYHo|iZs_fqHKVfsSc+dL1qqo<%W)Em`N)0oE;LNhz-qOi#*iJXXA z5nd?RQB+r2H2<}w&tyzMdK+s{<{bYbV%*K827-A5zab|7+;YX(Jl-tRby zz)U%h!d|g33oujJZX>8+*gck#uyd|`adysVl(N%uV~II3tv|u>$$eP=gneIoTv_9U zOKh8S>j=&~<}9K4>;{46FBCAh<9jl2?NYFZkKF8SNFmpxh(R$fk#HMN_7P}~V@k#% z3+WAr3a-gN0-s5gHVK;;?nJR%7He}n`BALRVt^HiYX;Q>nl*5y6b}kB@fu`KP2%Yw z($+QCS1zlRz^ay`$Tk`?$$mXQY2eJVlDnEhv*IgnhHMUT=s0DLMd4$tP6xnS(;o=xkWTwLyPe&(gqpIP4ixqe~#R2qe=bWA|y&> zG??HCt4iX_A{GCn{?o*-y9a%Fl$0NHV|n$GI(mAW#Ay~^an0?bQ7S5i-jb{X8;2~6v^@z?|&tGCX_G8HHmtFrhs>#fY12+TUg#U>#pG7c&6iP+@tkIz1QS3%I| zVSNa(h03MkPIVFfsxL;+${FxmHWNiRFGHeZ)ZeMb{kq!<1dEHrWulbuOK>J|s;RFl z#YLl>piGI6v$!o#)dJRKGAkaTx%?)N<8_#a-se{$Y}8j5;;u}9qTE0P&f*L^2nfto z>b^86*84B(CEGR0U`Sc#F}(M@E2nYg!g1yMed*#EY~PlJw6rvA&CEh()()mSu{k{p zu?Z;%;crmF#f7l#1&l)p#vuaAqOeFr2uKr@gM!1ATZZ^IhXzL=lFvjipOYSMzN22| zGVrf^`&P(UzX+u}qH%PcH$q(oVcd@&T%xnOL`UDaH&Ni4WfX03F*u$GnUP zj3(&t%uDF{(yM6Iep|b1>ftk+);rd?fO+4hYd6&=AU+hr+Q+yrgH?2g1j}_L9v&MJt zYV~hu+(N~TZq~dF-XClSw}=b`i0|y~?U)m=9y9#>;j?B9#x7llfi6?gsi$;iP4HQN zGuW(+g6ZlgI0hzS_09u0QTqrBl6S+>Hyrjd(_c`$iZg6YXclX;%rxf`nhdu{0$XGX z#zt%=z-@**VaE9sZh8YGi}NPgnuJcpfxpPby#taVpB^X$toY~2=yFTa4_#yVq8 zQZoGa9l@IY$1x*27gKiSU=~4n&W>y>*tQcZc4lM4-u>7_=#4*h8tGTABIV*mL>xT| zzkU07o>-s{VBNmGSig4<^V-YfbFd+IKf!rF^Vp8rncJ}L_$m0EIgjR~?`A@C3dXY|;K_O$$NKUjG*9HcPmbGyi3I-%n=|2+nu*CW2~koeX0py^ z5}3W>cfvECz#O*&6QZ`VzEk1Oj)QwjCOo(7hF97icqZ?H`{tc+-%8f8z=7%A!-Y_)kfSxbCg1OVCz_kBh^;_)ud|NEB8iXi^?;9l4S+$9LtM|z_!+DB8&MAXMl!u|LuxECi7 zJrQ?!Ol_DKYC=3w7djqyLOoF%G7fdY<4_yyff^C_)pB1o;kPo4oy5GSp)z(V%6Z*#o?o_c9LfT{P`YUn&MmUTmBj?n zCH6Qy+Z?C7O>sgj?;fLYXsl?2n&9M+`#>BZJRh25foY$;isgO3#SV5r(jA8)!`lv4 zKYgv*r>mg>)|d{#9yc@WvF(pN4#SYkPR;=bQ{+0D5}M7h&xLX|#a?$)9yf}hJQBOy zhAV&OU4-);ZvAm`_GlEZbH~jURs`yAkn8mmA=-iKHn_IZ9@o~m;F7O1F8Dg(ysslJ zZt%f`-e0J9cxyuQrVY#R_@7$rXQwlboU~b(j8Le>YSy5+(er>zAv~n`r;*`Q_%*VY z3g<@N5NJ-}`y)|-vnIW@Sz|GtFVHNErSmvw1ZG~F;2kf#5-kA=w8&m@3ZR>a%mlS zIo2;f4b2U}tf09mWH(l_NbgYnB!p_fsabDjSs+=?*Gs$Ue?v3NpxY->6N7M@He(@g zij^xCW(CdCGx!%QHwqdCKZ45i=?YtOCp$G*>8S z7PpORZWn8;xHHJ?coMozEWhFkA(mOqI!m|}2vm7Dpkh| z80%B9QtPFMW{E3ZRdy9u&mO>`-D#*Ty^gv{f#yQqbBU@8<;$#tW*Ok8`7AdAv)-~S z(r=UduPDBWy)4W2t(#T(Z{}Ua>C-2Xn3#ybpiPKmJI2MQASpQ=aY>sSfH{Qiy@`-4 z3Jr@wIG+UpXMt#Captz*&}ikp5grnO)RZ(hIlHrMTfo3riy`{?n=fNWgg-*(d1KHQ zZ{TH<=4!$Oi8`T&WCKGJJlE|deEs!zaI|y6s--KDnw)}@Cr{!i0VO#(83zs=!2SDo z`BHv_$AAB&{FLS2qel;M{^AA9U$_|WzWt7hnJga?1Hy0Xj;+x3`OfItwKLiox8lpa zCE7IOcJ6Q6Tw*C3qGJmqbYywOFYn_H?P2x#N0>9DH`ZDVLy&bJBzpXe)agU8H)tA) zj&H%E+t-1IHF*4>4v+3v;o-w_+<#PzyMGqp{sU>>B9sXh}(qJ2z20P|$8=bBO0IG>Rn1{G&I1qlF3BNMyp7<|2Q{s-{ zM9_4koCwzr;+hegqD^znca5w&#>-UH$?(qJ39~h;(CnG#(6((yv~JUyb#yO55hLNcZY`$pw=r$|R?ObE1M|1<#Il{cv3}291oOFzJ$4*Pr%oaD!g&M{iUam! zBXIwItRz&g+_4L5ckRMP0&~d00|+^gi=fvKaeBZ-IAIIwnMDaAsi_d~b;!Z+0_eExL`*!#wY-1-SO%2Yp3P^ype*zrCQ{j=Z8?M`S!zptIT-Z65 znE@?V2E&T&IcjAf0eS<5xle+TNoTy?{S_>pJxf8eJV&o|ZH;*&dm+YgBr?X2LC!1( z?499=RQC~xwdqf2{sG}ben;34Li6D75Hz$GEI)k@pS{!x79YKWaO;8CG0p-9rn=(r z3>O@nX@?x|;RM40IN{U>M;w2~=_&nDyxtDw{!S<-9GCmqp)|k-WdXLRS?`Q$KWEei z5Q;atp+3Nwz%0-#azgC}2UM-ILHQa>Ws$xc?uGkNKDeWU=Gs`D;t0yIRE!VmshY6y zs3IiSh#$11As&S0u}amO#-b{S+T?+1f^&5!VLHr{=TH&jQO~^YCQhZMD2To*R_OSt zs3M401@rhI50o*V;viSvg9|DH#wuvO?CXM4i>z>Cl`GE7w^TYc*-TlQ4||NnA%gQk z_u)7w(9HTdFqY{gOU(G92bTBw20O=FAj5GGwob5u{jc9B!$cPY1FSS7{EjupcDupY z;be+zXLIaz9!+2#Nnjq?;KRIEe3+SkwhaDrBWSbZlH)oEJ6-zY=nOLyuJ^!=)pj^M zsTa08f31KyZzUmrr2{Una>B*c&bYY78JE|&;K!ma||CcM_ba%3JoG;7voy`@)G< zkB=pL#R*(LahxXoq->hFO|ht?TqlDGk7?!9%B&bsv077TZsa&EUnvuhYf?u2{i`~N zWy&(?8Bz8(Ync6nnJMs8&_+oC1b!PNR#bgKwSqqh&5f4k#=?slMe)Bc;GIq8zMNM_FitZo-;BFHxJZuJI<-Ca~xON`mYka4+ z3jf@%Q@+9{j^rv>6}JY>uUY4wQ6>Ju;*VUWTr+A3qXLuKl+t2hCSVghtIBzf4w~!4 z%3OPc(0l{hl-n}hwE*%h)YegV2+(ze;i>}s^QZ=eH!fgTRyx7`Mgug<1SbNqw@WpP zumGw=?YMO5l!E3X5)-+;n%66Bv@$mV=BBEb`!BnK%V+oDVD@IzlwO6zik8_BHE7n{ zGn!hOMH)D37H3JDPLLvTp-YSMP+FL$p64k(UaZ4|d{zm?!NEvRPe;(EV60lbmT8y* z<4Eb1C#EP1^Cp6@xMHZ3;4JPJ;+7!_3YBe9*tjWNxoxB+r(yltb$IT1nN^NZY#>(7 z=IGSg7+(>bpCP2kcA3HIrX$zsA0n~d)9n}MJC|DkuxYPmMAHA@ZjMC+_`gy z9imE9R8$a*A1Ofo=dZsBlz+31E3s?)4$PW49Y6o{BieW9tnwBawlacID+3r4&P@!P z!GvWH_YT7rTsAa72V)a-CMVN6d8@fiK^c z0e#}r++4-S77%`|j~M|sK|%B8lSi>QEC4gt`(n!Km6)``55rt0qf-wVSY(V(`kTT@ zd|~|w!J(VC@?Q$BnT%XpV-xSeXgUPJyv-4H0WH&mkOpgeNmif-51` zIVM3_o82OlVNYl_4^G0Uuw)D;Xq$?gM`#LcLQ-KBng+`d2|GvVBXHX=A4hS&5HOBP zf+OW1zR{6Pqmp&o3Ci4VBjUA0wgS&_gzng6SfwVxB{3FV-xYstc@Ef-V27!>F2tZ= zySC`>>VTQak(iN|fcY6|Se%)G#c8RS#ox^2Kz~fyunsdfZN#eN1o&rdK{D$vft}8P z1BbCRa~o#z{XaQ06rLOWF?QX0xKl1`*1>^t+Zc>->|}cL{wK$!QkyX=DFd^UwqgeF zZAMHQCi7l>LKBoGvi>HsvoI-qGbXYwCWNJ8LQDp{;xjNYAp_G1%@ZQnd0<^z@L1=h zo$yS^hEM!%c!@hk=w{fmerLm=y6V>}$=w!kTE8*Ddcz%eBgb|Hl7)nTw) z9g2~JW{Va68169{ty*`%8!x|tMYE=3MBjl5%3keaf>}d`{9^#FC20H8K=B^qj-%C;nE(J{H0fYqP+8}q_+2oEoN|viOy5n{rQ?;KU9PEY4API5jiONuK+>V@#ipYs5 zkD7$am?zzgcguW`enxnpo`?O0q~Y=;XAt#M|7 zB~DH?Qu5z_W(UV@6ybOzA$g>-E@zLGbcC`n?{*apMz+fk>~!v@ zrp_)}?~bCijyOMe2-3{o!{zxS2+r2HzT6sDSJ>jpDmz?VEg=%mOIDER(h*=%xIcWm_XO3nZuLv^ic&llkzd zJSc6NN0F3K56#k0qQ*vSS`V?|kvc^+(Z8WtH5~K!FKCvIi}*duz)KCB1(x;env$5W zk`AKvfUQBZC>C+*PaXtk-jhfsvxtsMRQgY7PDnz4zdzo5LmQ|xd4e+mw-^r}*5Phl zDT=NhhwboRv2v~tZeBU7B4*Up-p1nxcW~+S5u_)@qP!@dFe$#f;?u0TYRJGo2|ZV> zz_}Flb(JVCxT>JCyhJ895sRGwGhwOI3I7yvCNx@C* z%*w<>FE7lQF&%He@g|;o<~e-y(MOm$b2iqj^~1dRi?Mc{KLYrXkBE#_+C-2R%krk+ za0Sg`eHN>8z@|{O9L-Kp5(H;GM`30UX z?g;Q*iJWbzICuO2@~&P$X-OH1ORI4$zZ6$)7UJUdD>!rUG%j7ago46CwnHuNy&UJy zp2L>3bWEN&8Q*;KE!wtir>>>0*Q_Os4NPEQWP;{KhG=1IsNyJ_aJjWXOSCmHMEmxw z(WXNyLURkez!&_xtqm~z-ELTC_A~ZRHpltt7C1J3Ffwd@z?N~nam>#ZMcJEhH}5QP zPeR(2;x2Vho{366FE=jX&V|#sd+`kJT&Av^!9D81mDBi(=lqj@hB|@2ZW5RY%lEJE zp?2Xx-Y)!kBO8xy9>?E>m+-jaD)4|X{7(t+Cjq(UJn9P%GQZupe|0Y&Tq01P-G-WD zTTrn#2_<_XQM6|xigx+q=FUwx7_kt$R!zXk^%D_4Z8YW+WIlgKi__fPkoC__fq-z2 zzNT;pO@@E&acntv5-VeZF=xYS%5u<|RFox&KWGVK9bBBm@HFdUQG_NgyO(1qAXu8Cw!aXq!<5Du)1M%9oKcL6g-{JM|e?-skf5OXOe~(wc`x$Tk(hFY< z8;NgBN8_szqtx;z1Bc>+KK=1Q@BVoAx88XB=RcI*`L#Db_`M%K{(T@m{B;oCqu!+6 z_>IebM&QE%X6QS18oVP@;gPfrHnAD7i^+mh#8x=5(_v3&wqe`Zgr&n~Q!4C&xV}jX z?`O_V#29gbVSTBo(0wCd_Z zenSkQIn$e9Hr*Z>o@PjP8H!XsZ_%TEL-@d75H<7{gbw}&VZ-~vlFA!()fxhzp_Fr5BF6{Du>ry->2!3N?OqsP(hKZ9hv?1=taw z?NArwf(OB_xWCa6_XC`9YoRHM7MbDp3JcV%8bg_)c9j_c+6s5L{|*6ITrBQ{jz?Yi zcvOdtS3b*i;*vqYtqt=*)h18e-Y6Q+>o5)QU^*5xR5hXZHq*+0aTEbrrCvPN8|48L zP)5n1zp~(oDB3gu#i0|`V8EM!p2%Z9H~l?ueT_4&t+pp{I^xz+XXGz&!Ld0uxUhCS zp>qN*&a=nGxt6%P#15C{+v1p-nwzjps6Fg942KD`2VDCrXg)I45(~cTiRHaN#V&7i z*5e>#c#XmM{y!=!?=vl1V%4yL*ynD89BY|nkq~ZgiCiah$O5Z@r>C&YWzE1LHf*#OQOFe{(t7!8^?ZVbd5I%r4_-3b^-O(n$=bY&}kOuQWe!gB%$X@it@03 zPnHfO37*_9Vd|Z`b0a~1b$@(HzK?aoIXCEzS@B!6a)jS9l*l?#!Y z6oc!R&S(>@$hDbgRVnkA0f!~|%F-;BZLu^TJGNgz^O=)JaPLl4100JV^wY4cBrq+| zTz(lB&t_xi)+pZRNtTWI>!4Xh4S52Z1&lRm5U&BV-uf)dCywsJq1-*p^9&B>=0c*$ zEL*e~pMCs^@)_>bsSBQc_IcRz~ia*P;dfgZ2E4guYeo zBoEbBPvZ84gQ&T12=y0_5^7K5(WMLc^WsH3xON%$^RD3Db#{2J5un+I5A%*uhw%5! zgLq6BM#&+d>?lxr4yeq-<6GxYe{K(o_Qv6I{3@IboQ|XG#^XxZG~A4yfvcgDaG4+F zi~LAm4w!_Cf#Y#`lPj)=jK!tk3D~#P9*34W;`~Ziq)Z-(`3?i|#oOBNtGPi7g0q2& zN&Q;CQE&?14FCNnka7MD)+B^suKx)QqBv?eJ!X}bX9S{%eRS~dV6Gdna zf;pjih}%T8Y4;4?dbI~Uog6Ue*WX~m4(5yPjTAITJB&n@j}21CjY5LsAf&hsL&i8W zBw6-DZ|+~&U9(eqIiIS*Alrjo~0Mly|_i{h{uC}Li> zsOuXi;*9TDoLw^(=lmw%gr7G~tntDn-$}T!dOFT6nTV6V(~vpM4(s}VffJLhacz!0 zuJavK?(c;Q^K5ZYrrdTPjzco3iQ8c0vQr_EG>%NO#G)Tw$I9NHV5j#O6+I)}%M=p^ z|ER3IUHKd>8`d8=<7}|oZYcJ!b92CHG!8hJw-Q_AuVp*%R+*n+WCSXS;2%F_IAXl|;eKFz8!D0ME$y&-fO)d1#Aq})Py{1f_*8Q`ZZD z<(RMtswp%RT1C3!n%0@#Jww576KIxu)$(JVKXuO2N@ROzZX2v0o+sN>eX(xj#}LQf zql8R}Q;I`uoaUw>R%aOyD8QT$myGC0-WNeSI5-5KeEKPjjQB%eG!sAm@Htu&W-P}H z#fBBLFzDxxv2oRG)RY$}cZ-@TLT-5>P9NHX?9623T|9;Qs#49`OyCq?R^jIe!V=Q$ zeti{=AK8n&*;@(0<*1S<6Ur)GK>!sA@RksqZ=kN~3Le(qB$Qvr{klRts4vFDyJfh4 zx0LymVb_i{_^)4qEvfN1xPLcp-MFd^>f?3AH@c#@fWWMC%_wDg39O|hH*l1&D()FM zySJ%h^;TwCegc-8NTB(42lr|<7}jD_>jnZ!xI2Qf_;w30OTJnN zJk32rFLBd2d+HGO?AnS0d-vhU{=>{S1DP3F2nz{Q{=aQnwZSvbJgdOc+S(fP=FP{V zMN6@G(Q^222tr731j6{NB_<{#IXM+kd^Q&?UJA1@WAOfmAD~t1_WY3ZW6zhSamyAk z(82QSJ$qp3vZXkE_AF}e)S9r+9p%M$P*(B) z1qIc}%{_+Y%U58~kU@CyrI!?lOP&JaCPv0+W!#$2jtN>A$b=+>k(PvKalPR6JG5zy z_HEmtOP6QRsbdFpXk~(D3|r#W=FRY9ht}}^@otsdQyaHGGLB7n7?Zpif zaAt{&q92VDQ-|WxJTnxmv{8eiiZ(hUf1?9#1lZ%|Mw#Hj6$SqMUIE_z_r$z@{{TnNkp z%}#+52RH`qgy*rLgypbU!g3s(xa=%ZGlCLe77&9mnbn1o{ovDt`DK&5V44RvCLW`DdVQAbQZMr;jsmK&!bh>XO!AK-wmB#>JFoJ z1ZO^%N-cFVAneLi=0*a%IvG&BR&$o8rAa$VQcW$m-+<4!fpJ^@zMetX=X>I<55L9d zKla0muYJf);xM)ktd(fu%g_-~js_yev~Be#Ddk-(y0*@8I71YdHP-IUIie6m~y-4D0VcfaN#u!t(2PVfxvd81nur81`Wg z4Eyvo4F0?aOuu>))?d5>m#;sD+xK6>^_MU3eU~=q!}{FqG74AcIOFCD7d(icqb5Dc z9XA9A-DH~UL7Ju6Z24|WhmQr)+y^6df*E{xA7W|l$PU+{;r+17 z+ZLJ5Ly*O~5SNTxw=sleOXg>RTvrPXmR(G-TUnOb7Gn8lx{bz^ZsU5Daa75l(SKL_Vj2pf#d~dnJ>5Dhi!0ApV%@G?#Xof)Z z4kSf{BPEVZqKh1+hgV@Q1&L(0rx{G+E*(tEJ~EVwmhlucH~KXH7e8qXr`%t4tk-aD zajR`2?YKrg3B^z5s|RO2n>8M%sp);_)fAfl&)}@96G}m|9+>qKR{+gD<0)uXVdn(O z{_oH%nKwW)rMXZLa9MzVL$hY#)*7RUp1M)}&8x~|K5AL=lyb9i6g0EJ6g2C=StNig z;M{a{Xd;gl_lu^`EEea@O@O%(n)QvPsVOkW^1+LvVk5!`xFP}OC!skmoV~6H(f=8m zm2}Xohu>Hp-}IcgD5+;n5~fd56{b$PY;ak3o*Kw04+85+VkO427bj`}XWnBR?^~wP zPEd8kb?LRq!^e)sx8Hn=&K=uf*}_@)_@lS*%FFy185`o=9?!zj^bZ`~okEz^;x7vn zKd385{-u-Hla+$YXAa{I;ka74C6qv>iB?lZ3oPCiSS-fw?P)l9Y(HU@uq;s|2+6lA zxm+%=E08LFzvAX`4fktr;6ZHx$_YldORu4x@LE)O3B`q1us>%TvNBSTlbwZgr;c*} zEd|ZupIla)kCK8sZWH&6LKQ`$if~_6db7dOoRzUzMb8jWmUzl{sw)WUgl0l~12A)0 zMbfB|d%cEpr*n|8ISdE(q_Zpn#+py_ZRK7k61VckZlG3kyQm{Dqh`YaZ6F@k2^V z8p7F8n>%M7MvfSTci(*vty;-s8O_w>G|gqeUrXZ#%kpzwJHge_1}Bdm!sEXmp}wvT z#l`GM0QW!$zmyiCu;2!6c(nooR~Wr$7h=3 z^elU9onns}WBTFS_j)R~80kV_Q0^-WQ52;q5zXs3p3`9EGYeMJ7QlAiO4!czh0FZ) zaGtvc784g@q{nm&b)JlV)}H7~4Y2dU$Z@k_Heo(SPh13x$xC?da#+t?37hH5U^8GWyuM;}0uW{BGf*(!Mqx=w~|~gPbN|5c3}5G6h3ir(&r4bY5>J z#(2$v`GmO`<2egvW2a*@@5#((Hq0k2fW!0^@LJ%H>An&0jY`M2y@$h?b@lP5KcPG8 zskem#mZfjUl*BEV5|ILD-%waC3xaJx0<1P|hFwG^94ULYp-pHKY$KCl$Bxq&@l7W* zyR46c?dk}OTowTH<^C+wB(!D6{7n8L$K3B(=bStd*l8;=g0h+n3iPTJdWEds=GPy?7$YhQAYiuT|#)le5N_km+ za;O$bqw#+J-+)=v2+aS2W`S4D+Q}PfaLds2UjqT3kn9&=W~QR1(A*GjSV8jBk^tGd zO8gHsHkeeH*5gn&qSkfL9IXL!Dy6qJ>!sjV56hGuh?8VL*Y(h>EbdMAH*Mrb=_*?R ztH5$7V!}fj6~^_j@F$^J0ka0pP2EjI|AJkvAsBylZhj_J8|{mNka2=96yq) zB)-YT1=n$}_O`Mbi*8=Og#2rll+{-c%?etpOK|n-S!~^!jFg0EoIFg(A}EVXi3}Xn z!qgFxpOye})lFPDn~N>UVMt93W!Vc96btYxXf78pzoo!g#dsFi6uCa7zb`#F3p7`h z<|**sy`AvCbt^Kdw9T6l!{3E;Ui{Xr$ICCjOaN`6U|7Psd3uh=j2Q$G7gvlNWs07! z^+anOC<}-g@mK|9RLf?~(VE*Uyu6R6{sl_YkVQf*}VQsdJEBX+&gjs-g93RYBO@4# z3kK5`%?ateehc}NIInNak40NUV{|lXg%{eiM>iuwyvPsL=N$~-@X0G!JM=3g+Wv+t z*FMN}{tY|E^}?}**2oK)j@pd%sLu|^ox_RvGjA^**PX*d>UPOtR1kuz&hEhN6WdUC zW;gDh&%vED*|>jkAO0jP|8;FY{=T*ce-e)CkHnxlClFOT)}dz0GE}9^M``pl+}!Ai zE4~i6ECXnkn&bMC(I_Ak7q2iw$qG{xEgywK>ejN6C|oiex0a4T(TY(hT4jpDRc6Rv zJ(?dUbL6kHM7|%j-V%jt_>o;>i<@iBaf|z|ue8Cj*<<)oHOI-B&Pbo)g7HIs!}ssM z21EI9vr{3yy8_Lx4H&M2X7Ophh=BAsEDBzZxf|wV)|%z880UpfFN;rlbG-E7*KqTX z!#J5BBP11z5;L*&^fiLnW;kvXV2+2A#AX&>W`eX+fPk|G&<+G-Y*CnH-;lD zECHb`dsNgGBqi@b;^uuw*qn=)#2iE??m=wwUPL8kBP?zwLSlCyC^8fNVO!uAx&JToKN%hyQ{c5J4U@vRUT z&BD~UZI}|j4U?&f@mWefT%JTtjoE=Iyp~T`2GcAazYDVx_F#5G4ra&i!Gfe*EKNOv z<>|+;B!k*|6iYIXV3BC+AuN_O;}9069l&C0Zu~Ax3QNa?ury4G*@ii>yRkTNFBZh@ zLcq2o_>$1vwsUuU_T|rb_v0@xz}yPnD_7%p8)v-w#ZPGc{F`Xs^9%HuxEO;MZN%`E z5wPBn1UvskrZKSckAkUhFh;Eig5$bqxUY?Y^>VHgnk7#25H}yRZTBqRf8$NK+gW2| zuRdzZ`0niuFuniJ2pKa7VWayHn)@kBbE5MQWD=V9c$p(%_#Xu49}zV~gXYoizph|* zg~dQZvnzIwGsUj4Be0VoxWjoMwpjNjzzsnv;q42KnX z3S2+w4A+mIK`ej2$>WD(#Jew`mAre{v6kuZWk9Eip}ZH^L2PcKI@N8A*_r0MLEaC> z28J+U=h2AIwWtlZwI*;2%$hK5&G(5cOP-<*CPrw_^HqF!?l)@Lf@%pv8Q3f{hUx}5 z%esjP^ET%D$=FzR>RYvH&39F67_>5iK^tDXoe_*$v!l;_E%^R2 zvca{bgy#9qaQs}eGvJp8K)JGQ2i@xsVEp@?soxTMiQs#jC~($omM+PW~2 zu#z?quu+X5tCu!LtSL)r`6y^6r|7NR`T=)(kZ!6*uLU_r&VLe`8ztLy`8APZQ}UAj ze*RyeS=+C{asxC!eIr_ix(hUs=3~qZ{wFkR$LNn2hw)_Q^KVFQDgkCqPnY9=sVOwe zy5u9%IyRMlipP}b1RgJHv>xjv7Uh&U9;5Sb7D?U>y`ZLQTIssFh+|JJCOlZDe}l8Q zYQ#klP<7JkPOB?@qv?H`pO(P1vMB4ISuL|Jn@Xm;7RjhmIY*zbO66KoPW~9AhZM)2 zq+aTw)rAC8?T~cJ7awFVGrCb zy+}YlOW3-G+S&@`hH$^Gl;B)|Bm1}E_<@~-&_YyK$Ydk8AaQZ^X&rU$_&#je982IN zXqV-qwD>A2D{c@-c?>~PAXEUej$nDOrUZ}b%TS(w5jj~Y*uQfN&Kx_SK)9sv22LM8 zLV!M}1{2m3-tX5*=(|c~IhN3E0?h);3YaT&kt+n`3C`l8krWq+-PJKWqxumn#KA_XqFIq zB|Ntjr;g`hTUG{vc?-6rry)HpRn0mS9u|V+qy(&AzZTuPJqJU6xLWc9(XCrI^ytw; zS$g%LELIggD2p4$YcF-f2*U2-IkT~E=T2pT|Lfr$JiJ$<-D71r?p2iIQT1*7%{Kg# zWvRY)3O937u_t5=BIZrTWb;m<<>cz7}nk8cou^ADry#$lA@9mK6m*(kcO1C^I{ zp#Jh6{C)Wd9$!6*#{$dTe*ah^YIg^sGIco$!lvQMy0N&t+67lvI}>6Zkhjua>E=pD z6#7!D?bSynf0Y%tnNy>2gW!B~8JCw@B2R#N>1bSAKAPv7BX1SgS6e9E@Fgs-;d$#U zC`%MDzq~be6akw1t}eI1iP@uZc!UC(Bm?7uQ!pC96_3?NX2W8B`C+hhM;UqS%)wUiGy8GoSJ&ulJGXhF9L4i z2^cYbu5z#V_|qRSYP16^ZQNis+6HUa1>mp09^=uYzxn&Ez}eFmuqXQvQa5i$Y-|QX zqcaf_y$wMT83?2{5sZVQv#^P99LQyV8N5h1UdysCBY-XpPsf6=%{rxH9wB#5cqSIa z?8JOR_B?{|nyvd0viB(Bj-E!^`D@sG@j3zzoWSyJgw~ARn7=g}^F*21n4d);&f1Sf zS%)^y_jyU)XS_c^AgSobHfD*HHA?mC7QyN)m= zP;VzZXY9qiq#f{K-MXy}gZugjc(Y?NbyG4H5}ubPu`{sy2)-LQO1XCQ=6og@m0x^8CKL*cVgJBET!O1@c z9_$pktO$ekl0b}F;tz{u8&sU;cI}_X2XDR&7b^>l{G$_~%q0^pQh-(wSDIMl2U zqU;7B(Qzn}T!#~yM_b5v~fK@opb1tC*m@m4#G>d_AKdcB80 z+kP1I&P%Gp+{~~8nzOSk;R9vegg|WAtfjK_%DX`<#4` z77iVWebc66zmGQ#O__ub~p{kHiw~X z3$~+!Sf2ai!XkT=`g`HfGPDacg7xe=KEPnPDU(pS9Pj9RDU zT$$E!b7WI+j;9h7G$%JGo+2zi0nI!nPJo)_(LwXm*Hlw5Hn}ein8OeoA&|U@3eu(j z1v-V%xG@|&oWNlgfZcL|es3CgFBX5(1yb^`Ma+%CI;vZ8BB6~%d|EWVD~@_ZcF zy#@PsXDPUqkZZT{FQB~a8g5tR>tIrYOc|7SpFnq?kbCL)0YdX;9ND`ArTNzhxD_~m z>Nu`lInpFy#E5*u8n3ro6-MoU6#}28fw3T}Y z?@1dBs9BlSL@CTmbp6U%Y|Tie1elWv&M69_(^6BE4&?5`;6VcvEGa;hfl6G~t7Y@% zc&>9N{P^wHm@w8AGTr@^Q^#V(n;dppB2#=11;L+K5+`F8KhlPi5zvwjX+&qoDd8csi+6g?k zdI*p54&m|5T;Ng`{yd&QP!2#r%uM8M^uW!vPAFJmgImk2QL@q&Wxft5TkD7-Ut1Kf z;rd#Bc-ABGnI(#rTTr8MYpE#;myW`%#bzj4Y{8Tt&Lv||K!`3}VS$2` z76fdO6=khyHTSQfR@o7xZE=FDp4#dSd)`)c*45!~dM6XYJq7CcE zpt%v-Q=qvedh{DYXig%yohCG2M&K5K<|UZ3aV}=8TLib6Gw|YDT72hcdb|gx)u9-- zX$#zg5-~S19ijUV!#ywtE}X3UraAMa`K^qdHfaXxT$8xKdPv9PmugS~?r9GyMj!foyz-WbdM<2)z9YvK%y zpEwg9lcvLC@(hfd#AWa480$R~?h|IhW&C6~dw9Wff)A$6n1NNReGwEE0pEZ?Oq#a{ zp0nn|edYqV%$Nr!LcYVaIk2BLo9S%WPn``L=4VY=F#pj$(_l7XDonkmV8pmd7%|od z!(Ap|h~qd6br_3b4jvdHl65B!409gGWp@m*cg1kt!w_pn47GN`NN0CgdQ5;L%jq#= z5!~l4ffKLa{k@MNll64&@C=NM_&e$R4B9{UJUVxM1}0o@*31Ci-v0_7?39f0i^PyM zkr=!(6oZy*#NZ_xFl?zmY}bXsi=C(Ogl6ZZn_#nOBh0wZe5oG>IeDQ?yJzvi8*jtO z!W^S|_d}P~hIpxUb4>X8O9YwrLxg#M#5xQ?lG8A3b{~n%31Vp;gA~&~%F-M&;(G9F_Q7uZKFGG|g}oDPG5p4}vmNnmx99Qu zYuz#Qz1Ly+<%e+j@k@;V;|EM1)Cuo~gMY)^LBC+mpkI|+%C!DJVS2xxFs=9ZnAYb9>PPte_8r`rzr`o-p)bE- zpLTA8*NvK^hha0kVbTKcvE%=K*DiRab4OL)maR-+$ZapM%+vWUO*0*+e7SGOPe)z) zR8+@%V~@vg*z4FEha3kGng&&R70lw?`I!r(P7$ZJ>7iQml2FIVj#rR&o!n@BOaQy01OzQO`)>;k6M!OLRwjF^` z+u;bK!nqt_JBk{CXy*~wHPab~mySoOmnnj+en*V+U}Q|P#_riJN=H`t;DX;YTwXd3 z$7Z?1`STCaneXb3Mn(k3P*m5JAT>S-sr>Gz5|}lQ0ExZa2+o>!fu2=_3H88CKH@gn zr`$rxf=T49B=VLpRwHj|M$6>?KSJ|n0VQM6#^KFVw(U z57==Mwol3`ZNUddro_~snLi3XJn{q4Lvt#RN#!}4d4BAs5X7_W;Tr>a4x{Qw ze+6&7@hZA>Fvd?`yn~E*?h9T`s4Y^y$^y;xgwY3eMYwb}7f1GIq3FhWTs?mj7f&C; z)pJL2^U^8YB5YRNx`G`Nwc_|e)Dv*a2-5{Twyfj|Zr>I+4GAkp2q!GxVZILuq@@JV z{oB*AmpXGK7k?7?u3k8cec3w|Fh9Cqk9&313c~N|Tr-qqSNwWQ)h2<(bi1=;GwPl9I+Z8u?4)2#~DeGjf%D&A<#=b{&T;E+bePVRvu2Uvk<&`1)NQX;WvIf-+8Vro^2&oYWZ`g_yLNwn2>DP z8m(Hlg)zamRh!l@Hf{wY!#3R43MSm&s->}JxiKL85$I)RK!NP8gzQ(_wZ*H(MtI$@ zIR?Mb0sad$51-u2yh zRCXMHSDnY>>dV0GD?q_X;KDxqbs`J*_9mbv%OAI5`GMHzf@^+`gk?M2T5W~$)z+w3 zX^DyzRw!F(g;IiTF}D}3At0}{LdhB{6#G)#R?Opy3Ae>7#t@!Oacjv4<&IIbXcS5o znWA*Dq@z&0#FYC9)jYP4$L6o3R+%e+E)Zy5MYtw_E12eb;c9XD7=v>&hTz1+zPK=F zG$I}Tfb&mpq2K2^Xl`amaF&69EzqOyK;@pXJm&;b&Rj%5T09m8FTLsc*n}s-GAscWVFYHvvr759>vL2$b^--dx)~EFO$^n*(9Yb}h0nTJ$VAEqmU@}zzCs8Qw*pz0KRb)Waw*pk z>HSvqf6|w0N*QEX%Ah|+@9(Pq%unh6S|h1IlpNoQ<&*NhM9BL7jps4s%XcyC%XcvN zi`UWr(^v5OyU*j>o?Y<4i*4~jTb7r{>&x4Pzk{~?{d8*8n)_HEOr_4A|L}86jY)$~ z(k@I$-V3j!y%-m}9j-FjOh_WULz6IpZRD{&3QkLcVZAsIW-<_P=?3(7ngElw&k&m5 zg0sbFnD*&|XWJO#rB==1`rRi8GV6~x+kuF89E>EF;Yc5AhMf~El)rMyXo2P*5IdaE zJZ2#5zx+V?4Uhf#BkY;(L|`6)?XE+x-MK$@IP}FH`@T3}M?mx*gH4XZ@j>VIs;+$r zf{()H5SquKC}1*1zt#r*Uuq4T_c~%uKLX*$FQ__gA%iTLe)>{Zq|bK6&Kd61c;w9T z#QwQnI55u}#}-XdI!+y$En6Ob{* z1>2^3V%L)Cm}NOag=3SbDH0?6oi=T;ve$12@ADl_&9K7lxGAVg@uk@Hf!R$Uw7dQwDsV zTJRZGzaK8^E#lAf+5ezhXMFy0SA5^&dHhKI{8~5s{`yPk_2$deE9mo95A=QaH4J$F zbqxLZE!h0{1;!8h1CxgL$AEWw;+y9>;g7dphQ&AUV%i^HVs5{m@kftW`0TYsm$q$@ z6d#F7ezTKf#FaIM&`cmzmRo|ez$#fl0zPQQPK_rLn*g&0&EzmW(vLN-v67io|M{0{?z=bnF=iHfLU3Z8=yHxEkB*-6XO0Sp;?wSX*|niVO8BYkg9)sF&ZNqH=l zpg{ZIYFefSal1N)%lZbESsY{eu*3;4vo|TQti*PdaDu8w$?eJPd5ThaT!IW_4B@?m zgdi$#6VqV0*jV80*Sh1a9xvj#POb6!OI#3u;yRbVWnb5q2;C)H?9`D|^6^Hld5R$9#=iNHx%PbNX3jyXvcL;%2 z&BCmpnV@_}w(-TGNi4xK#Wqv5u7S2#vL!D)G;7PHyk0qE_P&JJaI9XjM7f6uG}jPJ z>uYW+xUMX@g|jD);`GU*sH+xNkqVYWV7UmD1m=p$8z?Kkj*9YYC@;E<;=HpsoSTi* z#02cyy&rySgYf0&Kj52he!%zNf2;iR?Mz_8mv&oz=-V)D zC6OEmK?X*IVWdHJuPOp^-C3aO1W zbNm-!%9@3kv}_^X`&_2cZiaTx^@QEhAb5mr#@LNfm`@1yOWXk~!j)S@8XPwz!ZA1r zb^)<)2#AM+KcP9004>`>Q(#FjGba>VgvP=$oN0JGED6mvL5VOABtUPBffvE;gCF{# zMcXd;q1ONm8D@b&1Bb)h!U3=J{D|e?v(TcY8t7y@;V@z+1qsEW!jY(trVY>N1%;s}3S7bgEGfW3z(ultB81OqBdjAOf-ao+l zw{KzB>jyaZ{TW_Edt;HqXoS!3MEsly*x+e{xt0UrJ@i+24gLk51Aihs`~sh$e_-;6 zKA3LWA9KwHVd?1MSYbH|zIJ9<<1`v;oXxP>#T347W?1WC4nG3f#z_tcobH0aIqnEs z;)TRD(~!Du8g}~6z}|rA*tyIV+viy$cZn@duX4nxW!5;p&=kiO48yU7195c00PLO7 z8yVieAlmg?Y_RUH<|XgO{MZakW4%uZiigYENZ2k7hQ;C#j95f)UhI#)&OR_|(*^Ip{svsejDmT; z-U`THGHD6RFWyJ6)j%Y>3__wyf26n%!`88*vD?!UIbP;SW~Vh~D4}@-Wibfu-+!vy zGt3G9J0}xP3C&r~gR$FXFrj%6_HcQ>Yo>iTYmd%Y6Jb&`svpBTK8#kAa!{vqUxVB&{t}paN z{<4X#1m|>pK!cy0wLMkilA(TT$ylXvK@|0vBxn&@=>lI zn`DdQ)Ct1y32r|%#SUkt*y8MTTb!9;gR`@2ac-_1E-Y}wrG?J8w!|GbScbf1V{vt{ z2hPuT$LU2bSZ47Dp_%Vjg0qapc(-+HtRCvm}!BU#HqL)I}Q~QV{vd=e`GrR zfP)?bamZ~Dj<^lQVUJ-r;xigCraxfSAMYX4Ya})kn$skD#=sxY$%q}b*2d`AsRP_pOanNGf`NWrx+nw!KaMn zios?bmM<=O_v+DX~3*OvmW>wCDvRGg0&la(tYFK6{=SQGz&26p;;`=Iv5sk)*$(R z1G7lqcugBoE1Mpg1)v)%Ph1}2SRX1n~Gs_{VTG#cEq{k(|EbSug7cIaXqJgt?j3NY_nNK+HH(bgX z1&a|w@otZo@p|{?@M70?c&5D(>@0?3&(3t*udC2mXSJBd_v=ehlz#^MccDtJLk)phrg45yc?*B?dR2w_IIu4pYgey< zow*&p{^(P@+NBFRv}mp@8*Q4)KpS~6w?aFkwrFeEmM>@t71v5xjb-8yBjXl)vGZkZ z&Az$Jqgz45s#(ep=AS|~;$M$W}u1#xnFg8{`g*{psVf1J3AbNs5t_Ckb zdE{~wZ`|!9R z3y&`orjG^T?w%E>-aHMrLR^tY=q>QIMbR1ul&*C^8R55loegepuw!b45?>2Kv$;~S zuV@UxdNfmRCnT4ywpA)wWrI=)9k;?7CChmpfwyEij}dF~5>rC+XntVLaC7NMbSFtz3f*oq)PNTzPCuu#kSODQ>SInU>@c?)cDaneYf@fn1ZGffd{{RbSr zdIJMK>!~dA65Yf^7k{|>cR$0*FBprn48arv<{QQ$T zw#_rI!I~iD5tarw|47UUO~C4yZLnSz1XqFPkYv~dijQ(UZ2eI-e$8B6MqVGfo@W~lfBPk5d7C5K!xY)yPy)AKevNg_4 zr3lUE3C$Ph+T)5q^E`W8m~De|v#hw!3TLKS(*`Bqg~t1=-lyHy!h;k=+@;qv}2yeMr~lg z{0xi@2;N3$->Ma1xeY%1_yf$HJp(fC=Jcs!WGH^$37(td;;}iN%#^_81c6PB4Vpr8 zQ+E3=XjTS6{aC36F)x8;k&dmJO3zeHx5;tBeF~UG{{_u)IwPl&2F*z^qNkx*%EbJ2 zZ25l#XO{g*Xr^@K(}QwT$@XS({7<9-7f(6q=i=DKIwz;lC?h zlelE)eMjWx1m>jDw2Bjnzp`G^FnY=H`cP_mOS8Cni2qC@6pN1y6+o{Z;>4#|g_mn;W!77oMPUuV+~}4e`7wVsj#e-FdJ=0dQ(jZ+SUe5Q za(-hTjd?Y#LlKWx=j$4o_n~jk7+xb*w#hn|^;1?RvR%})=+V-)A$-W#o~m5(plT6E z(&G}5k(7wRz5hVZ7oW$Que^v(Z4B|*2XA6`Rw{0n72sZN1yu8D2+nEPlbM2jT;9Dk6~}XP6nsj=j9d8^k$3eB;kO+5*Uw=~YAn)I zqESM)tS-w(?w(BS*_MjEyynG|hk0HFP8~gj?OW3b=y`ZhUyD29mZ2NGr=L{iKcQKy z%>Q3Nb0auwR%&tO5cn<5zlxa1U#xRrN-b#$E_gFRTjcmeBjD^4Fif*omFS?8$;^$Ugl7$>~at%tHEyx#Lw zbn4U=JzskXdv|3a|I!6qIdc@(*(oVFdjf@LPvX|)b11%X2{+Ci!-<_+5wd7LYzFkj zdoRC;wz`Q-BvL~szDzstWh|r9#UHt~u|&RT3qzy!Xxr`?f_X=D5`SgBtXuM>+nli6 zQmhs{U-?lJoLdm0jSX9;ix&TEP{&;k0C;lwR#r?c&g7O~15J9Gd08>g}DbB)Q1?l+nQY7vlT!p%wb5OT+ zBJQNRqb}YSbx{_m3><+{KhbCuts8?|Yb=%3xYW-Ml>v?@^CJ+8`^8!foJ$GKB}$_; zh~_@AK$nPbvjS%xPY5nuK`m!K%Pdf~jOQ&iN5N7v+*oRg8%svwra<)4(S%`vV?y(C zTf(s&kFi4`b#s{&Zt%SPC05)vMgjMYWtO-x-xil9o8qj`5S*N0hD~FBh0|AWVc1vi z@`EOSL15>_K>JI=>)-td%f+j(BrOwByS8KXrgd2Cw*>Rn5|~#l!L;Se(09;abxx}; zJz%w9BgThiz?E=2BP0Qf!!uyFIuz40_rpFo0oEI1)I=pV8{?E@ke|#lXA_bL3re8b zQY_7(aa@j5mS(%41VXb642*@t>Hu_n`5kn8zB_*FKM4H>j6k2>12Al;DPDc$O;zUk z4kJ*pD}?R61Xp6`BQJsNoV*Yf>8ntkwH|djL8w0*j=N_h0!k*qbO#<6v8_v`%?Yi= zy9l;f_~-f-{BzzprHgH@9iZ7iICe*^Iiwp{Uxv8ns&%qBdgz>a&*M-uC6F z-?0L<+X%PYmZNUlO4M&(g}Xa^QNPm{cSJi@~$T6V;%!=Wu_U<&$q*Q-$^*XW-1E( zCZR6S2NiyvC<~s6v7f$7Xl@Pr54&Uh=g-0U>sMeb7EyMn~VC0Ely6fClov3 zq<}Iz0*Bp39C9_~x&_X7Tj4zKclK-_*(Ts7eZ+>wv9mZ#~s?iZ}2Zz^5a{GxA_sZ36oJ0?u0UlLmljny%YLi zx6AK1{f*AR=lq^e4j^?xZ&9|F0kql9!Z8E0-Rg z{}Y(?dIFlcuGs6}z^qp@{QxuZh1GyrFC8@NfH>(X*Nms7!C+)V4?({!5gDF_=ENtU znU&0nQ+_c@jo_@;(>;cG<}c8!2WN>F*bqLA()%ro8hoDr55O$a8u3X;)k~sS=%t6| zMuiZR!zj&l~kIpa4#a}3*3 zz&TolEsNDH$3?~}$<*4)2bvwActUf6NP1S%zEW3&<`*t;!F zf%BfsG~{G$!H(15Sa|gu_U=f> zo-9H$^SOEH3?ALBCP?o@?%rJl;d156EQ8y`+I>goM)4#t3qTWsb-=9rm;VErYY6ZH z#S)c6EW>(e7MBc(;-Nq1KJWS3#nY_QP((+DsF%^^%?a4GBagaj`?McV9f?*ofHIP4i<=pslSx}V-@2`ex!kgqk(#Zp z1z*f9#OIbT>Sl!HX7VRR=Fy%nd_#GGGtW<6c?RJ2iUphr!exFAs0eT(Jlk^{fq0#@ z2FL=?1nUxCOD+?z*ID5fRmA<`rcvxmiF=3mJd4Gd&{QfGXTozSp;`QxZ!H;(LJ`+* z5rzv1#|6u*3C4EFUuuuS#gba7scZ|G%6_pn%eK5ZHn=p=6z6<~wh(@!VG$4wtBo?iPp2TlF@f5gkgW#) z=_G>ztq9HH^K2Oy4X5yUm@iy`R?l?D3$MI}-}?Ec7IK^{Vda8 za4BdK&afkK$=3nrS6JXG>*yxyFQ2+WU1vw*>MGX7dI#ihaz|mfCyJtdkRLe#H$r7v zV^>^V=YWg81hdt4xUkj<*EV{fAY=lH!+cN@>Wy2Q#vwm|(6ZJZ72>nL%95!KQ??!R zzr_wp;Zh5VZLowXRj`!XR*1WdC1ICsPL;46x7ao}7mUKq`KBn8nd=s^P3Kb!$FRNm zoUq+W*a<3KF8=VAC|_;K)C?udhjZT$rb7tPLveG}U=;cGL&@si+}{_KEBm3E>y^HJ zaC;q}(G_Oc=h6!)W`7{b&I~gL^hVp}E%EgbbIeOS2%p$oj1SF$N6>am2+hKR_}!Qq zoD6rr2sludt3zPEBnV>`hhQY3dCX#e{BGkxXzqyjUlnK`1;-J?(4%W7yg+Ck^7bo8 z7;BDYi3`UOIFaqijcKj@Gc8&!uukymRHSB1t@<8b-7e3#PxV2#lJU{J@zR$OU z+vl%g;+HSN<%<{5UM#nKK19#AZh^!JW;o_O1_vC5V5eO_WZL#t+HT(;yPO9j+jR(X z+=pQwVS2ykNF4AQg~JmGw-c=i%eFW@$(}&$h?8Q4=DTLE%|Pt69fbXMgK@}dIF1td zPmD9gDRvZ2d5^{^o_mULe43DaV#;WWd2#*dL^CzG?+1wMT1bG-NZCwTYG5AnhKAK{I+-bJTpo>$>X+qTg{k*;003XlG}hs#&aAw4xw zeYXc(V$r@+_Fz1VCbd6a}#iuJP4b5XpUn=$MSrc zrcp*a>MQ?gN&YhMP=!=uLr6vb*KvCNmvvX9`6mkuv#uhw>B999Gz6T*0u1N!E?ZMR~>vA0P`j@ox|^wJp)1~g?Im<vJUqNxsa!XE0&Ml}cx{k|dk057TD)wzp$Kk!(QB|CWdo^X) zzh^tHT|TF-tsnF!fGbvJ0b@NZH?=fNyl4UD|AuBgEH^^4-X%j5^Wbsfb1e{j;^+bX z*!V(;*Wz7^ML$vgY{ZvPT1vc{&ilarU8t@hsI$CxDsG|Z${7?~IEAtsS8(Fseg({n z<}HTr$_-exd;{y%7vnvq!(x;@e*Nwbv^Qz5fVN%x)_A^KSF~?qf?vM*3PXSIh2eew zz|cQ_!>C?;VKHzRY=)1-Z(n?h*V`E2%XS7B{z_L^zw;8Dzjy<7U-X3SSFgjn&lgx~ z(HlYIMk8{HGoogC!Pmncecyf!zrFqn27K@qUgpd7Ieydyrqu_JsbLFa7&RwU%b!|} z`C&0>0TX@@C8X2WFL%XieyGkTtwK@8DqN49hBF&oaVcT~@{;D@YT7cC9gM;K%e(MT z0Lnl$ztI8QznP8Nt66w>BMbM=#Gq{ZB9tXhKy@4eIl>0D;TEV3Ge@0H^^sP15bub4 zan7iVaztf_ElM{MWc;mBw!s>ugy1rrD*fzm+ut6g0>cDk8SGcIMr4Cqz62%lX{KcG zpSWlUC>Ig9ZY{SaOzPxA@Q^URPxpO8hMDp))UMN0@y%SI5q z%#=%pSd;UYh=o}o*_?naP$s}A>o&MN-4f>s0O!1i36o*eHK(ek` zuhKaw7pwj%3)V5)fpA;6+8PCGtZ{?O*I8$IgxKq>qbsblON7=-^M>Q{{GqtAU<9r# z9EnTwwKQ+346Nk&;xfbhD|~HH$@Z*br$}^rtsTnO$X^S1-IZ)pol4{{2P^GRN&qie zBR=lp>+XPZ(R#vnzL+Xm&gY74U&0Ph=`shlJEp7D{`1?0fG!=Gr^iP|o?jd!PHq`>eg! zUOMMoYyQUAV~!~@$Kz$@IN>%5`yIw%w}%a84IivQnY|}g9K3>=8Ry`eb{aF+9mAaU z2eE9^A_b;$g!431uidB|n!Rl;(YfVk_^f(m zOz6}`d72N;vcmy_;eG|p_BiTehtvKJNFLV@DPzAzrrj_F%}b5PpkK2(2(}xIQ*+%2 zuVy$t)f}e@v8TPwlsEWBpxlQYZZENA-A!Y?O``$gL*u~%d8^!TbGa9m{?Hu`U7N!D ztCsNUQU~tc8=!BC2I$nJA==ifiFP#HGkr{PXR!-@oN0~oGgvM@lL(#@c)p2TPQ+2K z@i;Ddh`H{xuQ4w85rhf2Kh1F<06XH=TnWLj#R->jIO;G8C!LLO&V4*C5w@>PGs9)- z>U48LwI#QsW)Wolxtwi@pXS@($9V+h+19w|PZ$rd#pOVIT;%fFLPsP{9)}jSYA7$W z7=lLZ>o!`CMBK=p*kJKBN@(ak+3H1O&ke6LX5j(f8^@;+nrBSJm6@iv!sq+P`Ob(Q z*&Pe|evVyJCm?4UHOm_Q;|3{bMWs)w!i7+rn6Lq>Rwg1SBn7KNqOp9{D)jk=@9wVM z5gML=*!WEdkKBOBn9T@}T91(MWCVrAaoZ>a1&3qIC?mA}q67S91|lV89TMZ#At7oL zlDK`)@^CCzun_hxuISnKJABcuJ^PQk>fI6>7l}Xr{*KUm3%howv(MT{b|TlY-`ce! z1G@;!S((C8i4{1f;SB zoT*)EsRKm;UUf{Z@b$o~L9+(Va>*Q7Z7G(9HAk+}a>R z?yu8dx6}P5=g?U{6`>Io0Icb+(44Vl1H~oFKtFMbY^#TD?S7^45RO?vD%~5GyQExH z*%&}`nuwnjH1mE9q<&X$%tr9vpqa-KXckbGX|cBvl(%mZrDJ>A4z=8-pqbaHyv=+- zq>NaG+38u>mA*?Y_hsj(kdywsyQ6K3#^~PRb2MpK2hkD1gyw(aZ@<6c;vcww=O!XT zRv1uU&)nzC z$s^b)@n3w&NIqpH0<>7}dw0|DNZ&-TK7+UKp5vESC3sf!5QX<{Apg!!cwY1vn>Vh7 z@3d(M368*=fW>gI_rl;GCZJ2VzG&I34eC{|jXG82Z#TZgt5-p-TGd$g^P_cWUbg^G zH!eom#w94-v=UFYMdJ0|wFoo)9v)w|!M*j7h;=l@s5Y(8x83LXVPIc;(W0s9L?rWo znrxtkHfc)J>eVpUY%Ct8twL$~D%?&9z}+>0C`ez1=ldh^>`*L9560l#<<0o_{eAfN z-FeiL3DPQZ)((Rj5#94~e*!qc=VC`+}+>vcAGv(^%C*P7$EbryKL!IIEyg*R!o z_+^VTe%bG?XO!qBwab<%cJ!v+;bx0{pUdIo@W3;N6ZeyxSR$*V|Y=>*nKGQUK54 zkAj3iJdW`t>=Bp*n!`;A4wlGM1O6n2nP44iOA!(!JVPLwP?#I!gj!Hf zBSwtH_;Ds^qw_Scv>bx64S^_%bU;b81D*;rN7=9*>{vIf3)V*o0b1a>M6#3+IF&ay zi0j3(yUHF#!8AaES)M@zmX)wn6J)A<#_F8t&wS%GK+mXZsHDwzI}GM_csx;&U{w_6bIP@fmjbI^gIWCmag2 z)1aBae0Zv@n%pFAR3F5R=!u<#=AAC4SZ*>FgIYE~>NHy%^LNHk4^tfTHdWAkip%pe ztZ-+c1MV)eA@JJJK=h)#`3!i_IP^g7DtBBY)b8+@h}HHZ5i#K#EFaPqONO+^!tnzU z=3s3T$(Fd}FN5mL@#9=8 z+?a2Rn+qIpbG|)gi=P54ag~sIgUdVf?Qoyh_GpPS9xtDaM|`#(FLOoiQa81Hv}`gS zQ1_QR@jNcLzl!g}1$H<`7(YG30vG&kagG3ena56Xn}`;*Ybt|A2G3dc?1*jlqp{kk z7t$>IplFRZo^GF_gJwVE(eOGseH1RvoPevdOmSHNeV!AN$M?aC!5xv~GZDMI$6^3V8Nn9%wLp* zz|lQq$$k7W!NL@s zzU+g~+qOl`+7d7Q2@;cHl&AUe<2%U7ArEZZs6sT-#bK5~-rBic^=Ug~5;U%pr~_e{ z51pNc5`GgXRiP5^0kcRK&n$!X)ONyXvJZdbKIQzS*h6DGQAPHQiggv@Z#;&C;cBVu z3?;duD~|sc5w<@L%px9}eRgKL4xVXDlskikd&zc&l68b;ZpcD<&uvasgkxyRBpmMC z^l@czJ5hNmGd(!#;7|(VKSQ&2v;Ps8RpAOSYtXC>STulUbu5F5P{7P>m3R;Qu3ulV zO@-w65<;R`Rcti62J=2J>m}Y~ErUh{G>ZtJB4vPR(ER@W@;s~{378k@%S8{(>iqvJ zG>g~(J}R9@oZ2qhw2jbAKo(fmLG!ka8Q7MV&h-+uA@R-v&MYHk5lZU4EbLC-t-Q_q zcjaJXY6|-I>5Q+swZ=ESI-zOf+6ZOmEztbOA8*z2?8#%;keZBryE3t7*DgGM@DO)z z{e;^$F5+47L**epck~d>A3cJT`}g6>>66I6dlOFykIzc-RH%c%Y{A3ZIB_rsNA_mp zG1vY2tQbFDIEBlnk0I~Q4ZM9(q*C0yef>nu?pF@WdJ%3_>@uNNKO104;bX{j+HZe( z#{D!;vmrEVP%N>`e}!f(wwb@bC?PQCt3istzj=nI#kn|scprh8Wu7U(oWbP|>=0<) zu@$>^ZbSBtO(@L2gV%3L@#M*46g;|x+`B)a;QmcKxqBPG~m5nD2*SG&O$6SWFsXgl^4S zp#``18}T*nL=#@arr=407a_(A#j#!}Bq$US9Eu1QYR0HUU&828oMy)}p;jf#`m!}j zHY~y8)yr@ucrNk|Y)0ADL)f?~5N<=h!H6$fs5D5+o6Ym^YnmTk zrF!7i8b>@&wjiLI;niv@yj)|27pc~Gxxo(4*V^D|stsPOcgC~S$tX>6rU*hQ-Y7{D z2=?Ln>D+!Ap0Dx6)78`QbnPsZt|i0~i1SnD;?bJ9xSKo&*JJ%~C3+?<#m&a~xH&i; z=7;Qs9@s=!SUpWLE0`dDss$2!ED<}!9ARz~5NaNIz+ebWxBMs>!c2zwNU zTM(?Qkt^}dkp%cK0$!*c%bj2tIvIt*4qWGmM=Km~W0n=6nZO)ifx9bQk!m{}fy4X3 zbHqTjtfzUJE3tlM)-)$4I|K!-gqP=Z1O_g}!lkP)Z`mTuU9lK53Cz=$EP$P>BaN48 zXxO3+d}G#O&h{MmtWASQTrxakS7Y(6LsBcz(OM-E%tIA~ZL_EwiWJ+RX@THylTpGn?Bq3R)^`k;;P!eQ=BC3FZoX_w1 z!Lk(NLn}WCxe#x#4ALx%ATaaTF9^xcd7kHS?s!IMe#-BXhx-ZJ_Y*?#lW2DZ$l@86 zrFfj55Ryfuk?y<(8c}Lu8;aZH2?%rDV}8$LT?!26M!RrH0OM~0%em1WcvLP=Jf!s7 z3pnTVz9a;sh}Wt@L0A?=;gfM?ra2BfnPQKlF?M)c!sP2Ns=h~f`61}ob@=T#i&*Jl_`zu^DpjkC9vwTvgGSag zCwqL^zAc(msf1xIKgG_O&Nw!AvV!Klezw><)dq*BNCpjCh9kq-7(o_B z=-;Fc5a>Xg)iYFv$IXUgS(*wZ`owc7$C5uK;r>-zmZFtS?Xew20^O zHA5DmINfCgk2@3@lZRusuO*HKIO3SE70w6PA|EV{dt-wC{TmvPDG=>AFvJmkA3FW3deG;oR}EQ-dC7(b$da0!uO z-G+El9&A&rA8wl);(~|4tkW!pbb`7k3sVqGSK%EOq^tneX+7nf@yf z5}t+?!D$Ez---oGQZO?x41sgQFlRvs4eBtAooJ5FT6cxByAR@$*CHWl6H*ehu{tpm zF|o0jzjP+Xn2bP|UcK>It5&F6zYeNa{{+d&aro!I1e$N-!2T?xr;}X>irdIX+l77e;TSk_BHvj%_K-}F*Q6@W%@g9>Qo@*}`3`Y1H>rd9qw zel~<=LzTPxDLo_`-ngN(fr=u2-jdGcc5PoNKviJ8dBXzNx!eucy(b<=e@^}VN-U2a zelUCXY}ke^vXu486RpKP?+`1Eon%&eHg<_J zvIxz)keU>WVS{>L+~{vH-e?e7$UrY&FT(P_@aNxt$KR*}dv~j{+?%}vM-Cjsz1w$j zhtQn+kTCqJ7`J~sk8{Tl;mnc4ID7OkZeP2MN4Kw`IPV@_mKCZ=KyvTj!j z_QWBT9paa#g(!Y}7ti_mcLMM4GA%d3RmC2w$wA8SieUSS%hw8^3Ay62CgjQlCzsBj z!UMv&d{&_uzi6QudT{>F&|D#jWtQK?%lzh55&n4l9FGXThY#c+JBvof&P-b8nq+G7 z?5uRv-VW@|#IvWlc=kLG1^Ks;```v1-@Apf!d#W6?(05ZqV?ykREnXxHEW>`C4)9< z6P)VvrQNwsbxioOBbIqPVZOHun$)U}dR1#+?BG!_8ax^!zWV_q2M&kv@QE-UIu4y0 zeTpx*U!21z6voZOQ-a$|0>?8#U6DYZc=@6wB!P-^#jCY52)kC8(zXUBx2%DSORZ6y z8i1ce+;J;-3T_ZEkF0dV`Pf;w5)+8qtCwKkVpn7?aKM$-b8s(pA$|<8fNW4f7!mI5o@T;sREWh2p6tA}~!OJuO=S6r$cz%%WGn@KF6Te&Csu9WAtg!7(JUc zKwAQ)9)To3{Yx8rRPtb_Vru;4g9BZ;(#9*-M@fmIqjB-N>dMw-A zaCXYR!I1lQ;|IETd{_`nHlxtE8aOx1tdu11u%jXibl!yzwZ~3y%oYk9a)M;|RjF zY!~TC827;AFfW{CTioYtgB%wtZ1J+dkd7@?nVI@8Ld>b#@Xt7jS(^@H`PSoDx+VwS zk_{z^^$@oa=7eU8NCI={It9(OehU<$D>sqxIt(>A83sWbFFc1jup-X znBxKs#0&n`xaLPgk>GeIz!DD^+Tz-5Q~bEln$T>8>oP;;d@I~qKp6>- z3D4glKiPxOJQc;8Jydu`ev${y&KiR&0h92fzbUTqo+Y32CaXaRAK48Dr%$3GF`o6{ zfN-aYXsHJLR>J5pJG$yQGRL$jw&>Wt89UCw_}ibq;UE9_6TkiT z7CG4h+gov9cNWeZKZ83r?%?)MS5cVzGv57Di~<7VrBg?7{@5{GJ9icjZePc}TbI=D zf4+N#rzN>Kb|{D1!~GB9_VvpOre8lTQkg1#BjEn}q6lvUTnVtRpBLfPb1q+%;uU{; zMfjD8N(7iSm@ZYId|+=ju3x)|KY#xfzY?0?>VVl0oCS>kD>T27--`(91^E39kNayG zZvJ!$2lws59^P+u)=paI1m+CB3^Fs2wNs!u4d>4s=5-XHtmGc@9$se|$duy+IDY5= zdiUz7rkQS3zmCeDP``R*HQ_>CLRpUn)!{zqE2PbE#-KVAl4(I{l;8ubMZn6Wpo8p0cDeOeo5E|A_LBTp7lxzw>@uoQ_Oqn_aLum#Q$v9MYej>yAjusg35i*pVm*;qNT{B&Eej)jW}#BehUnI<2gZ+|fU%>;^Z6f# zi6*va&5mBKd6CU91e=0=ush_bZ;}}+MIf%J& z+u#wD4DTcf%~%K1s8pDRuYq;&T3D~*=jBOo2wja~0?jq*p>zB8aIv+(6eoLp(X@$z z=D{r*Vkh4j$7np}&``_rvBYjqYaH|IT) zopEHAJr281#E~h4W|^?V#{`%CtO$m-I5X1>*B02Z&UoLUG!8;MkRL(=BGgM|rnoZ4 z5?AM05FG9B(>z<;r0!7;-ePM!SZaeu%ennB z8v?T}a#ykq@L4J3^Odg*7$=r1fteCemb#O=Eo5C6Q2BhH<+2Ur>f4nvXbJgz_euCv zo`g`*NV-phcDKu5>~tE8BLv%Hv&?XEwk4t27_I8mQl4fRw71-PEcSViMW+2g{ABBXtH3+Bi6BWGyD^^Cp-+vxDwC#Z^RT{#_cOjv9BbEoJVpYUOETW<0H#-FW zvx5;hXQlEqo7p*|Mcb~ho$Q8`b*l;CYp^aU0~-@JV^eAvQj&c!)n_!m{k99=VU1C< zMlIB=Ru!w0V*&gPMTF)P$8*?Ukt=qR88RvHFzex2pqYH4|5yJ1Z)ny;cKIkYzt3lS zhTBOgu2cNOW9y(5Kby8IcT47fdzUC>`#2Z%ewFFP@qiHh%nU$#nWf}OV zmjH97h|rit_)`h7SzrR{a!yI3mPu*QET8`h&ZYGC>XBET&AD>Mr%XY)MSQV0JPnzcq#4$a)B9GdyNgl4FLfC9xL1o=;Ys0L!e}0z7TqH(pTxzJCkfO)DQLd? z^Ci4^nu|BOfpVoqk8ttqF0tJuns@eM_G(sPD{M&0TpBL%?mcJ2#UlMj-@^|qx zzal{YqJw7fDoZT0JOpP6>oQnS4aOrVzk5vxH-P3}^`2(xuh1;#=ki4%;kgihym_Lg z1;2dZBo6K0g}pi18Z>9J>~`$HPD1kzdFgChkK4B{60%G1qUa8u6T#KHMN#1428ZcAt;L`P^U z4Y$SsnP6@P1LVCO#+3HmmfB`rd&ZS&vT}qF3{#=+cBBTe-3V z;!h=}w|WiKtyu@P2)^}d)kpoB^;Di|nP{cDsG3Y3P*YR&n)sw@HI)aoa+S)cTCEyt z)U1WNbsC^?gC=O&xCNR}O&foP=2SBgA-d_O^-;G$Jv3?28f}_<3j02v998PnW!g;|`3^y}ZgNoYp_!a?i zn=of{4gxmqfOB{(Ocn$o_Rwh@Eq;j=dyZhm!P8hoDG*NC24_OENMKo@Sr5(*gl1bp zv&1=1UYDsn(cW9LFkGOy2BEp@mxSg?Fd9Q}9yI~RW(2j4Uuw{7H3Y?JfhdTzM^QY> zB9>rEa4r>aW_wT~uqb$@l(&~)Qy@95**0ZaNVqAcF;^_14N6WZX1gomvPeLeFjOQV zAL3OeD9db)WpR@!S3Y&+&hD?2QY}X!>3C-3M$DwiUT4-FQ5(YMFfQ;$RI27naV3u6X z8Z_^x5t`#R30Y3#uxGLncG`T8Y-bb9A2kS_S%(hYKgSV2TO9Q^QPXIjq|OnV&rKsd z&#=T%Uw)o#g-792SPlf|5Z-&RCjnZPZn(Rg?|}IhxVg{5_=RbXI%ube1h0kg4hm%xNVU9!$830=lmc$)`1IN#7)7gL|@jmC+lA7 zji9XBg4T{y8S9`qk8MMs`7!k;z9<0w}gnKqhm`z`i* zjmD9gCOAc~PjoX;o@RL#Wd_Iy7h@cpIu^T~2jjq$VJKcZ1;MB zA4^!Cgr5S8abvC(eqJ&eSxgMc;9>QbRAvkg^LiziWpajgCy9xo++y%i{6%qqG7k7Nx zq75dP8Y4b266;p4#`e@5NMB7}NnM8Zt6i{a=_Htp{}!J%;Cr!lJ>@(IU9}YW_aDf6 zdxoj7?YbRMjpjqHq56}{>pc`kVfvCo6dTy(51PU~Z{x@j;SV}mZ6cbf& zk%CyoXBsg7M`$MB2=8dTMQ|4BpxF?fwexJ}asM+k@6jgK=XeAW+fdos{w(!gR5y7d0fMIVtwI%2+jJDi%Q&2l+Jy(^9BT( zHE|ya&Cq$8B`iTeQj3EocBlm&)~JIr5>E+F9|MHC27F?e}-mR{so%Z@C=m6MpOaK73H&ylI$4@pwqSzoVQ>z z;d#@>bZkl^IHw6TXK*gj*!W1|wv}H0~3El%(Zo(^~VC96r2{ptv1h z_Uz8O+M=NO*uj0ccK$MM|9BJEE}q5fvd4Jy;xS%7eT>|@*Kzy$RTSqx!oAx+;qL8A zcu5$2`}R40`}GyEo5JL`dDfc>@mb->qVr-@Vb!^P#6%&Y{f&s0ZZ_Bpzs9=WoB1 z;BUW`p|s#0&YwPpLwgC$yR(tCGqW6;CAK+p2VX#JZ$IC>gulHl!*9>>Q18RKz@WR z!N8hOY(*$AS1OIMMydMC+!n?0?4+o|Xba>;TH^sh<>w%4+zYc)`D%X-v&DTv%L8^) zcL;*_*`eJa6kb{7gcI{^vCY#6akj&-!gwHN4(kP{ejQ=dwI%w}xav^9Hd+vFKP9ZH z^mdi2qfX6+XwtA1T71?Kty_0NyY}7DsbhC^@6rR^J9NP}U3$S{@DB*_oPxCFtFV9d zI-JYO!P!0gG4i|bQJ;|9{L|0Ss6k^iuHOg^>eNTW`k$i7r!DYVbHZ)w_Gr_F`hsfP zmd9_0j-5K9Yq!qm)%#0)^>r`w>H9Uhbnk}dEn1+_r=L6B25|ba87|J5fEN*#1ZQ)?lw?nk$xduo*7nGkzmG$l3C--J31AY7 zd1DTtS>l=Jo8vx#dW-8wEHfR7spE&BS5!4Vons!vLBEkOi9UHV8CthqKD6u^m&lXJe#q04mjLj2_*-!lVht7&mqz zjC3;tw(Y2SnipDqPiXd6p%=yR1mIXgZ`}KmEGCk%OR5wfdXd<^N@joay~acM?9W{s_=MM{fu$S&6v<^gRQPs7}Bm48nA8IOq+)(A#32U zG9I&5Z^!&KJ77I07;e$)U>CC%7U64Q9G;4atCCb$hQ;y}7%z*5LzvFf%x6fT*@@6R z;=BH+U$rtmrTRB%sGxc83>##6PeQhj6>@m)eI8cWO=#ZbJPyaFnB$1sIAl9b!u*ln zp(E?ix=SlUvja}>w_^&LO>lOa8BP(RPfxeNF~ajHKMOnv^+s;EH(}Wuk3waJJr5e% zG!!Dd@Hp57_ZBzMT=V=|Oj#xdGI7X@rr1#B1jT;|bW%a3Ed zu?>n5R}e>Vj^mQ;GM~Raq(O9Pz8QAA55X>vVK_Wv5>C&xMv}WJTGBw5e|=iju7X5w zQyiE!mgV*Xj?Ns5lJ!1#x_ug+Z1us*ZL^Rc?}iHjMz}u56u0JD&`_|U;oyQCKHE_v zy5Xe11y1@+!pT4@B)CmfP65et-@IuveEn5l^!<7;`VAO?L43~*=syrm8hu71zb+;j zTO*Lh@vJ#ZF?&%k{O2vlw3+kZ7qAe4bCzKK!etmfaujOUZ-_5Cwa3t517U4p27m7W zgfCi-t*e)iGyIT3qjj45M6~_9C2CgH4DuOMJn;8-FY%=49uDlyz^<%q*qxJ!o!MlC zodn`+@)!YnM-Y4f(DIW-q-&Xn#B|S-=Ue?>Uc<`XQclI&6(tx9h=BG1}vu6 z$Dvs}ry(dCN?orZH1qtzkRK{rq_~mLtbmPDY1#fF?YAsmkVrso2LZ7{1Utf`#5>DU zZcs0|VFj!_kzNAA9}YUyZjh4g#oMeTpEYS-UBm4)(AM%1YsXTROMt6@;)+5y1ZC~J zw!OX*%EKK9p8BNL+JHZOT(c&vk~FF7U`0yyiE?0;GSES;NPoSO(L%0QKZ%Q++t4tP za;|`4DQkY`I{oz6D%BjJR}0mk^CB3N~Lq&r^CqfPVhuF-i;Xs}yq&?)*s5 zEyZuYy~3}A+P}Tip!ose`d0~wpk#(X@hn$J^9UR27s9ReN@Av}!Ds>lGUan&m=ytl zz#L{l2(e)2WzLSw9A%LNzi1njv2!VjCNxLcsJ}BG3LHcVDBIvplpXF94s*jT@tDWD z7v_TNEA4S|2|HzWHj5^GgTc*epfwGE#ss;>l`5lYmCA&{8fe;}2|I}|(C+hY_^RXA z_~y$Y__pt8eD~c%3>;#Hfx|2@aD){GkG8?kF;*Bh#sb~G9DpB&jmLaaP4G?&S!UR9=}ZH|WZnxK(P#!{<3S_m*VYo!KNe*Mh=eA)Y3^!%nj zdVbvxy}0b%cL4f)I{@E&{Vj&^^Ms+p2pa>@n?_&9_HEI+ z-?HpH+&omOIgy#A16sAw2DL3PAA;gdvyn^Vp*Y@^hM0JR3ATika>ejF0Y`;b5O^d^ zLNDc2X1Ph~zhW94A`MW0ZGybs_ZEGSZ47&mV1}Qn=RsZB|@Ce zYpwmYua|f%dCgRz#K`jZLPC0>gliDgWpJZPlP7=s^Srr);^_&sDbOoEbNx>AE?AMzakEHp(z;UG!2!jHbnPMU14u+fkC~$A~aW3vH1fU zH^lZSj@avCja^=ZWG@Ts^|Hnu4|DAHn20Q=F*qry5^Te!B6kVPX({1# zDa&n{je=&GFz5x14AC>bTVAm2o)V6qCwSszvbWN!lqqP~1m`D7lAqfXr3nOQ3IAc6Dq{T?vz&|K2+c~YOSZpaX@AldSr3J=4#-WQ!7tF9 zKtqSubB)k^a^^VfoiY-KW=w>{G$*rsTGp+t+H=sp3h z>u3{9$%G{GyniCO@2k{M>#A3)idr>mqjv2&G~AouvzD#Ul16&#w(ZfOQ#Z70*GV}Q zx^(P;_HEjtey!@LU8g2$HK>mob?c#V!{+GLp(pzG`4+=R41zh|EpARG@N}|9n>MZ3 zR%F)EO0cmo#naMRD8?%9o;tX;^?%A^RkB9#q_TtSG`=U5h2QI0>Vf;lwD zoJ!)Lwj-M;_$0fC_*wJRl2-_;3aXVfh}J-w$5N8ddhxp)N9T1`o@WDwQ#@79U>Y>* z=cEvobC#Y5brQystQ+O$`~cKHJf}#-HB$15!{4;Mz z&S7{R>RQT=C7<61R}GxyzFEk;mXATRAuww(%UrSmRoXTK8A5Y~1;k^>_B^M=XKR^B zD$4Y)(9G-OI+12s@jJDX(!ysdq+N#&m<^ygV-sOn2hUpO44tPrZ8J6zoHwUw(7Yur zQ)w%~c`LV*OeEQazs&TFNQeu<=5;YRvOg1vapCCRr6W3YXpd@q(B%cwp?w?d%5^o5x#RU)W zysUsAOK2^JW(mpA>#xxKR_BoxudzrE&X>;|!QM=^<@8NBck&4S_V%@cW|=kco#b-n zcJJOk$MdHI`X~9^M?B6?@Y{>05O99;Ob5*`Bsa6bc>zJY82OKH;3g85LyTaJeB&;csuxmkmHFda2YL9_J)OIVJb z1bb667#od8i@J3%v~?p~kMd`y;-dzmsR1(t-QsX-9W-02{K8KNdu7zq2unPTv}Px7 zOE@7kN7@k-oYlanJi_wBXnWix5I&5s;(Ed}JG`r*-Z;RHAi`!4%zCy&*V=3YjT)fy z7v0gdV?T87{4M(Q=!ZVN`s1sKz>NX%qe@^4DH9CFK7TuaQ#JDz% zaBvpOGRBi-YJ(yg9r==WOwyJ`NJa(%m&E5v_6I`S{UAr&SYVDj^NsO%u_bbYJ&^4> z9?=$~FnRD-sHyTZSEVtfLGyrN12BCdq1kmV%)EnP<{JWQ|5dP`y9$$M2f;mH1$qx2 zrnc?$-4J-hrDEpVbOiEQca2Gc$^2l%A3cXdMb8v8FUUEHxqFUd@!|6b%sB~rLbCv~ zc#_pXz_r^HG~4pCe0Sn^XUcx<7R<`b!Gsxss8qE!zU?;vp6(tnA~261XADydd$eq= z%?LQp{Ch(4T;vMG5LyZeHOd>TfLViOy;R5s0kwpHs+R(0@w=Hf_qK1Y-Bl>0oR0V>e^EXEx_FSN!PUlSaiYK)z}*0AW` z2cNcX4~uy#;1ZLn2KG6Gt%FU-df0?-gnjgS1Z458KK#N&^&2HB8+sr>PUSwlhr7#M3CiAV^HWe5 zJ(cA)l}3&?3WEsH%UI6K39YO6EG)C9G-#FqoRS{!WxNMoC3xW_^(x7ma6FZeJQZ(J zr{mYu8F;yRI(|tFz%OfOKvk3jI-&7qRWcvCSp$ zoQduzB2X6*j0?qJlYcZ+dvwC1I7d88a8}TKca(xe;>Qzy#MorYNRuj$Z zH$qdsuWEBYnRu#h&8leDupX+{sG_Fau0mM1G&9D}H_o#!&%mA?o76!6Jz2Z4dsh~A z?MhcE=R^vcGsqSyoPiQ%`2d<#(g6wAP+n&NQQ4k+qEq=SemZE@V49x|rogU%WT=k!xCiwr-1D1qkpU{F$r$@xMWH#WDUfG)tLk$DnvTtzx-OcYOVJa!!F{!!nlZ z(efjHuwEy_b+Szc8}whHSyIiVD`=JpO}1~^ifugK77_Q`vWd`~ww>UdL1@-%JFinK zzqIu!NQw{V=Tsctw-f1GH)7F(1(-S0PsN2+=F3*5c-~ET)O(u6%Y6Cl3Boc#l%OeI z=3ib&y0$z*^8*w-zNvs(Qq8@6O^AK_N@c*vdwfShvxHE*?`hTp^T$2S61pMBl7WMI zZ?mL?J8^KA%J;l4JDv5v5xMt&#y|gftERM8gA0GrQq<)?{+ZX6$72ziskbi(-_Oek z%mn9hXqEwia;_p26yC?aqkECHcPFy;5SX)BW_-ZX`9e5;j~s*MwQFH?$Ioypb~Yh`9a<w>NUWiuZLjpU}N;{Z;Wrg9*h3pj>0$J{D7~%9gSXn$KmU~ zMi@v9`F1>p^qYY12Ta1?0VeqVJ5$25DSr6g9K(i~Vc7S^_`25+3>`2AOMPY{W5rTj zNn4A;!@H4tWH%n1Is~7|PUzn4OME?GBzk-^5?u+|U3(5i$8O)CU8i2?+_ew-_Zxzd zLq=iD;1L-A{cxCkKLS=mM#BDwQE(nH77jy4!IIEC{+s?V>e~&b2 z9}Z{ZN$A+T8Co`Ih>k5gqI2sW=+deq`VgikwrPT#X)LqYX)G@Ze~}4JXmH5jHSsnR zn$>zW0Ld8-Wia4!8{A)NfyXPX@i@#AnI7X2Wld-v(p$x}%OF6Rbx}I(@fH){8892> zF0)}VH5BGELSPdZ1p7HF;Wjq}ZnKu5^Vj{=wp|7dgImmM_^jTFS$uXq64tr{e^Sphhu=MC9cx~RLGHGbgtO2sKc$x*83CfOZ2+ovs z@5F}3ouhU z4|4@HQ&If<5olJxOqI&g5SsbBggR(mV2bcsNGK{~Ta}nrks$Ntf%m?&A)3%*pwO|#Tqf%iPl0vP8y#RB>CfG!z5}sFA zK=TS}IiY#E4w}OV&6B6|{o4Rt+jmjWJo@`VXhguO!@B6pcg|)HTSBvyg5(_WG)s7f zCn4Hx5;n1}j<`<5d0Fpdj73HR(3X&8+2aeGp5u+n)9rE5&ziENY;b;t4KB{GCQO^* zjQ0fGm}iUP7{VigSxZ0Xjyytg-bz;#1QA$+-BBFGa$IhQ;uW?k7xy!ksRCvJI~{M=`4O1?2+je@qb#ue>-zclWus)*p*AhS^VC3;No;eX58IR$-~5Dt z{3O8x&r%4|i5|%3`>7y~1`T2QL4p&%OB^$w!&rN+qv20@e$4iGYq=FJ1WMS1F%A=) zkLaLTJk71@X_J6V?9~?g7uw=5x7$r<{wc@~FEVBjn7vW3&I3=@dnrKwX@Lc9FSf(I zWp;QN!uwn^6Q={M5HzSW_IR4$pw9#x_A^1O^BAhc}`DH=CwMu=`oHAC$>jTB&4uEKXj<*KMmXs%gFLb@ueyzVt? zRzX9;bF135(VPG+##0TMsg=Kps#L9v+O_J?D6a}fTN|7?aey3kn`9D0>A{$v4WYSwy^wZt$Mi`1>x_Lp-C&~m*V5)I(C{1{Rpx*PxK^-#(yYgjr%00V=DIT%D{n~EaYVE#F~^;EL^x4wfK-$ty&d*diTQ7g9mVE&mJ5*a1h51?!}#( zR|utLc=b#J=2y>h)v~nk4uSawo|ZgPUg3ZI@dl;Ed3bp5mYPXW4c4Rd(5!*+fA=&? z?DK=$KjGn>>jYv6y?Bm$H?I?|!);V7;TUh4@4&*5B zvcKLxEe1{5SUMI zz~4`<#oL^4HK6KI(hNfLc*GbFK&_>LH;vOP4cTL%+Ij3hLh6Pm4HInEUJCT18l zbU2z+uZ9WT+u&AAAYqLixel5oJVQ;q5pGSe(X*bM(8*E?9j+5bqi4Ie=-Ig^I(O`iE*-x`*DeFl zrN?*ps?T8b|7Hk=^#2}X2mgR^-wlG<_us=|#0a?aoIVpLVJ3}?081ML+Sp^Zy(1R7 zyCHbSbVSV#K+>Z5NDB=@@`}Z1-?$#jv@+VVBkk0vF*-M_hu)29Vt9i}$eC(_l6YSg z%2E(z%ks8m8PYf*RLLm*0{L4wQh?eSxdgY|FfM~_kQZ#nvh+lT=QxC0^O}ZzgF5Wg zDt%Ig9cmR+@(LZXcljCIW#-5?k02TMrhVj&gDfDqPQ$1 zG?zrX;VG}7*^odly{L+~lv zsbq`kQ@18Ix?11>&y_=H-Xor74{IDy&^!^TR|gPKr_)HC%4cp0mxSgRS3FJj#`D!v3C^A-sVaoS$QgM8ZRrf*3Fz}_gzmMS>lMp z{^PONdlc@nT|D159fj-pUQK1&TkZLGa+t*yT0> z`#s0vpzj34Pacb|jWjQI_inur5U>c|Qv%`P9RN?SnJ_o8M58+O(fYI2@b;LFY5dL2 z%@0$i&coOV_Gs9&9eVWs4t@F$!8d%rebKB7YE-L(FWP+$8=Fb6H=6*5Nyf0{d(>>A zCCtohFkrxUs8zFu@?zJoQx7$()P%FGBlcx&N7lAfoH(=#dw1l>WR5kolY5Dp0EUU2VhbP_W2CEC*m6 zSZmvAf$G|%DH3iV(pPXrMOWODRw3mohvtgP5df_KX4zl%)$PaoG2CClEJ0X+SdOC| zOM~WZngr6>7z}~D(?9a;L!?;gPi7f!C#WV{v=W*R56y@DUasFN94Hob4sbdFl_vat+ z>`4J3S*E+z(#1)5hL-A0!1&`HX1!#>5{aLdJkS69<5!h+L-ITyKahf8D33B>Sr5!_UuwCW<$C`h%ojeogvKjq4Vd>;N@V2BRT2#+uITVzv0iaJIFhp zf$Q6&acfHmO7_Izm&3{U^uxcifBd#9kj`M2`Okv6dq-bHrfP zAvUa282Zi<;HZtnw$yK#;u2mcxhO;QAz7NneX|>!NTaEe;7Y zoDpC(1~aWDVup zbu0rXgUp9u%IN-RpqunSyihW8mz{?_Y`mwy(qkU1e8XYo7Y^%yRj?0S0gt(%aGSXV zpLgn^b-+VL!Z9)t9*OHQedAVmu2~0*peRHfJBJN7?qP1$e$3N(ngyC??>&JfM=oGi z_F)a0*KUPV>Q)8K4icJ?N`PLo9g|n5!#05+zBLCEr_JTL>Y{s>Ua%rG8w)g#8IK9< zpxd5sD?Ng$Azz z$`ukrM0u7e0on2taV%%eKw%ln?|NvaG>@@BvkUSGJQ6k|;WIKgP(M&m<@Qzk7(z3* z)9=e;DrjcC=2DVU?h%joC_>_xy^t%O)+irtH--CqD)0HT7@4im0WU@3`6N8voMVTJ zGi`802KG+1#-#pVp?Rn7uv-?XpxKttY!|wg;Jh9-q3dBAp2}r1EcMX5QlL2z6P71n z?DAMxha{<(=1P_8qiw6UFd97q!v_sOV?IBUg{OC&8rV4542OMf2+YO=XLIazv%(<{ z8|1i|AZb(|?6Vt-b8hCyv7dmY;|8Kr)k-ky_&H9^a>KRhgkygj$`(J(cEVME2i%Y; ztY%VA~VfJXJ2phdH`@b;R6rL6A-i<7Wm=~~R38xJ$bK+IW| zjDThFSjgYpCI{fF-h(m0)C9{`F2=$I3ovWe0{F~Y2?yV$u=SabA>%AjvsQgnt6UjP z>o-Q-s`cP%I~jX3GnAM4;NA@E-_1UKw?K0Sp?M2-?It*jx0&Glzd^H>xr^*Zeh_I6 zPaRKf<5KqzzdkNn&wc{Ub*Zt??eN2W7r=kXQ zpT&u)7>wNYJ~S(EloM&v3P4V7ApP$jIibE#MDMMq^wUObF)rLj*0Dkip;<2pt&q9O z_1_Jx5`7`aRM)Da+Kn0y?Xc`RO+8c`~q zWqwQC_RDjW~*+)~h-v2g=_{^=s#{i>yAlgU6NL_<_sco!uFcU9QN z-`>8&eZunkHSsDWLq8#j@-hpws&EK_W<5A-Kx+ug`gO0K72)p9t9U{1{`9%#( zA|W4t@_JvErrS9-ZJn)eTq99KyeU ze?eG&hkyL?2zh6A;>wmt{IqTbN;1On>+UGL*%gYi%?t1#ZYq9eM|?NZ0}rF!399aR z6yuHqgysmNuMlrF60=4PQ5|BVTD4&^Y$Pm=EMYs*hR|#QM^h_|9Q*?s^5tmxb!Xg9 znh(hpn@?zd9Oi=DFgxVQ^v-I=IvW&+YM$m&ois1AKr=;XE{KuQ=G-=ckdiQ!9nUmK z-rLfRv#`u|go30x)vBRspc{S9jviYdi7o+Ah2}mx;V1Tk+_?CX^i6f+t5dO3fVWpR<9F(}OB?a#;(EM1wHnWlCgSChIJ`O( zgI5P5@#bg@em|FlS0^Iz?BG%q?VgLm-SbhfBM?uw`QycUA9mUVF7aluGcJsCL@5DI zCf6v5c0^gE9i9=KpU9NagtL+e0tCxnrjX8!o`TJ8qp{q0Ap9l_Ld%A=6f{?+YSgL@ zi^*27@bHAS_d-}r4};aLaM%R~!(sMHxX%fNyWcXj?yNy`4}s?J1h~bo!?cZC;JI!C ztRv#F^3X{nT)eKrE9UM!hQQq#GzaD!!{S5dl-$#H!aj8ioCTWKXjwD_pvBAVoRR^% z#4Uv8JvwM+-FEB&8#7CojGIJg9!F?4#~1Cos_U3x)E|ZG=V+6f(C90r$|QDK?ww$& zd7Nd*c0_1a88BF0GTDyG-z-Lj4qla)ID%!XfLQ?7AY4K(?yHv=bOO%u`+LAlK$c7y zGV7nj92cl10Z&1*2Fyx&h!&vMdzz{ING;SxgJwQUJoaN=lbShDyv#AvP!LV&p_%vk zjQ8_`_xMs`ZbO~$V2M4!lh8cF28X7YVe=GAjPKVA&Aaq~?eZu%MiH7NJVT&4SO?8v zYhV+igXSO&nvItwV8V(7j3G2z1}9>qlMg~Jgd^W2PACd-Kv^)KgH`r; zBGaS?bIJO7lIVtCHcto6h60b)0QZvd_UJM^&zOt1=}Yj()}{Dc+7kSl97rRC<;r@K zu>-|11m-ArRZj&hXJstKvTh`eTs-TEZ7PpOetx_w&V=*6LdM}rtPLKCrspDCgSv5GaR31jU#idaAcMVPWf9QnPt;T7t`!D@*5mqYLCN&<|8x4AwSs% zPd5?VSJSXcVmn^V_MG5~hoSa(z;LhiM{?b$gE?S4cW!o@g z@fJ8ukH^eqTQFnUCipK~hbc2w!p_AP&Yr$l5*&#o;VB3RPlIplE=-Qff_Kag*!zc| zVe<~CDevkAjZvdgO}N{+vrpWQ6NhrJKPR2qiM=@*G-vJF%B3Ef$sYpE8Dt6-(=05= zCE4J8VCFXEtRw$+YuhOYCAmSdT=^hDoyr}d+HpTBL!J`&7Cuu9Y6#60+Rk+%VJbZ| z>se6?&#gG#&JVsDN;pcd_n?{5L$i{w7*&y{Ssg%@+(8GD+IMYz#WsJHAu#I!T7X$n zbZ8?fE2Mr`g;xQ{3XFA@iLy>U{Eae%O8vzv@w=L8yIcm)EP4-^MH=`jz*YsR-J3`a z?kfl8520CaOvpak{eC#;P+f~OBvzKxtB`f*QNgo#owxJexA7PfwjuCb zJ_1#h9Sx4{d{79@GFxD3BCTH^Kh&#PN2RmbleGiSOAE1U$2J_y-ienI%luLUWqnwM zc$tgyZxfhv38bYerdVQ-B`5Q9LaRWtUgFu+!?GTpHJB|`(tp?g{+iG%US{zI3pk%X zb^xc2?#JmP`}u;%!r{F;urFsjP9NWgNB4fhli~+zKbhh0#nXIk_COx@-D@6~ONpr# zZ?{~FeEx&L{Gzk~cdlJP?(J*1fBh1YqC)X~-#$=Nu2%j8Uw3PdWpk$E=EY<9pFdv$ z|MLs}{^~IvU)Y0-n?rFfc_H$)hvH3k6kcsvhO!htcK+_TAL@vQ;e7e?bFKhFG&^>7 zyhU-II1^xv=yAOfWikW-gl36XY*?!nOn(>&>#-IDW?RZqLGvg=bAw8iVb`xa?j_Dv zlWpXOOy)~nz}a4bbAimbMtCj`vmwAqj$uM`xRvreD`@6ANlhoC&Ba4gL?|pynu6lQ zsqDB2IvZwUiS8mD{_zm_ z?IHf>`H%RY;PL7Jt9B3IDpChJW2j!@uus#-G>Mso(!Nmxw>l#N*GCQTWH{ zDExUk0)Lzi#orDu#Jk;oc%D85Pd0g=Y>jxi>`@YD!;aDhxrywMu)_;Ngf@#E*R%X(8t8mN^G4^9SUO<<0w;ZsZeKK0ovFkqR<8O9##)Vsg{L=c zycfY{MmVhfC3{9NoaP1*n#14~unO(FYvCCRn#1DZ5}OL24V&S;F%9-{DOkAY2!fAa z!0g?JF=y{_1Sn7QG0e$1&UMGIeob%7`(D9beM#S&f-&UP$$mI)Lp zqXgTKluwzYij!DpLXSS)S@RHUo@NEjI>~qCT`u?b>O^Rg*#|`e(c*cQ?J9toV8v@F z7vW2velDNq(L;EJ2r^vr@V0<1Vk6sZAi%>$d^E6bc-Uy!)n)~wQNcFk~fnxh8Iepa|V z(+1ZF%hv)3{xU1zYUU>whLvpwaGVMQeOfvVqr_%r=?AYe{DNr2+7e z@6|_PTKGW9;o-j$wjO?P zni7Z=F=<$ul!KY8kHa(VG8{Htgzxr?2uR+8R-O8($())sY)ELX4Ns@3IDhgC4)4oW z(7Z2a2Xc00VpnzsvUiiMvdL5g=AApqBIVGmV;r)97Ai_!QkI-{fEZa_%By z>7@AWBdn$ImKN5kAJ3-eD7|;tP~|-JAEe(-17_WEbdt~d@Zb++cs>6~!dj9!PH`8@ zrIPHZ2VFybcyLkqMSVmcg6@BUW|54n{BU~#G(#EwRu0QL>D}ap((*xT*P^`0`ikTB z`T{jn#Xv$<1}jb*)F-g30dfW#Nky!%0W`~b^yNVrmX}x z6g10yv7+>z=8r-%*GXgEk;dywlcgS<1^V^iu4Oz?-sWut=It9O0cV+eUQ;^Hsg;L- zGaLTa3|@zLnlrd>1~#nO2rpMpzNBm5n_k^jh{da?g*dc36E`lOC%g*u%48r#N}8AX zo=P>R-}Up*(lO4>LH&){bXi;x&&eYnKi7sZ-%kgNyia!)hQT~_oM zf4q5t8&}WcxNcU#69;qDV8H#mw&VEW-MIDBSwi{|T)T8!g@_d8-NADLzM5W}ko+5u z_1iDnw5thr+7Na0;Iv$0%;NdEQRtP~>a>24gR)0;0 z5i+FX<_lP2_I_UJfw=K~uxwaw%osLMfoh}LwO}@EBy5Zb&EsugZ)~fedBmWhXk4WV z9Qyafo%leWLuMo6dD!`=m%8L@rX)l_Jf1SKiGZ`HFvOfdZNcrW*y-75`A!oEF^MjO zW>3l+B{EWdV*r*}{h(6cNop&Zq`|!Zm+0EE9&*xx@$9Dzl$>9UX9U?7w=?nV=WM*X zyBELQ--ox458(I0qxgH-S^R^5{Lg0>@o%LI_#ZCu;*!$83XkF4y*+q$HwVAp*@?He zw&J(z8}aVy8vK4a0e@bM#@{bR;2&2b@sI0q_~*}S@Q*tixXlKqkQJ_07P~TY-n^V2=$HW&?RyJkD19!F~R?um-=K6pwQ#N^_PX zH)A%QrcKAIR4+VXXIYv^5Qvpf3>o}qfd`QmxF5&PneZ%AL_Z@KzF__R5@C;D3C*t~ z9Pm;GK}9&@d8j>}i}y0h8yg%)V9EG?gyz8*@MTBUma2YIMJ)%691L3@Us!rBfZeP} z*ak+xHXsO&^Mc_%F9P0kg3+m0U$t%5LBn7l5(}rOWK3DV30@mE!YOeLW@qom!UHE2 zFwfqngXWxLm`hL&%sz^R`_Ew6v5P89!);x<4w|>A83XmuEIFNL5SqtL4dA(Iplhct zu(z;)xzTu-j3zXXH$~^pTApUFA9^D%H9!MWY0HxPSlTn8S<=ZpA^6Bt*JXrdnZ>U( zj=)UtQQ*vVk}ghpdq03?@d^vnN|=Pe@*@J1UXrQfLul3mvjSg2avmkQlPi`2;qu%2 z;T?KUwf-0~fKxzS;+gd^%_?&Se|sF|!SjlzS;8}P86?Cb%HN*GxbxX^#lyw+xIIt2 z%=Wk%XpghgEwOKk3D$a9VDz_Np(X3jZdDxYA|(`~0-CLZ*Xf`+iJwyxG)qjgK=Zg2 z@fbyDHV;e02p3;eso5AUo43L@Uw?%!J9VHzT|;Hh7|^ILGN;<%_zXK7^fnOij45OL>~V!@w3b;n}YX&iK3I+6-Q^&eMD~zz){}ZSfPAdT72m z&yM#?*bQ~Z!(eAMO?5%A1K%Tj$H;(K$<`C<0Du@$EmGFxSP=97UoNN_Ph zi#qbJPbK&oeT%~jEU|z3C>-+}ub{bTod*h%*tP|nli98XnrR?B5<^(h-4W2AuJggA z#nuQL(n&${LC-Nb;yWIx0?oScj9%TpAvA|$?z|xQ6PN?%u7J@*3)HIDRE69aTTek` zN)CeK_hCicAp`_v!(;9y_=oJl+{hzX7`+>AvzEZdeFhx7=3sg3cC1J}2EX;^;kx-6 z9JgPCPsRnzU4Ia5zZ$B}*{EJ!8s##0bMRNwh~9A4x0ZWS+8d@?yG}PJuIiIkkyKmy?UNe ze5P1QxT`|tJo6E$FjelS=d$-DK>ne)UUxnJSIIuIZ>5S9Ar%r}Q(*z>yY^XR2%Y-E z(8~~V73lJqySVQz7S>VuGqBS@k~mu*10x_=0n8dSE2-`fAVXl*L$fpxty0RR1LnU#vpS9j$cE6YUsoQI zp_kl~0W_C;nt6>PH6^*+FZEYw=J^buIb)-I);!JIB}Q4qMx{bU*f6&VET?Zox)K|$ z5`W`EA`M$h-6GJOjtueoXJn{Y=d`sO;B057azzdu^bIbbJBp`8_wnG?Rg~u4BlyY$ zBm&10Z(OXxGzxRY%bdsG2)cw)HNZ}Jlw~rKBFJPVYL-6<(Rd{T@nrdgpe)c#nB}&w z%L?%9Np3kbtATq2YuWbA%Q6(@-cuH|c$!5=`1$C*ESx#I7q@R*R09w%o;`@`S5K+$ zxesm-ka>-S_jd&6cRb&_UnQdk&&zeMUX-eVf|t%6L+Hu{`2L%os8dzagH=L{Piw&5 zd<1?vvm5wF8SwW~{IldI6rSCI%bP=Se$8Us-w}z|2b1tJeFbt8rsK{k>0sEQhY}EE z%4NQ6B?XrJomm)4c+d@cDu|T9Ws?a7GZ8bP52o~L1ic16TD>})2lm3f#5wHDT-CV*njaF(bNO=5S248$&N7*V=51EcEMXbq zWe$@Z&Fu8z*kL6KEW4sO*&Riz2?fcXDBZdUD{V$AXm0vRB@F4%3jN!+K;7z<5VhP5 z|18*te-&f`&-Mdl2e~}K<#FmLbqN0|J&1pn9%4Bi#6ODpxoAJP-;00e?!v$BX5#Nx zHsIGok$9C8gx?QD;E!XW1lSe$+v!#K$GK4a^Fk>8er_fHd42^yFULPFtir#pMB#sa zO2WTyq~agfQt|d|EZ&}u#kPW=!|-k|kDId+@AfRmZwFT3x1+&$ zb2J3M9t+3IL*XdexeSjs&B24UGjTV0D(<9A!QB)u6wB<2aaxX8iP3#Vz-81u`6WNUh$eJJcww!>a4eb7pU^xI zL;7mo+G>@x>>1tq^@5%63|J7F?fs)*H#-vcvqRuGKLqXzA~AJd2)gthptkMu-B1P1 zc3}zdO5Fgj_3Po1vJSI$?!%lt#|X?P;J^160`?q5pn&tPV_3NN6z1modo8h}T8{?+T;<>65nmfV4!j#ZF4yL2WDroM}y|=m! zx4}IXH0Me9gM>K{9Odu&BDQO3(`78Tr*Q(#GK-%RO5+_BILpj_r80QX0GbWKSA$Dz=ohn_zZBs zWiC(onjy#A7;6a4qrUBpHhsT?LuevwBM8f3+CaefpgBl`W=kD3Pgls;APFFJ@$Lr6Pm4%Xfyzs z)?;wO#SFXcCt%U&ehQkW5ABV!0iO7Arai9l`sLcM_!FE13C;l;IA0?)Ungu9L{6n4 z?ndLBWhC!10b+T-$UjwV_r)~SB|IgMz?`?7>sAq5#ZkcbPi~|w9z|NIXD2UOa-7ou zm0A0u_}=4llP~o^<1>$S`-pXvM}xhH^;gXLlT>p>Y)6IhG;$KT9hE0>%L%L_t`lSb zEP;BpI?8yq1lD`=i=hnUHe(5!Y% z)t+Zg7=R-St*~#}a02y2Li02=0I(pL;7)KZOk~>@Xr@st?IJ&(FrPG;&^!%Sme?Zl zhi)1)dyT=dnUj#}J^^2TT2F(HG+Y;@VEUp|1SRc9)cTVMP0EH(;3C+2_`-S0Yyx&RVmER9_KTR8aS5|B z&tt*P(+Jsq7+($?g(@`&%?-qZUJY)}({SQ&6z3;)=-AvtZ|dTa0M)rJ(M0sD7JBUhwNj3VA(*-n;fTY|WRW=M#} z)eEO^@76V3Ie!x8jvvIe^CwVDV3+MBp8De1L-_IX34-$(lodb3@A}yY-@L#dzdYBn zX1sce7tcy@{rY7DuUv#aUv@&xDwWjBw0Yf{upZGLht`JU(eW(YJ+K+qvsUAFRtj!y z3d61S%aFe<1h02Tp=|4NJWTM#T|)AG@yKwQM@j0d$Lwexg*xC7f%dTiXMzJkqd>eC zQS4mUY3IhxLcCdj*tf3_`@Y>(KHpCp*2jclKfrRF8LTJS!e*j1?2WaU=4Lgj!~KW8 zc#u4g(Ck8Jo{R?q%{->W8s-z)iujr2@?a=E7hLKJDNOpOpg<$ zXrAU|!Uw_eQIal;}-)qDXRH$M~R-pd|6R&)8AEAT&#E>vCw0lg?ic&65eu_7%{~ zx-ExhLfeCIPpl?1FEt*B*@WhAyMCeCR@KUyr@0G_Gs`JcVCFs_4gt}y3ygr%yl^-z z429?77)+TLiY|Th(EJ1JLt|kT6a)99)$mA7h3lI21mQiHLtqXdFwe?4f>{F10?~x# z`MZu|9<}TUq51eFOc!WQm2`94U`J?nND^&_b>b%YY|6wK?-@K-Rdnv$frg?n%*PtR zbj%n`9A}JPUw)%N!1=pw$W8Sxhvp)IX8GHm;8etW7ffN2I`3Jk5k=n^0YNMi8NypllPA2XOs;L7Sq^Lvn#TUi=#c*RtV1VG6;|B390#19sxEt zk;V+4>w*L~oF~9;S#FBcq4qScxE7j8DpL2MD+Z$jv7SWs%cA%9W@R{MkW|GdJ;yA8IRt* zd!v5sI%w9UDVluR9Nl~N!4D(HW7t?zjGkl%Q#&sO(jLBx;W2d%>>XV($=nuZwr-fa zC>%>yBq1bvD^@0M!|b?q2#i~exe@UgJjMuh8?jxeaT7J*?Bs*9XD?&-?riMN5ihgA z@(yI}6p_~m%R30mA~ic+IypvUz&GU_s0U2Nfx<*&H<_JT{+TY99+)*xv$mb^mBwLu zXx_eQy;AyS(S~wn(%@LfEDDxI@>$l){)B8rxE!q4c0^D6TJuWt|MUmi$!# z%7!8|3)C9Q5YWm&jj$=xWE(=W;Rz%hq8xx#OtN^AW#5WeTy-2>!Dn-O4VtxN*D`gp z%&xXGV~3`6T0t^9oo+Consh~{zd*BG8!JL)CDfpKBgG2XCaQ4#mrDoE8Z5IibW$s2Cc0N9(onN!3MQu!+JQ{TA^W` zsu(wNAg*0FilWCi@wzOR(40pgmTVUV3YsN6<4Mtdx{r-mFtpjXXDvvZlx7>TpeI&N|X%VkS!Y_C|GSIK2 z@D6TVJB_m^_Tv8S%YGdpuaMxG4XW9OrwS^k=g_rp5Jp|EaO2Ug#7LG{X?@P*nC zPVKlC4@rqJ~@8iYFub8s(6j^&PrQ64HqRvrN&pFpAFP1y-) zF{To0Dzj!ea=o+GNyW2cT0H~B>*nJ520uLCJQF2bXCpUlE=u<#BGlblrJUAJ-pcJC~t&y*J^L=Uy`v`8>{()xLO^?2V@>5)16j@3M45*;+T0uI06? zXJ@m)nR3FD4FsuFE0nFall)Z{_ z!5f10Tf)O@!oy3}|BG-ZE*NFn z-7#@;0GwvW!eLGnoacwbWpN~?ERV<31!3r{gXXRSB|IYr=F1}B9Gd`_cGHh^YAtzl?} z2F-k)gV97qbjy$#n1*2hfzWKUOb5*&I%uvVv$3{BKLco%@Qm?oTjGG98xDD! zVz0*pf}0f%O>w|MnI&+lJ>tgn#}?BcaMIHfJ8Z{d@wo3WpjkC|)9^VH;EwD5PPjPB zl5lK~%L2-Qy7Y7YHVT@r1X}Q!@xlYX-yYEL&t)6WXIobuW*XgkXqJB&?k}~$orPA& z4`O)+al0S_a;T)gBsfRfvb|7T=F*CZTby`S(m!3J+*eEF%Q1gwhcRb+O) zPf+=j>Znv%rnmkC)%YH5-k>g8Hff}$?5|aw+k8?9m8(=j&Du3lvrZkVF6z{8h=vUs zwI4t*U52IjJIx|N(#;dcm+-stW{VQnlvaCXjWMr)CqLb zeO3_2eY17P$Y$YX)6!5mkQMatxN52Yt^+|9eD-#J)`4@mwu@*5Z8r$h&{;2PUv*s{ zTqC8W2$6to4WQNZ-dv|;vtZ%sKzTc_V;iqoKv}Xzq!Wy{3mo&a{N131XRtviFx6nW z+)$8v){D@*gAGbPs|wOVu@WnI2P;~IPRKEN966>w21o+y^$klaFHPF_Z8Ts6rp1WS z8!|L%DCKEZbt3o0`;-PGjZiGFjr_ck=iR^ty@B_ZjY!D!4-oI182g#_AHWrZp$MN!^8yn0?p z7=5llSU^_*SKw9RlqLK^Vu;^K?DLBfmBr!}B?I#0^XnIyr&%9cEYn>-E6rCKG9*)m zmeoSbrL5pu2g)MxVoSWVEcF8mUp-{w<~0GF!2S1s z{Ej!jykuE!#>l~gP>YZ(9<;iAaed$O3#6=^iE|loI21Dv$5#5{QqXk#7%>w!V*GGD z+7~xtd~t`Mbw6qj!w{H{n+PjoD_B!j6HPI!-vAXdF>UN1+}*GW_u^;a zA))bM{7gLH@_u3f?k3LWbNNo3M9rSA13{Cl2CODZpt}mYLSd1s@J!v!$ zfKu35C3>JVfq=};>}e9AiJjb&)%-5doGRe#ijuVi^;GF_2=9sPEEDYUG|>@daSkj$ zdz2EaB;$fin_EiB-~7+wMG_y&?Fg|Yk?gR=J0%`_l`_nWz$5XdlCEk7VT>>w&JI?* z`C&XBJNM`8pvxj`*g@N~{2g#F)E;*T0Jp>JX=pejFP6|3M*xqNNfT&r@Z5!jwZa$z zL6q!kjpy=LG=b|j*WKiG-etn=4A?$dArK4I$xzdt9y-wg6Viopq&VEU2-Or0Nr4qtt% zw(b7?NH_+^z;a0_oTKC5x@I*duU!XsKJx*)4q*QN%7^E(0T6r>c&uWKYcv02F_BC zGH_33rsQ)$(0xn;Pan@L5{MOP(9GvXg*NH;=ecPxO4f`#0cQow+A$szlqIez?Q^WGq}FH^L!oJ?z3#VI7tN z^YGO$2~UP;SOP3U5@8XH23b@ z5e;ipLo-5_b@vW9Fnux(xKG4>uL(HlV~N9r<-^nLv45%+QpWbjCgUNzCQEFy9fM`# z24cXcRp8mb3(n2;BJkTOXuce1qo7#^1FA3$e`~_CwSwk{A+C57N`s$poJT{yK)lR^ zAFm=T6PN{(b0h3ncM`54^&rm@fm`yO3w%e= zunM1yXN2Y#5ngRrPCBM~j9s?SCUQZ`=@{HL8O~wW^^WLAgP#>gAJ?RFmno zs}QL9S^A=?y7lt8I-keJwQBNM)liM!B}Hd_)_uKd{H-bta4suRja$}1@2@{cyLO+V zN|h?GwRIvipHlM+?A%H2AS`DpU?!8Kv(G2PCj69(HMa4WE@16|EqED)Wh9C~^H)}#aE_q@)A+NABHm`>P6 z19Fx5LeFEuG>U7;G2}#zS$QqOI3k1VHk7u%Y-hlF2C5{UT)o7du40+rlO9CHgDWt~ zi}*kS&4wP+T>^eOAbejHz^qLGCqdZ0Dk2t)NC(dENe|BwrXk>55i6sCv$lV^1SUTe z3svHW)np^AG?k)+`)(ESnq?{91)4=_dPoJ$0=SyRV+hQK(5zJoE05MkIzLNgrSm#e z`W1m~o>LobD&Ao^CV!LlJfBF?wrGH?85kAfcV2@4@_Qb2y`;Rw7$C@sc8J9$P%BC! z1QVQhY+${x169k7T$e^Lrd20Xpr>!6Wk*oXkdO>Qb2%`p#>MM1fM%J=NA6P^WF{Xt zM+a-vu2mHi#*fCK137qf?-oJy5z2~k;pysxyFdQ~;BSQ161>%gQOMvxiE&ooOfZ&= z6%xn%T40wDte}~nWhvkF87yQ#-h;c>aqIdem370=qx>;o771LxB!Iu*@-^4XZxT9k zj(~sj$~pY;MyAnzhF__F|NBqeyn7RNcD5>|n|RJ@@a6P*!y1_FXoB1OHsS7`HMqKQ z849+nLfPh3c)Do?mrGE(c>y&SB^v`#wALS`t7oA!aXO0FvC80>{9ySzQUmAQPz{>% zLR~e55T4~bVM;-DI6HMh^W!LO@SbEAI6i9(+2rOkRgVJb1HNVSW`%m~;BfL?DSX<#nLH9S#Te7cGlL)QxIv5q6jNI3A4hBP-`B?9=F*^|3rYlLnzE;ojs%8v0ndRy}t^z!&4sj zWuzmwcgFLmX}Gq+9jQ(u5M@6a(W@Ycg)U7+=Mp$-RvlVV@x58Ow15DlsXF{`k(pEUdZ-#gB77TOr=D8}PeTUYt zG9M4Kv14F4b_Cnt81(7YOI?RW-_E$3a!e}1$bF!3YrCWY1~kc1f2B{tvtU3U;>s*ERic=NHm=C z)FdXZ>R+A0aJDTTPwGV(qXOBi%Um8SSN_ovXy!4c{phFTmJA@8G;kJ)F;u{H65{cI zZR}CBJE7TAd7AGAIpIQpG0sjmA#B><{4863x5xQFJ6s5~!db%fF|U!>?L8KeZWGbF zXJ>R7WdfJzt%PP8DWU6>x7j>`kW5*GCc!c|8RkK$Fb+;p(`TD5j;G>av?Pik9gPv* zGf<^oW3+DhIfnG>i?2F$Kz$net!h+(-Itw^Gt~k6T*nid$K&8M3j(t(j`-TD&#TAw zMVj$o9PqZlCfm_iK4B30H>#{b^E@wHB{X01w;(WEtMH5~vk1++r)#{oYyMWaGTQ=o zRyg26h=i8$`Jf63%tdVLB{V!s<=w?~GTG0=CFZ!lz#Ik3`26sFm>X=%eQa2N64y*< z=JO*{&SqPAEzC#?_@(NHy(-Lq6kh%6B48`I#ck(*)l%ZJ@l^U0S!s%K38;vUmX&2h2kF z;`wm0vOqPSyLy%C7&KrAJUynt-EA5?T&7^MqbnS2>|teQhDki9X$^9&OOGdmD7<)D-QyG(zhRHPNkGeY9`ePz_#pa(2U^BS*1Kyrtqz zPA5lE3Ys&-F-Crp$w&ShG;>=$G#d`^dp}hi;ajn!p4D{FOzshhBQZ105Sl9_0Bq=K z)*=EbQ~}RA&ohsw0kep|>r^qGO?XDfHN3|1fr18b{Ba2b>G-bvz6``N_yC#(Mm1O# zPo{pUc{K@GdYY#Up;=;T-v?rm6qo?+b_I=^DnbkNQXr@+>?{E_9W?8|YjMpVf##3s zW5?G#z7-X-jTNFz?4kEGvjRlr(5!icHNgF=^o1t{nn7)6rKC$GsY)}fbZ)1EW|4lK z{y1`N?-?4rPyJ_gKm1*vA5dQh6?LIOvw*Ptt+9QJECr5PHv-KHoOvz<&0MFTnZL8% z#M>-?lc)xzyAM4q>m{KXlEFk<&qgWr%m>HS#SwLCSBIsA33lz=j{84f$E&9$STcV$ znl`G3lEO#$=bvvBH2>}03;gzL86j0dFeH9imxuYa#4lF>vY`Z=U+KKh&&vvM=Hwwf zx_=XY{`N{ivmr1W!g4t@bKh44^5@TFaupdYSc;;@w{iB^K0GhZRUse0ypU`ce+AmH~R2*pbV3889Q@EMXiG+&+q(GCQ8a zNGJTb!Uj|OwZw=X?NF~uWxgD{WAxyGFdaJ%w#FuqDYPw3EYYn~chvf%I$Aewg0BhI zjjC2r@p)L(>L>b)5?D_23inpN>-hZe9H z(g)+ax5UuTt7D~=K=T3`4g{YhI|4JIS>g@}!$m4R6gyaUXt{}m8u=|r^F|kml3XD( z$rTE;5Nu>RWXattIed!yfJMF?qGj`Gf%{Bxl8-kb}9*QEZ2t^in5N1Z`w&by_DT%?9zm#Rh zL`&{(iLww&JXg?chdYFJf#%zUEP=D9)Nf$~XF~JyP+L6VelJ+n$<- zxA%zga0-co)shgn#V273jb#tMTip4q`)<#~+`XE2S$UZ$|6K<$Eh8JAo73UCZZlj6 z%I*Ya17e;>I6upCrhAsu?0*8!1aX0Hl`*3{PT9U( zg<=p%TjOLqifoV`IMopQCD)m3b=Pg=cX#a1~o6uahN)=`FT6b@cJw6WD z?>+(hyvO6fG;;;bN2lB2@N_FAj`;@b3C(*vO|ijdB$iDWjQ$M;nmgm{JWrK1Q4;Jw` zTgGx0CqfA8Q9-lz%t^gUh)llBp2>PBQqMZ!m&YyPJMak&prV^D!MI-#> zS}(lZFa@P+X`HQ=5DnH(0^1Oc^-Ga9NDCZ;?8VkNw#b&yY>mQL3;Y~ngp&ayadNsT z&a*58nvW5hkIkBh3xwt+rXx_F&tK!pl@Me<6#M;5u$T3AUZ6jb?NL10$u#r{>JlFE zB*_auE}Y2sgp5IPgwxZ<ZiA~_`!Yu038_3E8SNlwS=9pt(YZ&|D7Ba*gHJ`@gG#i?sVvyvO6H&)W?= z&H5d*nN;6g zYMyqMsRqijq`{z*KJA~1!{+Z>*(qmcrem_RBWl;G26MAX>Yg9mzKN4Z4x({=0%wmd zc>THzzyJOcfBg26z%1$82+Atkg}|~bwZVHbiO7e*EZ*gp&x%yKH-X=uZ(PRyJ(+5q zOnhuiWpSe|t;AX+7%d(hN0j(h%*x zXoZgLK0}9=^^qRA5D&H{;6d_oJWgDIM~So9X%m#AXW~(`uQpR&oHq*M_(F|!<4d0% zynwHGVAwgRnfTaws21RmnC4t|&H@1W;r6&g_?kZG3k>euO8u=j z=!4Nbs;EyRpnmo0YLH;t zn$^&}YGt&nUIiWN)kL4qo1t@)I{2(+6*c9zrYdMiHQ+XlYFEQ&^=qPCgF5Khs1bU9 z+7jIxea4Qt3I=s-j)nF=V7^%|c#UX_aK}D)nmG$4YYDRonh7ZHK{Ej-o-jjTE>I$n z5SG`=dI+1E+&u`#|U6aTx^Ak)!*rQXtveBnE<7rNa}##^^kfL z=B|KQ^FH$0Bs)k1JJCoB)|aIQ%`%!_0ke3&ED1#x+~1u0N$jSi;*-f=EC>NqxCKi1 z+tW~Uya=&k2QT3n4)~F}O(1;4>laVz8v@&};wcN!Qv1D#u*VBtLs_g3&apGw=r$G! zPDThIu(V{oXa>wDs4QhWW&{SCo5Ie2AzTB4;66JH-t(j3yEqQBgVw@#VHDc*(6Uza z9cv87kXV>64uwbJYWVP(_Tu~2gYTKCThkGkb5OzZEK$y3OwZZ__f6a2vM!C#yot-L z3Yy*5ZiCaBO>juw2-hTnbYeOzLQ~-oz7{{&PvN<$qfMJuFgGz$&}>O)HW@z}?b>xv z*D$rES}VH;o@mvn5ZP9w$Z%sN!wL=oYL5GsG^tFVqRwhPu* z5$mlmoX->?TFYXg0j@rag$(}F2I%p5(Lu9%cBni8mkg-Yrrl=yQXw0%lz$>*To`^;zie8^||=29!14KTyWnP!A!YX!{bXJ`ZHE(M5e z37$6mZiUlRC*in<5q5Y@!hkNV(QfDn*d=a+1q~DHFhX+ZT3Cl}fJMkgSOyWMSINwd zDVP+Tgo&#N&C3#Dyd)mumc_zsRUAf7orS9Pn-H2?;`?vELHD+8P>+Uk^Qt;%_H|J4 z%=@NHP|$p6nhlQn+TjqPIc{uUBv1GbSsun%XCtW(2clo2PvG9a6V4EtFZ9a&&3Oc6@iYrCKM^BQrL+_gq=W5Iw89QWD_OsUW;Km` zn3gp|{yC7ia`E)*vzVwdmgki9C_jN@yn*Ec)HFV z#cS9`l3D+Wd>dh^Hu!0U5sv$hA~c(^o@{WAhQYB} zeVVa)OykE=@w#qy4qf5MVIP?F+>vei>~9UizccRU`3 z@SbGQA@8j)))UtjTOz~$Tb!6~gflaYa5m5wo2Qtdj}DqYZ~7S)1uj5bXe>cD9INK7 zL_ak~p$fWo{TguzX^4tWLqz0egoSNIM0h$P!?q!cii_HTOa`wVk-+hNKzwLv7 z1@o|e^G?LXXCN{z3&GqsC?N*(qx{fy;HRinw-OoLVPql)xG|8*-Z_VH*Cf*rr0|j=*wG{NMH8pjo)I(uX@}Hz>fy4S12V%-qZ!2%XAR zPKXof;ZI3z%R=G)hD$YDoh&JR$b!o6C(?b^#{la`IBG%KJiii;Q$C}NAvnwOuhNG; z=p_&==P>*`K)lTY&Jxr7mx4C{W(}J4$E>(l!?MZ{a2Dw*$Pk*WRRdp-Iv5yr}J1DA4t}Tcew(Z zcc!QFJTx!}-I*B~7%_a9f=x5iNm!ebjO$k}W3s)i>Of5<8WEU(!Mk^_@b*^;fp~(q zZ^{%ni^o|~%n1N%v;Vyh%X*22S+*%De24>kcjCm+efa(Di*oO>-rFqDEc%Pb`KcD3 zArgR=fq~b~AIDQ3?{5TeIZjdD1FTrO5WT+ahBj?mp;xaSXw{+#>eQ)@I+ZJ7>#AAE zT_29zNWyZcFYbm+!Hp#@xVdaH?h^_MV!YYOx~Ug@F#)-R;9I~KXI`+a>PT|gaXe;c zBGCLOL@xnnC!O>lT>;JF$;uT^mURBy_gZVZOy+EVKO{F;gZXex@~2*b#1EN1i&%3Tbofv1z^|cC2tm z=1O;LTxgHfIhK?a*3PrRhQ&_Uv}y{rhEGFg^fc^`n}x&C3vetU6ya0d(6)M23~XNq zG5$XwcKX*?Y}*Y{&OPuXeJV;)2_Z=|76`)m2@)49&`daqv&UmK2?3#n5>QszIaCM- zp@x7gv7_=Ay+Cs@0ZW040%!3?8cKs^8I)Hbf9pouD0nWAa0-EDRfdywa#2f_Jw&`& z1Y;Soufj42yfV;B!0R!A>~RD^AcD|LD9)quC3m&TVQmhX0H!p|TmkcwFf%;m`scxx z1S)$%vpufyT7Hfus4EZoWW0`)j2Vsu1<6jrZP^|Kno9}IC+AyZGXW~mc`RlVUfR{G zraDvcHrJqWGjQ||7;0$_mw8JG&8y%=Xr3}ZlF%HBIU#G|Gd~q7-AGtozY%WhH(~PH&2UTA zpxJTFW;HXQ=jsehj@<_1m8)UL^9^^L#&gxc=byKPnekYdO)!Gh#4#8<;sAgRH%$FqFgg;^0hhXhB0mr6W$(&&voQMgl;xTDiyq2q(&}_Og24klBqk6;UX!%(S^!>6s+B9v1hBd07 zRjn$p?bR83XF4IreH;$@Ov1tG7C7WjXQ5hb5ciWD{!m<4{Xzteu zr{{VQm~C*;&s<&mB|m~Q-yK)@4!JtR64z#05S~qOdxG0NN#8=ZtH+wH%}upduY&{!uP45%0TZytZyat*KKy9b z$)YhhK64CCO*bbrTjP9yK(mp8=F3Z5;Xk6E`bVv0?P>^d9L_Q{#eNz)Kds<3#qd0_ zt|*bzn@OH{nL3^C9yeT>GlAD*&o*I$dn;(XM|tAMg;v<@JP4;}jK|rTMmRUe1Y4&P znw!*9&v~;3pTgeK4l`-^O!M}~H20aXwRA*}FZ_ zzvTH0mj7?`{|%aXA$m`zeg+ujsZ<`*iW?WHljxROfog!OUoX-ugpW&Yqb`bIroh<{ znhm5XLQ{pM^S<8;*6xYSj2{hqi?TX>aYC8^XK5-|N0wVOGb;g zm4^G0HmpAdYXsa(s?p5&Lq@rTeXla0typ!weIA8FLA#q}T0zX#3wr6E8IEE|He z-dioldUW>&?q0t{VCJeXdR>`F*m>Whad zD+mBHaDTZ6&iI?*%xo*%4fRA`v^QUn1U|mpOW9F8WhYSkp2Nv~wheI%t4C3#89 zfw@8tLp0BXEH!8*7!V|+QjMAN4M?KrM0wr0CQI@j|&-Sjw z^ZhIE;vkm?R^sL1m3VVJ3hz!PUw*tU5Qd+KXc*;{}(`c;12_3lyv}!?R!;JR>|7#!kV}c~;m;fJ$*P!n}zC(Ct&U zIo6#Xn!g=37{jeB;jw5r+yho(%A8R6EC|PR0@Lh}RCvt^Li6_8^wr;+v;M=NJ4<%?kwd4Jb?t?Ijz8-~62Ie%QA?z`{q>guZM?r*JLwTkWeK_NRV1`EHqOGuqcyk_xr zeJPO4eFT(W8T2yGOTpEPe8M$>yNb{(5jdXl_-Ep_A@xmos-A>IS$qYYDX|(G>RrAj zNE?v-B?`lH)~QIdYI9u;GSfn%J(Ug=+k$+hct&u0Rz`4UXY4sUdC%pyfV66JY>P6W zuqc#Z6o88vgyU2XLbE51Cwudm;ER)nfq&62iH?M3??ubuTDBC;Dq6O_d@<50X8erA43 zY=$C#@*r#pwZSSvbH@13FuX|}1P<+nBWb}nO+Y>!@5H>j;xvz~jCIA?7{YRl6E4I$ z5}@sIb0#~OtS_nSCj?_P@Gs9pf%6OHra_4zfk3ICIiGc3KoDkKi@Vd~G6`)d;yowN z1eUebrNr9SXJeE&_Rpj)S#K{(1MsGd&)s})oQxlbOEYZo$GSMYUKxg}<$Qh+mLDu) zeijGd*enOEi}(uL3B=zePQoqb;}O%SOq+y5u~Tr2*HX!AJelH*1JkGA(DZ49<^Y5b z|3pD^^LlkK+x1I)pWuieqwR5RE;~yym3Mg%UJ5iX3d3(J>wi23u#1=7x2@n%#cN3a- z5o*PyLcy}J6f6^(4X&~(7{gZK0kVNusClf$Tt+M;P^q9<-$quWB-(~@^CcUp{XWJu zx|l|dJ_Kf6+W$4K8j@xW^JzdY=dLldNf@Yy|4V1a#N3{JtpnU&=HLIx6 z$+@oorvsy?8eR=jASt~E7KjGTTIT*FJzb?5Q2v;4HpHu%WmUu1xv}RrYSI}&n-)ajpD^On-A7ydRsCLny#DBTQ z(k$gH>Buz^f)z9qjyDjJb!`-2-bo2GYu08dLkTCdiN}AtcOPP74Cxh`YCIu_(T1l|N0x=y(Q2Zp!ttKxNd;v`**J3#=@ zfyWPSs{h2LL$@poI13z$%f^G-S5-s|eVa_tEzm6f(Bk5u{(pK0fB#(@L?~&zd-V*j zsvZ-X7ovINCNQsKhK3eaFsoNrSp+gYr{ON4`9@9zj>OpEWTrc=m&V|>M93%(Zb%xl-i=Yu}MnBha3FP^jK^zI@N``|o^Hs%vyB;ez9|#WH)jxb3D;XQ@M6bIyxKVv zf9%V}KfWo#ue);ac4s=OHpk+@+GyNZm4?NU&gj7x^We7Tn4dBl3)6;RZeVxJ^XbSm z2^Y)wA|y0FBn&^O2p}NyrOFrN!!mc=H@qNa(2TMyQ<_UgfC8bXd|~RJ(=})&G>PU1 z5rQc0r#_DpaK)Oa0g?txPgrixiaoe4k}{M}?<_}M1THOnmaDa(`i4LABrE6oe9mhSm8efGi`^X524wD^=l%d&nFkSa*|9P}v*vruv{ak%S8YbCW4#S< z$&S-gN&z!FNg|1^qO6|m#6D-;K4)D$pYO)>L|U9+&CSOE&g=xqx=f@|RW1V(WxD8K zT+Q;s*;H==vmef|^LR2jfWYie`QcQOFV0N&B`EvgyudR-S$u;paQ#BOH!2CsXL;{@ z8|sXKt*z1X>#q^Ka5)^Z*#R0h3e7(oiOC5WaLO-(eNhD_WaeUQ>`b^4qHX4s!*0$( z6{~qFJ5J7oXs5(1G;Y%w&6_ntn-)zKG`DV4UqN%guzvU<(H}oX+F^gZ9eySt3NRmx z^T5IB9++?UIWkB0!G_=|Smik$x%OY->o(@_9o#`db7dT%S==)S%%=&=l`$?j9qov- zQ4Tm4W6$lj3Ys6Xz8>(ICVt9K3fK|h^Gs#|l=_t@8ZTJKRjgY9=2z?tJtsUrEi_p8 zB}8Kd??nSRi@&^wC&59E&D2^V&|NIN@N* zG(xj2&SVE7nB^wW+|;Z#W)YgdW5?l#Xgl1)&++N7 zk*ec4WY`yI(y~3w8??ZP(Kc}PiiTfE2BP8$5T9Ivl#B{wW-mg{+~p`OTaDb@T=;nV z;fvv;kr+1#u@l89%j+62RQ3#+2<*@GQ$M)eq85NSj5D{~M&uELS7l-&eEnYVHxr(kqhV71+L?F->~j^ySAS z1)@v)U7atcUmf&XDpl5beyszb!>HuJV0BhtDUE?}sn%<*RReOP!dK9&t*g%ghDPw7 z42?j8c0C{3?|&8jLq2C{9S$u`9hkLsfn_Pj8fe}^Xx=Q~tXr8W1)`#K<8- zFtBf5+`fJlzyJOgZ{JkmuYVAXWmZ4JssOY2H($GQ22UT|#@`9M64^qBWn&3A3+&3= z_&c|)S2Ob|3$(HlGreE6skjBAUq63{!~4I*n<|-cP^Q%;EWdb+*DoI{OS1s9{Ql#e zMDWliRuN0{yO&Ry#tS6H$0(oDhGzBAz?2U*Q_aV8YX2@cS(t<$qaAP}(*rjb$KiZw z6mFJ95eNv)1cDbu?tJMx;a#31-pwUA&vhXnxhiY(g93NTgAYk=A?0p`4zbqWH?T4Z<^CJ)3fxUWQ;+J2*-PQ&^4o>j1cSWeH zCtOC4MrX5Hh#dbJZfz>Ujb*90M{vHoG70xqr{dxIG(6srf@fO@%L2*U1dwOp#jb3; z+A|BUsb7D{$Gh+6;+Jn{;w!%?O&$%yYw4f}zY1 zCZP6OES=2HBX*{q&T+u2IWBlznXNZR)UQe-gT-k*@L>wARqnI^y-uyO-}S8he#>ghUW9r z{nVr;=Lyee6TERg(FYf~JRR+dQ!#GX5MYbGt*r6M_=yNFUyhDnPhvfqqE@{|uOsz}!0VIu(a! z*x-D&JuYWD5SXXo2(RN9J1>>IuHz{V_&Iqh4kg*4lF;n`Ssw+MjS0>Cbi5I`HbhNS{RJC%c2SOLAaXXjC;(70Q0RZSKQ5!*w*Z*gbFk}5SpjrYzCp3 zc~+HbLK$M@K}b-U)n%=fzrZ!0`Q&Js(o3U5n}dCygp%p=l5$PtvQ zfmyVbOd^bAU=%Ut>C)NCi0i7Qvk_GT%$n48;WWj58YJ)7EXLFtXs!ll#YY;OY0??Y zz-Rvh8Rx}V#zslw>yq-zP`|As=Q)WAlbTe~)1tD$nW{4p=Q zz7L_<*n(XH&IYL-j7Y15_bcKf4aOk7p&CmcbSdlN3ohka?F-I4s_T*bR+pz%v71Fq zQ&qAK&N?)&*P)sF3pi6U)%K2cEW>qTbrx{m&UC4bA_6n7XDiEJpjmvG_wLw0?;p?-d8X! zKFku^Sy`MZb&Q%kMRVPdbbe?4UOjuPCTfxEu`;tjz1rryKF!kHvtd0f3bet&nZCHO zC?2QF!*IGR0@q5y6*NB~*a}d+%6G)uJO{kX6}jS7E}<(|fJva4F9N<;)ZjpY=3*}e zR(JU_zsGffXo2P$zh=IKn68>NP~11lA~9p?5Hw_dBu~9Mcf#bazJ#02H263Ynw?w` z;q42L$&=9Dv=&^4_Q18x^KpCqTs+!57f-hr;MuNuxWBy+x43?NYc8&Cnu|;8b8v1= zHjXb%$I-|q2;l#@9n71H&)h_s!Z-V{YWtcL1AtuaTfQi`) zFm>h%RYFK%XLE4YXa759hwWoMT4-V zTbfG^(5!rqHE5Q&&}xt#rF@v#rk?YB8Z^rwL-B_tII~U^EGuYs)9RRQ0|q4> z)lm!L_8{D!zoVMz@O0EPUdv=$P1mfd+Oti-*%J@*d<583G`-(XHx-vqIjd)bSPANOz-$snZw=olOD$fIzGBaS(I8-J~ z!4U%gQP#uJ2wqRD6ON|X;m4%OIGp5w%9+9N8rV}ob0a=SGn~G_UIO#a3C_5l7lK>T z2`CT5GZ}-iC|or>O# zEl`*DcU`l(yq|cl*Cou>B{Z8_!OXNC>YLV4Uux@_n8LJ9J(%%7^STXSR>u-1waf^= zrfOmoH3 z!`5wl-*44i4D>JD0-;)%VcBN<#hd?=g|rB0Tgf)t$R~tukr>y=3c`UOhi1)#;KS{T ze++yiU(-JZ&6;&uI8YzoCcN}MG;3@qjHFAB75-w{25Ic5tA@{vrRT+18ei#O@r|yD zlKd<84B1g7qE62Be*w%2niVWd4@2u^ybsM9%ozi-M(jG|Y0xYYEHkU*q>*Byz?Z{FN{)ko@ZD104KuAFf|I!{5IWWC_U< zf{y>|5@`PWU*4(^a~sz!R|5ehPO|{FfU!t-%MeJG8|lzb-zYE^SQd$8S*^c#QVq#6 z0P&As4Xz+e>z7y0Q1$peyj-0WfLPWug}Di#nLTr{G)uhV;()2RvMf~rbLGM?oGFhW z90lVZU*_V2EYnfHEFd(C2uN@92vxZRr@4G7%;ifVPhKb*6f0;}R%TD!C0u*0y;fFMj8H(JF~9@e25eqK5U z-xkJW17GF~!|jmoI|;G2BjGoG7+gmVfX%19Fs^rJ4DZ+)gIYI3@1~8=t)T_lTbQDM zo2GCXGX%r>v_@0&I%vw5uyx&9$~~k>ZN5qt~ZNmKx_R!WTKw}r0wrO-_q2v|yJ;_u5+mxOJxYQ7X} znnd1UXX-hj`CX1HUKa=AN`3(Lrnq8Tv>R6XPD9${A?Vpio1nmiFGLABXVIz!>>?r& zn^%FL>G_CGEk$%@8Da>{Df3stH$DpuTeekgc6>lEf>x}BV^JBV&CP>z@q8q0*{#CO zh4cBCwC8)sL?&t99zfLQ-3VN@2_b8?A$a*#c$KWz+%U2h6Pg!e!mNda<|UXqV>zZ~ zuE3cOdJQtX|@PEl#7=C3dA zNbLCCW!a0rxaNLS1I@~hS^Qmje~2___Qz|&l|b{Wf>69H3{lW5ZXIH2ex{(AfUM#= z*Z8~Zz^p9URF#OwydVtAQo*x;8bSFf>r`M_+KU{gzHktdpUn&9eacQ$P5`bEl&>Tb zm=jzG!0xy_-5Zxv2*fG=s3cULNcJHp`{P8KAEDU;CsW;VhWFaJBwqq@0FMb$x{w%1 zXbw>B8YiP1a3aYYg|1`Ks*yRq^zcU1(lv-)yb1wxa^V=CfKgt47(C4d14r3l;OCPu z!rl)7MT_Auy972_^I<=00jA6>h2yMJxDuM1weJqgh7HlIStGQvGDp7-ZPC4vC8m!b zNoWqn;do~pN+9^fyWn69fjQC@2MEGtuA?w>@*r#oo`!WE<5A%@9-~{-h0iB#u%CHU z&`hWlw+w;ilLY9Kv5u&WvLi50#fgYXxRmNhSmr&>dRFdbZ2R|Ff466cQH0$b-dl5d z-{ywk8KuCPko}zdK4*vW`2u!M7kjXS?Z)SY3!fEEcu?j{aCYOfk=uDs-p=*IN#_4p ztb@wK34;5HC>IgWz7?N$DGtN2kM+NwosLU$ z{Be_QRN9@)1o(V$5MC{jSw=%}InxO@X1c1G4+WgB@^@vzRGgk}hl}a%>}WaTG@*H^ z{{%Hi+q8}eOzN7WOV57jJ#Yj%^!N-NdkjUJ4&BkVU3;`?*9pyAcS7T49Z|ngdo-}_ z1j`2PVQSt2W){s+&$2n{)@uwCC3dV#8xWQW-U7`EoHdX(VF%38%nJ2P>%+93Kyw3h z>-i^y?aJHEpviOsC$V<2;oCj^vrP7xnq9hen=kOeeg-d3XxJNVxY@|n1F{u@iP zNWhtFqDR0G>Ch}ZBV1z)%pyszhMxqOHN6MRy11V)52T8sX+1rXeKl&=;g7x)Te!By5{7%h53DG*1%b_IIDuE zYBFy~|0B|^th(RHhf<(wv~CQ|dhFv5p?Qmdr$Dn@r@*6vX96+L_fcRLAD`-NhRUw? zonijvyhbvvOjV%F4q}CCD z6*#jT)^fXqqEi6PMsEbpTUFU^QfUe_3p`7m%}7hd(j|+qb?at)_w8P6+_**!{2TYx zNQ@XV7?sBk<8QzJf;TS(Pz7+`Qg5hN3U(jgyMZ6~?ZD+T$MMhK-{CLZ#~7G(XqG{L zH?N<^ru8duaQ`Y_Pc1QL#_1=WlIEbbmM8Hz~KZdkiEtW8Xm#nY1iTbamY zCbhY=Ah4UF3IA(uW~Qcrtxsr{SphvJbjHf;2`CO9g7nGFac6lDo-Xw_K(iknRPg%C z2+Sp}3T_oN6JnIlVv$^{_(O^aFNI=l);@`AkW3lY#qz8MdV2xzn1)Nm~Educi!q*!@@;lbG%qV!Bc{yE3z%oGdD}wc_c`m#rM?A{4#|uL9 z+u7_)6$awsTz~9N7HD?INNn>O%BN=0UeIi_T+z=Vutu$!?6 z)56nH$DCl+vN1f|Y~V&{cAYv2PK4&sUr&RzWeapOtAlUT2*`wH@oB!7FIIP7LNnX8 zf>VKJ9|cV>bYK=ASW5g}pXd4TH=$WsnnPJuR1sl|9St3tC0dI39Ba@lz$|VXVr7=J z#b??ep7#ZpFIev{xvb)EnPpCaGV4(WmuX)&0@&#ZV;x4|4q^FXiWXMwa;z(^$9dvf zf;X;Em*Tx~A%Rd#_?1v{C%NBog7S$pcN|Z1!HHB?oFz10O!C8Jg7W2r5JGb>p*aZW z68vx`p6w&n8Rs&CFmuZ1XwG|dTyQ8mE^Fblcp1W0uRtuH(^0EeBd~Y@+!C_k6q<^$ zUXkc)=Y>(x=?a>s%_za7j1o9ym%%k*7NNN(tgIWuy0I17G_b^gPVLdv(hNaE`r&87 z`Y~S1L4jtT`v5y>2P2$uFxCb097Z5x!T@X}G_Q3Vi*lE-__|qLcoUj`62KQwCNzts zSq1_gk8|X22UJFD(0n|6A}%D`*FdxKVb`Gf*6c{!%!$OU96lrGhT&mu7@p>a5}bos z$Ls(pX!a!JvM!dm@pGE9s zEeyo7g~9B^1rv-NaV5h^bI+K~c9Z3dGt;NBo!H|-nkz1*IpO>aSFDSaX|(xnkqM0I zSisXa0u{@5U~buF6qavCOlAqbatg%fHhwBDd+eOm@Jn6*-{eIIOkW1~xKd2=nTfBQ z6EV^*7Q@DeV(?c!7%<8WU)hJi);}2&-QzITb2`4V2|=qapYc4_s9o2J&lQ1cON|D2v|B01Zq<@Y7 z7tm~w1S$}4)(7#4K~zEvzYoo1Dzaq_OKGsHL9e#{Cvg6NgFebw)zJOFgJzk>LI-A@ z9kukd{M6Dwl!*u&$_1$c<8KAZ+(zvrTnRL*3u9sJAWZHwf=M~v7@FntzGnFpxFx&_ zm~wr~28okQt=AxKiwp)7>!`{gVNMEHppnOF6~{cVQ008a(5&EERg`ubL zX)?cRDgVoPrBT*sCnaH5q<1I#%2KArRv}q+m)!4Lbs!FPN2 zVfPO4HQs`KyLVzv_6)RW+z=B+k5aJw+q;+eMcgO|wSWD?0LH%vG`~@kp!~3R2hN{7 zq$U_qv-=rCv-}olCLG_qdLDasZpM*=-{Tmi`z;GB>psmV4*!I8D;G0gJCrq9{uc=> zzp2tB7G-e*k*TzQGPRa)Lv?DaS^iq`x|(w- zYC|}d-|e_;FXGFht>J%h2N9T7KuuXw8erFfIUo80c~gco#+sR5V}A5NM1Iu(R~JbP z(*PAEQ-g}=SQ*>z@0BIKSFGoeCikLuck-h z3tdK)a$iZ45_pzrkRQq5Kf$jg$}|Po#I@rw!TC{1F!K_u+%#0w3~?Dz z0~48-QXX6C&X=JpVN8Hotdinp!8&G|GMG#(&{f5*tY<bTx9;$r zISY}5=8%;6h)G|77y@o=!4jm*TM3`&OhR)j?rVZc!4U{txgHKBOW<6#0FFh)m^dQ` zZiO;%Y#t_O6~R$vm@8cg*Rs{{C|d)U!sT$yUkU7rp{V|$+H$K zw~R>y=85S9>Wqb$5}65;hAq*db!!B;IKtD;4sMjAtv$Z}Y8;waHA5$M#`dL(k8v2` ziJe1sg6g#2|PbzN9_sEEB@~idE-%O z5c4HgaT(y}gY(G}kJy9I>p@_4qg-%}x=LugoFFsdX;Vs{NhLHBmQSR*DPTTMXgWz~ zI!lPYAeQDNf1DF3cMWesr#pe!6_xSsI5Q&*abFEYV=FU^@(V=NnoaOsv=p8t^YB%4 zB-(yC293H8ME#cCQMXY?)G}*{I!!ur-&iUN6IP-KWUSL;TMVxRd+5xAcY?+sFDze6}%fr<`-eYkS za4>p2?w83|uP7W#o`!D&zalg{;Yhf%(%}eu9FMoh5&pM7VKO^FUigWfjLBWwspneL zy0sC*&g`1diTE*Q3eL}TW_y-SIIpvu*Q}mHd|vaJa4CHn>vL)$Iakt;<+{2SUZXZ4i}0!}H3>5s|2E#Pc}Zx} zeXMs(#r*MXTA#_f7OjbU1sgh>-O1DUi#3Qt*dB~2xnD1;R$lZ8k{ zD@fE}QHN#?oVT*TD1qf_Nt!&rs!X2W0LlLo89Uf3@th)U=08p9w*WJ>TmNk&RjG1* zO3GL!Dbavg>lGP8vnB~eCt5`~7Olo+c33oPzpZ+`g=rl`g$3ik{{9<&6BmfzU*a!+eTzSSe~aG;%)e8A|NR|qT)u#xzTJ(} zM}NjY2+4mn_%VwFgmsBU`R0{#%2)Z=&)?(9*%R2ZcDZuH5LlLd#5LpC!5?@nD|x-E zQT6z4^`Ji)9QgXVEFW|K8b9bi2+M!GV}9Pg#G^a6ar_`3BCFP*znW364w^N#L=(z_ z4^FYVj_%V5-!DkT@r5xsQx<`9#gVvD7>3)0L40V6|1E()d?e>N@dfU}-?B~!5Q`RJ z;4xu9pjn#<&>%vA?h7aq3$wg*9ug#;<`WR)H{s&pf&g5Y2`*zM z!fEu^@UpXkx3e=`9i1>}z#v##TB14Yp+|iaeA?IypIX+X=aX*Ei5pm zbpuT9*cgu8o5Q0|O9TvTkFZZWB5FuS#C+Zv(L*~Ta8O6M_UiyAzC>LXx8f z>_VuMh#KyA!1}+!d>o(ajGKi4c*gpCRjBRrn9%$*&mL8C9oUf~43zuf$ZTgqvlF(& zxMI2QRLq$2DLOW&t1MjNvnbHqrtc>Rn3IRlxn+n*EkRr+p?S_C#1<|^a^VX25t7Qi`u zJ{*$^U>`phQ^RLsY+yRZMC8Dpz&x37K4HcJOwL@2$r%F83os=*8}*uZLHG7u5boj% zKejh#M_1T8IAYA#qtpof_JrndQUjIM_Q71~2ojp5b0~E~NP1GvvX!q6>i%G7MD7p% z|BUtioNeK00ow-4?Fl<1^3_1=0I`0h<0D{P>I<3VMx?^S$(I2I&AeZ#f!SZd^6TOt z!g3&9&gXJ|u$sdADLZ8njYht7+~a=N3;l3$ju%d6x}h>zChYLU6$0axIDuvdT#vP* z>>i&ikT?u>!s zoZyi#8&1>bz%D5dwkf%A$t-|VJUdS9yTQC+BQ#(qziaz87}2Q%2C*HL*^I_9LbJ^N zw?Ep65b1^stUFeEj>AgV;V5$(g|C~Kz_V9d97ymdkg{_S?MaEX zSuD(iXC8k#+ySQ}>5R-e+$Ydf@ePwu8kEyf17C%Jz6#!MZQ=#5LX{$Kz~pILe98>`M?1zzHr- z#I0v(yLQ`HjTsG#k8F5y3P=dCk*sAk+@WqudG1 zE;t-+hl3GQaWugON2gE2K|=BIL@(@#^TXF2TB*rRTJYH)X*UWhq~2!xXZ^+DHeebKmi2Q+Eg7L6LS-YuK+{ z(S7g)wCnQ)+EcB24nwQ%L#WTtvg<&!>oEv@KKq&)jsC+Ypx@9j=)lfbUGqkKMwqgF z)Q6RMJtV|Lu1+Hcr6f|)kHN`qS?1`#$;Jom4qjF=*2 zrPU_M+s(9g6K>@z|8Dusb)A2d`>AlEaHAoQA}z$7#+bUa*w3OGW1_KywcZ0^;WPuQ z8E=>K8UL={PE|9kAuj^WT7(fYnFh@@2k0Ow(%mw|%3KZ2Vs+O4H@cAOLNdB=b;9cH zdVvz0*K1a14VY{ET?Bv>G_#OZWpRHEnzi$3JsVySrSfIS$48-=z^6$o1PO=oo|T!v zEYc=Dsc{k6!nAc@u7PHzX{ba^ENtn$s2&jW@ITc59Gd0YjeZ-+xUOo-ndNE(&AMv9 zS(5;>xN7_vm^EnLOi)*4pz2Vrd5Z#N(eAC=v1i*3?AgI3_08^GaB;9zFZ~6j#rXgJ z-#_rTzrDlX|Naht{R@Fv{ErFDe^D+PZ*k+w1swe08&n?IkH7!+LYq?iBhW1F7dI}S zC1CHu`ICon|HdU=>l&#oGMp*#osM zE*oO$mNex!u}BL{Up{vl2Y&dL*R&OOlWmk=X_H14Xx^+L>en+-(?)k|WQjF1LvU(Y z5>Awa;$mqOE*FR5dJ*A}@TE-y>qZfhN?i#E1VTcB8ja5v|HDFUAfN(cBgxA~tQQaD zlfEu)dn~@ae6io3FVGx>bF+Pr@^x>tG1aEF?%1{ioX1Uu-RQ62GQ}2dwyyAW^u**b ze5f|C!oX%$DE7C*ijb*T6Fd$Z!zW=&oGl;P_V_lz1wW^F;@AuyRL=6lxg3AK-~w?o zKbSAKVB8U(&f+jC1h)wv_lgOp1e4nW8Ksffm*J0LEv;Zlu#>Wx*xed?(k5U@{7^!5 zZ5%5M!ONvVc(BkD_rx{B2%2T`5CLX!XA$=avB*9aYr5)K7@)bBkSx$#>Vx|N%>-kW zh5?k{6VnrDR+ejmnM@?3{Fx0BaF(ff#lk3q1fPf%Skf={r3lUnngyc8!pzPJAz2GQ zrb~VkzzOBrBpCcn*n2debxf%a0O9{G^LLr~IWpTG*V!Qw3+YRNKSK4RLT7fIT<|hq z1{F%a193Rp6+0z*MvRM!)0{PBFx!}k0h%?R=B8b`BP6!~Avq<8NGm~XWF_Cuj;Ph78?1*;!(_s=ZO&p$$y$g>sf6XU#h98_4x6~yXxO0- zdUoxJP&YROy1K#D$qkNc`AZHD!$z}N_ytJs-(&fjYGIR5vXOPRG# zwaKwoedz?;o#`$D-T#Wa?6@u-}IB#5! zamF=5*|iu)LbM~H*_qHxa7+O?rCDf%6sL~11GYN4s9C2Pg#q-`eOPv;ICRA4@ zIzpzMmWe@5C-`7zv=1it>yFPykHz@#WPIwM0P{hkQHy0;t4$A>wC)9q)_u{S>tM7V zJ{AMr{V_Rl20?i?>{92#KCJ*YNx5*%D1uX52I{wC9a>qz!m=I)b?t($+O@@4OEYZo zC2S`8<3O~FvN9hc^d5|K!q3!xcGhN%>5u5aov_Sp3|6^~K!rP@xoKT^_9QgN`=gS8 zc_xb2LTEl8>4=Nr&bS!nf=i)JT(ScZVS~%l?eRc;QIH5ge2(#%cAFjkt9c<*D4#2Q zM&^g%MnNcU5t{GI7!W?!BsTU-wu85eJn(9%E2(;@#RrT*>jm zf$*tJ!vn`61)5!Pi1|N6&^{J#hogx$ESo7f#SYW92saFEVQp|vtBt|kJHvb2Fl0H5 z#yp?NC=IYdd59y53Ed@OwkQvthK2r9u*iR!8Z5pz#0ENlY3^H_`es8`PdbS%v4`ePL6$Ruc96@Q@~>76)PzVS_)TL zo7XTOh8_+J(ioa`VAi2o2j^-~=Jx7!LZ2=L%{+$=%{nkEXr`2VNHsVcK{G3<#=6Yw z5~)uaOhZ0rhd&^ zlwad-zZ0Z?dxu+BFXGt2A90G%{MTPzaQkb`CF28V7SO$T<~a84+>GlNPviN6Tlit` zcIIsbzTLG2SI?iqoog3x^6*dCw__7Fu3n0x`@dB)0lt2EA8!cDVu^nB>^^>b`&5JT zUnPQuOny>@C--mT_@SQ(<$GE8>k%Ing?grS_&_p4vu4(?vampdIws2ZH`QYz&a9q+ zBgKKZSRR4%MS%q80NgJ1$Gviyu)~dVRk~m9q2dt>*a_6hNN|auA?_Axy4C7+4^0Gy zdxShC_446y^L^Bm&JPy`;q*)|%%1!iI`d^H55D%T+rnki6u3^E3=bP8c-XqZ+tCZ| zHV)|5ycwMOcg5|s#kjXP6^|Ci;!#Bu9xaF>fJfso^>kqro-T;MvxQ-Ju`m=bDnjt0 zJQ!7FL0ksm)q+sGTM~jlmImOTOZ@QvmWANImq+0r>oag~Sq8>-Xr%m^#dRfn+A!Q9 zlq?DvjB#yiVSf(OSQ5NWGEW@Jwjm%$#0EdSC?Eie6}o^OC|;L< zv`i0ue||6y@CCYq(7Z9u1xx%V5t;{~FM&lq3)ihJQE3R!)=l6uBMT9^kCScHHXME`yiD`*>aG$#bwprzvl2MLnX$vtmbs=n0 z%3zz2gC^ZRLC@~J;73pnaB+c`n}cF)2Sh>-K33Ld)(z`Ze36ygg}Bj3Cr8Tc189~>EhOf$nq-9SK?7!XOhf|EtRD@S zWomUoGuxU>)vYGQDP>!%2;;tCY5?dBwt*XL50`U2aCQb^E7cxnk_o$n+Di#;1ZHS~k=t`an_P?uu~jI67(i7pD7j}y?2Q76-!aXN$BvYc>crV}m@paq)GrFh^> zng>pEpL0nLxST|2j&sHN)Ih8Zbwi)_t?;>xJtik)VMt&ydf51(!&nD&o#2H2wm$gG zH59``;xIZX9TU^DF*$QK>mvuY8TqgwG~1@+!!^AS&as5%wp~!m(vr|*hOc_{#+c?U zVBe?#z76!m(HM5*VhM>c;uq|QLs5=6KoH#@>w=kI_eI2@4p{Cn7Hd7ezygo2c&sTr zdbGj8cpsc5V4jWgz&RdsDclj)L!5Cv$c18Og5Y>5%nsKQ9Ml&D^(^B(EuH?`?C@VM z2&O`Dh0uJpAdDT!5Zo$~u>?Wt-j>d1Rhb)JFC;iG^}?G)?8Nfei!9F*(GEBo;X<)v zALYZ}gymQ-LbE4M#Ji$WW+`Nuo|#T?HbnP0#rl!ouNOz*#PkXHDdKDVx;7OrSH$CR zye$sP08B|E($fIVVs*DCnA_t>oIT5B8crm-Vr`%u1~jqK+@uVhR~dZWn(Ag+3q37O z(6fOVIyA104h?IeYyDd2VO|Ga>(xPfs!PMV=uo#7-1@i1x%nYDH`5sxX0Y9)+OoYl zV;kGgfM!;NW~OUVkFZ;xfNKHs2G*!=)d($`HingXZJ3+XQeQb6H?%_Q=FQQiLkDzd z*$T}F*{vHkLYqbn(Tbf^aVKnUZOsm=6|Cx+z|yof>es8Q-eKm2UlpX^}0oi_?~WKB1y?^`Bh)R)fSMtsf)sD}`^^Pr;sMd9HzG5etRT zEP$--|2{N}zq5j5!**Q)ebvyc&Zjj1iPLNh&D+H)%cWGd7RiIh2ryG>f)rMOW?hzs zrUCQzEj)Ish-pcAvD`icqyh+3j zLjd_B@T^TVV<=x~tRGmJb!h%}&u;A6wiN^V_Tmk1itoSK4dDOczyA3<{_*!;@W<~O zF#q-UxA^;C-r=8r{S|j^T*j%R2dJO%%Zn!}jGP`lL$@%?QmnxeN^bk6HOk8T@YWR^ z_-;48Cou0}dGG&bC(ax@fYaRP&<}fApR2KL{Ytg(uP>h{aDMURE?zyqkKf-tBbsBLo?riq#8^EkyZ8g(j1Iq@ms8vS+;)H%3@$-UY zoT`Y%c|ziOLf7T_0k~P_k2@87aa0ftDqL}IfgA2tcu;(iC`&1US-@GWF$CnhgeZ{) z%ZCk)+NH5%6nTva6iG)Bxa&Z_zgenPDKGILh0thVuYD(Cb3xe=^ zK`>rb5SA-~@q+&gIKN&LiZ_Jc-xmkt?@Pk**ClcI%epMwU6F z5$jG#LXGiQ4VbyVK(hdI8G%{c37EbH%*+oV?*aD_7n583|7MvxZkGr+d#Tv~&vEo040npdu|JK_oaBrR@h({8KM^x055T|{X3BDJ zTDvxC6Pjz)CBP)5AhvihLQ;znJ+llka~2>*e42|_A~1F)8nGOhOgqYF)=j{u5&8jIAb0>W|hM~cOgOx79+Z7DYXPKMT-%hzXU!x z3*kDO`%w;A3t*qV05(Y#uuU$3UHlw0?J)q|dh~>sy)D7n0iG^yaCddXv`G`us%cZS zWL~xqZl0Bd;o)32^#$P(+xLAVOS8-X_n6Qu{^JsBS^1hv$3?ylh;ggaYytr-*;zsJi8MlY1_3TpgXYsS91YOy zj`Iqd-EbzAaGBzyL36wt{}0A|w<+k_vI)8k9gd!pTqsX`W*dNE_P+SSIS}LgVlX8l z6%(eXVRG6`IA+X+WBOb;C}7URwA6gqr4_=3(CiYIi3aVuq7K2!+QJ+Y`+kCPO`5{H zMKk;u`bnJn{~ivE25%RN=6wyBzS+Ol>n zbg5GdgB#Svr`Bc|*r+bSxi)%O)<3#A!WS;B;{?&Skmc z0#!+9K0|=sLg4S$w7&Y9DKpM~@x^dVv$e%%pN~YJzWvdqb0^fT%X^>yTUuJ6S+izn zY~2(snzcZ)#!b)s8WI<-fe zR!z`^`EJ_46ipgh!j#t$7Zt9cSqy193@d1+bZhfAVVh0jzM)HF3GK_g5j3wSuaQr5 zK>kq145&CoosT@nrEaAaDgS1pwDrl{v~hKsgzLn2`A-o%jG$S3nm-JXAgpElTemd7 zx9vYj`AZu~K{H{wdLW=$F+j5(>P?HGCt}8pWLzl!S(2u4Fl>>4vkuS3;{JqYsW1(i zsSP4#mX#!tF0_i#z_}WlDLGz;y=uuef@uvjYk4vTW~~>cfwZKp4X6`nmUzWA&@6?n zS(@396f_g;w~8dMTI2W-kUyq>1r8zba^!Sj#f0$>08X zgMa+>E&iJ@eedRFR1$Utnx8+qqoCQ?Jwu0PakKdO`#miG)p&OQCazzo#Ni+JDM0>a z2g_jdTKxR&E}T026OR4-9lqVU8Jkuw#gBWpUD8AL-?%s|KYgS_Q%BASps~hUDr{AD~6&kYlZ(he#4TS4#Q3uOXd{h*T zGo?YeKnT1%-yb*2{Rzn8e@h@PCo~hD?qoXw6um@?>4xybQYd2ok565K-K(MyeduL%PIy>E28jxK@5R{+ZM;*&C)o$B@n+` z5|7`OPsh7u)A@Tk-mFZ=s|{KBb>l4jzG)8LZpg*2JIio$O%cX)Y@w|0Vp&{hKN|m7 zo`$8KpQ8_duaEP`-~9zLIwmC5GHd4oT)s5lMytFMe3n~ zW^vchpxIcu^;slwdv$QuEW_L;7D^qO3nj#v4_*+MMO8&I7>Q!~&&A@&GJhtn7zF00 z1Y(hLui<|Z+D--v-e;Qk6*PMinmuuY``qHXK=Z9~ADkoT9V0Z~&(j8gJ})33s;C0~ zxK|j=x(_FO$Kdw7DC|#l!;VB3tc!EP0^hNiF|jX(v@%y#t$Kv!I_#*)qz7Jv=ERbv z@K4T1OjbFfb1DeUi#2GD%R-aZ9aMYx+`$8WOE$oX(Ck^U3Z5m4F_y6Wbxbxq@|VG< zXa(%kieQ(R2j`@GIK*YcHarcJ0^${@^&Dk`HiN!Kv%zEWiCZXvEeCe#C9s)JSWYT~ zO=2-@<7cC3&;ID%yC-~{Y~f=+4KDWf@Njm96Wd~2zQ|>u>PAA$v(ivJ%4O$+9R~s7 zhw}+lGH{dk!lQC_CJ4{kERh6ecBEuxNA-oEP=jS%5=U8q^E`HX*bc-U<9$mr>y-6Y z#d>?eI;>)P0?p4Q6rFrISrof!ZKm$N~|lc z#fWv;9akc`Ey9iR&_sZ~6iw(Qe4b{Von&W6EX=0}%%^8Kw-Fs+M$+ZJJdFBjauf-P}`yfOdGU^ zRjVFo)^9Mnj+%feK5_6#V!b8Kfg^!=T3P|7QqGx$a88(khV8n+)XV}cE$hQ(z#xon z)CfMUn&I03<~81nugfn}5+d9Q&Ca+S?u6?huDBjVzzuZ6HA3^HV0)Gup?MZNuk0K?;PXwOSwd*u znl)Bm zd>Yp?NBc&W_=KQ5eN-PT@*0a}z7w#*dmPsJjmM^d$=KvS4XeDTVS$GYid?5+uJd^H zK+SN)$y7U>&T_%VI7jqqVhJ;oI%-<$kdP3p=R3J#$qEz}m%-CJ z2yHs`MC(qyFww>h;jtO;4^M_)=ydo5#lzD-3SI$`aQ6v?Z%{OBo&C_Q*I=}4-34BL zVMxo$MQU~)X6DYr%)DYmO;1MqE?rPt-e(Oh*a@hOl*BlEvuk@bBPdQVNC)Q{gWLPk zIYgv|BIdr;obbLrWDku^= zf6|}0i`VfUH187MT&}3X)UB2TNIBTJ&{V-O0b>Z&_2EecCN)r#i+dlOjUlc^+~3g4 zQ6nwcD-vklxt{9;x*BNKWDL!J0%j5Os6(?v#t;_}b(|42?~wErG}lzP7BxeIW^vCD z7ZYVQm$DHVpn3b2H5xqsFQ8eU&QY6<=Y4P%2{7x@%Ul2N!`{?;(xKV#Tfde+mocwL zm5=66uFJSwCg`5yeW0}jQ=7u|5N;$|Dpl0gpZT;|MSn^ z@aWzR1@#Ia~+Sq~F>w!z)46{y-+ zh*vAK@nTszm5GN-GI49k3|wE3hO1@MaibyyS4$Fk9r3ta6o>QqQ8+m}6bI9M@olmv zwk5h@bBr_AMLA(fh#krUY>?$R1_SvrXtERW!tPslAG_!64M8ZFSQz?ncSR^IoZSy_NJIQ~!q zy860+vl?h-(2GJ1nqLSoOVkslUu|VJz_LgNtUVUMBcMGrK(lW3zCYg^cX*E5Wn6Oo z0fFu=_dA#GgtIwz%EBtpEZ6eT0L)@tzC&2P%g)PY)*@Ylt=pww>+Qpfeg zh}Pz4R9k~)@olbURuBFu=}4Hj7=Folh@4T17(#Pw!7@VgDg+Ul8ynm+22F8-|KbgB zC|VBB(p7NJUx=?GGchb65ySoB(RRdC)M?%e^_usFd800^S%jD;RP?x zmhf0ctf^wv%CW7=u16wI>W z@xt=||Z0!-*I_<=0%9K#-qqgY!w$bUU1v za%IQhOuPrn+!Z(2`F>a$gr|#Rahsrif}JAq8+r7f1n+g&fM;y z!ti|??1cID6EM)aff@+d+R_4pS~Y}IkH*;QKOX01vI9j3zLLqhXL(&smPvA=aIYX7 zw+f?iEk6cV^J8%>FB(_pgy2%P7p~0mP{*BMJ2=f_*Tp!gIL$I~iAfz3czXF@<(f@c zv}_~hmn}mq?>)b;3GoYI;&`U?^v>m2Fo9p@QJb_ ze;}Q=s`=>ONI$Qv>&J<7KGPXZ<=YsVtNn8g7bO>?)&&T~t-UHKBJrI!A`7V{wB%yr{B z&^kA|q>Qz)*Zjy0&@49ze;e1Kp03L0maXqY^Y-n^(kzX$ONX{9T-&-;%klsHzklMt z|M>?&_^pbK{LjB>7H0wJ|NCEmC}=)?{2;;gfEu{>R~42{CP8^o4a>3=i9hq6Z5#00 ztEYHOz&>~45DtF7ry7#?ZQrQeG(<;!`UXGjkr>k}@!hU1c*^sLKlAIV`^?8v1c-qs#{mNXpHLI z3J1%Qake4?X9-_d<^>R-0tvY!94=9w_f^bK7 zgt_}->X`AcHZ?=f=8cdQ;EYV?(MX!~86v*yhoBLC;4`8x+=lgq-GFYG(5)>dbZvtP zom*gRyC(R$RU?dSRv#l8n`2m`y7;U?ZS*s*g&ro98D&`u-MQ>nzb<;#=S!NP)tq@r z88ryMuFAr<(YENv-}8f}p=xcM>L93rbA;ymWg0Zgz`G~n?^_grH^o7C#pN@J&r863 z#uu-e)ldA41)>P48Z`4oC{{`XFss3p1}OfRjGG zT@Z-7MIpE~FA_KBN8@g39Iod_;^zcc>>Y^#3S)#WHG}o?Y zjK%({Gv#@#=Mb^C#6*rFGjweE}d-G`v-z>yd<&K8sXBM>}m9^why!4jHoMiCrR z3*nF?z&sE3Nkwp+J_~IJ3`YNfpCHg(zHms4=82d*_A9vCPD785ozxeZrD0B}T0)p$ zIp~A?1f(QXpz4TJL$d&m7X3nlX4YMiJAqjjfx8--HTMgF<`>GEOjs6+GRs}Iy{r&` zX4{fUPGqtg*73Eup13g66(pj6v!b!q(spHp(_^b<7`%S`kakkh)zvFybM`8YxML`(oqQI)KNbF8}yNvubK zWee-fmv;ab zEnkm{<(pBqXgy-sISPrFiHyn+npBFgv;_!BTZlj^AY&ndGPxu``_CvxRL&B(hs?l$ z5q9`$iVtQMEXT~k<;ciiicBuk3zj0MY$bfd6H(W^Axur{qJd>S%$kuwXqJJ1!YkxC zofjzW3p&MZ0?M+F;sY_P8H|-O8QP!Vcs-B*7t%u(>(MQCegMrnW2!kcjCe)!ZV=wo zBtFkt8k$s?IbCG44`I1lA2fq0ScLoJVG!RgSf>0be}NGgtbcn_Kdm^BIDYL%&_ubb`v9W?9UtWDSxBe_08&{nVA{7SeRjDycaGk zP9QY<;o3ZHMmZHvxXhbN&ANP#u%$}@btzvU^JOqp4KOQMCam2d5Z;kv77(x(^5wHA z0FStQvXDTzIFz6fuHuWX^_UFXUhN^i#622YBW&U{L{D|VbbC+4+WR2fB>>|_j7LMu zCa^HGAbizUpx>}gE#;SICck+djp~`AzTrQy49Z|WSvH{rf?L-$ArPCPE%%eKOKr^S zqMcPe^{KK)lltgQ^=jG_t?P>qs|jL9_QR_cX*iMYfuR<)kn1}ERqNt-y`IYLL41Sn z6L21|b0Pk#;(GC-H~??wg(zsQD)ce<8FNVplF2G0B7%g06L1zwvy`m@W^unTa>4kR zJ_gM-;LP(7as`;5&DWq=_It`Ykils>Fl*3Es8jJ+HE5P-9*+rVGW!2Muj7GS3){f+ zLUAwf#npUgT+X#4G&|urI|2_ST0lXtvUuH`AAzf-k+@wRhikc^I1uZIy#$dBvCb&= z8in}b?bu*qx*E~>1oZ4;xMUKN(~1-<+b0!M`EX61g>FNJV#M&z5bkA%Am<72aF~pV zqlRPZ=r7T;QzvDis|c~f>(z<4lk2MDT0awrk~(8KiXXEE%)B3XU+B;*?ipff*8G}{ zpqUW;yjX{3FT7+LFIg8aODVRgmt}!$FH(;J-u{po2(M(hXds*Bh;xM7bLnn4BcPjP zk4s55xRf}Jz-+5r3$7AkYM|MJpzKa)CMXh`&vW}lu{cu~2%6^!U}p)xXHzA80cR%# z&F9iIU_Qt73p~#Sfo2Ik$9*oQI^qhU`C^hAD%1S1H{Kf~JG4Tx4&Csnn;#~o&Vfha zLc}atjhMwN;W4uSw$WK|j+q6w#4OmQW?{;#xtKU(E+%EnflX!s#HZOIy$~Lmh44+8 zg~lDaqE_8H7{_`@a&yJlCf0ChV2*u(1oaeu9Av%^B|76Mm&Xa6C*s_3oX}k2G#bGJ zJ7QhX6nq;y4XcC4BI1iK2>7HW_7iAN6E;sp5^`ALF&4Q|6637u1M zb*6`c=7;lm@5z@P3ClR&jrSPqP1Q{ZE@yaX{>bqXubSl(=ZTB41YM?oK8$4^&h3QR zlLE`(1le%rS7z-KKjcVXoQxJ|4kU1M|3r39*zviLNN{Fn-~xeJ{Kd~FyWqxbZ`>v@ zpU)&{vtxQfIti@L<1qyAC;?|r9AzCJWXJBOAbT7j{2vbY#LuDZ#0J`<$aXCHT3M)e z+=|b@zK!d_XFzKlA?#nul$rhugyws?;j$Tt!mSVc86xN3@IoRc;Q8d=96 z9hhro6nyIM>Id z?_e+A8t7ZaShe7YnI_e#T+Ut)#bmzdD08Jg9tg`R}w=*f0SuyIDJ zg9l=4+!5;FjnTu#pkagNs8`PlGF7XEi3zNjZ!-r7RuAvngo77S8Er8s& zNdxq-GDCNB6Lhav2R#VMeb_nZS-&1SSec?@eRdY=o1;eq3-qw$K9+UV#3`+IKIQyhYq0?lIi zC3HTY@2v)VNrVLDqb!h0s1ggWK(q2MHUP6Wc&|p)kW9&bO0v%V#eykQY^w+wJdecl zeM-QR=n78++6ZrIQWJu+z_LVl(5#_i6%}aLq1jJazLoWi>ByuuPYVO^qJY3g5Wil? zmujv(9_Ko$0gev{%=cB)fKXg33B%>GFx;w$#^oG;9Eh^TH_^@tnhV`WB6esyjB0AC zLbge?mfD17iw2F6Jf{e8c?$^51qjM4MfgmDbIt-J<}F1?d^Vc3?x>)-&sUS-J%1G( zbC<(4cNx6+e^|kKc=E;T6q5(%_QriLYBqJKOl`^I6EV-QB# z2VfF^J16JBWkw-vQu1J*Qa}~ME;$!&1h`(G55>IoDl@W4Jm@Pk39{llnre>=87>N3&xyrXlp>Rk zIO6j3$+!|f1y|y1a5dJR&`i)F9A6KZDJbiW*5LEzXW#K1(=#XS4GfpOwWf$MjrgrW}`{_E~ss0 zf&h0X6o!VuzEvx@Hn7C^fdus=e;iJ9!;xe+98Gk`F+%4tLgwKFPZZdE0snq&u|9Yz zc1KLa%AnCm7}p2>{afM(b|z$ElM^h9(=mkEI7gg_ml*+FP)P_Cm_899R$5y^vkU^{ z{mpwqLN`9+JtwitU$G-2@v(1ad*KYrtuoP-OAnk$}xD znq5_diz5;4I1)+Fjr38_Tp1mRvvFR8W|p(ev^U)r7ZV(DQGhv~=}eani&*zPa7xW+ z=!wcWt`nFgKJ)P?52izSX5AhLb-?~$dmJQ~AB^zCPlV^)fp#c%7>~Yu&Ph9N%4b{e zrWWuS+#W~MeQ;r>JFd#aCE}8k%#Ki|2W}Sx;A);1uFT=RKid`82*}qYUUL>ZTZHEe zv)Nt<%O}}31e(`G*`r$nGnkv0D`<9c_d{{TYUGu!!=lxDP_%Fp!jtn6o>GXI85Iai zD??~{1@HSs2+yLjmmrMGkeLe*J7)(HqD}BT7zY6{m&PnWH`rr z(7a(awdT*z%(SYZ*^usg;yFGB&N2|L2AbEZO46ZO{BQmQ%?g}#i?hJ94$w?TmQ3@{ zs;Pt;ePo+d#s{#hOSXRuxMiE}6D^+(_89in&-=a%<*DjYuUEO=_d!`*yZ-w>LG!k4 zYB1owT|3dgPfzxsYoRDF2g{dK5Lkaukut>E{LjDX{>`r_0`YGzaq9R196P+9(0l}E zE05y&lRFBo|1h{_2sDeO`SQ7wIDhIWem8`XyL;_|n(SmJp;sm~k=X$EY-f3HUW+p) z4yj`Wwq+2ZOpNlj>Iq?4o6tmuW^oOXkUBCT;`_b3m4Er_CCjj8#ZvtI!*_@b3R8nY zEKTdc(xMKm8(N@cgGOqg)5K0Ka40VV7fZr%l`n<6<)OG=9z-w{NG3GPw9MraQA3v6 zw)=eHi?vw?X3-tp%3R??Y0!L^pnRWtw1gr!KUo%r8$8EPQFfR+VKByYYK+E&<{ph2 zAbji;B-y$m*})Z&whjn(a3Q=5MIa-+4;fr=49+X7kX+<1dds`#$tM<6RB1{2pmB3Po7dsX@Gz&a= z@MY?c*YkqaU_h}lzbMh5S*A#SN`Ms!V99jZ`otzW2>&~1t_EhqzB)AP;H*N-5qzJj z&-en({ydLn0TpPj0cIVV<=oVmDkZaKNKH z2Ru~+0A(uaP(pJEu9k-4Qbh=ER)pgsAwnir_<_K-k#JJzHWE>vw#F!H6LhFk8x2iN zRfo;GX$xfJl^}9<89Y;q;GbTCkc?7<&MHUzoW%$xG&gVCSwVB}Q4`@gZzb$!FM%7u zIk0#w0_Uto&o5nI-l`uOcNqf9Hhp0(v&=Q=0MjPzQKw-W)RKu5ELxy;gVv~J-3hHf z8HMq|32@Go2o3qLB{bXd+;*wCa8DyN5B&^N$9{o$Ups`kjYpvSGYmAT8i`k%2+f3DiD2?vW@)U3X4Zv3Gs{}aRh1{9`H6%8d=Hu_f|r73Wx3R>%@SVj zRe1p3EMVJV-APQ}t679&8Q7QVfb*FI+e`wK#7j>1Pyls-ZTbRX?RT#0s7Q8F$Q1TRGpcBqSlxeKhD^U(w}?sJiyn+pkq-y{KZ9hfP?@_8Ma zQ@L+4&z+({v-mX2^vPGa|Jmv8sGJ#sxi-VmhtSl%eRr5PXNRX*2bi_#3bV#6|Ax(A z(zG3FTXS2pt}yS|4fQCCcD>PSf+O5Aa^Xy178ecsv|M=1$Y-0F1?#rjV9?Qn2O`7X z89wbh!^5H;e(+^kBnK*3K9WqyGM>;(U_MB2&6zk90iU$P#t>U<3H%ys!^R`sW)Qsl zHN|%_^IsfcnUGmY$UYhGL^*Jq3nAJQrwO1ZA_>irEW<25+vf54$9qfs{Uwaxn|T53 z5C`H;7Wa*}#VH`;rUp&2abe!;c&PQ z!P%G4?2A)T{-})cBQ(oY<}SE6-5!?+&6lK3<6Ts^K!ItA-FzyRfFJA0rMH4}f##FU z@2MCsWpzHra~_OvBq-Zsf0z>vgn8gVqzArX9_G1BL@z?Kv}0>i6ZBw5)_q_L9MAN` zMRv-rWZK|b+BBR`nu=>P9Pxmi zGusVoqHWQ=fhjCZ%~03W0&boGC@f!ztfCc|Te24OSMES^;Sz*q6cd!o5u9F%kjx5% zW)YTWE9pAk=C>w<#HrB_W zS%=+^LbGs(Vh|Z8CIKM~K_LvxQq8AEX>29P>d-6_h&Bdhkua0aMmi`POJ^lxz7nZp z4dARMMlpisJv>*f9Y(;cL$d~3Opr@WasvHgHF(vang3B5G;4*b!85nBAasbWwlJ%8 z?INl(EX(TiFbxAV?+_7eHR-@CX$sVBsjeU$#?@5NtWa618UCsz6LxG+lHS(8f@Tez zncjQQES6>+nnldJ4$Q1@@$3Fj%DVd=BugG;scn-Uo`A100Dma`e=WYCaiz+!1X_b+ zUABJ=npNFMd9s1kNQ35@*%3bu&1wowl~+RZWoT+7mGh_Y`q_QFcytHP9^Ayy1K;u5*W%!hd(~h~}*W!cJv=&yzdE$IU0?zz2wW-*#IE2;m@{DzCU$ME zEX3XTU=RFqG*WC_k>cQnXn|&G^rwTAyTxZ6n&9Y)0$f^^hifZyaC6OE+}|)CcefPc z?#_JN+A$-s-vX?RK4d9x)Izids%yKSj>yE6@c*^`C8?a0Aj zx8~!od&+Qi^(>6**$j;c@2#!sB9!obdwwJ?WVyk%qa|$mHo(p0;#NW+l7T^T-2$AH z2DgOd`9Xx{AT5GMksp8i@ntL4W$`Z-Xcp_QCSwr(9|N<<;Lh<_J{v1w*8HWl>6o>U zdn$SZ>qacwN(O6l4LEDSu3MW0@*eScf#xRy&C1d&>+JXt@NVQg@kQ!L;M1U4CW^T} zF9es%D7KTE5~^^98(-Q}2+a=ICYEOB;Rqkl5??f|gAQzK)^)UyQ%zfSM0$P&BC<;0 zo012=6OCVCY0U44vYDFP;4`IV@RCpfNRV4kmFagU;+I z$Fu@?q|QR0A)mo!%rGSSOhu%}I0Si4fv?MC*o+&APVHo(m0F0I{25+vNX31LyIky} z++k!kzb7oar_xCgP!^XAH7JzzP#mD5sVLv}e1dU4LD`@xrtyNRVmo@tV_r#!#L_@^ zOgsrK_BfkjgEJ}i%F26AW?)mZun`tzI_q=-459fvq50x;f^eb(F2_&8mAHwx8bjzI zz+8=V;8OgWT?x&EX9DxJFe;P~Ev^|c2E`J>1r49?FI}*`$3Qf8o0>vk}yTN}NbH$~zX zV-V7{7o4mtvCo_NN(sWTc(F1QM4A61gkD+SKiwPY?CkpYZ-@0E_E_&T3hN^$BGX|Y z+JRPU-c#C$-F>3nd^t^ zsXR8;9%oqgl}Q9wm5vuqCGdE0(}-hwQS9tQupT2Ea5~%>Ck2`b%tyiq%wfJv-w($L z&H~M+V|{Th&I=crhf6YuGm+p-NWR3*j106CXg*7bme|ded_JF&e2V{ew7|c(n0Tu7 z4I5gcQ~UN< zSXRz==N3YiWhm{ zYDp$h5^$yz>yRr9{80_PWDsqdYK>KtrJ3BcQ$U%j0cOK-65+%6c|a5%oLfopj>cU&;|VlxSMbdLxlQ1C_f~;sl{sC8O!9&>7?YFAQn%*H$pl{C zKY4W|1_jcZbcn7o-wWuet zkG)(2$eUEp(^r?IZw-q|T(Sr1AYLY5CA3*HAMfkABU@<-j_Va#!hc zUqbV4=6}!Tt=P+J*(;W3S#IABe=jc#A+%*?ro+*}0d;EE!rFB!@$}h4{6d)h$6wwl zn3buq|M{0+aOrF%e)?`7Dvt{^AH=!S$MN{V9p(2dl876c)%g|f-?@aUr?>I!@h!Z5 zehHzf>tJ^ou9= z@ZBD9@7RWKx%__j4(!{u1?!hD#hS&7@$L2<_+ig3j2JRVbJ5_#vrFq%7}~oRTAG?D z-`{|VBXMfUOk6DuC-?>th6C`RA{4jfcX>DgIT(-ll9yP^GEne#c>uR-?ihCnh7x^2 zW|6x?P`k^0ltp@hKkhB^*T8v65FRfN!PDiDsLb)j7QYE7cN~RjowZPJoh<6YZ{!!q zbo4-)od*)8x*~FlGsX=XtOjEZ@6sF>*UiJD&Bb`IAqV$2<>0}(S$Mo58xJ>RP-%F) zJ{6DGCgIVlI6PSug{qa2c)2nhFIR-%&B_qGUK5Vz>!VS%DIRY&CF9LHg7ubJxVw1{ zTn2VPOV)2|OB48w=z;UO{x~vyGTeJuVPZE+Tv$YCUMSWlPXey_|rPv(c> z)w~G2DH5ONAOaELbDn^#K$Spjjl?}etj#(Y8^f}4<1lO&NPfig2;_xPGfy~pef+%F8q4PN8rl2Af(D6Y+C2aDH!hQM_oiqITui#>eN z7dZ?;aK9E9D$vZ1NaH#Zr@0o|bnb`bJiZ_^`SP4T6G6$@2qMUarRF1=+rs0s(MBe= z;J&@Sn1sN>Rj|)q2&b$vc;!^UzhF7S_~H!7nh*c9e1v2aAuu@?!Kt|jNux6J5t>zq zu$&@<&7FsU*+p>AE@B$oo8QJ!NF z<}(F>UQ^&^PtfbEL9_SBzW8-rGM?nRvQxyeEev8E1+Y#C$tB{~&dy+QD4rEjg(0Xa z2*UF`Up&wCMAckEvK*J|jH+B0{^x<067_@~E1Bu;Ze~1edv_wZbi@!_H+<fc|hCNm`UjNYTp@S8(U$U5A&8BfRl0V zI7Qez5l48AvBwbt<~NC+NF4VGdYLXAr@KJu$ zr>IJnW2IQEW7+A5w!^td8$z>n!Ob~AB*rIH2V>NeQ`dH^+AxA zFoBm@wwI=}?AaN*INec2Lpd8yNGCL(iIw!7)l%|tlB#6KNCtRHM36JgyZB6hM{vHg zXeOU=d{(Y0#D?S$bgEwu<|cK~z|0i=nm5Gw)(!9j>-^j-Z}lZZ29!#SZ;8Krw>XHv z9fYg7Vqs>dY8LmK&GsS7Y zH^QJ^JwU2C^m5Qe;lkt325?-uI#It45c)pC_ygULgmxbZw@^HLf8HTrOLh*Ke7=Bq7gLkVF zP_;H0Pq)s2&)|+|&j){7i&_X9)e}e4`NB(`hQLpnU~GGHREiaMQ4k*T8t<|U1e(Ro zLPCr^EeXS`;t0GZP`<2&W&*5OsRXJG7G@Q_LM+V&F#ZslbzpuEnyJSngkB}j>SOdN zmK8MfTuRE?Ea%e(A4<&TX9CS-A3`(p!Mq79YcZoWXqG4-&j@nQ1f02iQo#T7Jn*Q1 zuwE312LwWa<}33E&AjI8gy!2t!Gs709E_fdpZJ2`M<|)+^f~-{H^*SB+Gs~;wl>kG zO7GlvC{hZSBPb;ofe9G|-Yj@0%|<{{4#Lt45E4HFZQ5y;<~}1QA}DVKoMtY7BVV>2 zgj|pG64*y%VyaIpTtm~TRCq^Zz$ZKd{?Rkw8I}sifOw2{^22~Jwix*3G<@>;Gz^;H ziE#lba7iwJYgz%EQVR)c^WmAAhaN+QV$7%ki1VL>Xo6O_4`H3q?C&`hpZ4pj^5QbA zC*H13#M3;&FJbp-5rJ8tisdUI2OqPn9v26yj>Y3bU;gifXX3-0FTUUoc#`LY$9XPz zEG`-{b0q(NSm1^0*e|L6v%1Pq751~YlnJGyI`VQD8gnGz&kky zUhFt|rspAORxx}MGhxxB1!|d?U{Hq+$R7F?+}d`;sD|d);?43&^2e!ILUW8KPQ<$6 zShPJ3#XDnfq6emr9|+$;9kIgSUP1HrglR~e*cmn*%&;}c2B(s}3Ce`dBmrg*{&wee zvm6M`XQBwr5#rbEfNL`an%OCpFEND@;lduLB3U-#Q_RlB$pn!fPA2+t>4j7AJeELv zHpT(xqHJ)1ofiS;(-8#qFh?8u8)|eB3EwNtc(~!{2`Vwe%^aw_F$$G>RfD`5{!Swv4NSVJ1 z(K!ndoKb?%8S>=;$QA=%oH+klx(5!Qf%7-S6@BSAu;u`e$x624<14nMm@cS+uqD=+^8VO8*9$gwHfSUBuyMm0 zrA?GHg6-7DpjmvHH9wyZf!P4fgh(BnwY>d1Xy*T_LJ9OdUL<)}kw}=1_Su3; zW(X9&W^vQFe&wtJX3?*&pW(MR&$xVo=Z|lxNlMDmpFdt2ORiepK{HR=o#Pd-l?E@>)O>=xp;xP z&I3Pwi}JE!v~1Y~wQARbRecNe=-dU(8(68?oEn+d!Td0L++CK8oAUz+$pOmZd}~2C zZd8QfPI<8UoG%d<1e)&>B4sAI2lL$s%d+&OYQXs(!R+1wUqUm%d2s+95}Kba3&s74 z5FAKx#kWyT*c#>vr=A@YGX~@4A_G+`Gf~BDuU5~%o3%3)OuyMM18+BGQfYX-KACWxh^p1GT#vZeTLIB`4UXGMZy=IFuJWdP8Ja?D+rFo z;=k>qVjMqGfJ@MvPbe=A$14J56`}cgkw3wc5Xya&TZMqHa*GhZS1+~gUqQ1-hi0{( zEUST8tjsbYhL1LQti-<>n#F1<(=F@JEEZ=qr8D=HC>oC|JTz#Qb;*}NAHhJZX#xlm zEklRp=i*)<&`dy-c(sp||8x)@Nz{zOU|i+$>OApH55vuZAi{$k4#(Kw=LBbbLm(+} z841rGO)$WsHrf)J8=1&7+O^Pg(3eOmTn4}CIq)Ym`%ce*`}CRcN|;HIo`Zn+bhK{Y zN$uNv#5e@cT?)613b-a0!8KtH22Bk>qmG}VRkxvN(`^`^%k%zLOB_Jp~bgP8juhf0dVM13KZ=syM>13p+$!>|98PiDk#SQQ*wq55+IMm`j$G z3>bY_=z{wN4wO9s*%goSS@+bvTn}8&a>j);JDd?%PIDkI3p_jFY?>yqFrQC$P|zg6 ze3>1Q%Y>>c3GRepcU&XnT_-SIB?Mn2EMFGIOv4od^EE=`^=N`$q$93HITNO3UxGH* zFGS148xq!x@JSG>oGwu~ym6X3o8rSf5#o|Oahc1@vm;SCI~oTv!>}vS2kQub%l)Td zvG)Ybu^S2h5uc!Ellri%OIR|i4|DV8gr*j-vhD!$25n*1pgl|*wMSj+cBo_35_KB2 zMs2I+sNbp!J{fHXA9j@dG7I1#zRX;@rRO6!rv#pH8LY#WFfpx%Assto_K2}?Y26+p zt;`fOpGy{q6@Ok&oFp_KC#W7~M__M)2T~>uCNy`#GG7~P@|}q7@zW4Fsx@p0%^Q5D zuyf!?V3tmT7b=z9aVC*vAffjNoRx&;zm{Fl+wJ4(znBlapvqaVg-ez}!{A@~LP-vsju1nwj@=(c)|EgNr#) z`0xE|@ynh?c(rp8wr0hmy@jcY0%b{P4xBO?-u>Glb!Z!$&LsS1^7<7tv#k-3@0Wz| zIT4Oqe1@IPCa4pRMP~@iX9(5jW_uHwy?GvQT*(i`&XfT3v9?suT-V$jwyyq|QMeK@ zgy6V2ixE3_A*L6vL_*;TgwCv>79c#Ez$^m*xr~sAIJ1{h%Md+h8OFLqqVEVBgeK-- zPU&i7<}XABl}@GRmSS$nQUpaM!m4p|^*-*`yBA7zXkNRD?DJ=67Ng@OAYafA(pBbg6jN;02F_*0w6 zO{77y?ys$L>&Kv3TvO$mjG?*OqRpk`{a?CgXfP@#V8#fi|66F*Gpflr4UznoUC>a%)@rBB7{XE*Ne)3siJ)wCWVOg|R0rO_VCko}4LwyL%>e^L9`Vg3P zXf{Zd53fzG@gvfqS%X#sG;fr~sU4@6aSi-ZTbT!OxnSc{Vg>2otnc$)gR2(PG*s+| zG6rUm2F+qE)__@;e*GT;v!2I~&EzBHw~OgTg@&STtvaxoYKyN&k5(^H)4FvL9Tkpa zhkwSA{omup#nZTdR9%CGg#}u-Y=!2H znT3k2o6V@KhBMcOa2x9^Y zIye*hB$l%ZRj0zvNdyhTD@`WhqS9_0?#joW43z$(j7dJ z?C6OU&j5@b{Fxg0J*-<>T;IL|ch;BU&e|eeUzLX&%kprkd?wD9q~cUj0uIlK#P^v2 z*q!2uwUM@1954<=ZUk|g{z#qJ6G`K`BXLYO#EkBO*il^(JG>i0KktHwu>&x^Z%4GN zZ;4J#8X?(kxU#TZ$h5_*Dg7~q(0roM3(r^{G6SM?3?33X#bW%FOw z;YkMPNwkbdr4o)z3rVKUD9H6vaj*EZ6fkqC!kHPsS<=v^!`6UWAXp~FsE)WWKS1fF zECrgy4TS&e&@9KuK($9@0?h8}+`3g+%~WU@Q23O2u7+l&^-Q@P1Q4dhlFD@E2jg*3 z7;fbU<61!ou5tb5yl`Bb>yP8{b~qAePiS_*_sO1E=s5;1T^plMy;^9+j!1(#nx(n- zkgpM+w;0}u+3-zBgGWLNT&HKkJuwr0$=UFWOF`=n8Z`GAJ_ez=%ixk)!hHz3vDxVP zr6cOM`vk4Je@Za!hDI%VplRzqXx_FD`hPYSqbIsx;LtJX(t8NnbsL0sJqDvix4~#V z@GA_q^TV|0Ot`3^2*dIK0RQw!L_t)j8S~(hG9Nza^U!P9D2)7kAR@iSBf{$|Lh}Sd z^JK(?IAiL#FH~N}^=gBv6|s0a&xK{AMbN08lAIk5i8S(rknyC@pJ1&mALV)DK`zTY zkIH37bG8p|5Kv?g-K8Wa0u)_IUCZhF|@y1NZ-1~}q! z1Ytae=TG*#2iPJn0;UinZ zxw{3{c}*fL`{Lwu0&bEQPAd@v#p2Gg6xWPOiK9$tz9Q}!yoW@$=6d4<%l8bSv@)8Y z9qU7I_NM|MKFMbY?`H|HXCn!>gzF0tQw*A>p!rm|1CE9{<6xL8_J_Gq9ymZ~KE$++ z^7&U8%ep2sizQhmHW8PM3j}2WXR$b|bqPtwj?OuD4s>X)j3P9%uFppDSmynF41qqu z7k5h1@&7xr8SnNj!kc}|uqi7Vt;}k{jP)fx&B0T?M!wrPlujFj<+lA)yzjG_Of%g9 zSLgEiT^fSN3nOv2BpBz#!Yt5CU_Pzni8Hgjac&MfSk#rm2y99AM^BmD#8lcsJxq1- zMn=I3#LQlT*f|Rkle-XcMJtdvZ!IEnmLim(97T1&V0OY zSjHHdKW1sx+%p84HTMin3Yvw-4AME_W6->ZjPNg^nE=m-v=di0O6<_Yz#D<1cqnb+9rCdbxbAII7wfwRSRbFL(X_)^Gnq~dpK(j#-@!@0O z{GWV(bZO~`<@kMQ-p%r_cFFia)p?`#P`kNJB!ejTY}S$vu$9kEVJ`O5U!-|pO|++a4WSf>1%x2;=)eY>{erysvTSa={zOlrf_tRAeb zo1jz2jxaYfRo67QRYM%k3&owqvA9zah`S2{3C*E|=5Ru@W@*+w>H8~ad{X2}U~o~( z$3Lo0eX18HO zFx$fqnFQu^XCFdyFh&j<0#ia)=T^-SX*U(VUk-!&&;gj-uRF$e>x?hjx52Qs%`l)v zL-gXqyR&&6Wigicz!DimTsuTEm{5G9C4RFkWi~{yaLc(h2&j$bjar~%+jfYz{}Pos zp179djCszVV{|KX9FrKmgyu&@yf#9!8XzPPD{c^kM}g+&MS%onf|Gt=*s&Hgw z9(YjhUaf~>HJ-w0)HCBp5?5&|=&V#q3JCTMd%fo8Ez z%e3DbG<&K%DQH%3$Ni);pvRR~6QoF-U`J2%BjkdgQ+%+(Zwj2cHA6SE+Nx|NzP7ZlPezPamM6Ek40y#S!appNoLlbOcf{$+M6- zrxc4cn$$9l*+Fgb?_KOj%ov+{!OvgV*)A@eQ=5$JAvj(iFl!`%miKqy&5#1 zU|C*BcVdT_ol4$M*JsOwB=!VUPvwJrI@TX&Vgm@;{y4|qV&Od($?}e{S8f>>!>8ap zbuL`oGwgAa`8XQxf$0NI zm>nH4j5IYjL-^FMu`J99C$odF-D?DP`Haw@d4>aS7RuMKU_4zIjzz}?19sS<}*3G?i@c{DvH8Vw!3!bwP9Y*3?}C03Ys$u3C#rLm^lj&leY*_dCQP6 zZw+D!RuPg3$O6g&&{Wu*WeAKEJ7^OvpiW9`IS-?^ZUm6n_L$fh3*Ff`o#y~^lG?6i* zX`CP|p}S{@_>SGQjoXA-{xviUlSp2~y+Q-w|BZwlHE7-{O9OuybEz>X8%zFE9LZ}n zRy8ymn8{exEXKV537WM*U~*Ds%y4pbL9#B8N-1AsV=1dLGh|Tpx)B^}k_yo*!CIwI zhCA^~;{+9?Ij0?@|L%t~HwR9OEOnnlL{Yn7|YL#{_E zf!86vLi+i&>(KKmKG#~qRKKgQQ7dZ=N|^@(oL0*K&H6dD%9382G(4>lRyRKB1?l_z z2Z?o9_np?ES-FTXoj*5fJzu=hs6AWPK_(H|Bj@M!Ng3}VI8PimPJ8KpG5~`HeX7<) z9ox54A>9uC_$`lFkNtbM;oPzPc=Pxs-c>zRzRY)SUQ(83u{fVObr{bG(gMxDyn3pD zS@itT9TkIl^STxIhH!iK}o@9&=D*Ef&xs_L#9JoxVQV+GOz%@T_4 zr*C(w>k&)yzU{0Jg0fheH?3U3wjkHF5kGym7Zs(&Xx_95O!)F^(4Y}|bnAgeR`u13 zv8`z>tmcdO_6o5!2PtsAwJ-!X7wMK}aY-OF6S^K1dl9$@5QPK@foFl{V#3*cBXAaA z7FCGlIRJMG`Qk~l#UX-&Seke6VQ8HY3Z0l1axj^+Mi@pTJx9G>lnDnj$)BEDDzh*<{WDj|_8G++)^ z!2C=?jS+e!Jl(?*nQr@iU=}_4pFp!n&OzzGYz)ofilIbUuHv$)G!V~~rCB~JdlQ<) zLd`VPAVLkA#p*3_XT>u5tRMi-^0~|>Fc(mTehQka1imF!a7mB`%}iJHbUv@WI2gBC zF4yw|DXtTm&t-bxXq+7m$2s5tU;O(CgBv59;n1Tc+KZK$Wo~Y;G!Oo2G9qV}!Z{`d z?y-q*Nth0Y1g^)Y!aE@yezD1D+flPLcOUj8d}o!zKAE7EJQq%JGvS&r8+Jjl7;W#4 zFQz)-^9i;XGFGB!Ou(R#W>_ z4j?oKq?I6QW(7VOJQ72CcSnTVI0UbH9(Y;cg_n6gcsbV}ujT~dZB96T&5gs`>?k}*3BbJsAKZ!~6eqgkLXuc` zU2!GVQ|V%|JLQQ>DFkI1;Fm;LX4zdx_Miwu1nkNL!VD!~C%(uR35b`eE3wYFlHh@B z1f>Uw5xA2Yi|d&QIFS&JO|A}@HQ*DZbnAxHKD{t=_-Du(HykNcn#}~nO&*Ug+O|>m zP+jqJH8DkfODoi=%Z@YegWBwD)~Qp*phrlH^ z1O77$;GU5S_pAcAaXmP%0`75XFtKO|s|F1)c0hj=PM!w)Hf``#!+O~3=gdxpA1Vo& zDn4Q7*`K_!4fPv_ql880?L4MTN&uq}%pJ;HRyy+HE{eDQK2*^&nun zp)$^y5bT6AkyHeskYId5rr1t%RRO=Enu6 z#cw&@S@|%ZXD9SRf&(s6mlGXTD7y26W|_tC6e0Z-@3GT_W`TWy=5qw_<8i$2SLWjX z9ovGpKdr#K1M9IYElfQ##nRlOzBwWZ%^L~VXLCYPnd(Y_pMb-O({M4z4LA8*c)TDC z&j`)W7DwV*o+r-EB*YV%B{s9NG-rFGa+VK|_rrz4D9q)3(}ZnCVkXzMw1ka|FER_3 zBWCteMCVozm^EmQEnJDXqSahqK{zHT3oy@>=Go*C`M9NF|rEEFww>dO`5hs>(;H*WL*-cSy)5yiCAJOjirQTwDDlZz^sMm zm8H&TJdVtwOJf)vn8{!UR#4DPUaAIXuFE!gx4!Rwxr&ihV5S{KTSF$YL!v5zb*-3 zVhqTd#0vbuHYqG-RJX)xc`(k8mIp0eEgw2Ci$C=H0Ix%{EGZ*kHil+p!Z#F_9&SW0 zsP}=n23$q&l?7FEz1q2r%SYQsYnav0Y;=u!nb*Lnkqi~3pI85_fvvtY0B*Haddt6& z0JHcQu`%n=Yyf9TCuDNx)GH}58=nXI|$A@aN*bi{Py}O-o1K4$bE<_7b|h}&<{9vvK+qOy&ZeDZ^rP?2CKZ+Gp&ct9Xg|H zhmJ6)3qo*5COat+Usz8(BrrcJ^2F18XTp~T%>on} zG!qs}y$#l8s=^ERD+tn)rQ~55t!}nhZzLs z8ScSI_m9S>eFiINmg!Q(YT1SoD{?cDtWz?8uOoq9)TwSQbY)t-EKJd-z8U&8sE2`# zEbv*=2Kc;H6AW+H0>j!h#juu*Fx=V_Lz`Nlf8%=SU}=gbmW|N1WlO|4kH)Dyf85D; z#|FLx$F{Uk(EM^iC?1QCufV7RS^_Aq=~)p0vyi}A#O=id=6M9`dE%}@NET=|wls^Y zgb_50dxUb4F#OiH>FYAXpTv9qv!z+HE(_#7n9Fyn3J2M|&ctxOI>#nHZvLhO{>4F9=JE38#&S=uE z8(MViiMHMPp=tZ>uxQ!_wd*&9rB!n@ZqOR_8@5Be#+_hd-2tDCbwGGVF@iEm5j}e$ z27EpmeS36-zth+7a~=Udx33WB_BCPyZ4ndVf#!`HqJIl(++7%jr>u`>5;jg*=h;Cj zU`IuiAB0!`f8PE&yo#i2`-ahwKp?ogySrWM9?uHE*97C+ zscyKFMrFC5ImUB>^T|5 z$>K-ssv=Jm#7YDTXB5X$(S*?`4+YKnkxqOjF8s#v{UbErn&qJe2bM(n5U%|R$%JJB zZyDFi2*Xz+`Mmi1^(cE>kG4_ZED(Jqk{~Q`nhE2VdF~}1e_n~eOkH3amkiKc81JF{ zvt>%{E4r%&;khi4FfI`__+H%*n2vKnv4UpiozPqyrCG?YM6d%8BauPanf_`e{-baY zzCZH?e!RE~>yks1(LgNCE$UbxdiogbPGUzbCsMt;#aT>irVH*a^TW4%_n#4%#nSxd z;~0Fsl;u5_b#|Voip_jw4#9a2%VDlRp_yf{EE=<&#-f2hGwVgo+I3*>>QC7HjGg06 zh+e!AQ8^nFG{*`^FZ}|sOTIuPAz9opG;Koo{0)TX^$4H47E?T;F?^aQW-j?yL30w{ z)dVU&YZ>Cw7h%rgwU{{70k!MYN4FowHOd3blg5NQDwJVQ)t$}StR_% zc2~HHZKkPZR~SZ(do%EZNC7f=$#fu~#u__}%%MP8a|cy2K&sARim57qu~Cwriu?3@ zu{{^=qjrmgDJy}psf^4VE}{MR0GSWX6k^LC5@kHscwJp82D0IIl^-2^sS3yOcU3rsV2-_}7?$&xmT8%&a?y{Gb9HDosyyuq@N5ttmTu)P2WI`DDe2%W z$II_~{-x9;B|Nvh0)G&cnJ-;?n6EuXXg;uK3(w!eiX=u2}4zfh(5+q>7w zJ>wt0{iJ3H6!#8^oFV14mmd%H;n>DBHxtD7?!i%($G#n#dE7BX1P7=_(bBvQS~YEf z0X=)8Zmn9%8td4vBd%=Bz@0S_YOvqqb)o#2hTxGvlMEOm7z-djTQ1WAyHR3g7GNd} zt|Xul%4D#hnlX?7_jn!G*9RzQmXLL&vm9}jfOUqA)**iI-MX|?v&MBaua3ac!?Dmi z05jbJFxxL0nE}!0*QFO~RB4$!QWZMkG635|K?J*V` zeJA770Bd|1I33$U?XWAt5&NQ?aVXXuC)r@0P4~xH0?oO3fymAFM=rtS6k+8^wjXA> zPed#88fe+D4ia6*qA)t8bELs`1QdHg46S5 z+?D_bLHNGk1cJFwv$zYaYJSZ6AVBF%$iO>OKvpvB)3;v=pi!>~=SFDOfcce@2Fowk z%2I<|f#zpSUtBU|#yS<#Sy{ZrSCJhW*)Qo5@Lw+p#oL?^yjd*k6d~|6q4+hQvr3!k zYB7k7da^1AcXNDkXGs9=FC#QBk3tba@M3}kF0dnXKG_|)DLy!o9s<_^T@`ypmc`c0~EE6ymvm>=21drzi;=ydfa<-3(XMBwyQX;eYWjPRv zopC+Q6*mdV0?gN_vP=T=Okb2{5lm(U;^w?C+#+z_niGzjSrN!f3B}GpSFCfJhM5zG zAa-~+M33x_V9I$=R}6309Nk;gg_WfSw>4`~76j%djnSlWb2M+#8m*hRL)$i;(V9MxLLirn)TY;B{UcFdz8=prKu9Hm>pJjB5xBc z3nCo}!MtvSH|2|~;Xaf%u5n*kB%igoW-uSu<7`kyl}5WN2rr@v#7{a}pxF->3CkCw z3CmPobP)3rNO;#KC=quPi3(D}`zjUj90NdM9tfTh(#L^wd5m2FZmQPgl4fe$1eFyfpX-6j}$CN&6kNzRwI1& zD)^@?#5l)5Si6Q|;p)vu&R#)KUV_-nrHD$+K`cAZ*$Y==g0(%&%q`Wp94kvptXig7 zns;pZf=onc-nP{Mkitqksfxg?NnqJ1fmF@6S2%}UU;s-~KGFZVE*BX>1Llg*Y?KjF z6{s4xs3I7@FUg0tPVUrQM!3G4EV7HzSyPx#?oDPdl!;v`1G7jN@I&A%_oc!Clfjg$ ziF5!|CqTGs2cfwf;cG0hoo+{Qidu*v)qiqWkP|o0iJoBiVwJ5 z{w5GfI!0;0sSg~~rS*R*^XjVea;_!1O$56@{Qjl|wVHuid*;2%aM{ zREEmZ%c-JFVOT4(ir_5i%egWjP$Umrq-|>zQN;5N>*8!~YVH0IG#}c--VddLv+UdZ zFQvh=DRBPb_ibT$ZBy3Sb*q=7Rm*0O`R*rAo`Mb?I-x4Txt5uQ zYUDe2>WmGm*Wz;Sd6xTj?D+CiTsnVTS%trU^B9jG+{UTn2XOS@EBdDsDXfw?%&4mGC9g`-{F_s`2Ie8cx)~rO+2K7;!_fo%(CHi*jj<(I4 zD{F)Vs^32=3=cLX;UOE1Cj{olt3n9GLE3<^Wiqpw2c8j@UvMeVtgOrg=JG*+gi48J zEYoj4T5r$>LibuibG8G{CJHpW;&_4|{QGuP(A>$Q8vI8N#v(7VG6!OoPdGCC!qKmD z7nQGJU0PtrtSB5L_+B7f6s}A_*_ssGT9bxbt5b1jZ5r-yd3#kd?yN|{y_Jdhc2yF- zSrLzKS0oTvQ}AMaB3`YJ;xZI()`s8*g67lJfq1z!4W|}Ip;rTQG`FmQDEpBpTq^D) zo;Z-=j47=xa5mc>-+#<=kcmhHQpJZ@pG-uF z@GF2UR!f0nu}bSI2WSH{zgp>EJ{`9J^DACc17_w~U|GPH_famwu8QYOK$d?51ei59 z5hFCeSjjY&1rz?l@OBZ)A%|Kbld@0(&8u{1)?z;El0i`tHRRS}4??p)?ym?VIEUkM ziX$#0*yCc73(hAIVv@aaEIky~-P) z0W%kFq&8p{VK{s7Cd^u_#db|z`Z3~{Y(m1aj}eoz0bvU^B8ZK8%+k*>*xCV2S~S3n z$%9}&WiZ_AM#9f=1fsnrVrINQdUS4&ZdP^h_3AjhT^WTpi$d`8qA2`4FB;#?3dc)= z$}@Hn9;dOsqzB^utYF-n>4zc$@)e2iyC4v^a-wl_VXV@fWy!d`JQdfMrQzE03>4)g zvm+OU9g!Yb?J^OG;|IWfU{_4;(i$V$wZNzjtuVBCeN5}poSnrE2pHBAz9agmf8Pg|^z@Q<0F=+T83>`TPqeqRzq%osmJz*RsSxw@ znT~N&CZT2fwy44mX;rfts4ijSx^=^VAww`~^fkmT z+wkM%1DMOa%J);kigsW<4xcaxyZG(B$@jjDzV;4uNuct#=3oTc`Rl+j6#FzRZxrdr)KTCuy^t&G!u;Z zeueQ}ixg;HEbbSZl(ku)IcmWsMDrbtJG1(&wGkl`p8=a+qIVyQR zq6yNG$%_!lW8$-xV&RI9VdvrttA@?kS*(c$_3B~u@}-K=1e*7VwONcsiiHFucPN(H!j*d#-0h8clXwd9QvmW}?#glOKBdQb%BJqnt4Q?fKlA4FGqzbQDI zLbC#A12Bsu%$-yY)pJsngn3m3#r#Ww^7@*kSRMq=HK_!qdcS-`vVV_Z$@8TQOo3Th zZoT~AG6Z3aPiD}!tp+Q_VH?5E4_rDe|SdFeTMu?XK~`lUS&c4w_= zv6nB0fb-!4yYcnyYk2(4J$!rjhMMi~@4vpqll#|EeCZ^LE}v3i>i*z9anJZ0)A{b@ zV>N(KEX@MW0?qQm3NTAVjWef?;RJu*w)GPnI|Vo~bct{r-J>#VHL z;sP<-X$+w`m5>!iC?ph6+H7&+`|638VqF$3_aG#CDoA`{3d{to2NH5lpm{?89#c;w z#_>9_G_#RQbikPy7o1HELe%h{XvJ%GtW^d6!vlAmoMNo0ZXc zx*-YAH^k$`>QFpc?u%Ch+@A=*uf%%(aWu{^i$=djmS|eLCSqMDB7a2$zFF>xqgfuX zZfk*4GiTt(PYBI~Sh2c_6;?&GAke-Pe`cBVLj1f}ctekXp+oaimWfP)@`TVUv+qg7 zg%3fqgm#mC(m4@m7GTz)Spl;FngwVzVAi1dIrU6gnFAFtKQDL1(DI-|vuq3W%j6qx za|D{h@LIr`W%OoQsIoN60KgAHv;2GT4b!^0*d2G4`r+G^Val)hB0=GNf*mdpn$IP; zaLF=D3&FUKZB%`-Q2wt1%~dgx&>WJ!0N$}_@Q+J`XM8+d5>wS*@?PSek&uo~UAn4z z*}mr>gwNSPU|vfQTZx41HAtGh5vlVxQ6C{=;U{W6dG1Ce&EJTGc^i?){Rx!BhfQ9@ z-*aTbgpG(5%NG@~fDJf*i&*d}Lg#&kq!rsR-p(8K>(|AE;k_}9VCFb=D0~QAksf1^ z%}(9mzCF;%(j3*hG!g3g1Ww4`^8i=Qh|Je!kt^O&-a*-VYAFJ7XfDctB$-bhE6*cc2upobxDcFO<}ZQA zVs?%eu0rIRPcS1mh9GZ&wk_MhVfYB7jva+@jV+Ndc{uVCLr_BSl_{$Ws0&H%$c+_CupK%s(W zu{6s-Kmk~Hlq5Ui@lw7wIRwOHCtQgjG)H)#EZiN}!aOK1Tq8gWG+z^Fj&a0Q!t&J= zTa+f-5qw<YCn4vR|ggE{^lX6)r`k@xHhk z&(4=@GaZ@Gq%hV^L9<*#tmDOm?Scg6H75c8nYRc3EIokN7q{d4{6k3bbu5SGE)DA; za^hfo88nUWZkU=9`^J1Pekb{^f1ZH1n^W-pmnr!1%Ot#6AIa~OxM+p)8yAKr>{LHk z96*2$z?}vDJl7w$SH>f4#xOO$M4-8rK(nJj^A{?dTqvPAf}PgrCG4zH3Cll6Ji$4V zo!`)zD_LKbs>z803CGjDBQbJ@C;E@I#pvnoa0`q>c)}b+Coe>N=5i$P_^9;72#K4C z!K1BFuTgVUV~4z1<3`xHX0>u1m5_6~dxmCp6<(`oY1V<+$S33%6E0CwY$DvFv5D^f zX$r#{C~Lt3{>suU(5!>Af@T98|Cx;Wkae=00h)J{wL}^?llKVDiaX0Wkmqx|vUF&! z2+qP^WGVsXodV1nG;1{!odfp%h=Q51KKW7Tp5ft zSpHYgT-nmBL$kgvg{4;ty;mh5T@KZfh5=}0B9i@v3h=Kb$LX#QCi__yy7j$cR13KV z?_Ijq5AE4Y?Nea;bDPj)TjX)+6HWFw5daD)L<{0;9RqMEdoMK z#6-m6()n}P^2J7Vo`i9`j~_XR^XE>f$Qhe9t-@}049=ZBj+B&m=9SRQ509GYq<0V0t8JnF#&hZ2 z8dnINk5}-6Ng#Z-Je69s{A{_wO@qMvbR`=Yf}IwQPMZ->!1>Wyg5vr> zLUSOYIamR8$xM6XCONUObH(MEp-7!D5N()`4rW#0Go&9@`uSmjw>K691|!4A9|JnJ zQ*I;Uy0yZQ*>Si$HwJ}s!%(y!64w^S6Pn{uwlof<1m@BuF(_LWhil6T(<@>Oip2d5 ziFo!!2A+PBh9?`6@pyeKAu|SV*2UuGM{#)aMLO~~&P4xKO<`HH3ZmR5DmLu6`Y)-BD?6)=m$((J?h5uC*hgpjL4^Aj})ka=Ap{>&OQ zzcE1bYeMsDLbDb*Lj&gbq1hMrna0hvU3tV-1ym|*9Ikd%4wO~`;xd?LIOlT-u^_qbGe#ib)ypt);z4Vt_5 zMf9vS1hWl@&0L2R!gI!~jfl_QfOs~@@$;zppQuR`;wXvn8cP_Cp8v60s%;*ZJfFYM z`2=zEWcy>DD>IONiLhCpA#LebuyGDWT{8;|>Cq9BM)t*wNrT`vV+aCWMj<;X024+G zLKi}F;hYFOTo{WdvtsZhD;l>mLvdA2=^TI~1gI}O?6J(=8mW^;!=-0ujAUKv+oS%T!5E?%pY4cVgX;u!>=PpI|qE%{Y^q7Tf;E_BBu9W}0 zmGGUr0-m#%5s;U{CwmF}s6cK9XD@?q>H-ALTnhi$%i+IhEqoWRgBQ=q`fL}*IQsDU z*ML=RD~#>l3$dd|VR(HD#7!KEyrdukvrI7JiF|g>&L_L#T)YF$MmyqQqzmTyjEDQ! zPKch|8(U&+ae&bLY07lOj%lx;`Es-?&KsclGC^CQxq#4IM3@%KaB;LRK~x6hIpWbG z*73RCxEkk(k_aaS%w=J^2+n-wWs&Z<8s&o07<-iR9k`NgOW?LcQM^F28}bEi3A*_r zgEd(I_X>5D+ojPSC=s^`0<(m(Q!pLpr&JP4a29dBl+dgon&(NUL5JodrYrkp3hcvEl)QLZ!8dnnJjJ3D8wlZIfU=LFna8jc&Y-Enh)Kr_Eb zpT^!b0K@BGshVfcof<=YE`ac5NmlBZ~s zPt~qtL2#}M8wX!xz2l z)6r+tGz=PNgMOo?V(fG`_=YDVF+B$nadY4ml!B>_K^QdJ79D#ILxW}=(V$5y<)`1S zbt`Q7?9=keO?GcHxF#wt6JFCeNW`p{56nN?(nJjD9 zM)m47P@{TH%*;qS?_eZ>a@dOu6AIE_m zn^|GEoyO+RHe$!N&v53%K?40pXw$rj`k-2x*Fdjs zUD!}}Q1>#RNnIRH@gq1T;Q3NQ*~)OjoIly+blbYkt_oSW#FJlg?yvlER!+>COcbOMEMIpGk(1*}G9p@9prpS~7}X&aH2 zxe+sGu0hJIbx54E2?=bJ!5RsrWiGl;5@n)?57TchtmXP#0SG}{BU%tQyYuD zoUq(sIu?u?hM5EVA)rSm*tKtgu}!Sdr?wf|vUAX!Ff4yn*Q-$#^=j2b{o1u)MVP2Z z=&o1K91X1Mpb5dH1>d{YEt{exmyH@)p)TQB{4(#2oZ$MA9YAqrE*vYA81&aoq z(R=6&jI|4awMQI$V&@}k`IpG~XeSn}-i(#&x8ak|_h9|@y@*}53Qh^B2$3nwm#!l) ztwIn%IB?E#1kGNGFqz(*iXtdS&s~Y=1#5T?pT&yLmEUX1>W>kcJ_n6kYd-C@s#Hb4 zw(SuxW<0vpGDF<>5xATbh~gw46efG(a*8|7r#h-Z#i!W;KSTgt>@^l%BRU~s>HvJ1 z;(#M_-LXB+7KuaJ;VYlXxI~z}80A1{wn2Ur!8uke&2B0ZM`4UF<*g<$DNT@=$*w4g zbK-H1YQWvq2oGEh_dqG(`Km60dTEpsN@ESs%noE}q8%ZaATB<~gkteEE+C8-MH76Z z{Rzu_hX}bhqup>rB6UcdWdgKZw@CRl6P^t!F<6@=7PRQ90Jpeh#1WX|3ET06ZNh$8 zln<^a1hYe%g1?s?#xGY7u&(UDtAhRToH0?Ar&yb1Kx^`};aKT3M!5`$%f;=59=I#t zp-oZveP;%K{xSit*M<_5qww3uark9pG=AO?g`YM=6PzOzOn zmWO`-!i5?*^Pb3s>Kw!UYWKyZNj?=cE1)t!vkp}{Oo@+&)*H}I{3~cS1!n@6{!l7N z|6P-urvb7CiXYPZX~;fJI$Y~{*7xb|7^c9iOV5W6%|>u$Is%AtZC{+YG;pr{fsi`2FOm@7lGNKadv{ZX!^mxauTxR1x| zWo2YOSRNv6V4#$ZNL+TL9vGzt1*+v1eDTS8%$=16Pd7*O?$r$z=4L8flZ8cXE~}wO zm#*0S=_ci8)wX4GESfim(ENY#kKcd6@9%!Z4{s&p+-tmk_6Vns9c1rr8;=B2Z-lfN%zO4yk}_{et#y2?O5UemEpH*5@c!x}$FEcbIqhPyNRwr>k_Lf6P1 z9kF+AIu52s;VU*IpR)n^gpJRtAQvq3wndiHB*aY{iIDMw;5)J}Y`$y@MGJu`&Ga^LS-NerbSIvHS|KJ`wi_W#z31%cjx>4syS^X9zS?62(FX=GO|E zDFZYsV5VLvXeL;%(gp(R(ENH;kn%s)Vk%2sBu{c3xwcriWgw;&@=k|l>G+8MbvZN( zFfZY`%f(7gU{+Ra4VF!zS@ewC->xK7%(lTjLi3|_A-J}{i_k)7PI91Jk(=O7cqTMw zM8c&{5A~O@dCh8SKv~u5=5TcNLqhUggvDgC!OVnTVitm9GY}Y?Mr9x*HWOWX4^fT0 zMA1l}m4l>=HAtKJ3DUDRB0X~jQnOYfb@p0n9km`Qb2cDh_6EdrJwdInLwxocB+7E` zdgTY5FmElAvsWP{do_~e7y@=2fjVpH7YGPXf_bglXx+FTtVa`4Ck}@7gduQsvPNu# zKOClyN26NR(Y3xMI`dw7vM%;vUFpR2)@H1?{H;0PnWpvYqG^)`XxyYB>NjeDdiAYP z&#E5k*0bb#eKcs)7)_hEK#LZw(5z{5Wm%SJJmxjcU|z!v7B$ULhxxFwtc&Ii8>4HR zP8i;EAnZqu!tu{n;oa@ycy{Siyv*N>f*q@nsEjj_fAdk;7TM8e)P5Thne$B0oAFn-)bj2k}zeFyeO z$F5z`d%$37D0=iAjBY&#pnH%0=-#z2dUoxNq5Xzn!tk*eJ9HEV^&N`t-3Oq3r`~AW zt}D8C>W&umtk8r2-?BzE<+5enqbEE@j7A&oiyl2xS@jE(yit(sf&5g~HFjik*;zY7 zh&)1YUE($xZiCw+!e$u0%y7n`Iqvu}Y#JiFH^nZWsmSAT7b2W+Il`VGYmfX`M*_14 zied@1GOe{(Z3RU6eJT`Sj&{T)LURG3xj4cdC4}Y@!gDFtl?0{p-eX8Jb?<=5i_aU&-Qf4_DJzmy%s_k`x>7k0vag64)N&^)A717z8a!NTc7 zQ8GVL&E|P4#|QUT1mgQo5CDTo`dkT1@I0}glj+;28|jIi~3E_sy*vLyKb;*+y>@#nh=nivcuUNmX?hm|LBUF zS8a)TQ%)&IcM-m9LA+fmUM^Q%M*(4W4z0 zZ!p2xuw5CLbt9$57G#i0(yqh&nn*!2Az2J>75*vL#oyIT)4b1eVCJ#!L9VFE&8aV$eXqNqzp;=^vu{O=#wC(u-4|Yzob+fI2J-C?DXqicBE`_O^Y-{abPL zAbXYv_h8$W&)FEop357$aAD{Jz%_im`z&Zmy<$A!~Jap}x4Ts(Um5AWYn z&@9t(i*@5@%>*gJ(yOJOc(N=A50{4$nDuFc zC3f@s(EOP2_*6lcA6~5YCT#g3FURRF$!j_gj6V(~G<(}% zsh1rxob52QO9xmGs=KsqjzH^)uHL%SxwZhRRi_SYY+;mqe(prG_W#9-8xlaX;BsRYS%<#IgiKJk}zAfG~e_n zXIqqhGza%LM&t6*KzMXE!`>Kcy!|Xz^I;~q3N*hWpo*Wd_!B?jb_Hnu05~g4b2&H@ zSS7Br0h|Sz6+9cDS%6uCW`ZuuQj4O&eN0D(=2r@unT`N+IW&tqh*+Ct@S#NO5NPJM z_--p`W|}J8T&M=kvQ21K>G2*^8U|=q;H)g^+?LrPmwDoH##G#0?v2MA!cn@w1Lu-v z;C!+pa+94^$hq@L9ypU80ry_rwHVn19`mX-QLBa}ob0`k5Hk~Dp{ekXNJU^&20|kU z%hBlwkDrN@%th$Y#{kV8IwEWCQe+Z-(+JF&vo>PR{0*2lcO4cGgy&Op=C8->`5Q5l z>skCw1_I6!=Vpg@hRpn+laY2YcXfx8nu7clFyK`U;`Gc{tVHHDXbH$ zKXq$jMDI@Ua(-BZJSu3BRg2#n_8h;%RG;K(bZrlV78nW@XYJ@s<>Z6uf9aLv0qDHlv zFso*UTGeY2oNJ+unFZ>aTk^c-=-j>=2K4NY$)iRg(0LlNBi%8V0GAmw9bxukF^zc| z-CLqzbU~k<-3g#w(YZ@Uv~AxSEn2ofv*yjvq)B5mYS0Kx>Nh}BE2?e-w6JQ3<^*#+ zo_XUMHPEsq^Urm8@8z#{k*w?S&Fk^|(3tO$bPhW5oomPUs0lj{O{-Ty3#uJE5#8B| z=xtGhs)?TFHPOeiHU?VN#YpC1a;sLb?br@py}KiP*g!;&91NenJz(FtBWyc$gd00p zA;SkD+iohBxY%Qs^(17DAA?2CPO$IW7fqRmu#p3CIXMtT$v!Ad@lrGU<)t{{Vj}BY zoF|SGfLD5qgY$rPh_o9)Xm-KLoB+(X?Te5uO|U<}7MBUC7b7Hc27x$+Ff6{z1XXc| zC{`1fa7j2VWQV?(oxoz&=Yj+mp67ysaCY>=39E!=0cV+1wVzxhuV%lCrc0Cue9AF7wj zg1H{VeIaUBIa#G)cn=(Ntq2#iOZcr zU?wGHLyJ}Uh zJ|<6?i1~Bp;P~M~ICt`xYTJcf#63fJO#^0~mkfNTSipn_Dy(ZVJ*P4>3)mW&$Eb?n zYyf7eoE;6S2+dl06(!fvVA)g^Ai0us*J4v?Q8qMSHbS#VptRpBo}=f%c;5RmJyt(XmKq%Yxuw~7?f-9} z*;LxKP0QoM_od(e0jY46pjp=;rmuxX`2aK#v`3%@AlVkB~vhz`x4i zEVBrTo+=T~V;nGLvNNowxL}&K3tBeUCO2u^qzNLU5;1G$3S_3QLT1J)B&RGua`Y@Dg=Hcx zBpr!S*_fHO6mw>+;&CgHmYRe3gxQE7j0XlKz|$)N&QAWYndX9tW2R!nkZ~B$e++u} z8I5i|hoft^{^;4IDBzVbXXJ~$ZZgCkMCI8LxT8RLzUguK%c zt~kx*NrL3@XfK>j@WZ9_5O%18QIr{t!pt}nW+h=)SRf{LY^!GW3>`H9`6)pPnoCj% zk*S2z6ap|iX_xpdIUVbQ6|Q68I-mo>W(>p5Y+r1RXUBU|cSQ7VgMt{GyXz8tOj9~Z`IFnyKMpxG0*l$BVv-Eob-3pigffU|<;1hF^^ zL~9agzN$dqRZXXTmFHX$i?!r|y16(J|9R&qe!sRK-=F&u_x7yA@SfTXo3&WqCbVnB zPQ*AYoH_)#bAoYuO*GH($HP_O_~GLO{7AhZKtCWTKj3%bTYek9;qMRRm?Z)H?#uM> zfp|k`e!4hV4Hgfb&`t?EX@MU5*|e)4($3h zP4iGuS~^?;~&4_?|=Ihe-N1e{`=2L5?b#3>0=6* zWmdqGgyt&+7jXMpF}}Wa6?x}RDNFO&6Niv{@-VKKT*kW}UMW`$fn^2Hgyo;Uf1$P| z?sHjjp7M*9nFN1&`+{j*QU1-6hU8nI`NYwE$R&vT`)HPM@jva+r9E71tx@01Oidb+ zFtRrRDGpCpgyN|{Ga>IO^?YeCo>7li5S|HrkEn-)xrYSjhlDc;O{T2PYkUZhn$=T4 z^Y(l{DT)K6{7r`!A=RF0BoW~($>Nt$=+67Gr zwhimmLt`$R*0V$%b2HSntc!*g4N=d$4lFIrQI}w1S-Tc2Eo!NWQ0mmGhPnj*dbO$& zuB$PP8dOac;J;;^nrL3923qpGwmh#Tk8NDr3{|RW@q`on>~V9;eB9X(hRbX`JUZ9J zw$L$n{Yey_uk<2d`r;+^dXUB}8R()R291Yk=cW6)ty&79$c*{l@(V{Vi z5jICOYl5*YTVQPS7MRq!EoOA+2;0t`;mG>v*{?Ue2la(JJDK)9I%7uHj+olH1EzFt z51VeCVB575?7MV=L#Ga~ZQmAC+PB1{4lVe5YfS6f1~a;~;qk37tz&c8b!`d9?rq^p zxbf=S8G!@3Bcy*%L=5kTkP*ERF{Ur#Ck;j#VRqJxG01V8f)yS%SnfRo>-=o7i7@yX z-Vu0Z?9it9Adet=KEWLq38;Cg($V!Nu=?Y2qA&83MBJa~NeQqLC>6N6;&MFC zBh=+m7h+v;kx+MmaC$M?372E2Xb0p)+9EG}IxdAx$3-qLPzBL;D2a2#?JQqhP4Pxq zW(cm$iXnV^V|1Hl$_gGfb|CW80{QLmL@A-UG}QyeDK6Y*o(aAeV%)IGc?{eKbw>D% zk=T{#i`8BuFwbrS`cn=h%iN%?K9^;5h5%$Q7a7H1aSf<^Uh#Ms_ekzTUm;faiz$fA3B)Wybd44pH zH^hX#%5u5R{Wrz0Im!z+2+TK__v`yNqcSvx9Io zj_b(}D&fjzi*?zDGN0vG;1yy22eItIW!ymyDLERb_LB9*h?gQWWv1_p;`IF zN&y;fkTQj4%^$_|=8Vv+S&L1@Le`;KmL>&nly=^S{t_^i9Q$WrSy}piQ%E+I(Vs`l zr(SWC1RQyP`h)y4Xx1uU#mZ1oS|v2Cm_`}P=R^4uN&ILvQ=t;`z%o$QW~OZv^TDN9 znss0<2WPpq0iHiBJ)bP!a%iRu7H9n)w3&Sj&is7V;>53yeAS^@*8x_@@|eydz68n_ zn!g=Ac#zOMRr^5Itc~uSx?pg>K5BdQ@+H83{|A2i^(XxC`#Xxj{Kq?j^gGtN;0=%+cVm$x;%l8VDMY^R~q~Q7ObKEE`z>{z9D1a7g z_V=%z;rf+AWq}rdW*Ojk{K#IMK5-Z;mo7rRI`S7-71Sq0P8mBK-P^WPAK59b8{qBsvkm=E}2DS!| z!s|^kxrwKOW>IBmezI0V%NZoVD%M>EvIY@C%b}U&_DsN8fLW}~N`zV&ASeK`-pHNlL}+%w*$g*=GoeW)i%Ab4NVZjf3D;!DrFxB; zsLIB%eXF+U+PoE7u%lCxol9{|FssVns+hsNdL0F^)n(>60&`W18Zcw$r%C;WXlB(2 zO{@skbsMO#j+Pen*fFXPtGe>9LKE(9$WBxP%1TWMU6*Ropdnf`Zh}UZ^-y0X%V1q; zY*Cw#Y=MTne*J1S(7Lt-x>;GESG^h--p~w=9hxGve|Myg8HV{&CSu-%ahNk|B(jGO zL*~GNNblbt>HYd4wNGy(_U(y;zTFYsvkRhnbw+fbu88T|6$v~)kxK6018D<#AahV} zWRK{BdE@$H@swd$Wj7ihxlY2T?$+4iGab8w9B?Gc9j6Iwmr{dKm=cQOv``dghT(!h zZJIBR6Pk|4`{7J{5YEO2A~(q&7gObUZ$d4XDTKx(FV!K7tu+>w3qn=qaHE=yq%jbY?pWZ?jfM+{6L%N_iU$B2|w zx|~6bEb^}`&6Q*!I0}(<|<|I1NsnP>gRt*m5()*DKu9GS`}AW`S3DN^1%L=&@6d1u2)Li zq--=90pC>O=dAnH=+Xm#YtNywq|&?(&N?9L*Vnf-ndF(zQLmutnVQ^#ehodax4`oe zmV+ijw22N$PfzQAotW!Wla`n775&bbL%SZlohF{83N7cP97#;muMhNP!`D` zKz;p>U%ywr%`(x6K(hcGLF2yoHjAZMCQ$kH$G0jb^z|$GILmbQ@7ac@peI_R}%7M>pkT*{4lQY@C&X2RDwet1VwL+etn;$P zrv&aTA)eR~>4V)--q;)Eg#$6(I2`YTqXdsrgpJdQp*YUQ@gN(`J!~v@MFnD8OaP)L z4Mr2QstWW{-N)kImzlV?HW>Mf{o&reIyMK4z^e`HG%WML%auMxOS44N;C(C0>{`vD zuKb4ypw9@RDzu!ys4NAT6*LF%w?I5wPOXr20x!W=`&~O$1{^A>bKi$$T{t;A|v6J45^qi?JF7}B;e`ZlbMZp=d;t6CV)s18QA zYJ%}?o5QA48^nwqiWMHVSY$UAOJ|J4su?4&cE%{IvnF&|kH%;0RDC*OBtGFgp)_uU z+WunF7;Ls4gKf6svD0Y^cDhW#F1M-J<24QYyr*No_Y55NwZ$<%dz=n-!TAU`LL{Lz zRwm`}Mrm3gZf1w$?%YUxJtqS9XNTkKnIX8F?vGn(ez=zAO>p$ZwR!$Hm+XjR0=R^# zGjU#oW3m1c?9yG3pCNK1JUioJvJ)=w*m5Oly2y3WH*CJ^-EoNy_QzY${35@yd5 zcqN+6c`8r*cd1JRyt8rkgmgl&05Ffe$lotf`SH#?j@L_YR8doMllXgrKs13`9j8gw z^LU+nE(@q486cPFgyJM;0;nr4CrUcBm!fy3(8p6O8Jf##Cc+^>v-7oXoVnlj#t}_z)t^Z$e7R< z6Pnh-h8g2f7R`=cgdf4xm+OAG9u+|7^j8D)ievo@^AzDjT}N%Yl#na{hJouOjpsf=k|XUFGitc!LI zVOwS&ERA%<#c(Iw|7a%u@1xWBy<``DJozOqu9|~3b+yU2>Q=1+ufAQd#>E;rGlt^y zyb$2>m-xrt#dy0m8Sg&N29B=9zaLzIe;!*d2H)#i@(-Y;0CQ~>9i~PN zZK`%@C#~w$#o$2$5Ec}OWjPD6Yx@=j%EBQkgxr=d33%JsKC}I0n=bA2@xA+TV&4HA z+q0i-vOqKUD`0*S6)?H8q&&mJQ8%yFDK zaacvO5J~imqKjt~G~d2ji1UQz(*|f3>+x?ty{TYjHU((eCr~YLETQKlhV#$gy;8BD z|M>Msyn6NkMftfnv|s7~KSl?3<0P-`=IW^OEg!$$-8;k0Zie#pmBuRBdKj*+h{e~0 ztVc@-$IHW&YsS-Mfq1+!j8GhgC#$(!6H3W++d-7(->krkaH%Y=1iTxw15lVA%*H1a zm)W2cWrZQfW(@k)F++cLF2*!#fVEyT2O2@Z^#UcWxXS?e7+-C@0ZU`lOdJr(Z z3C_NRRv%??Ho8OzF#8Z-WfBoB-m>V0NT8Gwkbbs8fLi3Q!oVpDG(lBYd5mWzmfeaV zgBwS%78;Hn74d1V2+cgs0L{8TwGPZO3!qq;pD*Q-_w$0!@gl0j>W7ALojRnV61VTqFjBp2HInHxC;(Od*EP# zFLtqW_9g4nW@B9{<&t`LB( zB>Cb>Dq%P?5Vx{}ad%D_Au$*?)BSLRAgE87d?Vc(w`Tj{=7J#PW$@aW-Z-B`U{3Zy ze!4G8vi(px*Nb53h1+up;IjzYSxlehUnD-bsmvdhpW=c%Lh2;~b6x`TLwL<61S_~D z_?}O6QPC&PC2EmR&hz>gsEdTNi*g?^JT98>OVGGGEEH zf+RvtA@>*Y8YR5O71oWCWDgXiNStL4!YA{|bV>;7*9p=$nde(s1X{5)vorCfzYV(7 zsixllFh1h~b~dDp%aRG4$plD3^9@3CnS_gr_rz-Fv9Rva6rMwR;Nz(ya6Hfvi6gpW zL=!VCnKFv78jNc(A`O^tLQl#kZY#BYS=?@xV+<4ZEpytW3-Vrljr-Vdw&ow3w&Dvm9R#UJN4V!taMZ!tX~v#y?Jdfqy%*8UKEIGyePRR{Y2D&+*IdHF&vo1)l9% zi-M2mp-0m?%KzBBjtpe24lBO9mKNF|KA8xtmU#`;Y1Z18s%YG(0Y;1%is+~aY*@b* z`}giv2B!mrUloRS`&Zb#c^h_b*{TMu=^%TQ0D0)EJvg>|KaTG?$n66Pnq~fm%5yjz z*d>v%D(WEN^WY9$e*w+p1m$r+F3|q%7LNF^bhv%*Hv@nQe5&-6DvpoRq%lcFXeP_( zMqcuWo~9{x=^UlT%^6s550`qtg$n#x?g{Z>Xx7u0ycqvsuK>+DSee`)RSwP4t56p( z0ail>w2MgQbx1Va2ygyWmA0snzeNGyqbuG^RJ+} za`~95V!pLBbx7XNN?1|62J@nNGSY+7CEKdP8m0FD{|uZDvzI8UykETll%AJAhh~v< zhFEC~6;eC;6 zd-PDf=?$ydQ~&25_~n-$@W&s&C@0u^>;sr>&9FB%;ooQp5oT^Vw7FE z%#YM|Y~S`d&YeApoJI3c%dDo#dy|G%aGpLHy*jj4Gx3dU*9_<9hT$86vIP8pvW#%N zEErD+Z%4+d!p1xlm)JO6&I-kR+cD_hz!HO7 zwZPCOR#@gS4POy-)(1FXZMYw%_UW!Z_~W~`#i4mg%8Gf64en7kK*u8-a5&fzyM67k z)5{inyzQ{x*8xWYoN+9O;%_HH9C4c9e1?tE*=SdsBv2gWmzZ=_+vlgViA@Az=D;tbs4QhT!#vaDuBB0n-OB37D@0j0w*&%b&PK zXqI95%^RCE5j5K6p=aYtby$gyZEp zIEQFU0qK>5-}j(d&ehV>tk%qzSh5MsG7A2A4%1ph$ufuMDQFhg4Go$lPmc|g)hK{p zqe1iAkD_pAAsg{bTioLP+$Mw<&G*EmS%k?f!X)ADe6kx(Mml1P-*k*?+CYt}w`4VzsX;)(QCf*kO&YGuC^#VuPO-)_S_)qhNm|+uER+nVABmctZ5C)Hs}C zJ;;p=z?s-koQd?}x*N_>XQ^BQZY}{im(TYCK~+M8$@IuFX~jj>5dp$;DIPe(I&+FT zK}b1Hs67@s4dD?_>qx zHrMZ@`r#IV_;wQL{WmhS{86SpZhQKp7Z$J@(7`q z3AC5F%$JaEERV}E&ID$bXAI$yuqII@1dt0^K1GxOqpWKZKz3#N+Hb{N7PBl%s8VsM zkTNC&mQuwmKZ%|rE(gT~)H0UKP2T%Wb{uYH_@RuQikowz@Ui<8v^Uctc?dM;rv?$Q zSw>9j7V~(MPSchfCzPvaeb}Gc$%+5n1?>n7km1@WGA~Yw;QY6sqz-^WV zpI0G4SKKrTqgd{gHb77!W{6vixM)xU*P>XK520BG1kV)*BN#Hbvx&QMj>p6P_R6ftN>j;qmUz@odi*c(C(R6m3|H z!jG2V>KCg}@X=!Au9}OJi!!h`BO0Hwtme4SK;)F+u<6$s4b7@43vw-UnINYo>eZ^j zwuINLE`zG8pjOQqXx*Y2#<0U26B&-RY!CPC*@L5p4&l)L{n)uxX7byHUE2tJ+sG6$ zbMiK}+1m-4WQBv|heOmcw$F#zz8@nXAKxSH83zb=m4R8-D*^M)edL1uOve4w51r@WPS(5NlL;QWCerNabzhSK4peI(P5SDS4`I+uFJI! za(O_QOn8#VSA=HnGvO^$M$|dcD2Y<0NhWZq7;r-4QGsPC&x2yLWZJsqzD$8x^xotq zrUWng{|%Z&ra-32NcP%!#&iE?D!)4yFpCT!+;kbQX_{Aq!w6H-TVHyoO1IG}ZgZ@;R7*vnf35!2D;>EYk1K2+gu?3e6fYmqW9`LTuVY)vWB= zi(m@P`A${DxoOeUFmDO9bYlICuIO&YU_#a6YJ_TF9WlytBt~o8Wxzn0y?zso2Xue=io} z?<-iBMfyHj{^R$b@bbwwYN~E=&-mr1H!4zxK(j>HxOe+1ZeA-wS!q5#`*Z_No;ZTN zEQ@w+TC0zoMXj1}upSF{+i7ZofmUp^z6`g=!xh#9PB;}vSSB=|3vPj~oQwu|u%H zWdg1(iNHOAX7R!R1b4H*X0Ks*vmpX63B$^QOb~rVpcVIvr>lY#oW2lXUQSR}!0d&$ zOGG|O0?w}p&I+Cdnq?-%Ro;Yc4Un~|u4O_JeVbCVEfRjk?|GRhm@*yeX3QZ2I^O>hKR}I z@lgOF*=Z8ixJ|+;mnrzj%@!NnW?+qzH8yxTA$7)7v^1}!pgC&%XdDRf!Z`x=B_CUy zWrrx&-yY}uXW$HVI$%0Z`V*}Erto=B!I@xd~Sh;I?hJf;A%Png^*fG$i156Oo(wNL=#L1l-E-PfISGp1iK8vOFBCZ z+%Dj;G7#-{CZRc-z&n%Wk>!Qkv%K*29CkWb&mV{dnDBf((G52VV7CZ$HwlV2Q@n8} z)1S~xSSGOF<@%j8Kip=0zDbC@K?u8^?8)Cka3d>>*Cbrei$)nCyOen-VL4yr^{&q& z^eqU)?F9ty`GL4a$<)SW**>^Bi;y*wfS>Nk4yp#um#Ly;XA~1OixfaR6I#UzOxPv- z74d$H)G=Jj0JsG1XU8KyMqCjba9Lb444|!GQ2<#)h$|9nay-+cikW5!&%MHZrM!3P zT$B-P%LvNXc#qe3Z)Ln@8SnEd%jGH|_KufU>xXMgA6H|7J2llK_1qmY^!o>^2U=o7IA6&ki_kHysz8XCP^C7xZgthBWbs zj$#?bgyMEMJ2~M3&4GkwKa>jW5^}XEuf1_m#Plx8J*PULKxX+%6c-aG-n(=-9NBSV z8M2-fu_IQ<=T;zqECU0XR%w)km-AHMEVK5>tbn@m3CiLNT`bTX$BvbhSFAm*M%v+4 zlnd_g_Z#d;Tub#waVk3llyugv3N(ukbF4(p5FcpP6LuPl!}%;C*jXhszumtEczyxz zigw_K+-=yuJQdCB)KZ}hC8TJ*YFdPrx>OU38W_}}1x9r4fMK0FVN}O<7}dTthPH05 zLV0$nR~y|L)kT*Eb9~2<}Y#L$YIT*OkkF2talTNcN3Il-V2FjwQCDuZ<}xc+i{tIV;kG) z?b`{=0-1znvdIzh$FV(oDMB}4Sw+(_X}>GLGNsSQuz$yX0waO?D<1O|*+4DX&X==? z#v~d8?!XbY|N6AjIyfI8JIb~KX9Fjc^8)iF=jz8QSe7)H=c6hu9i9o!Iy6%U4LV@( zW>BsmOs}G)*@U-vTsa?B5IY)F`4r(MOlEpq1^zPdoPfUSBvgjxy}WouXx2_BKk=`i znKvaD)G}^_GbQOEC=ja%&YJXdj3BIU|4(FUR{twt)@FS%LbGX>)Lx)`RDKO+aEvUZ-4(4|MqWx;LYo2IDhtpl0fsh(?^th#u@6=VHq8~ zQ(2tP9zDQ{y%pl8{5!$e)HTD@(kznL&0<{^{qn;r1{>$gmxQeaXJq7VVDB8IhPcu&dS)>C#GBIu1s9;v5^0i>U!POMpJbhH9^$ z9d>z5$4-x_*yA}J2Yu~vgm85%$bk?iR%U`yj0?_2JL7DqfV2%xhT7poxGPRa`(PIV zJ9=mYXNbO(r7|cos`90h-dFSztL-0kbBt zItw%h8lahATn^0w%#tqQTh0}0xC{W~Ij@%RoN{PpItrRKw~c4Z1)4QWvp};x{kI;o z`Spe{1Ow$qB1jWVQk^Lm0yE(xgJ4HMzLMgB65i+4 zG{Rp70WZr3x3YaH0`E*u+@0l(M+?}YSsIB~izD%EmLF~q97W=TTgLORF|8Z3d~tI& z>lc^eR#BGbiBhI3ka<0k*CQ0&OAjO{`4KP!aBpU~a^bkimPbIu5OR@%gMPl_$_Cg6e4#oVfNY2S1 z%<@`S34|pHZUjX>W2Pa#$Pyp9h-t~bVnVY-au8V7AY4Q>0_U z`{nYcbV!)SRf2OV^C5o8nphTmuZ!j0XGdbelp!icvvi;WM)bplcweT+@?*Yk5!`Pi z5t^C)4Z^78bA`)j3~5{)zCAnQf}IU6Im|%%u%77K%nXTR2jF5fp*cDfcL~k+!UJ$K zJP_9-1ezs89D$kOn-^mO&2cgSkdT~YuQ~>Wyw3s=?@_Zd6PkJbV&``;}m-j<0)UfeoZ-dAE=aXp$Hx=3C>j2)RU7ruK=czbYzg63a}cHxKf+i`GhCYlp= zt5&HFvzoO~yH<4qZdKG2Yp^-rOMdIktIL2qs+zb_)Zn|yW6jLevW}%CEM;<@23Dxs zus$rTEK#F2(=+G$$@FSe;rTT!QNMmmbnDg!)2CV^J~kRFSFgd2JqK~-e#)km@5339Y=K%xt%hiEEQ~ zNG6#nE{yDM>iXDTf%S?bcDScREmWzxgV?b*)!vJwig%2EnY zMu2MhVE$MDGRL~sgJI`EAdt|!&mdVhTAC?Cd8oe|c)kvpQck){iZO5cLotZgmgRnv zeX@Q)K$hp}Quk=cV`Zt)tLiv8Pb;JH2m79k(5%bURYL=4Nkht8B+ppx(*VqrK(O}W z$x@RxsfxbjZ))@|Ux2U2{ku0+^omR8PZOAr;sT-hyjYu$?!~D?On>iIiXRQbz(cVP z|Mu(mct`mCe;<6#asA^yy<1SP`K#ZV{YhS)31^BFd9XA{D)aM}a}4(kuUiNoMHpdHrwPQsNr z{wT^|gE`v^vAr$vh0icN+Ymvx6n|zPDu|#=@Kg#W4Eqv}y~{1jnh4N3EOTA-l6tz7 zV7%N@S(?k?*Z|N*E3rr{zl7oCgj*9!bNP}mExzK|ye!4X_2DoFj<0PzepN#ba%`Q{0 z$%Xp`p1qxrOlWRpSzCkVQ6q37HV~(MZIp%iOn@!UQMrCI2*cBG!g~si`Aory00ML9 z44e(K!Re6cI2$g|JOigfrs8-2L0bHAGyHLZ@OCQB87E_;POxr-+u>ZSD=w?4nb`>t zD=Q(XnBaUR-5FPTZ>1T8T>_{;pSUBGBzkaN)3tO@!YLszlMt8bhOcLN;Ng6BRu-|m zmqp;o{9xQp5sN4Rj}jl;n>_y3OhR)Op*ho!Y4{Qp3CGD?r*0E0?-0ap5~$Q{bA+Rt z;?hBQx$;vN<8RIX8mWk0ii{^IpPGkQk23$>F%bj_mo=AY9;eE@eoVxd1-bKwL=l!W-tn7g`+4ljBp*w@(IV4j1XMO3`A*`C`joFk1fp#K}lu^J6gO}Ry0ax zM)UYc-Z#&k8H6I{OJ)VVk{W{2^l;?!dg8a7M-|SAMcn9qu&iDMGL3%7us%4SD6xy> z9{?9zBQRe}bifV5tPF}1tMv-UQ5f8yD%^Us!*N$9oOH2A+K9gB-^>CDCg_Hx_86CAwytoZH*BBKxAiSW5fDQ*h%m_dXVkci4!<} z>^Ke{I83hCPp}mOkmi4^_(YgMIKd!+XdR^YY$F^JmNhFgVNw~~G_Vx+1|95FNLaSr zrfecyCs8u?vHjP8QI@<0Swaku8h?ol{2_cGtUyVglp6?tmunm%Gbon~gAS7qM8Za< zl04|VB+|pw%3wNyd|k)NMX=V{g`A^7Ebo!|5ym;lYir`Ra0`#uIq`$6D6C|VvNV@N zvp%_8#dD0;Gx$>*q~C+aLp+9jRK@sreZ@>D@nEBMShMcxaqa|~b@vQyN@!-zkQtRR z{lO?1Ay5N9N!MtFu4u8Ahpb)y&$f-}8q+U_=1SVXjW@6SftZd!Gb^EfEe)Fa8w+MH z3uND35%X!H3eZfc3cz#Z;p!El!rugj`+Pr-l_kf@JxPT&oLk}i_oX|r>&J`!9GY2? z`3x$nA~aWg_9~2>SdtA=P;3IsMreLt68gpn&3ly&%IIBzYL?NUUOmvaO9xCCqCvB} zs~i6F-~SW8{_>9c8Gro#EB@mjeIjN1oI*+78Js%2n*h$2 zZ$DvqFYDj#t!mKT-8)zD+iySM9bx#-EX}5p-;L0$Ny5?zIE%&k^^3F@4%3)HADzkqv9B@K@h12*86JA3Pv5K3MHXxFQ^jluL%e z!mOZKw=@eh6S`K36_e*LW#gJ1s2bbCxEBJf-$ad7xkCl$c}BXDno^LglB@Yl=b5^TI7-^ z$^y-EmJ2Y4Fdrd!MOfApsw~bDMMEsm&sGExn1k?;X*^PCXaFwKq4||S^CCACW!bPp zLAaddg8WQpc7WWGm*9%rNIr)U2Lh`bKKHc4p!#*yzZA7=v7xS31x*Of$up+nBR>zU zahihlu2Trild#E|(CjoB8{BNL+Sdu`juX+QZf*6<1KDXi8SRI2f$Z?`nVl3^_9K}4 z+7YB}aomUC>^B{!f^7-NHaHW;->K7~({M6)DozlZ&nFUo=Mc~cs;6S@aU#xvO9z}J ztehZdoC$NmrDz}IiItHsa#;qr$=~j&yw_9@)>99Hw6}`iD-%xKARyi(2;QCiaO?Dh_&xs(=M&bI* z2wY7ILs=TPGh&zqw-fzwDa;wUf%do@?oRj;3oXGjDHxZcd~qRy5F6@=WB#n)-gel> z^4{ZN%ersNj-xHMy4zs0`wV6A-{NbJ%`C4^y{EI|XM^p&4*1008ViOFLR!!6h$94r zbnOWLuI=H|xi!2xG=o>Wrf{WPIy8q<`(~KlssX0AX$Z&8&Ee6j9sK%tMo|B*2=3Pn zVf}j`RF(reBb@JX(J;h~CW`qFtx9=+?3sLI(E0g%p2C1d7rmnXcMFOT}fE;!)ofLT+9V`PWMh#4~!=VC(H841RnC_;01Ai-In z*&k)$>@-Dr;Q}G}EITV_;+@%oHc0XF;PC6Sl5bTSg-i)Dqvl_8flN4(XLE~^^MCbDTH|Cio?!CT7Vir zdY!<0li#rG0?pA5xE|$%Qs$$W%i?fH6tNBqH2-vb6Y%UJel7S4&(D5|Ouml-&DCpK zqME?5SuL2E5twTblFjO&Dgjw$L9JJ>5gIjZiB|18qFc}27&LSkCQObEja+l)M>;F-r7qy-^hy8KP9A)0orMfC=7V{b$QWD~4mix^ z5iZ3YLv++2<=adqDGwpXJXR1HQ3qNrA0~M)$drpJa1+Z_gJxk<1EW%=$7|!sn66xh z>Ff8QS+gl&KjvMcg8eBptHjEomIW$cTfxfw7tl;;mS$a*VNgr6f=qs|a5I(iB{HU~ zNvmM*1G9o=>A{&ovmq~%wg$~o5ML?Z)r!E(bEpqNvs4^;u&Q7Uc~a?_+=DT1+Dl-X zUrnaR{?AnIX2Sc`_sPSSo|zGr{~Ve>JSb3m{(A89%7FZ5`%HmZ$>5eD5?EFeOEt@E z_tq~lcJKg99X$*q`}9!IoRXA;|MOq}jh}z~L0Ovr_Q!AduYdmsoSm z-uaWbRhp06SMzb=&@T4aCGIm{oc*ku1m^t&=Z6pO;P>Buz|TLwDhJ{Z!Lly>7*lXo zlb;ZnWg--b6aDa;JNWG5^|*bb6kmS+5v=OfW4@~@AL3qJ+QGxw4qaQcKpSp<>Nf=s zS486L1#WmuaD2EzESdsXG85j1q4_!Y%WQc9&CdycGSie;yu=Dsm=cKd1cVJIAj zE2fScf{yiSA$06u9L$QrRyGu$rG{W^uTCmI!`rvSc0%-CHY`WkNFF8hNUT|j0B|PJ z1N%d4@fD$JN1!9Vl(2OkHdy974RdTJA$8JdM2#GX;9&#cHlQ!;dUnCo&h6m9PL6Hw zzOd;(81{pQ!KQByf7sw2f_IyU-wVzr|+HV~FK5|#y;H@Z*83XdsR?q`EcxAEx2PMSb-(5N9eLEsQb zKF4Qz%3pxl8E3p*anj2P$Gz=w(w`6ek`X64M&i!{jFQF{w#IOlsU1lN&U_r237OPx^ocmKfNuE`~H~ zfT1lKqJNWm=)-3tVf^~=UF}`B7MB+2Yem&JN52LZ=-xIf)zA`M>efUTrq$Ha9939LAt*`eT=gi*Ht@-O#pGKWC!&TU(jgpI_NU=Q1T0cQow`}UJ7G`B|q z=A(P|W1krJc#Igv^mg7<5;;Rzm<5<+X@F)KxMm8?Mj4^m$V@sk%Q~4vv6tpOps6B< z>3kvAP{7LH4{&M9XSyn0Tcq=n2F(J{A`O}~ALh!?tO2v=&!JfdXPvuDRXI=kJ;>lc zWo2f%A7Ze|=jq&A4$KCrDZB-m%l%b9lmP=U>(XKQ!_ch3ayc|Jqk5+PRHkW5#W49z zu3tG%x=e3egVGOMnMM1=aeg~NQT&8dPfj~e(idRX3c^q*<>1WU{tTLVecifinkSW> zN&cmGqXV;^N7LUmRW9H6%lOZvpQHb+Vwx9Yo!bB@I$zd2YR*B2GxDK>0ixP0uuv4gt_)(3Io*nSmL`RSv3`2E-K@#Bvb<1~LrrogOA zhh}{elpnu)iGTj>7v-Aq@y0dy`p$I>89Y$sTYOR5HfsbwR|nXS9i^tR4IADKHwn%6 z7rGJpJQVajBG5e|XgyvXM9>mx;H;o|b%=uIXRG9|e;JHM5G4HFo=0$=8HnQ409=|G zfo$iA7{x|z;>ZDLVO|x!!~5dk>^N*s_QMy1<_X<9tLPd1+qS@#>{#q#gR(y}1p5fx z$5Q=qC_NDS2_;*S0x`#F68s1Egmt^-7}B5~`VkO%TUnupRReUlYRJa85xO_!Zyh_c z!ES>-EnA>(o7UX!h@q`oVpQWgm~S%zCsRFfPK}T!G-rBa$;8fBYttPsK8k^alY70& zLxq}?kZvl%#d0@-vFCfRto)Y^u&jWYu>4ejnR>*I#}h)aO#bp{HOrC`UuIoe$TzVV z6M&cUI*SRfYS5l$9TwPqwIYal3B)Uc?n}`cLaw-0C}0-9YonF5e_Jv$%Z=1g-eOJ;D!s)8Z@5`cg49# zcYNeF9lh$+hGh+j(=5J{Rng3>22$;8u*u(>&^#6E9j9OewUGe5(RmV96P{Ol+hT#+ zWOQ#(TTSI2GGZ`}g?ZvAJ3waw+;H6A38(znN%D2U32&|kxbT`DIL+&x40BRZQI3TX zq$T2mSa|1#ACM z%RHxIq1zhVW?LzBwG*w}NAb zR&ejo1|i+MBD6;r1b6L#2zKxyx_3l)H>TaAJ>q(GKw{rcNE_G{Gl%uVyz!&3aPl~0 zj~j{PA$<|suLt4>_D0f>en=eJ7YRf9Ac@Ci5XN)HkH+e$6Y#0a41DG5h#dhAyl;DK z3vtGlU^i?H^u#t_KWy{#!4^MPY+(mQuqhUhVJ|@8v5WXUuh(gaI8J!o6P?oJ)$t^@I=w%>v9fqI^&q;fYJUf0-QR z9N|?uROble=VRp`2zCskoKYCbx=t0bLnx6du0#@;BMHorUIb_#g0vQDE?*Swhe8F- zA_-&XrL4}PeAYKLU{Luqi{;uLC4}HBYL-8CNLeQf*s01-4rXUG90jsuSr&-<2SHn` z+$GWW%B7=3fSI~XSiUTlW&-_Br#~Y!U&apwJMs0gP1v$58I9_&^HHS+tQt1O6zj=| zi;BUL1*LX1_13D1YwR?6{;A#{Tn&rufTKap`flm@iIY89bXgH>7A z*rXyiY0#|khj9WBBa4_y#E;L9Cs(bgylG{dTmWOG)uzvCKLN$Rk8C<6Z*a^73XK5B0 zz}b+Qzm}Y@%eY^^XI6*M#b_n=uG zjwZ>+UqQ3HAev;|RE8d-+@mU2seH0Dt)NAbP=bIvUg8t zK7I56@-Lp@aVL4)VLbZwHvahS`*MF~(Z2#^Q|Z=bT{19G8yxrsCwR@g3uh1+5u~8G zYPG7UZ&3?wQzs)i&WZBK#?0B+x9D=9RuIDNA}lKjK5u)>o|ZwIXKal}erXRHhM#D*w;tcwrC(r|ZVxK2X6 z%}~ToAB6ZReUUo7KQgEH!<-odkmE314I*3PHV$h&CSax8C@ioagsf@(k!?Fn0rYZz zdu$*ieUTi3oin3xAUh5RXU1SBA!k#d{HwuprVYgGiM=s*azEru8;sRX=>Mw8HMAip*Y5C949axV@L2T-+_x+ z5y+briOaLXQ8+sSMcH8}nH`R6^P+KMP84o%d2Mb4%4UnjJ4`LF&kM(`g)$*cBo&2w zIdOQfED4WSQmeVXGJ>6?aD2Na9QRg+;_DR=_-1J=9^@qA@uFlZigh=FbuI#rSzjJ6 z2*vZo(fD>=7`_qLlch2Ec6mI$UKx*vtCI0-Wh&oqE~)3#%ax)e*1<%)S{9ENi(~PK zdHXgq7>_a|@HOw@&TMwXvO;l_fLM|kjKahK6({*blsDFS*rHRdnrKk3E}>bThc%&A zpp+0x&=gB;5usVax`_)!VZ0w!IZZ~l+Ep;VM_YUuW%&Nxlz{^{Hoz^idV>q<`5cU%61R^y^W&0`v@U{{fw{25{ z@Aeatb;+Q-gM{V11ZX*4+WFl)PL_H*tSyBh#Gs@xmkeIh2G$#)ShoQM#JXQy0IbV${GjPVg0%tXwAvv*J??E)Tc?}ugro_Bx236Jp z=6{b8d2%=Tmz@g}Xny~~QedW_rUUbbp;?y(&Hu_j_s?W{!+$3IW;I~`pF^{0L8-J1 z6-+_1^dc%jvq(X6rH7+G$oI;N`PbpNqF9cGbya38S5u#El?MYb|4*Q~Vud!OC6&|A z`!dawroVz_0c7b#o9X~xY%U4Ga{op#UA3>G6et^{wpCg2czJ1dY{#}wKE{_D)*;H@ zS6N2eG;fBdPafgF{`23JKeKLW{@ZWj7I7E3XOF9>8P~2B;96-uit}<+SUHK)d}RMl zoIiaOg?Z<2`Qm9@ICmUh-zlpQZcg++g=QltYgT62C&2vMI~h3m18!a`!O=r|@#)8F z(4k{%RH5-dluWT(pZmm0-LbwI`n0Hvp=}yqVwaY% z@7oF9!+RrQ;$WmYOjK6GPeMGfCBhq9*qD%B%OA~Zi#>)c0RR^Yr`pqT(`0?aR$Nu&;cyk1FQR?zH^H*2}Qju1`ImFON4j;A|0_55m)Rp@c9m6wh=)ake{(W_sdMvKubO z5i+9Pa7ld33C-sTFdJQ`p`*DOYVs!eFFS&bb@3#7T&_;Gy@KD;}; z`nH2-zm9NcC(5;dPq+{659a~>;W}^-yoQZ}%cxPX88;kuV}`NWofad%Vv4t?m}POV|m}q@x`@8z9?N3fT9IKC|(qSD+I*=NI|#0 zlI3wIT^5TwT>pA;BtpbAZ492QiDCyc1}|2};^mq+E~D{$RTMjD(d-08v#v#Q8O3D;UagMc zGiDkq!|-NhIKEpRj<V;Bp<`;WZwtcV5ax=&8gXU*e0`i~aDOgiBoHj5qTg@%?;P{J>}b!+amSnd^h6 zvsr)V@tzj(9bX>F_aPQnMnK zf=sj`{=r32?%ejpDo26lsu?8O}6e?xyWxJ~Uc*zil*nS^oyM0)}@m2!t7PUOY zw)lYB7I@Y`T7nMCw1S})?H*oN;CelTzEYVnAr&;W9=p| z@7lE$RkXhf1g4A^)C)ua%@mY%SXNR1XgE$TV33%%)zT#671A`oqb_Z~Jm`uLX)3+o zwQV_G2WH-sk`a=nvWjF~(wBhX0?d2AGKFSCehd{u(p45`Wro+JpqUR@+%|M)clB^6 zHFA`N{Gt2PRrx+mWvq}!Y2_;AYf{FQ^xiX|^z%*2O7ioe3Z#Lv{4Mm?(0pi*Kr??6 zNG=CvU9!&z%(_gWS@NLrB`+91B%3#F#Frb_W9pdE>LWFB$YAA`@o)e9JE8eU{PFuc zH6!5Pe*YQw@7}SGF>+XX8l;%|KjO4_~Gp{weQ!Tzc05o7Z+Sqaj1_S*^QuJZ$gea%m~e0 zn>I$gp9kCtH8R0K(Ab{1!G{0a1zvcx&;t*bb4jq%tj&SSx-7sfk}zyfmJ`CnwSW-! zY=x%+%a;VdCkq2{Bij!b=Z3;>XlL|jZiOzbnxP(H*t&mb>|GFz{j-9wB|R9Ex^+@f z7#f<_CV16Erv~Qe(Xugmw{C@z9SByvJ0i+@5SFv?_&nJUTiGb@%M8Sc*`dgt7mmF7 zp|~_B2zl9oD3}$10yflnN$xlq?SKQ(w%D6sj{^w~ILSsVFWMgYF*9%>ehMz6OhH+u z4Q|c%!c79np{VirhEVZ}K>Je73h2lCVuxTkfpVFj0gic(ER*LvPL~2^11vvWLP%j} z<1t}cU|ZZXWQuVeoFA^1^7LW(NqPF=5jz5K{;h1GA!2BJd z_AP<=?OKA7ED5@A2*R&dh$WhUygU?d2y-&{Puw!TTSVBJ&;9eoH#$gzW)ol*Jyl@> z3Cq-zwbVL3v$Y|(G2a6v*{&#=?TLa+nf$>Oc?lkD=-IFnUL+3o1+H&&n~F}g2+e$+ z;+|23@YAYpZKS(7BFDoHiDUaCVrUom_Gts}?ky46wJrR*bU;Yoo`@Si0uCdFqGPir z7}CBu0ta-1Tchp({Hr%<~fNU+l(?k}N&&mU;h<`F*pL z^>vLO9oLkLtU-R>kn{EwC}z6Q2b*W4_M}4DHq&p2K_L(!6Ap zriS27j34e2Zm$dM^1E>%%w9#a$d7YDKB2rM(g|0&|0+SbG}swesG?AN6jDVzwm95D zsU%7U;}L+P+z8Dc1b*>rHbQe@EIR>l8Z^uF+-e!CL$i~D=EA5M%>N8RGdt}}Qvmp4 zq$kdYyW>J6-z#=VWq|4>0`mpJ^#wxnrAPs6N6HD8A_&^yt~f=oewVuycvFnGCA(3w zYbg${&qR+_nx#2Yp!tvho4~O^8ljAUcxdNV93ccB`bv`m=C3wu+Oe51MzKAXe^XUk ztlDTJi>S7qETHp&u^na`svyn49U?wicZG*boueoy}bnss1S&}`tbaxVJ-@923{ ze%U7FCY=rGcvLJ0nRSuk^)EMZG&Hk z(Zns4!uU}1=?wKL4YmDJ;C~LyrjqoG7Uqi3ynCAwn#;kNzwcGYYI@&QLLR&(@oml=NR|j9y8Z&3KM2j#jxAqc>t~-}VfIY4YuQ45q+FdG@n8S- zAB5%K2+QvX&@veCXTt4|%70h}02UXVSGstPfP66*CHWWBBq>Ms@51HtrwP&LafLv8 zi2z-6ITt^D_gulRK=7NFkIDgBw?Kagn&lYzT?YT%C@aKwub&w9%W-nv_qbMCsP6sn z!ChFjdMTQ>XbtmfHPMLR<83nqk%3-l-OvhM>r}_SWEb4a3B>)mZg`-4a|w4V#nK$8 zVEK_i^Gd>qg6051vshyZl>!9>NNMDsi;Kp5f1IBi0?&c%(W_-6LQ+#SF|US+ecNHj zyeRC-3dFXIP+0fuq%8Ln`}M?fHb`ru-LWMz9DC=b;KZU-Tv!x`q9xHN%85du%%GMX zfFc5UK{~-Mg$-e%0~;zkHdN9uvayM=MsDOJoQ;}-({a;rGTsKKy{`xuFBKfige@UzvX`eaP_F_si@Yffnk8Ds69Ua+ z>WNsKm-}im%nB&bECc^Ei?e2B7S{}M;gB+ZxWt#>Oz0t;%T(MisaHJjHNpA~CBXc~ zpzjENZwR~M>-=VAFhy7Xi7T_+Q9R27mnCXOk}E+#V%vHs5+IvE{m)_`U8YN%1ODpduI z%xYkCi$<8z%o293E#cR@C89_6z^tjmvCM89HhI|NivV|Qj}An{l<{a)y(*%nj6lJn zcwR3IW%FZkXL%y)QiPg_o+F)}*DUk5Ec5Tzc;Vd|Z~V@(|AX7V5oF)3bSJQS5LkT)tAQ*lUNeVv zoqED+J zG<>(zfoZ$q$5oQP2Y!(25LAC$=7D!BobdBX!YcLSis|@%r47DYX@|F~9QeGQ2-!|} zvDz8W*0}H)Nk>%b9_u1IVozEBp0Y#tjOjdI%HLSmp37{H?5I9xJyp+nO&~!zlpVA% z*0XRPABLB#8*g&h`C`Z54LbvGd0%gqPQ%;f6Y<@$$@qTRRQ#~ihWF-x@0K_d&fV~g z@4{307lifa0qfiSbu8QU{4T5y!q@CzeZ4je_g07F{>n&vyCNF*3DI9KioiGQz&_0K zVP}G!jtno{=W~(S5G7<>KEd&<-((#0w?XXa{;IRwsD53>* z9bJi+To5au>r80&z(&W(=w(p_BfGc4$Dv;M!ruw23B2Qbw}IctKFD8`jw|UQxE;&R zPm~X?$rS0zQma9;fUNimmk~yuSNAN#-c>2+ z(Y(G28=9G(hLcD35=^(?2>IX;+vS71w%`z<_%K0P0W;fP9hem?Q~Rh2hAtCeR_vkL zdJQb~Hd=#Rktx?0+i@LAwdfckwzppiFdLy+e$!Zk?YT)>I)IlCS}T{vWaaL&WS0su zK%xBauPn_Xfo3ffonhM`ef|GEXckar-gazP<)=flxN1mT z>iu8I;7C*HWkOEnJ(~ivl*#`Xns>888KJpi!RbIOl98h70abWt+HXScrqHaWHB=8u zt2o9=^k>o@oDr6_nRE0?Y6{Fo=`bzF=`w|WDPXy8$v5+(pqY6h7z#A23TL=aUE&)p zl~_PoVj1ea71Oe*EbW?1+aN71{-y^1Rn#87q*}SlvZ73nD~E1F`IuHv(_r7K17lcM zz1&?=$xJ2BUu1-5wav2lFf@yJUIk$0N5=%3B|nn4ZC`wft)G8_tn?JrR}mM^RdI8P`e*aQx6dH6?dmu0ZnzlojWz zDYVa>Jglb1mKp!VT73WRH3h^vD4WU@nnnNo+s}CU>|15oe*5aNngm5G%>vFc7*O&f zmS&0By>;ss7%_Y#>dH({ypQoi`XD(X5Mu}QP_7VJGlt^s@)+D-;Eitu1O=K`5MT&t zj|j^mfo7S^iq1`&qD37ujP2bT z+vgCPX9i>2tZ+>4+gbTE2TmS=!gVuIm=lSsOJh*FI0jc11QX(Xab+eOsx-n}x*JNU z;#5~$p-NNSD0f^;~|O~tO@k+@CxeMu-4_!o<>%yOupSy`IB{uG)Y=Xfe;7CmQqs41(LX%ml# zOu_lFvNVeo+7}PYp_%){IxXp~|=7KCX=&2N; zStiL4>scPb^gQ9=VuCx?I!{C!La!A;+03jOYSyTNhSjUUzIzAEcbJ3?es=gc$N}48 zys(9Eup`nHJ0m=?CD0k$2{JL0N23X!dCIgAxVJI^-zuwk6aiHte(>H2r7sAlZwQ-j z2)M6TyRkFlLJ{cJI4QV%L%m+&is!Rk@nohGWshgG37!Pmw+kHzfR6ZSnKOPSyuMrI zieH(=Z)@D~$68ss;N5Bmf-3=*VEc|8lb=?xY-J`$)`3@iCa($00>?ivA3qW#e_rB* zcT1h{E7KJHzLZN5uk%Ze4Srf+jqm18#+&(*@!g{7giw3@w4CY7^;fv#w-uiFjo|$2 z3TOPX+y*}{pFxne#ShEvd0l(nt0SMcGoOnK>wqie#tsDQ6gzLy33$Z%@rbbeXt_Jz zM>oEEo?K^Vi@!f%r|HRxa6DKQhHnYW->#0}dN|+Ra6FgDDhmVgb^$wFe5c>=8NXi0 zj=(Y-yk23=>rUtOZSZy(^TGRjwbYdm&JKfg23HC+2QWP!+*{|3uh;wH-g!zCIAwcbAu-1tnWme2zN&C@ZKs` zche>fF)un4yOSeuQrt6Q36ODi$V+fUzPM=+nhRu5T)aCzbDo0!=2bAZM{9f$;f?L$ z|4wk7(Z3@CNAyMMifmlX3d7BKcCI45Q5HeS4R=v49|Z|6YH;CIegm!(hOdP?;3`4+ ziddEvFx&9h8A^o_RHO}Im(VNF9K}3{i%1k9Tn76oU}ndi>-o{##}1{EVZDd|tt{lc zR-s4+1q2e&D=xP?Z(hLjm?#c3yuKXnu7W z-xcpb+3sc7z9J+TD(T>?L9>(td5o~E>wgE$ ztUUh;nguY`Of5`D08|Rm_~3Z)a#+?S8PhT+6-qC>4@-M^%HnDQ&6=2&0h-?f=&j6q zIY1kK*X}{?Rqk2)>q6yQK{N9t;=_3#nx*m@q1gz`tZY(dhV+fs(etcbPgi-_nUv>W zN}X?1d4BZDEoCGX&_tT0-*68G$@4X>V49@T>ENu3`}eY9DY4T12{h{#W(lPu()ROr zxjvuQ7DDruFF(cHxwFuyQ3LgnN=`{8G=tFmlXB1a?bjdiv%wwW>pRzQG53Um*z;$P z;aX_{VfHy5-M@#E1m>d$_TcjQGbk;{!_}fZ!gH?jb-q$?LCx|fkucd>4lscXG@{(#5+s0ILvKl8u+{a4%(Xy(032kOB7{Rj&QL9LqAAz{BdG;f5Y z5N`z9PgApVO>WZ!1q&kZaA`2^=Xk4z`Oyl3mjY%DoHb~c#@r8&mkQ|lLBgQD-|R4KpBsVc13D>a4jDfP1#ASa zWP0KHOfOuOWtJ<-X1d|ZEE%9iDCYjsZ2p$*&80U=vb<2tl-pkHsH+1=AoXKVRiVa3;JFvR|$xIEyc{LEo(-EHjU<*9KCA<250KNP_YTYIzvm zP_J{ipBd#b^fZ&_|H35L&?x#RUJPj(Qf zRqpI0xZy1!SRnK5QbO`VJG`7X9WSz{;A!RrJj)n|H?t<=$JtZy>jHcHEyo%ETK|8&dxQ%C(h)y0?Vp!8;2rfF0r+?7 zZv@MK5ODvo!U6xr>;D^}`=31bpKI*#_ti7-i&$)zOu_d{r{eo%1m~4@gmI=p7=A}s zen;4TM+pCAv9|nK0lC0AMNoaq`t^!>xm3hDCLJ;vSjrCDLzcrs-opce_(SIL;YxQr zU*&K3p=#jY*Q=v&cTFU|UK57xkTNJLCpW#W}ypI1}Ry-@(0=g|umt#z^+{!q(VOoFV|ryP2Qh zj7##KN=$8Wb&$b)JQ&t~1EM42-)AGsD2(A@1(3B)AN&!DVpQ z;O=pV0l}S+KyVw~XYcc#_niB?pQ`S)Rsys4KJWGYabMl5S9f()b@i%WJ?p8bdf>}? zRnY5$SFo1nZVZr#mp*X)?HQ8FRWpWg{I`%x?( zq4u~EzgObEVQtRg06jG39AY_-$^exT zS14JVfp9?3p59Kd12o+_FJn~C?Dpn0>B%r&}?g6 zwrV9gK)Ysa;WcWq;2QR?YRkUZkU*M-HHki9jR3KYOtAF@GNIWjTh6dO&ig1Z)O=(z zz>8$^Sw(0TSl+C(QM6uU!lKCwMjFBx=6z*gV8b6KoSK|g8HjCVLbEM6Tlr4vggn7= ziY548L-SwzG>go9X@jAZ!Gvc094bTe)-_}oLmHfQeJiWGz_KZ(^Z%=*SwWdVvsjy1 zFe|*tgCj&f2hGMq(cZSRS)&WVYaJP7c+;%!a1KYbwCAz_AsYw{I3`*4AcuDP+JT z&wTaDrC7S;XQWP_f>%1eqJE@?j~bz%S%6ur%(8tP*E`in$Mpp#N+$7 z@Y~Zz_~p@k6dcP^_ZMguEAxr`9Oc)%Z|4?d9NL2$m(Qwb8-;}CNB7I{2Z30?_^&L@ zzdh66ohr&vOLdoAJCDEp{+QoA;PxqQmtKP`u(E3#Un0Jk+qdq(q;X@>yjc@eCG0e) zSp`9kJuooH8*emif>&6!bx|&at!P}DM3^IZU858 z&rmcu5yis-aA9~5iiZc_^eDpdC<62N02EB{N6|!X$9d!U7*7;U;CExW9qWzaiGh?r zpFd8F@WQd7?4$;%sz%)Tg)dXB}uS&^!>=2wP>7{Ryn%6VJg3Ax`{n?=tV@tUK8XM2jW(!Snz^ zZU7lsjtBlW zms{$Od0zP2e14wi#(e_uY*##->4N(N<$DC>dra@%G!NWon-Va-&-U_wVESN+ACGZx zz;<$%`zk7n47$|7i*1hEo74@0^EKAV)hX_Lr(AG_dA%~-6<0WbxXgjoB@SLMPY_#d9d6HP(9q?!Z2LcmaaepF#S^g0r7~d3^AWGa(l*BKZ@30Jp z)nJkDxk%iF?(%t*t9%jSCwt?nw0jxwF^|{yS$z4gPx8c#iOlz;Fq|7oXbkI#?9jf* z5?@|IYEDFNWcqbOMyxmdKYm;N(^aotJ#=c&5Q*P>fkTNQ$ciUG#u1tmIe3d_d*op4 znEW$DfZXV1haVc(M9+8Ia3JP}jXw6+5a9#=&)-2Pbz|vtoEx8jlSu(MAyF^F-H}b; z%}$h6Ed`t%k)L3XqF6f=M)g4+l^xTYU`kjfBE*<& z(5$$Vuq=$FpqbC8iY(>0T|kKsWt+0N%E2qs}xmzeqaIX1-PZ@Z~Xm$16+uL^EG1WSy!bsWaw@jYro%8<9Y-@mX)>*)Y#6 zpF|4L4OTu&1D5af_1h#Ij*>t#8<$~iR6*NDSNq~gYCr|F<(L;%od=Eu=S;#uNUxkr-@~B*ek$XgN{{5H7xPPZq zO%go3Zx_xK<>5@>apWmzPDA?s9VpB>g5%kTak3x_<)xSLAAb;lpAd|nJ`c>Ye70DX zf8}?Dyyv0)J8|XWXV=$vgu4MQ$V#S?2!V-C*q`8lEwTNvF@``F>w-;T&RFWz8?(K7 zBL4e#;Q2-)6cO}q%??xGR-WojVD^Ui9V;ubfb?X7^ArLd;raGd@!=*!5EM$rd!UTa ze4CJbi@MD}hoE$4Mkv9V`%{Bi2Eu=;{s-XZ%m5uFLl#Gw$!DcpGzidYB499X z%necSrLD{aoD^r)9rp>%xde_J!b>)RIi1QH;Du>UT~)}x2DNJvn(Lwt_up#V0I9yN z*cumz&0#Ls7|9|3wbFyP?gflP{I8@tbez+D-bux*yI?UfL)l?2!(xTAzB6@TQh z?zs7r8)2G|IG%7iQGANUpV$$ zO{gZgmL>Iav1JEK;$s%Y1?1KL!t zfnhzqMn-ZFGGyXil7OWAyCHtU;=bU)_UD0}K`wA;(GcC*Hp48Bepu~mkF-c{#C-NP zB0hN&rK@M-(v%cJvmf%K#mX!$5biimV9iV7HbLSgJ0dUE0mq|dlAZ&#cs~vre3_0f2Myjh5G!$-7ZIA%aW8iTiq=lT z>;zxD)v%8Gr_RSoL-Xd<%Pp=L0?Y!(CNzt6Sp)M*_WL3MW&)TJ`$Ylf)!Zs3V82X> zJB0#W)mIaS1)gO~nSdNHdfK+9)=YWA@f=re$s(yS!uiTIuL-v(dFG2sAb}itXhK=0cL2GjpkOI|2mE55-hT$|j>xSvR^h%8&O zg+v^&$N*zg{|cJ5>jcHLWiqB-d07LrUOt=qWR(}m{9Y}&ZAj`+@~!Gc>cj9>P|rg> z8-6F{Qs68iyC}=E)Dg?6>P|S73}>S%J2s@jl=zfvrA*7|MkXxUK;YY81A)ql%EqB5 z*KJS|fR*dCtAoyrN$Zut(9DckVXE@w8YOE*TBV`c0?y2vM8nWB0ot@aOWrgvSC$4> z4a=emz)Yz|DOOub`&jErw#~c#3r56z8cSNrYCZCux#XoOGcO|BH2#&;JuL%-7kpo# z{&d}KWxZSdkt=Hx&k-=@`3ihxOG%?uVtLr`w0N!nx9MgnTK$Q(44Vv8?p}WMD~&0Oyf98 zuAIgnzud*&etC$$5sqyw&3Y>?8v@M_aXcpjhY##gtN#f!i^R%&aNkbs*|`PVxA4WJ zHg93O*|7;de(a6vRcfI&!MXcKZ(w44FkHX?9I`aB*9V>1=?_xAl2-|QmkDoI2v}DM zt5;Ja6+GV{$cUA>Y@$EzO_9lHgbE%LtLimE&Gk7+SQuo7Zm+(C&))8Ymzp(1o5r>9 zZKu}QFd`fqhWKOEm}qqW>cn+Xf0gtQxDT?x%z$|84LCZH*+s4wA8K-!1k%zBz`TI984GN7p&1Zs7h--|U` ziFJ94y2IltSEmq~r#N!oiD2u>chQT3*Z=}_D6a6m5NN(U zEf&|9U$Ib^Om*eJ&w=gA9wn)cY|pM7^s&tnZcC|Qivu08kzhQKz#R6$tGKytF6(3v3S`nDAy_8S<;G|$a1kN4KwKjP z%Bft#r&yw59Ek^i5So2>u3i;TpgBjyX*O2|G+Z+T znknVqEK6?_f{)4KB_e@k8|5l!=HNiA>-_#G;r=K;XGnOWp9^QdcaS>&3Q$wgoL9<9eR6(=hvY}v5TTeAO+k&XvRPR3zeb&ra zE1j9PsT3tvgp6ElD^*M2S027CG;8U+Xlt$Ld1#jJb;Ggd^`AhqaZlU)TcMfxGhP}K znw5pwMrJ-lwrTuJvElr0p;?~WUqQ3^(y4PzV77r~J)&gU<@p=Xtf2S>fNlQmQ{m-R z0ByVk+*j@`JV!=l>sKtnqB&DpA4}1r$M*`FCr=u$p!w;O`vm8QICZK3moJ^gy*nki zee*J&J-ChEez}jwPwwE+Lk-P3jnOc)5hWMTBIn3H9NxVJnfrGjgGxWR z1E){qpycWqJbhS(Kb}3npS*`e$*_i=lkGz_*-xy>Mfu{Mu^R=)v(#sSX0bAhYer#S zwz6(-+06E}nPuF$330J;s8zk5D$Bd=n&GE#AH=)bp?&?D_^feFWD;nv3m_A^uFmiw zD6^xSE|%s9DvXdP5e~djI+0K(&^(pUHa$?SRCgma7*}Q|V!Zv==<{|5eEim{XxXd* zS~skTuR65ArXdm7k{p7y!(!0?)A!Ziw9$RLqhQ7eoST?{;!y-(f_2_7f1Dg0g#3~0 zct*OSXuKElesaZ$vEDd0ArxoFMdJ7{2{#vp0|e*QF%FpF*A;PH--i8LZPB$uOMKj{ zJ~~#fijLK);mta=@os|#c)w9SeDZQ@eE<48__EU*c)wi>yv#fd=>8FIEgOomnIVLK z50=A?pz8@)7F$+Hyh({=`KDOw#P5`_aC5u|ZjO@`1AS0VV3yd;cc}XYG?%{snkxV^ zWrDOR3pi7^XA#6_5uPhRbFhMCS?ok_wG=e-8JN&4TV*MgV+J&ri416#MNQO_+JxqF z6MYEE4mdi%3F*nM+`1Vd=R^dM1UDRt6$`!}rU^7xtAcv9YM`M&GY206%~O0_aAb5M z_74ogRs!<&a8K+E_r}&RFKmhO!Dhn9)&wt1_I1K*wQIujyKnIL*nZ$zA^zuDK5+gJ z{`<@>{Ab>J{P(fdc(i#U%9f_!?lKN!7A0^%LoFGAdnumvn zQAIT&6cT!`&yK-`*&Lt|ev4-klsQN&;#PpTh{v^JC<61GSe%$mkTx|dT5D!BuZ_j& z88J9Rus$;_hJ!u*4~bZwiwUl$S?6ab5sWAM;yiVcx;#aeqxZuV?q6Y>y23Vgh3)R@ zREcKe!q1M}Iuo2-2*(`M@%dg$4Z)SvFkDWJ!PRL|3T$PNcXJA%nSfoA>cqC}LdbRJ zJ8M{(1)AB$O=uQ(kP^0Ak@m4?JCp$z+q5i^ul}{5Zm?Zlmw%s_&NZFHqj()5IXhN>o&z2N?J*Id{J2=P1)2>@vjS%uXcmii1!!hI zv*SGo%$~>~I3E>Bv>JkPma;VaDQK2F?u~ZB-Cc8lTSsvxa|Lo1jm0q6UU;)s6?Cj! z6~kk~v6s-STrSoUoC(PS%MwjP#b6d_77MciW>f5cRsV100SRrZ{bU8ggntx|n9 zWedv|P-J>q&qK4-MZ>1gHDrRaDa~HOIKm3T9j2_@qb2O4C2kv=EDXRhRe)w~`TcjG z*{Zb!S?x|y@wf(O+t23pQa2UqL*18Ro+`7UhGre_xdJqcDgm?3uds;8kA_%BsvP`Y zKQE?j$gD%*ER(}3K=aOxn|V|5VSO<)E6`-38~G3v0KO2K^?B9@A`i^`Ec;fFRhhAc zW>&DOKyf3m$gCi#Fe$WgjT$*gditST@292nQ1N~yFmF&30?qtNg;*QSUqQ23!DgBw z+n3}yXlC9j$jF1pHjU@RMp97*FpDaks{*exZ8LAmGA|Ebf$=8Rmn}F;x%4wt&@AUk zL#-g|3ut{g^$S}0Tq`h}l4JT>{o=?q8k!|AxD0SM5Skaxo`UtO7o$(_?g})gO%-VV zA3S<+3#DaO5grkUqQYYkU*bo1%kbNi2YB-64j$aUg-7C|L7Cy<@8izR zYsfgT3kL|YnFn?t^WaWo9@&lT^u4%p;RJqpd>aq%+$4x!Aq;D9HZ9Ep%_4ztSy1J` z-W_V?zsC>C@#NthWE|O#;uHC}arJ@FZd*gb%Sci?1wU zyQsdhAv`sUt8W4{4$zfPCA_;vz{Qwej?yHXNG0mqx z)<(Dzto*QhXaq7Q#v^y?0AzBzXJinz4fV#Zk^a~+*ar)O`e1bbZxH<18*qB39lCUA zh7Sq*AJwdaj|j=%5RUt{ZiB#g-b3Qo-(rlN1LlSVBW>&mY@R+Aqr!a9^TUtOpf7!S!P?H*%q8{r^>#-bg=qfbc^STWcln{gyxc&ya%EA)M$4cjqi_)0WQkQEEE2+ zlB7c?G>erv-W>;IWxxTxnB>$Atqk`JTWFr(VUJ@|h9PH69Ci{Ib`wH&hI(OphzB+X zyJCH?6IS^1xw-Vl`%N3e^V_fSAbTHh={Wx9QZD|;-iY#z6LDkdK-^z95_eV(#@*$~ zxV1PAw-?9a_L5lKS{jG4W%0PRA_2EnaerkD6>V%|cq|s>JYK#s4tJJwyOh^0=J7?m zX7K>rA#~qm8s!W5jOVis=F6ZVK(+B}b3$=rZWON0jmE|KQ8+g*0;lJMp?FS+(kXsl zOr7Tb8PV)e%G`1gH%sdr_b+gJVS1=?mAKCLrgT;WN(tVjvqEr_&+jG&LnQ?Dl9@iZ zF_VKbB~R{CGdN(I;m-HWi*QVECNy7};-+Bv$`pZSCk4&dQ@tp*BQ+T?gs>Y<7?u!; z1YPB~Ebc@u%EBxIz#D{YS#;-yK=CwhO2W#?1XmkqR^MsYi_%nIwp~9$v$S94PyOqn zzrW7wuQ6XG+?L2^1>QlpF(H^R+8@V4d*NhcKb%c;$LVA*6jSF0c;WOwKO9I2#^oHS%BGv$C!?SW=di!6Ntq^tYR_~=9S+yw=&7^IP)nwCRS|qT}5bC z5@_aD<_cuXd;x)G9!pR3qP&$`MrHyB4cs0hm}l`faUXx1?rfh4lnFGi#Nk;(5&G@> z=)~Wyk6Jdy+%Y4un|+vCw1jM9%3Qc)>pGEkr%+q=!|LySVE_eM0?Pv3?0?q~*vKLR z&zdcyZ$S7nXYad-vX8M)yquI<>wgyAl+DOh9 z#!_kOzTJdb4X@Vxh?L8N+|TAbTM!nB)p?uAcK<~Bdd)IsU8wp{_tp2d1!j>z^L86x zt_;l;^2+i_om-)q<&)>YVG((lYNYs(3$JOS3@p#??AsR%n*;3Rtr2(r7Hu z%)GLa6~x-4XZ{rZ&qUcY=Xrj8qqjcb;|p?`1fo-u9| zq4_`Y8v%CBn#Fkc-Of05vOq!Xll$fPl|cOTp-c?CgNOHTTNWUBsw~d;)c3zWet^gK zZlN$Y9S3%$;mF>t1ZaZu;a$i&Dk1BRpuF^wvRt1lF0ipOo9-E6d49<4edYR*lX+N$ zx+}SM5w}aP;&{$coaJ>_FP$YgpU06Shp=AS>9%!P^z&?Ve60hj)To9=wQ3^Pu@~mV zg~I*I57b}hfnUFak|iT>g@Am0W}tS*5NMt$)@K`No-F>m66!1z#v|e&FXlmUM+mlvJO1n zeF>`uhaxq?1A*VX3$G75!}ax!@cp0@oL+Bzy`meXS*&JGO#Lr&r+qR%e8M z^96?W?}O=X&R81ci!JdH*p(EGqeGKXFySYhnL7>nb0%Z$h*S0a68}1TL`%j&{MN(ayLw z&W!^I@rCxseG%)%);*(?dJ!}$a5j#a&@5Z=MONTUuqq?ai3FZa2}IxKa}ju!t=hMN zX0b2}EGs|f86g&Ewt(|&=ASC#VCeRoAQTR9L}skL3eA;CNX|?o7_dW^C5JNz%<1tS zI27lOjAVa|v-@8CCETc9T{LJ=A9XlLdAo5VO!aZa(Xq+c%X8KT_QwW)M{Mw>eC)B> z%MNRN?6B11M@(?*hmV^zh5g4L;6c`ILUS(uQM3o88>galaT4wmjvr9v3nK7vNemT@ zdweE$7e^3!!%@D3ZEP{y+Rq^TEtY zLa!2^&t(ogL|6DMFH^F;&d(*>iZ8qPwU_hx-w_dnWx6=e2VuEvh9~nufSpCC zolaO5z?~uT&~Q3ce452I!^r?<6PhLb96^>~t9+2B1rl5XaBGqe$|v$3liUf;Vr^zx z9Qa*lI~HGOSyV^%1)SA8EF$!pR%Wpt%XeiwN44K+9&EpC-%LaN{7VSh;@VL%&0V=L zmC7-uAu*+|jSVDV+My`Q4yWVnaUt0qXOcZ|ngg*50}0JT{IDa@5AQavgLmG32mN~6 zp>xC97;oPNxf~43#Je00&aw!d+2Tqekt-sckwb7?#=mXYK|5t0eavb?rT1QaWKMwl~>@Vt9FX9G8m;7OA!v^?B-C%6wKYVlxHJ0?Z~fo51=aXci!S9-6JH{2=~) zXqF0Q#w!4GMVa7d1zlU|M)L2;S|0258k&^^%$X-G4WyMp+}4N57MfMLjD{>yf5%uV zUm9`6`m)sH3sm7j8TU2Q(&A?WnpH!vyfpHH8Up|&v+m6Ad99^8*K8$DU+;7}v>QLt;;^S@JVt!~C#v$voKG^-|FM)3AYr=6?`QFT&5)6|cV9 z26@M`@aL~j@$|u6JbQRgZ68q&ABdINT<%&{@YB%zj8Ogi(}yS}#zU6Q z7D&E1D@?_Ly*@Kke`Y5uv+r+D0A)d%z6-;B33bNpjcEk5`N`NA+6M{Wyp5QC-SB3c z=6Jb96SQhr2X8fNh}1ZLOl3zD()%;K`EnyPs8t26>sH6>b*mt%$43bI{sVNXSrzX! ztcD)#TOsiM_c5qj7mRo4j#*xPusYNgJ7WWJC?OojQW8;+oP?s}0Vo)lfb1dB$Q%}h z%+awZoHYbFGX`Pv@DL2O`x1RRv_|LZ)l_J`FIzXl@ZKL_Taq(QOp3&%sXENwtvQ6u zg<-h2oPf2M0X-E0$)$ zhgg|S_YAQ#n?U?m&}=IO&iwA?EMJro&P|~f!Tp6n zxVJEX--Y7dJi_$cNZgSv0r}p7Snfx2pP)TIM)^NKm>)+)|^fd%WnoPb;|+ZlYehGbu2YD;YGTYSc9E5``p<+Ac1 zb#I0j?s4#SpFn(nx(DT{kInE_Y24;Dcho>AKs^)X#-qgdYq~d&$%Ir_?V2%F`!rvh z>a4BD@-GP6mqdBFCGBl8)12smyRv`@bz2rHnWE!5Uu9ZXm`Bl7w##b*$>OSENUX)m z)q)c1v9#NAUUz${3>;kv(S&e9a+yStnd-q~5@p7SD#Ns7lHirmeuPYWoJw%QnIuPC z9O#a-g9y$8oN;Lgp?P=!HV|ZAuU`XQzWN&dyZwl7S~bCn06S#HI1)}J%7z0nBV{oV z2STj_a>M!~lYqR&qX+EU)NZPBZ+{VL{LtYiGV)Hh#^RGu#g$Xfy6*x zJlr)OxOxz`kF3Dv(J|=XsWm$AJ?!?zYuL4HDfY7Olo-uB*}tkKow=1r4%!;YzRiSY z_W34UX<#O!h{z8WpqbZ7G>EOr{Zkmj3T7e`jw>Ix$kJzh=F@|Ym40QxR2pD)6`U7K)t|2759ri(^i&LN!0%fi)*r!8=-<2K90KmlhRNkc&T z$-~gSAYqhgUhZXgQAkHqnxQgD1^9EyI5Mc&Xb92*phoRkn`CkNwb zoG&ty0&r|dIP%9Npm53{>>n9}R05pmTQB3gMm5p5bpwR+ISu;eV@&Av4ORx)VSlm{ zjtzFgxe*>H8BMSq@1y)T%jOX@m&X&F6LD)n4B<7LKrO2rdZBPge;iBfMab-hlY{!< z>NrA!fC>i@HlVpo&Yz=U`7Q?uVqrXy&>trW2n7TK727tF z@GMrfNLQpsxZrS@6ZR2Y4zLrC|Lh$#VYFerx~SKn0qRt(hIg7Y#*jYUkm~pYR`3~a z6*rp@H*62~Bs6p6pqG>q~~9d|@0OE{>ui zaDP!G9xM{jjiRCn%oO(@ER5h*Kw8B81q9`J5d`H3f-=R=cX*x>_wUXZm==f*=XZqd z`N7<$<^|(Ebx*eQ11Wz>pqY?6&l~sWa^OR7R+{an;P)=!_1-L5Wsu+VdEV!9y2tIK zRBt?<>V<~{$%ll@hXmLM1m62ID;KE360;D4f5*%@Um|>iSW&K8>4|%L$f!5*$4UI{EdhpFjF~^0ZiYQ z?VRu$A@W6TxWKcAvM6gv7HAe==I=z5v$8mg^;w0)lSNq6q($C`_sWgo`C?(Vf#y7A zeI_)=*+R1hXBBde1Jqmsb1s27JBjIvJCuTE9X%vn?!|X6C(;@D!vgVa|6=^t#l5(f zu@>t`MBv9)nxQ@4!(Jb}g?;3jz3lULk_UDQJS!2(_*sE7`)@4`RaydF0+1rY^197b z8n&Ic_cC0AUe%PivgotTkrd?q#M*X?ny8lp}br5tBH@Us%HlZL8ScH}*|wS{F=5N^8V~`Qy7TUn*#hiU`7g{rw51P8o?;Isz zpUd#k!%(l84vzn3tLB&#=!JRVf$013y9$Z|zkC~K7YxP4sj@PjKT79@j-rtzq_9?C*e4Zg%*nT`RoQxE@+HtcQ+`>SJ1% z82VcYMs_wjjyI{2h|HTb;N9@Fi*;Xr%{ za+yv#JFDygUTO*E?GZh(H~dFrNA*Ebyd6#tcE;JU-h>MIt3MGX^Wt!JygxfE7aU8r zqxw+2kftR?=$qRPN5U~Nh! zsF{)#1qGN@T7;i6UMm1xrhr*Q5GHt)r3y3?!1$cXm1~9x%vP5S*~+oIvpEnN?}_4g zdz>R|oDg7$a3id^s=tNP3Co8=>~S#I9{XfV7>@h+Ei`2NXi#56bFC`k(@bdY+a0r9 zdt!A!e{2nR!*uzYV3L0Ps73B(Iy2+DB;^c=Oc;JlD1N+<=M$pE8hvj82M9lh5|;Hp z1xf_#yB3L+S)f^i^Zj{21Y?TO{DAvnX?{Rxen4n`NIjkJgQs&n@pzWI(z7`pN>69I z;TiRGmMcHI;}>p!pW&z$Ir(F{Bc4rjz>`!LJRulAm_cxz?n>ZxAyA5(3EqwbWk=jk zbtW{6)!7S=_*@xq5CcY`}Q<9+??Wqk|}I=+}XSqG`Qy}IB^ua6jtD{f1AJOOScj4LbW$Xy-PpGv+ zen>AI3+qi-q@sEvBccbgxSt-{A6tC-@_lZBzHhd})DR~u2(!a_!tCfT-$Tf2FX769 z$vE>3&`c;6OY^Y^e*&@}VOdAZ$Wgw_9z5=YW8t3M3NV{iW`Si{ zO54!_%{lx|fH@=Hgl1V{+ZD3P-|=V<6}S1g0%(C|cV%fl9_OQAS)yQAz*+pD3C+h6 z+>lFP&Y`jg$ZwO6`iDnC?+G;La+@3JgcGAe@a*6s{O9>SxSzEFD~5#PhmI}Kn(txv z_us;P0@$9kO$6nQ*ug$fL$ei#tYWeG{A>RPVV`M2=R3!qu} zt%?d6W|Mid03vlm1B3(+{}P%-tbA+Yd`R-}m?7;iqh+zcO2w)Mz>1Z^O66GYFM0VZ zXco8?$d$rrXtXWRKa&REe-6z8ik3V|ek)w7_cb*0;j)Y`vNTsHuW^l;hDEF|6O^si zWmC)5aRtm$M&rR(l(qckh)y5Z<&wJ6TfMKIr#erb<@5Dxx^Y-P%QXfx8}%X2N*a=y zK`QlbBl*spWh?j6^{_@k^U}F9(XM$@^!)xSeE8lw3NrnD*=b!r4R?1twCT_ijaxRv z%sJEX%P)^nT5=U-*RLqwW|^e->k}O&&UDWZOS1)<`T6mKTm1em9^AcwV>ySge|K61 zYxA+pgE(1`qx_-;mQD8zfo9p4wB=PiKvg-V`UW6Tr?f0rVPRHQE@n&5{{#TV{vRq6tV^eBWqA7 zJCh)GF2RK52xKKiBXe*Ha>tLs;fX(CK}<0GzjzxTH?D(MneX=q%t2p#h=qPmI7q-g zJ|LJKqgaXTkrVHL!*RWFV1OO=rg-2$qJ*<@L1wf)j_I4Z#?gd6 z$Vlo#Xs42TAuol%Jc{5lS^f?ue6mhdJZ1&V@>jb*#m|IgJ%P#qW|0BOCNRrq(RBjw zH5*w%(Ahw<0JD<#k17d3i{G>SrB864DUnNpcux*sMAInwEFdl4iRABpu~yF!*!D-c zioy|!^P^mFBE}A9*-4*@bXQRZvI)(ZgyxJ;M?$k5q1g`mLmhBPEX|+3jRt%N8XC}C zlhFK5lO`Bu*8_9i`(Ry=0|z7?*c$AP?F8nnK_1FIV|$=0wg)+50pFd^8a2ib9BiIj zH3@e&O(rzQ;a$ieZug4 zZf_A-OGPpvRVJWLVVjdSD61(>k^B3f+uc_bw}w~!t!{L#JncJbFh3{EXy(| z6kun2m4$jr3C+c0eE7~fqwpuf{QP)akuZIOeQ^h16y-?PwONauhyU7$I#hXUv$Y=hf;`yu%Cm*MbsdrS*b_lohRYw1e&RQ zLbDZ|WkMjYNp!=p0fc5HzIXgxR3Z1o%FO%8VkiR5r^iJqX#V@TU3i$AhJ`7C=<;eS zW!(7U%{Q=j^Cs+(#kt4=Dv}ucO$~=)K^8WU{?_bcE6d~vk%lmVF{^=4r29^@&z8Pj zEuf+pf!_;PP$q~fz%!Ua`qoVwwM=F*rTS%cAKmX;fw{5-mbEslT4RCc4XgBAFYV`S zGLf(X@6Ys=gdG$|7>po;f;Hqco4kr_>%_tZEVsVCqHI}HWciNws|d^@=F8v^TWD6{ za4VhrSI@|_G|K?SHXkDGU0}^~WmV=lk%HzeVsVs=*c3o+CYDnRrAx(95>`!va%I>P z(Bq9%!hBHj0QmqaK%~l?$ok+lG(TUFS}y=O)j)VH3uHDxCGkJ0R5(*sVCB942{fm1 zU$0O|@jhmQmTQ%z+^C0t7n;>#E|z;+Wy!Mv%^H$rE&CNLlYBOSS;Tao7w>5{YBS15 zh3~B|v3yp~RGlmP6@mFhdNDN1KJzd6mUl6I`R3`k1 zTgHo_S-Wb8&$EQ1D@S?RWgObK6$f^2M#kY?O4&#EBQNJDN^e}mKmPVi!SOSuDN!;6 zq6N71#6a!7asN(lFGJ$D=j4s65fg)6XX=~}E@xe6}s&I+1quuL9Z zzQ)gCff(!Oinp58S1uYe{QKbI%vfAtXL)v#4~p3-%R&uT#s}c~cz;}(q!(?tI6*>Y zaXTeULG#XFdu#}B!7OhlbnWmG+BK+&){W|-Rh^oc80v%4h%#_asU;K%%M@p9y<_6CyvDCp9W#1mos|4`5M|X zpC7fT4WCasVoLw6>_|PaKiLll6I_uVYtK%oACAS_AtSag4#oG!p@FX0#ZGE>vNsM7 z@I_Xl7cvuEa5&M45KYid)SY5>yc0VyUPn;QP40zL!|ZWoqQHuuTAo5&! z6;oF%%&DO$B^Z?wn$?PKa?VViGmD@;n`u(A5J$;OKed{m0Cw?UM-)Z($7y!dCkYXG z?DVrE-3Tks2=c_XU^i?fFz*m(4iISez^6@`;;S~zk-vC6%FtIQhjAN<5)Kw_&XN@oDOnkDre64^R1GjV@R{k2@);huJHr)^2)e(`cg3Ic9P#&g z{qTPm_Qn7GxhMX6VR!uZ&p+UQe(r|7h6gy7p#LwWBof^j&`PKZG9 z_$U;P4oCi|P!x;^LBa4KDggN-{M3^5g~Q!ZIK&x6$#yuA&9O9YwL- zc&rCbru0&w80DW130+t=Unb7R;KGbB6pZ)9iK&4oUp)lZ=OyCgP=A~s8i?~Ff{{Bi z5JT-gL8qp5@NTD0__lp}_`lu(o4g3kVGbw=wL^AT9~_PBjr3@N=3dAo93KgGz)nAV z#JtxL?jLr-^iXFkj_8k#5%w7KX=g-tYK3d_$K&KM{$|K3e(@3-PTViN3~1&yTmYF6 z$~K-&AeDtoWI+>w&)tQSF91G9!^tvtefo-B020sHYJ1DZKN&l$kMIzc*zpqw4cdx=jp2LRdO z4mdY42G0&H#^28D#*>0=n8)A5FJ5bfx_l4c>hKEtskPX%eFytT{ja%-iQ5mnhDJkq=26_ zFiYCf&+=SeW2R#TWn1PDI2Ks0C>>2qq_5Y^VDLf(2^VT6Frm3JIGg2Hyu>;%%l>>? zS~@CPC9zzULD;mYn+#%sW?{w(=Na{7{$)xxs{Hoe||TVp|BFWSZXv5M`AcdC(>_Z(JkGQ?F3_6``3APsHmi04XEAvI*tRPOLDnVC}37|Aax+OmZ&)Rqmal=Mq!SdrxcTOPh4-seToEOl?&KrGPAtsXrxzyAg_ zi%e*?l{sRz$owIoWRirMr7HO_o^cx6#h$bcNRA0br?xGXAE!XGCxLD8(wXSqs|#Ac z(h_x>)k9=r6mFK4;O?E<1lwnX=EwNu(H;D%fLMUp3eBcpv+`{w^gerBPTeN>-oe>Z z$FO(TMg`9@K``scUbVvD>66D59RK~#rwWQ?kraVsiT^Cet@jYC^-TrLcW+-;&@2-K zbC2!C!2?^cW+4Xx%eP?d+Km`Lel!}jY_7`m%^RIDJHQ9CLcH_%EMJ)HPk8pn*@+>@AL56-A8-%^1Q!q2q55C>L#JjDUDi?v*nBD}pzSuc18o49lkT*C0=>*^dgyq9=+IpN3 z+n=3VfAzOoX8NnZqG8!3RcK(x03 z%U&ujuRyZ`XMU%EczPgi@|l!Qm)O#j0%igsL8(-Z3)IXaGz%!(Nb+%;;CN$l01A@& z5r!O4Opquf6sQORQ3QiXR~!{+CY&6KaKN6B{;W@TwNOkG)_sF|G7+!=Y7?5@Y2J)5 z)B`_z_s153$<8P*>WygPgD}zzI949n?a9AAHul1wL=n8poEW;>MP_C|f-q zx91NacqO7_VG>Fg4#4e2Nw~W>8TS^aa61sUmkdVP(xE6_J_02xe!|U_V^FestkTUj z<8gDHXaa7m8HbX!0}8;0S}HA(pUs!06T>LC2z72f#AqW<_li~Hce zm-Zn<_rrfJaG+f9w|VZkKiLUaN7&)qU=BnEIiZ+ss(7F~PNcY@Xpk$41`|jJQYkJd z8sLOe$xgU5+yi&`9zC5EqG0(y7KRd-L-8MTqVe~+@%VK{4DL<{!L^ZoxH!}s7l!!Y z+z?-!9psBMqJbRH8rus4J#m@$x;&5?;(-eU;ZwtWP(XM+HaG~m9GGSe6@?&&?_tiM zAY>2nM^=hA2dR7?2D0s^IHF*H9Zn7Ck25KDOt&Am{ctj=FN%`-Fzn})dv~j;@51CbPl|B`#B@|(>LMw#XFcDCeUn0 zXzqtmAH9m0&Mk3e&QB=dVEhONfU<&MmQ3`cjz@SaU>3KdT(!cV6G4{);uzTyiV4K| zgj@mUW8uy`&xr#^N96GwanaCLXR$yNWch9=XpWZ&vO4@+w!~wO^5*xxI;0&3i~`R2 zu_h#IX=wIFo-BkCPhd_U#3#BUH^~{NRC&QwSU~i zetiQ0StQV`w4SWAp3J0-V?0kWhKSeNLbG{K_VE@=vk}?L3eN0jtj5RWwHff4Z|i+Xql{Px@3s-zOWMOLe*Vmv247Tmh2npkvApE-nD-K z0RQw!L_t(OwI0;`D`?i~NPeq`m02%NVaTW`1*|;DvLvBgphgh_s8&n!#x*vUW;JpW zSRy=90-n}NG#|1^8iW9!t`rT;rd|xq0()Edz{J10A~c)%p(+3>?`auz$v)3Fp_va` zr(;UMnZV3_ONC1QZDc+GTWA))W&vh_Qu$8Grk2yL0L{ALno=)H1z;9PHo;k-S@x}N z7xFBu58ucG^Jji+^tIQPz0fYUxe`m~}j7d4|lttpu7S z?>5k^@5g;U6Nz`ZYtw4Xn>HSHJ-fow%^qvkFT&C_bI{wVJDRp@jyla7!h^v1@aY5m z`R6nICX10gy^r4%Bx{hB<+N>~S^GLaFph~e`Y!(X zU74oV>R+{DIX?aBD+SH%n>NA-m;RU&?i;$=*0Kg>8DbMKM80=-mxnUU?a< zYgAKK@fuYLZQOp`tPu{69Zo1nLSdXAve*e8jS{$X!a)J&ct>O=x)7Y5*ulx{UVge_WD$b z*DGPiyl{P*05rc-E+29aZDlT<9*hz~^GySoOQwb~{a}7C%TZIa1FgV3m)A)|h}j{y zJWi}&eNhnOh$4bP0d<^kpe6uDxv(ysa5Pd&+%qx~eK4xmR}f2c-P(0fr*3`Js$Ly$ zH*bm|c0IAk*Ad$Z411!zu``U|9477=j@TCHfL#L3L9SRDL}+f+5+4$pPpqAbCx=$! zw>?Ynj~(;y$L3l1C2cl--8>t=rOlwG6BK+*&;u zr7I_*Yz4J^0&XrPa;C6Ee6adI$DrugF|!PcT=FPu&x z@N)2Wb8Il4&WvGtk$5yU5Vyy8FGHRv9s_hS}rda7SDm>5PkG+;C|U+a%lca*f zN8rrdFkD@mg5QqLQG=V)qhnAqCK2aHL}3^682I%Y`2OA3(6(VC^!@mA1by-mmhm?= zGu#tdY-5=b9PCCBlp_hy;RN6?Cu9V3AQs?)l&{}I(6=9B76ErL;d*1Z9Y%fJ0g;_r z;KJ;YgndsOjh09pY{P`u+-N^SETJ|+tjvU8wqtSA%8ho=5-T#>xj=I+)65Q+Wxt&W z(gM@A(5xo`3N&Ym`-B2!N-e&^^x58JWkDseZYyZ!^RU3O6_}635vo-L4ROy9Xm&+5 z!CAHf(qd^Ai}tY?g1+3B;Cx|PB7Qx(9DknPg3!;CGY;UG5vlL|Z9| zpkak(e#g2p`J;l2>$R9qQ#@Z5VN;+iz+CD1iL7NY(<3x*+O&xeL`!AHfK_Wn$v%Nn z9*6}}H6&M%0f$z2lLlbjH)Wn98L)jA(hxL2^Rtl;HAzj4ZqlscC`Vn-0;xu7-mz7F#s|2qR1%~7e+Bhg{&{h6C0Sm z=kw)|24?2P>e9oq@MV=%miBB}gYBD^V(p6AShHz9R_|Vde!g~S-0>yUYTg8WTpV%d z;cfi>2SNDhE&M6|#*ZHnVjrpq8fx`G1DI_k;pb#A6p`%z$Fm1Gc`O6_ccdX>|1M-4 z+@oAHL{~1H!p-X!uxHn1T)1!&|M*iv+}&1|X4BeiLi3Fq7jXUhd0e}84tw`*#r7Q= zv0>vXtXQ=SD^{-H4eGOQvF;oqR)?bw|)Z^%XpH1Ph6ND zfy)!vQBD>}4#R~B!8k{Vyg11RS0@DG{KNp9A0L2A4t%XH#p~y%MV&~$Dj7WDJ4R^<(aAzEfw#T9P zemI<9hm3fEWp;4vsL}`U_<%k*Jirm@DXz#OAZN+nqXg;WDULWj+ymEHrdv}(2*m_a z0*ts~+?wQ#TP(9g%P5s4v?;aN$uzMr6J9ChM?78pi+PR6bfFL_ST>X@v2JzLfl_h# zuu&N$a4r_@8@#71S8hrGTEd5kU$gj0m(GxA8u}h3(`5N)YKlN}IKL-A&ydysCSikx=pH^uy?W z-=iL%MeUlkP_KSH)T~|=Z?Z>VPXo330y@Z!cj0Z1bKsl zkUhX##f+9Ehtre1kU7AM&(4Pf9zRN~rQ8ni<+Job4goEP=jDq3C_(bLil*R7Xd(Q> zaR3#|XD*@12u7KaYzyHWNU_|Rgr`L6#3yqk}nF9eNo7=7qQ$Y z6TEqj56>Z($MZ7>J4MNUC>j`m!odXO5rH^1G6?zNnlU5@CwSkJ$sQ^++&R7<7X}cL zsmmNpUK-d3mk0O6rNKRLacD1G97Tv8<49? zpK)#7MBJD&5~bq?;q=frtcvu6=XW3Ci`P4%eVvAI>GC6jzvzO+E(BJIXc5Z6EkQP& zgSGSsXM!}}Pqv@TFgFguTru#6&k)}2EBws&d|9wP(s=%;PdXs@)n+(HXwKy`$%>JG zmc-|nZ6n58tuQFQ%R26|GeJ~bF6;@n4mc)OW&T!)AAgSc@yk8=y|{Q}aUhw?!S?YO zK6|nF5@02EbXE-0Wtv&h9)@4EyhGxiAzQt#meh{*L>|x0lYK*KwLq~VvkmK&0SW6# z{4BoBB0lqMK5tnxB~QSc`N|~hUzj-v&yKFZ?6uJPY%U^C>oE!ldj=rp>~s(VSGg zAuBX$Xg0xl>sk>jlMTdDz`ulMruk2x*%o#!QZEV{vqe&A*27o)PjPlX3vK~k@BPd!?-m9Arv?fH_PhG5y64Ncdv3C$ui&a+wn z#(9>P!;q1_JU`vg^k?;A@CQg~wXCYNlKV@rV$pQ`JbN5gZJdiWdzQmJ!V67bZHqds zTB1wO?zmQZ6@UN7GXk=#3Mi2=9^uKu`*`vKST-dq1Den*`p0iiaQ%ES_HNsV%!9j; zPGub0qb$z3S%;JkAJ~cf{7n4qx5p~_##3YQ6w`NFLfqXfxro!J^YQTEEnK;BhLF4x zYY6{qH>@N$FTr}g3j)p4rcK97ZC_T_<}Y4rgGt`b81K**U05cu=DK{)0cU2!;_^f< zLa|tvLzK==Vn;dA7gxpw5^w@=aeN^6H8k(xkNt)a56ty*#fPmMqh8G_XxgZ@itjth ztrzx=j>f!zzL@OV702d`#@12sSP&b4L5{s)|Is^myQcG`(paxP=Y;7Q1U!jeA=Y1Nh6ZPWX62(SQqU}u`2@Nt1Cnil*;EPdd0ni}Qw5j_(6S(f zTxU!!)X*%}Ykxv>pn~QSu{Ng?nrDRYo`mRG!6=y}&@2mgFdwre4za*}2#N=ZJBGc6 zW_IXB?C|r#Tr@PtxZp6M`6%l$li+eB%mbN;ff(KY2lZS8nk7!NK=WG$G%qJK?~U-m zAwu&WLi2WBBR20&)Q%M69nx z1d`ZjNq|hyx$Ol`al7HX9Bz(!MzXF3l|27dgJ0?5n*`r| zKRMvaB)%_%=8J5XSJ)0q=7r+=&k?w^G#*zsjKKAsQ}E=-5~5NlgGAm zoX~unkgTAY>E#m2Wq_#e&GwbSHkGBq$+7K;m05hDB`w0Q$b@DM%vNYN0a+vy1J&d} z*%!AA{su|7IVIEDEG`*r+qrC?xoqbG&9W$qSidD-S&XNsa<$>d>iH@ z1mWWj?UW&*Rij3jF?9+dbR*W09jwqS{h8)~3VohMmHSphFFe;O)lUjAD@p$?#x@&Z zrUc9?LvtEsx@n}5%{0JmAlUIZ@2j9$aaKiWt_;o6-WwN8>-z>vF zkvvDEe5Q0hWMCs(v#zZoG;cu~!Fe;abrYdf{+>|gZx)ab%9QnQ3jM%sK~Rry%!l@3 znILHk%_3Del^LB;Gvii3m4?LaTJfO~{coX}=^LsD%`Xts+q8k{QE8@(`)HUKNuERk zlJfAZV5uLJEi_yI9+7<7LbFawWNm!bRdjUSs|LUUgc^BT9<1zouWD;Lee z$|bX~YW)JN+qDXzNfCJY&CaOXq6NP0-VKYEFT&Mp=kOnYKE|_$chstY&mKR(QzKGF zWnli*29{-^6WRX9FAwqH)^%-ZKClbvlmK%Eb@b3~17bdva!{>vy(7t*VeAKuO_6-v-4#MSez6vZajSnU?^VkFdW!d`U;yQ1BE`&8V>|zJF zBfGm8E&AA7R-BnpMcR7eliGXB%iPnJQ@sh-+xJ!18mz ztaT&R2PF!aeHDP4ZXg<(0~9oii$>Y>2nEezX_m=uC9|y7=8{?dD4!dOk|}{GNOWLF z?}+?3!eR_zF_M7DI?N_G9425Kh!me@SEO?wa+m|e^Z_9lG>gKIc4w zhbW)>h4C&*MLhR}f@FbaSHd$tCpr_DsRa2<$$7CnpTLzL=SbjoB;*jXcs!4AlTT8bVke5~Nf-I0&bef^m9KD9#KDAuNmd znL3k9h#o+QP7cKRlt2PA0XmVu%X9^rPf|q$>O$5_A>q0(h2pkIwsKGANllgLGz8sjZzBn_aH_i^} ziL*n0#MxmzaeioTTpHRB7l+y5(g=H88tI6OqnvP2CNfTNCp>%N3cqC&5 zfSRSCRs8AQkj^%g5$=VINFVHt_QS~DU6Ig>zZq%*Oq=5L1)@3uln`$otgKLD8= z^3c$Vb1Is>Yu|6?5}Js&B8ju0%o6Gq2HG8ZTeM< zY|j1Hl50$8HVtpmm-0CK|L4G2U|A&l%&!G3DdmEoc!2Ct37Rbu1D+W2Y=BuY+-hP>T;QRw@P5y)br51p3uB;E1`KSwh*2LnpMnZ@fR^1t!2Ay6Ei@VvsRA4 zsySMsDrCZ@5^RoFR)zCa<^(8B83nKuSj5U;qml=z+$Aje_*c+uRYhp7m1VG8N<;Ane7MKlamV24s ztA}sOe96SRtR)Wpw$jind9mh4N!6M7PFowhZDV^Lnyuihp_%D1FShy8*U3xE`jeMN zUJ3=wvgP@!SY}zmeC2YiT09TS7R|xpWplA%=X%5qNy2OIzlVA)TcK;8zG(AG8#p=j z#e;h#`2E+11mgz;*ark>BgC8$F+%~f;nQpavkA@r`TG+*zH<`?c5cR@eLIwuIXnG; z2Ia$haO5CCn%nfFdy#o$AI_gHP?H1??cbq7%gF>l0p}+K=i9fhDJ%1X2W86Qe1z9* z-?<4J3D4{JW3zUp46xD=7oULIwQ49}_4?uyqJ!!OG}T}+F}P`Vt=#~_7L)RvE$v&4tHy~BesY2!OpPW>N{BmdVj1V_9yhg z5km8^LH2}YPrbkc0rENr0XHW46O;+J+=`XCgs@yXRiK$*%+IC#Y-*b6d)%u4eDf~ zBjuoUifPCScBc}YaWanQFs(wuvZPbMGzt?rph)smuv|cJEnr>?3AII(viK653sZb4 z9~2Gbaq7e%fo2WNCNyh_bzh*_s?&t$)2!Fx0sMYIAVFE{G}AuKx+v!0;Ura*EQ1KS zm$+G&64wuL`S9g6Vx9KFS?2XD>+UoM2*s?U69b)bVkpZolI0l9!NgeB-DuX~Pwprl z<)vaP7Y}pCsi96N9%@I3?t^m#=W|1QQ9W^CNFQ7rY=;Yj`*FWN;n^M+hB@HENGJXt zxZ@JjyDXvTW`*F+(j;74Hx$1gTZ(_2O~Zd)*pB}?k%k9bW?{$RK#cJ2gUT8Ki|vjI^WuPwgRs~jjPBbL$^Cj`NrWdh z^EYOl$B!8DdJ}}UZ-~4x@yO<2C@adIuuR}4jAq7*i-y21K~yZw93UQ#wpW(sJPxY! z!ZkD(aB!CsLP!p0dna5UAS@rGj>IrO0^I~-v36%idn*XfBp{p6tX2>dy#SmgQikcv zoJU|*A>^ov&}_I6$#H_ZShn++PN9NkXJkbl5`vS>d0eDZgZ0<#87tE}f* zuD7w+{%@gK`7WCc!D!&Ba;daU99eE!VTg@v6Vq0sDrq2m*h)-CS>nYJ-FU$&Jv3`x zP0O!QX1OQ7vwf%{4asH$w90x8ud&p-&c9A~12(Z-8&_@A;=TrH6Rh?9lq_X6BX#H( zSKXg4V#P*i!n2LFS$)=dPz}?NF1G-RL3O@F9+BrFg9GMQpqcf*P3nCWj~n2;Vfiww zS+)d=7S6@`v~?IU_9wjm#g}N+@ilz%_1CCLh>#z$#f#_SKmX%bLb8OH(~vBYGE8tb zfmuQGGs3cs{_)2zxOL+Sj_lu!tRn}IbMz3wc^|UU49oEzoEdYZaqRUw_&}zV8@g}STZyQk@ns2cB{r{L`be%qXuf% zse@Ye>Y-|_+Ni1^xhksHs>1#1+^>OZwW<-83C=aEpmvRF%BSz^*3Gb%pu9WE5xc_e z2+U5{9O;5B;SSgl(wp$s3wuL*V|Q3z92USD?uxVH#X=@vDiiU1aa}<$!IN-YGC{!D zDh1Ay15l95{Fr!T|;vSZl{L-B{WM!4T+p_d7MNS=ubd& zCtR|VCM=#EAS<}}E1%K>1n7h8%n!$S<4BYbA;J$w28Cl3p;_EBYE@Ktyp{u1eyhCS&kDaf!c>z0NEnL+B}4iJUC2&vyuR_Se#AEb8;}t#A`)N_Y`4S zRGjR~dJ*fk57XEC3Y2-zbJTg+if?p^H;;Momdlk547uxrG)4-zVplM zzrld^^$^^qK5|FJsaVG{88|zhu*)``m7p!dIRdmYxlTc|wlr&Kc2>|_NNCOt6=-(G zL4xanSb{Ub`w&4kov@oh=+0u^vZ6c*%sRS^NFZOV<9g|B4b3Jvo1knfy{HLgxM<`W zTbcYPD+=aCOCAZ!g!Dq*FP~7K!QZs9#l!Ji`cnL@C=I_J-;7Cw`%bNN)T55Cw8hGm zOR;$~`!HoJB>S*mS0V$cKHt(0+p1zeXz33LT9zfZ|3Z58T%O1Ct#D{f!?uq#4R+$w zEU;|4Czvh^mdIZ!4b2@Yl9)N!P)$5EWDCtUbI>ZnGSjEh)Z8&sI)4J}sy<9;He@mI zN!iI9TK|vGES#c@eY#%Ef4a3aUuotLUNKq4U@ys^<@tz&-GmXPoF?~~(o80@-d{7) z3%|3;lZNI^0?k|5NVX7|Hz{b|o@PR`R0=)OwN!{c zYO7kt(U81s#t79~PE)#~&2sARxwUz?8k()L4pPiImuq$1nGfBBW&vdrn61`kIZy6s z)VI+XL{?Zf(pGv=L)^p{dov|l9%qA9f!mp$to&yY%O=rP^vIiO%1dED^H%0%>ni3! zz?siv6D6^}*RSSF%O9m#bLXPV58d(JC!eEzr_QKSy_)(V>)!1YZKOUFtv8gazgWRLi1{@X1V(G z?V~Lh_3B`VQ$Nh|afI*ZAEFJ9ecGfx_79E1)-1KY@0n1!F_%}>)JI?uNvW( z&|I@t9fC1oxsCuc0l5z0xDMg0c2)JcM(x_D&26n31aQK1-Ktg5nsD9owYFFs=!hMW zj@Ta33tPkO*`YdNOIUyG4Ezy$e80od@Lnhy>dg*56xSyE1$$wQ_%yd|uH%3^c1O|haGVksg+xLnf%DWrKY}4)j_{|Td4MbT#oFwobRt=- z%`QCG9j6Bnj1$EL!x;qx-NFPn!m@z2hTW5d-cw2L1Wy7X0lS#j7Bh`v0`uu4JKnP& z&Lr3on(YY9{Rz+nasswQvpB_TPbPaRa4rzpSepwI{dgY&Gq?Fkl(g;72$m z2xk&L1)7h>unk3OXg*44KEQo}=AZiXMoRzwSW3X&6wdU!eu$WtY9qLP6C59tggmxi zS)N+_f+hZXwu;y61-1By3C-futo)j}&%u2@(;;Rf6_s0>Mng4?XXR$V? z6J||NmX-R{;v)t$>q&+hnr*GjN(LxviRD=&dExnnl+UyPGvQl&nDe68)(Fi-Oj9h) zhdJ=SGB+7d4=utU1)K0BYdwZ|^gxGZjZls6P{&u=sXl(&_RZLmCft*T9fpL3q<=U2 zWov&eTeB~(DCr+%5}nAp{5I3K{Vd-b@Tjeqx}P>J%vy3kk=aMfcP2cFkt$6~0kc?R z#ZV^wzChxOpqa-^W|DK1*dJ3SFe^TK4wxyXudLZ*1hqP$u%H+d)mG!80^R|Bh z&1P8@qsV7VIW;UxnJNRblBMo7EL$btne`#S1{{f#nj*vdE|x zQ4Y}$He-q%4n8TSFqWj&9dr;Z9I7OVA;@&m(r9iH0zhurmk4`w$Ne=}lNFq|^QBbl&9l+UOj;FchIM9@{5Y_ksK(2*jF(3FGMmsW zVjlG}>{>h0wqV!R?bu3r`vu^u@4rW@mtRuQDQecF5ppw+;O~F_s+QCi>+;iwcX0W9 zG49?ju|Ts4%O)_39^Ji#>zB`=kZ_!FWIvAQq~q{`-8e*W&dxZ5{9~CY%Fou;W_~Xd z1;u6Kz}{_25?$lzqjKfb{OHkbJbHK=kDuJZgL|dO&&$N-?d!2>-BK)Dwh(KUEydDB zi?No^chHc*s9(2^@)P#`_Djt2cSHP--{PGHbrm>Ibm)nzQwQSG2yw+AG>;{CjS9ns zQK1CpKwKE^kMkpgkVRM#XxDDDND-WG zju&~NR5ZcMDg&BJCz;Sp2qiR|ZWJ1t{S-LsMNbH(+?Grg0OolFSzBlp_X@Ebi!2M7 z5S&eTwnDQvu2B+`S%EeoSue9q;5We8);*(?U|pK(M_>-X?P+?2II(t1h&#QwiMBE; z*yiUubHZ?HFhP^BDbW?gHRDtwp;;hnkS{U`9S0M3#SfQH8t;@%)u)fDj)CbT9K=#yOu%mim{Po%gKL|qAn?kG+X zuyscvA-90QTp%tTyrz)AEiN7c&LRQeV#2oo^XUPO1ZI1jN$Q8Q3Yz;8nEO!#=R_x* z;d!Td&1t4_Dur;F?7{m|+?R9Em6btkj0`wU*S&;QW$QCQJSltDj#XUnN2%hEr&MR2< z!zJGL5}{nSR|w4l&l0D(cvt`-nduDk!x;jkK=Wx17S9lx&k>H#kMYAf>ilRgoF_n^ z=X-HRfO!akdXNXs40Y#w?2gMLJaCoUtHa!JZipia2KHf_=z%?9-(iFAS6Jr$8J2i{ zjYU4+VWHoTSRC9B%fg+oB*GOlIe;DM(hHIOzDKVwKR~C}P0@w}@t11UL=(1|W>u@; ztB!3j(%SBbtz`B+%@t{H-fOb45#Y1!y)V{;3rJDFJ4^)V5fb zwIu%Yai%TMT)_86B4!jsvE4GQ0z&h?fFE&jPBI?sTY%s4({S&|8ie=w6t6U`uZ$5L z+r6y9!R^=~(7XXVHm6~y1)9ZOmHna7FPr_f*>_6+YU$4nsW5DmGB6pC_#)Y^?I5!B z@4`Zr_`tm1R`$apTlYz;m09{$;UnQCf#gciOjgp+Y(k<)v(RQTgG}7h{j+eAEii9l z{wT76Sh>ZgTK)TOBgvm)3ma(G3!tzZVn7ofFpjITn`NyeRZgS)R(=wmk}^yEnsvf^ zsq;*k^>2Q!rJ09i7Izggp{;De*;*ejlH^UZqnQ^%^ESev4K%CKmVltZn=FNF1I;R9 zCIm`C27p$Qr4o$7*h2G*q~jt=g=w%;;`Nf29t~JW1uFir`4@u{8-{4jO2Y9;DEVF*$c#T~57m^3B_GJWRoNujrvpgD>RSarXPT5+Yfs$>#Dqyze!+b8X)qhKA zaF#H1+Z8yoJrIhw^BPkoG+T}dG@I$kOTqfJSep3~s`dE!;wkGje~5O7E71DQgxSLw zJAMMX_V^J^TQpO^T&Her!bDZ1PMU!K{QDnx^5`CZp~PSL^0{JMy>LcD^HW(4TZ6Jl zj)~Rz{+&{kmtDu@3#SzrAIm<9^uzls&@2#r?sO3f@(9i{LGaKX96q=c`}b_c*0eRs zMMENJJT&~61)Lv0DQ7N9}*={8|STj!9u>8_rIb&^$U6mq$n9{D=@-91+aUHV9`&1R*2I z6MNXH>|!UlDy&KT>^72DY~XeT@)&o$xEN1!sv3EsLw%p5~8og0n2Cd`sLz2*}DJO^KCRt|=uz-DpaS$Z1EY@bRc74&I zJ!TM^HwF9Ph)moHb;F@x2ST&G0_I)8{jrxYr7X?CuK2E93l$w@x=Rlfj0{83Ab-N5 zSX~Ltg94Plu1o+dPykGb5T9m3vZ#;{E!&d=3Cyx3&=wMkB^GhMxKSi?5MW3q5{kuI zTs_2kT%6=gSa!s@BnQgQ0?nrr?Fgn0IFsz6+#1gEn$s!XC>}`Q=6wXvPpEwG{;abi zwsD#KSI9a#AwJG=euOaIGtvh|5&RrZFqZuoADkdi7jf_*6A8^pglZxo!CA_CRsnOc zvNE4x9h^xCCa`k=L7kGxfz&wxW8UW~_b)TO%Y^02$pN@PfIgqXLF`ZtS_!}BNBC$B z^TLIpp13%S8csmw_5!yTMiQR+PFx(t{SlrxJB))Q4tURw4aeE>k;vocBZ>BWe}BY| zknfS^_Z3!oeu~8&l-Czn=>0Y3d4GqQo?S7?y(@-0c0od~&k@+=ee{2)Exsnac5dDf zt@xhG;JYT%&n6&0feP zEN6!KA)R%3kaf8)J_N&ibw@%!LNft+MT8dye)1+FUu_Ma4lm)@l#w_&Hd-w{a*WR< zm+e}K1DD)rFAcR(5_6fbthO%5AuyZDppFn+4-jAv5@rujhqy16<@9J56-^@}hFb|| z$E`$Y$zmGW1nrzi*9z8V?V4eYmQewk#rn*&^8}h%R<-Q5SeGL>FylZVpWA{+fo8TB zg8V-Jp13-H2=48jk3R~x;pW~IaQ^5uv~5ra_4zy6;pLWCwqyZzZQo4(*{Fbd7yGDP z?4x#U)Qg>%tIyd2nTBR9voAGOq3`_*DF%_P;t;FU_w1)D-iwkxSNc(Et1v|+U{+}w z21X6adI6P+&|DdqO@^ra*r9g=Q^_tAHw$Ei_97OTrazR=E;PXf|aYx0U&qhE%+Hy$Q-DEQ>aXFU~)KX6--A zH2A=@TZT@@S_wuz%|@z=vM-z}=xWu>d5=p)2`q7edSSy#KGmg|Yua$-^ z4P3sn){A9dTqBI@Es{JK^`h#_=CgU7DP1N?y%5q+bYpo@BT|h{?y0QdGJ2Ep$f#A9 z%i_yL$zM}D_@XGWydn+B0?Q^aTlaO4bLN+MQIeO0&p}=k6Pi^_Xb#MFZ{Ci*X*+OW z*B-3n^X}QV7i!kg{>;sqHA6i@UEkh4aqsSJJQcs@hj-OPzq@5Olm%H_G%CAh2t-Si zjYs!xsU@-noCT1T?=mIOEZe-?36QyJTjD>bSR-Y6qg7;=lD4*#RmuC+_5j#4|@hi zD;JL@)vKU!Jra?pW{roHRe7z$E_WBB&lYFr*(jMD~2jlRx0VtT6giBLHQBH`xE8*G1 zy@7C9M$o-Oa4wsu;aDup8l3rgoHwD_N6UcbTNY@h6f_&aJjL6Ho*}--R&c&aiM9Bq zfU{VT1%?ePu}tPOB@lgs$CUM1?d$WzeM2nH*C%`7%0v&fRS`G%oh&9HGND=dH}kkG z*DcF<-zKD&Q+K8X; z$Ol;jJb~u@NdX#~1(;*}us6XE)7U0jvaJg=SFfg_`TdryFxAHsTO$K;G}0I8VT6wm z2kZ&9!+rwuz94P`2+@HaSSQglI&kpB>t?$5Lg6Sv^I$&(jbeF~=o3W)wWV3C#saD) zG%H{hcMXB%6p528$KBNN0)bkw2FrC~rKThtoC*m?z&%Z~?aU=%-a;S%p9z#Up}00W5SK?1qKEO^;RNdu z{`?$(D?bJEeWgarf;JI2Hz^XulVVUjB?&p>;;|~s86$dph|rH-LgW{1FtqnOnCAU0 zX83ePs$X|Z^y!YF_Fp0P#}DBB`D^I)Mk{pb&#*Tl69MruFV*B>N&+%bc$ZL}L+v4Az(ER#q*fo@885)XAHMx$U9P5o^ z9JCyd_9Y;D6HIxWX`2g`1Hu9OkoW?v*tOt>4X|4>-N;c4J{?l9;nniyF&Cgrs$tBE><+G&Ed~Q{IQ(3y+ zbRC%GH2G0LS=?6t3YyLHO__D0WvypnrZhq`fmxt=3t?s(8_YH~oQl9KVqw*T)37B8 zmWzkzLD=6DYT_?{^9=`k-=!t*wOtp2{DuL#m{B9=) z4jP)*8Cq*2eQdjkZ9}jGbrz;>9>#_;X0?f|@nxEXq z6Y+l*cqTXt9H$@IuVDGe!98j#KF#OPoWO&-WjL9ijU)Sas!4$RcW*)3hLu>oVj;gT z#M7sDm8DrG13rFyhxdDo1ABI03BO;ydNGzSUx6Z`Fr_oYLy$Aj z4~N)s?T_=r!6ESo>+(68@W;Jjqq-bS)I(hYSIt`0QA1$3W>wUvQxjEdRYSGf)dumZtfm zkg;eq3KkB*wfQl)Gba$`v%PU`mX88X@%z0i7UYS9+=&EU@n06}vMn$xNtii8^CZKk zc}f7`*nnmOm?h4##A;UHOjwpk7$ve02|o)sE6Xu~S6PV-Shhm5Sf457t4vs?)arp^ zSr#j_to}F21J|e$iRa8?S9rdZ;Tokun@~>KLh~K!ZfXE-2_Vnpca#QZ88~ntFed;P z$GM?+hzknhoKO_Y4q8F8BhC;!PZ1b13C!Zpyf48Y2Vw(oh#Y%3Fbk$LktirvV@v*C1^S;sR@1JrZIr~1DpuRJf6(5 z@K}L>wS=T2FrT1KDS0bbi_=M-1Z8(zAYfkgzbhsvD_BkmBP@sFY!W{w@^ezCg64Bcfw-7R z7^bdr|57sBz{qf1o*0Y#(K7MQ4~GfM+1xLf5Qh^4;ev^Y$Q>Jp^ifebGBOeeM?_%v zkZ^1xm~ToB#7d?yn}9mdsVkg6?2ND4HpNFTHAL69+rhERJMjMTQ@DKnHtarn72kJ$ z3GcURh|Z0wqkY{fc&SzuH02<;McwLXLdpDs8Zvm|Z$?!PhGkxdL|1vaMm2ofp(R4U z{Sc#jeT(V6x?*OZ?wIY=8}nTIVxfm4=5wI6#M_n7ES6?(4QU%#k$UoCnU$Ro;lDv zE=y^%O&*W(A~1{pv394(j&wJG*_&YMgToQNs-24tQ%9A12=hdsmi%P!8s*z8&}_tT zR?sXE{sL%LR%SkL1Aux8m}?A^JQz`OxF(>5w--f4km=_A?ynti+>1*w%- z$5zUUTmi(MmqceURT-K^mix&*nZ?RTT6%Fd;R4BzaDXs?K(fe$W>IBmwgRCE&H8#N z57QHa8ri@K%^H~hnWb6h(}ZSjUjWS}{Qi|svjvVVl5(2R%(x-nJQ{hnxFk?7hGvnrFe_+YBhW1Sa-8>k(Fm0f z+mvc}ypOJz=b_p9aH$G)VD4+#rfHG7pFBS)ll&OS%73!2p;<*uk!QuNo)AiC=KfB$ z2MaWBTEia}0yDQ2z?t=D0cSoh6Po2^QOBel(gq|w{uu4uv<-Wga=xpT2#{sINfKTRBmR_(NJ_II5-VX}KKO!D|1?jLu=t31|? zgMxztBXDMX7%mL=#l_))1m$2{92SnVLqiBZ0mw{d2Sn)F%TDXSpcq7b_ocEbHmOG# zC;ZkTl-H<35N4-ZL*SW^toOwYLjalLG3Ce1PFUhISFKfx&@3wfRztITb?|b-26(Go zdvyEs6BX0h@uSyqVCp~|oiz*xejbOti^t>S#;GV-ITW|&1*3ePFRsrcsB*t-W(e*O zf(3?k2suJ9!CBzBYqTA91eBDfe?G_B{1*z>x+ZFB75$; z<3K_fJU@I>LGyfXJDeO9p)AX%B}^L!T*b1Io&}l>%d$XoqQp#gRN#D4S)2*XNe(DV zcBBNLos~scTcJG&qvB$rbv9Xm*#j2{&6oJyMV@y$u0M(i!DkZfIaqKeD7*9CUP@;N zdJ<#_(BhuKbC1W{5uAGBXm}4~MD#*-bU)#pbh(4Gd<4BP7Mk>=CALNFiUj32e z+zZkDyCJY=7ubLPK7M@vb$tK!Yxtsb8+`a`3%u649$Gf2hQ@VjpdkmMb>+PjOEI5| zxKv1FntFsX0bp?hXi%dn>Z(b&9L&ml%XhXK2TfH7%vJb(b$N#g&P}=Bt7BUv|L`Ry z_x~QLc3m*L$CsGX?Muw<^#f))^u}xtC(QA1#e83HEcJH79Dzm3^}UPvDzmS(Z=`VyE$ ze#ndFmcX2^EX~@7SfE)&>L8>Z759&D4)i1ZEjq#hPdb&w{VWdhvY0msF_%f$%_PKX za2C1pxk`*_S^UHb%@+NIrCGUVFb@LEvWSU5v$$mx3NUjk&|D3dtlpu|-hPAJumZccvacgF?>5{sDnhgL|3)8g$b{y~ydzv; z3&a1eG%(B7+Fz^t$o<%#i+Q-& z1R7G^2v+I5nr;c()cH0t$}JP#q%q5Oi%fuH{m9m=4;c|jJ#8Vxi&R69MnNgK zWZnMNy6rnJkB){Zm{&^uWV zq8)6Yrna+;>Ms;+rbGr%5yQsZ8cVWWVafptH!VlkluPkqM z2@%rybm-6y#it4g&X4ir;a!66JzT$h2E~QB_~Y3F{AMh8^4l+u3Ca%%$^_|WkMJu$ z|3=t;Ovo0?vRIjAE736uk7eN2jZ1{^yO0G`ax)I&&|X6GPJ;Zljaa{OF*dAOrhJ>9 zJd_E6<#_m@9M=ih$8$4qH2nbf?AwWLJ2s<#|2`O;oP-qw<^>CW#=Hgd@yQpTtGst= z(ikcI*y(xo#0YkNUpH@t4m>{Ip(mj^7Uu?f;nGl9?Tj72S_#buIRiX!G|5w0ns>8v z*f%f~L%My1cD$|y0jqIUcB}+piML!`;w%dkSF4FyHEN@F%{qkQ+G@gIRdID8SXW`^ zT%G$8a;|1=Sx}=2nl)&Ewv8I%i#K1x2c0{pynfTUJ@(HWiNmu;VCUi~*tU8O9$n1F zqXSEDV@@RQ&JDz^xdcaUOJ@fWE(w(qDo)%g2&OkDd*J3InH=bDb=wdNv_P=9SWq{n z8P;Q7W6J8wELLWLW?nnli=a&iCp=5|IPp)G34K?ldE=@8G+|dw3M5EhC+rF^Yl*d) zpi9`jJx%q?w6945n0Vtcp z^X4$m%K99P^P>n62@Wa_u}q*Vlo-YX9B@*iMi8R&<9u)i|cxW6ON z9S38>5cuhPYBJ$`!rj?nF@!>yjOLAEwjr5_cq-Y8KunNeeq>>jeC}t*I3P!&WDsO! zWx(Sq%$z;*Z;um+jyRR%q(aJ_9N?njILjoy6XM<>KFAytTugMq6$14oes`8-E{=1< z=|oSQmR0wXy$S8UC>}y+rz8(rIs{&Jj57{}^~b?5dmNMnUkE)Jtf!o0f8-|yGaq3n zjE}&Hq$nzyupiAlN8(swIF2VrAddshyg`x3NeV}1d^nEAhGKtoAa+OkVOLB54h)RK z{z0)=A02>M?sgc=fu{evufwr(M>uq71OHdsV%P`oVAeNZVOp0jk@Wd{@cOhf`hL<0 z-9LO4KYsWIdVKgcet7>)e9@@`-fi0&Z?V7Y}BkL?o`zj zEY~7vHy}I z0*vwE{l(fW@XR(G@5Oc^-!TsaceUalL0&A&d4%`l1m!$|WrDJVp37%C5_&G1?ezZo z@pyV}cAQu@3tc(z>CmJBny}u#`rtkGxoZ_H?+{qtEEZ>pA|`Ws^n4x@3PrjP z*Q{eoxWn?fQa^0_*;djQTlS4KMS7CHDeFB2G)+!0rT;PKJtaSsfbj-Gq)5ZFA<3i4 zpDhslumJNW=0RJX(>9W+EWD&8*Xemqk}qvElZk*^D@_E{e4~wP75$1gyDpD881Ed6+*RTSW1eyg*_03fwR6(uR{%@e!Qc+apLRwTAlx?NXk;;-0#s+Lu zc{5e9(Q50mK(kE)ut2k}Adv=5oxiP89-~~=hQqR(<(BgVd{u)XAg-olUmk$*P}PV? z>Pn!Qd9l6-(trq>gkKdo-^2IYaNXRWd zk%#M7&Y^(N{P>Yh(|Pz|~fdzEED}SFcU;8 zL31guEtNRVgl37ke2u`YM0mD^W;F@W2AXAJpTu1*pXQIdQ~Ysnk}vKOns1xX%zIuU z9AD=3S1C22P(d^A&*QfgEC=EaRX&r00}c>w&EY_S1BQ~>JWnPDriS1|iZeTO2V+4H zcF;-U`rx2^oQo5DP(bKNj}|MlH@36m-%3E(8s?6@A)Z(t;)~DQ=zk-saquJ7#dh`U zA&JnuA<`ea!#uGo%nkbp*!u%qurJUN2mBpyD98i5c5&{C_mnv(Ch&T^>#eo6-UGEkRC}uP9|LOdx>Bmv27Kk#JUljWn!MhFP3m? z5=nxCxxpbgIV21xheqPmun3$T8G$n+B2hFX9LG|^2)q%fIGXx_80g zAG#pryRVS=?dOR7=2Jv{`5}TndK`o_e$}WE>|Sbvo~>Hr-A45Ys#VlG zQH{`Azgi76tX&U{YRd|pRV}bwLk6mN3g6A@HPN_n6Ev#d7>(*R;`_+~t^Bs}zHMG^fzE9>nC5`?hqkR) z58q&cdmjSx_n1pyUf8<}7XJ8^hUT8!+I7Vum)=<7(I1OF9kJNQ6{`r(2ZIQW1jyV7 zM`TCYAuFmMaw6=I8|H|dP&XWvHpRB(|5itMyx$3vow{P0VFN*ihE4<(Xv9bh}@t=_EX%%JQ+XjU>{xiU1{R#fGN_MBc&I8jAtmdcXY#d@TnKkJGVse7r0 z!F)1bssX5qG4rIsQbV&eE+dbY7l>QAhSCk!NK>81vZ)5cv`kC0x<)@#{Z|6_S!d=; zprx(UWGGJ+ECUBKGKhj^g0TSebI`2X2OGQ%H1DVY&FcuwEQ5fv zyZ|hV{PnX%09{#{wFH{CsDC#I&fB&z%`I5^^AaqYzX)??&4rVL6KV?Zv9lAf_~MIC zP*uHnCR80arQIN}X zpP(o&OF{FWzdXc$@bmRcXK`Tn7VO)#1$(w_z@DxA0o^J;Ldzhl2uj@M^4m91n4g23 z%p-*6bR0Uom)ZwkUk@~G)BuymkHeD13$b*`5)2tJ47KalRx{qazR?cjJ$qw@mmNm9 z^ub518lfY9sF(Wo$K|0>xHc+GO+v~~_Qz3n1{o@@uZNED8|RK)Q7%Xeal%qRM@)6> zjTC}Kz=!YQ$2P6-2J5FK?;+vnWNGZ`1c2%_s-hNEzeY74=a!vwt?E2hry9SL$$|2B za(%QVr1kvx4Sdi+L$fT9vTgbZ?EHB&)~}e3f}(W%&woC|->+rj&YF>Uyfg;)77!xm z1>%-~B%!%nz?q;~I$f;E8k}ztjBgWiZxMQL87>wYmPG_$3p5jO39x0dgtjRK%>v6_ zD4E3cxD{Yl3zbaO&@2|`3eZfI%wU=`1)d4vGYG!Z18`?r0Paur$NfouxGNJ91)8Z# zy!S#)z=3oPw18{Q=%R5Jw*bXO%_Q$apvC_IC zkD#1Cfbg8?h@u2{oZ>(yKP~_n(SFz$<3nin!d7$UTAduN(hGS1m?)L-6_958*_zeb9ao>JHko^$e@4N$-H#(#L zYaP+MLp%J~t~I*0X@T!vZi;VTZjLU5*l*f2#}{l{?>DZ4H|o{EYXp@JwW{G20&RP$ z9pCd-Y^%+wW*qD_r(`P@z{YIX5@kf(8yfNZ7IkZ(F|V&Dkzv@D8*m_4zb*%db!wmv zrJ}~vu8qb`nxc8jmT1@d@=8)oOpMS0O(a@^c}9xQO>CA~Y9@`-u23^FAjezl7#v%>OZ-BN01p zuNs9vbC%%0PHsiss#JW?x&a#TcUatJx_$Kpc8I}H{t2ZNFe`wTeoM1}g=?gre-U$3 zfaXe!@$ZP~nymC3cX3-8nk8JFx`*7~MjOc%|7&PgjHAnG-D*{YW}a)=%6FQ5M3PSp z%?2Qg$Un;3Y~;B**9-w>@zoW`voPFuyi6Q1Rm>AxcPu-u!#s2YN5G}4GfguM0Q#*oHyrHn>7&HS+v%~78A zMKbfwG%e69Vw%rEvsOiD*1-J#3p5)q*^6bCMc2J)ZI{L`(7ZK`c`;-Q&ANS<(5%7P zjGm#a%q+j<1&}hR!4zLIv7B$%OzFv$0?Vd$Zr#Q$Te)M%q`eKRm*I*Le(sN72+Gd{lm(n0yOp04p{DJk7+;tfVdCeh26`o@o~cjXva=KENlYNb!F*n!bM%yoh)Kf zoyTg|uBv<~YgMg*&P^M`{o9Yw<*jyVhJNQJb+K;pU~F4F4!gE5N9o-w!2kS$KQ0}` z`9(=6ofm?;^R=sjK=Ym1AwuEIPbc%a}fOFY2Pu!g3hMN=Jl-0OoD#2HQvsDV3 zRZQiI&@8SV%H3nKH`5?A6P7Kj`-zK&T+e&i%5<#|a8^+;2*U#N_ofBm!Q?2(YG^ifYZf&-5M?r0m>bMGh-98ZksE8r&Rs0c zgo-!;W)~DB>gBYH68v#IE(qz-0oWVui|x_g*v3wMd#F412{Z@$;FH$cJwqmF)~ekA zZCVnj+O>xZVKljKPYiRn$6%*^i2va`gmw7>pnXD&nF)s?5i*F{o9?;q;Yiw ze*ZP*2l!#GzZ+%~jAjM9U{;_Lru#c!idSDuaPNVM9zBuj-yfr$x-q>^5ct^}aC@sQ zdUk4!E}dS+R~=j8i+0WNd7EbV?B!;7uXzK!)wnKRtzRAOYE?lS5drn(dbROVy*g-F zw+@=us*A>i#s<}Ep<)98fl> zR}T&A*QXkwPQ3=GUB40PHY98|ZGk4uTcJsdR%pb1QPY+$p<$EeXwbMhw}jCqP0*Y` z+=5`-g!>H}HX=0FL!DandH?#TU4yFCfB@S7RSCFNs#I5yss<@lWrA!SRH;@2P20A| zyB~dtZe9A~yN`cFrw;Go(@(yIyL$jaLt_yZ5r@d=M8qeiAT%r*ojbpcs#R;E7Vp!j zA;G%C8+hlvPx1CUpW*d4KE|7Geu}r=`V4P#`#z7o^ZEy9)4aXPlia(1e+PW<{wHex zrDm=0+3RoNtws$I^5rL(9AKVYTH zkNgeji8XEnU#G6v?r*0os`;UgD2R{+M>wDhw?lqde-sj$kMnqDxD)nqaOB&z89Y0` zgtb9-*cj-5brF78J0uq0wrz#Pt{)>S*%z5Id5(Z;`WxpGl#dfc3luPGw+NXOD3)ef zqWe&YtYYbb1L2-Z2MOngIrtK2R!eZldf-^B8}eeA4%5r#U^0{MMrM>hw=#iImg@E* zG&4=UE4kEh6|F;j*9Dk8RHTk$ksc^udIbW{gys{}NdojKg7yhsTR@=Bk8$I1C*;LA z@%SESyY+5cwhGCw(c2bL=KVoDOUhz$^y+c$Ccl- zac^t?Z?aBh*|Nb4#QxueW%aK+SwNuKG?>})jL8JHplmA(Gf-yTR21vdrv59|l__hT zS)tjiBk?#eq1jq*O01U`$+q51#{zDrQn!nWb=qWo2nL zC%DPOH#5(zY5+Px#l8rV7Avo^{sNk9WmZ^)0#=q$z~{v8DnK(~S;uTksMmUD8Z7T7me&MkQ~w`pZyg`Uk*@pO0*hs4W@cNmn5EIoXlBIB zk}b0n2bMRxS;ri+9mg?~z#=m<$AP7@0aFZN&f>Y}{+{3SR!?h2vYb8l-t))v>F$~C z>Z|rFrmq(CXaZ+`Cwa32v!fBPu6VvPF#mU)0YQiDmShqIo&|F=~FKV@J5yT{PliBk|y)J6dE6>TrEE_H26{=T08Q-S_{Dzkd4_ z!I}E@Yl>R|W-f7db{u?&^za1c<$P>3H{7Gw}86Kf}kXevXsn={P+nOT{?V zpxH`jw%};FnXqL+RYp8^+hVcPB1*t+s}XyXE!bf)VO?|#em->y@`nsUNCzJba%aa( zb!Vs0$I}y?e7qGnyAf>M2@wH<`XWDS3M|t`po^y)2KcnYw%q}@BNEO)Bq50nO4BT0ij61)x;ei}4 zBlrp^D_GVgz|3ow@%p8NaPEI308NnAm&Dd0YiRBr3c5v-#&UwP_(!*ZW-VeyKJ!z6 z$IA)KON;Q?VrmhOf0v7gYPG@){AEcV{z9l0XnvpPe#mQX6P!h|T(^Q`>fx$VLUXZp z%<|qRFh5vZipT3_;!nTI!=+hfT;cP%GCNs8^VNCD1ZaWXAXS2xV43Udk=W^@<(TaY;SjWHIc9@!M3Xh#b?}&K+u)fb?3I1m$)dn zqKhl9XB$wHO4%;j%fvJSt|#TieSupqo}&g@3V_|<>EQ-%Z!bckFFO1Cqoc1s+WWLe zdtb4jc7!kG!_T~Zec{b>Jeg+?UvISY@j^Raidzr1Jr7?WcoPtPJ9dCihxT@HzkLVz z`V07WV3|6hW2Y|Y+^MTlM}l&D|Bg-j9oiG9J9OfCe((|CCM>u2Ymbh+wrhv3=-$37 zdUfc5KAn4`XZs%LO0ezV+aBJY5_QH4?yllCArtjjR^D^mqyShFQ(=ovMhu}j*_w~? z8PB1pbRO~wXQHrd7K*5}%sh1K(U;fBGW9H{cYE~fKLR7iOvCue5g2FJ#Hmr3FeMU` zxg9oqBJV7ad35WSNRVdH40N%);WRGup!w>SdPUeV>Di~#9@Ou z0qc3Z(GX2o4x@sx#Tfzgql0e?OC2H6aT~w=<;l)A<#^yy+B|- zUs1$+5S(WUI7_I!9D;K$!I_{e;4JAAye|nb3pguiW*G^{Hwf-Gbzmk$zr}mJHH$zk z&`dZNXudX!@1-4@&(BH6{cXR*_vc^3f4)_RM*h}H6cazb6Y>s5hXtdqn*9|4xvG-C z_w4iJ-%uqxG|Ru{VrkU-P(5l@Yk&ERq#tf#3rene8tW)Y-`gYy0P3IVmS&|E{k=eX zOK@&o0{nV9k{2D2b;c1RVQbLL^Cb`VZPV((Ex1nQwFNY5!5y9s%)0)&(CnzAmU6B$ z`+160g^}bRZ`o?krI}bSt;xCG9MDYJCqA|j%W`vQwl8F|k7}|MOGQZmq+l8l>6T{g z0Z64eXIRdcv>cV%Bs(lSt62k*hvighPZU&_G+fmXS`=D+FTgA#45sA-%u0k(Jyc7} z`!;VxP7*7r^5dI%zU2+X_wGN^L}mhE}e z(va`$l5|ztPM_JO)q|$izJF4BI$A!}zBDj-ZhSsX;4FY_2WH2T;Tkk6aAuoOq4#*5 zj5f76&3v9MfLY|IL)NeOHOn*Swu-8)sYP8)J-2mA^|g&?tUrjDm>30~ZNZjWJECM9&Nrdb8iQ?w=54lk zY&XYZmmwDW4GE|rI3GwdW53nHPBsp|2%LoC!M$PW(*sH4#$a%_&dO>p6OVj7ypS+) zG)kim3$gXZSod|xcic1gTub)AB5 zXK2o|FQrYu7U(4aKUiLfkC&C;(=l9LLIuqq{Wg=(oQe0Bij}zl9}t*7;Q4B$ zLAzuj6M^Iht5{BfW^vD0LvUWLLGwpzIdE7#6K}sL{*{CicFb1^%va_STILff<|IRw z`@KwPK0C7zM`e=Qj9fI#$VLO9xv?Z2$I3FXJ1-5B36J8w1l)E$@bUD4uP0&F%MX6u{>txGwthbT@a4Xb55d-- zph|Tlpmz3S{@Qm&=MG)ak=qV_od~W3U*_M3VC+k9_7*@U7>c}j>?OzC`tsOUz?oY= ze;)I*_U#G2RLA!1(S;D*y;Dc@?96)LwmXmecI%2B96a|F%Q7L^+p`^f#Nyq(3r3C@ zj!6?HAYkGoOc^&36GxB3WNPBbF_<`FEXI!*jgdnKW9;aW7}T#XJnWN;-P!(!jTnXK zsCXosQV|mvjnsrxl$S3=PQh&Cu=5&8}o%IjRwj`ik2_31l6S(HbQ z3B!a*(HK89783*GF+MOBQ$pe~bW8wzJM=_bS6}{ic@z43qgRhXaN~Vtwa#AMJ7I*s z2WCthhYiL=ydE2k?Ir_WGZUEY(7eGIhs`Dd=P<%DAvrw?TT{cZm5{tGIh5cWh8>n* z>`4nFXr|*P0rE{*Lb{yokh(_QEKS4pl2lwQN#=k&8P$a5fI5$i;Or>Xh9{R zx$ea|NFLS)^8!cU{M;;@C!n6An!>>mQpMFlg?J;hYN6+{sT=}s4k0*OS(L>ULqPeg zxLa^5zd|p$`^mS`<9?#8df@Voi1LsTy%mTg=twSdDDc20< z?N$>si_-D-Y=LH$zX_V<{bZiS+C7KAOFLe|cNgBkf4*}NRZCw)SBYZ6cShPnq*$5{ zRAE2+y-N0L)dJ82<|=_^_IVmK+vNmy!Wpdrvq-*c&K1H#t+S4j^u0}L4u1+5?fta| zf7-ouRVwkmKMKzBoiGCXcf~!D59ycfeY27d&9c?<=>TTC#GuN05?^l#xvPcU)uduc z^B&?lqKRb|Bbq?*zZKa*?q{#t=CaoT)x4gZrPr53nt@m9O48NRcCI%i)=g`2uDd2^ zwga<*W@lg)>8k)XRfv6Lp%p;39FeKW8C8Tq&2_;Unzd^PTGB}LG@NB`JkC-NsHu|v zy`;cR$5JSJVU<*cXq6?_=4QZbhgh-ZI*+IvY7Wg!ag6s9{uJyvRLS%Avr&nh!P${d z?LO*Wd}vQR4CjZZMnjquG&BEtLvd~_&Y%5vH`FyMi}R5~NAZ&v zUQ|D7F7_Wcn=MJL4)`B`cn5oSyn*`aeK>xo9;c2T#9P;|;PFTIaN@`z1;+pQ>o>}h zEU>IYvu=6Tfm(y+FO{WP;xr#_5NNLCmf&2sPwfjdAFQv$=@SR>!C(G_cW=FkE0-_e z%$ei3bm=T%`UnhB5h}dYkiNaK?3G2>xnl=@^^0GklfS>3MAYBU8`BfR@SD^aEVjks zH&!#DsUs$`<6IOUfIloO#8+#7!On#pPDLtC%}K@4Sr!~7G}q5G;NUDX4wg&gffQ6G z$FZ}F!*-j2+j#6`N4v`uhrLuK_ba)rHpinb*@S9K0$xu^Qa-2R7b+9Z1e!Y$KGMP_ zV^-`$%u1Mo$pibKFFW%sFFlJto~**(KE8|ZKKm5^>)X%puX}Iez1_dYoek6RXkCed z<~wVOaC=Q5?#oJfD+>tBvcQPIYZf6n9S>Fzn3vmdcaa76mShlmH5UubmsywiG#3zZ ziwMh3;4Dj7FW2^+T{oIt%X37xMNQBwmS&j%ScJz*3C)X3@%f?>e6gq)k6&gQzhOQW zW#L2K>wN{yTG%>$d2Y2bAO`{h$l}MWe3=EHWlP-?cwRG|gMt}qCB5r&(r}dkAn{`_ z&n2jcZ?gb2q2toLLY$gez`;&FVVTf8J)6%j6NgLEaH1>&yK-!pL{RkNGm$vj5}()2 z-Cem2c)EGOoxm##)wFfv=i*L4xOH{)R1?bD@jUN#?FeulaPt;-1aGF}0WTMKeos)g zPe^kSaOPGZxgBBI!_AXh0;i`he7yYO<3T|7>VS^E9nsmp3%a)NhHf2upgYyGOK+ua zU3#HQ=bq@oZP%_n(W6Uu^z7D)&_@XD#&f9dU3&7o?gT?ZbjMCSuPfCB-MV#0_Z~gb zvu7{#?B0{m*b9BQKcG(^4DQz-L;CmUU~>@03>%Jdqeo-Rs1X=9k`O$06avPNMd0`` z2$?(q(qq$QhwMtqzZMRQS5Je%8@C@!hM z$dTjJb>h41(aux3g^VMhj~mN8O^L^}5EG^ZB_b%&g1{&{##bYhCp6~BQ0p`RAY)Oj5 zj?^gZ&WdKciNf~GaBNKr!H)DW>`DvA-qdiMEVkm-oIKo;5OIWAacj8D_IABA3s)uN zTp0&zgvZ^b88G+nipZY6*kOm}s-kq%KR*j4lSkvlka2|EOk60J7|i0&?1-IwS$vIW z>aeVf$FtbpXHx{zi*tA#f5R>iUe9yzDBBAH&9j&e-;K*MsGLm@pP5S)5RP*R&Du(a z0?g-UNt6`=Gq)FcF9|OwKFtEm&7j#1%(rHWg;@h;1RWG%IKpmk_3Rj^BN> z^JV<)>Tdk=ox|Ann`hL&E#B_pw$lb7)25>KKqdC>+lhSyqAE2Bp8XerStPCqIv+Ut z_T~WA5|~B$HLdZ^6Kd9Vp6&sWea_ZRlkEvJtAi?q;n%^AZX?RyQHrBBxSXq|Jk zeI1$|T+|YnMPfWv7Gl}jrE`D|s+xE`&ug{3Pmx|{>fTN1NV*!&=&bRi9ChaGo}pbY z>5BC9oa&Bs)2evAUUyB;7k;*|||Pmwvuhq0I{Hbe;yyOkYjL6R|wJ)&aAQ2BJxhb!e6{@K`1U*uhaO zq?H29)PC;k(5$O>z1wPX`e79)@Cm+5A5E#4VTZK!S**c;b2`gP9AB*v4eHEarr#{ z@YYQfe5L1y^$u{BT#7MlFk%r_+w4iZV5I+%f4U;jxH-X8u4c7d0F5WwP0RMP=2S0rBG5+(b zPw>AUzJm{|euvweD)4x{2F-V=57rQvSBYx|p;=jK37#r8avFizN_b5oq?&MVaf%8_ zC%(xJD|11U1e!G!HbL`!>YjkJ6Euq?(uQ`u2F;oTnl<+fam{$RtOOq~DZ^)rsYRt! z2_FBB0RFQ0TFXR40zT6g;1*ccCGnkQscaP)L}Ul&`{JI#?cG(y95l?r`%LfVGwh)G ztgaIbB!28A6+t6a`8QvgmxT+2<`c6D3C)EHni~nthsv{Xq=W+^0?f{AD<=5+Y0&IK zAQQJ4u_P04-6eb=;aG(XlnI>5I!r)z@r1iTGtcpH_fU)6czF1#32eTu1Yvj0a_sIV z(Hhz)R~-3gL9EqocFBZLZ$h`fmp37~6XCZH`t%)y0sV(6_kqEKMqtR`(HJsh42BFH zgTcc_VbJi=7(8+ehK?GC5u+zy^tdV1WQ-g$38ThplKo*L$71mCQ5Z5}G=`5Jk4X~) zF?Dh<0;Yx{a7rkG1HutLEfSGI(Mr+5F-QnYfH5)=agp%|4GKf>)M*6dP{f6YBRVvM z+XxtzW9BBCuwO)f+XK{qSA1k-0eOHiJT!ot}oDVdG3vRRlG z7^40K@NnlKh4tp^?TcYUMzBml2o8!xXs8ikgyo268v?=;F=+G@wDax6cTA#VxWd0< z2XyP+jlTh!<-apO3+>$<^TVfLNkRzL8A9;7IUHNek$8=eyn*laI%;E5453**3&*}p z0~JSTj>6tN4vq^rNY0BPK*wNjPCTk}6L2^`3Fp|ZuFV!-dcx?eJOXY$uCx7Jo}P~j zbJz}NWMXT1HWCK(L`ZiZY%emQp~yyXPDkDRGCUt0gqLHd5o8Ie)5RSkUAaUk*9gL@ zSboK-tO2hEwOUMNo=-TwL>Rur!M<3PFElAdNrPr_V-k=}#wGDzCUBn@E3oos7E85V z1n`SWyr&Y=mkE918bUz6&imaUEWask89eW8o+H5g7T-mAAH~vqYi2U8D_0P)jAyAy zhL83x#ow=2;_q)C#p>s#tA7B+4`2G0sT`c`CkXA@y$!p?54ehbGWkbYoe9m(tMQ8T z{!--NjOM_s^{3kT&Ybg9HSv-3r5gO{Jf%QTIL8jnA_psI_iL#q0b2iz`6gdzX*D-! zX>ck0<5X7O|GxvxBFU?kPo05yy%SrtTJ9EdzPCsNZ{}I?j1y}-AxE9*^TwQ^Su3yO zxv4x#UD>&)rJUKRiKm>Nhk|A$1iByh?~Q*F0D*W4~%-Up=feG$63yGq9!Ux#14rKy_46m zUX)ZlI;7Ww^6_T*WUDn=ey^8Hj`@-_d!bZ5q@3DItqram<ip*1VNc7D9|kM{NVm=g7N`09N13~uf&mtDjccX zhl4y8-|715{kVAcBtE+R0si#v?{VS$S%ULf12G&q~1dOcS=`rXzZ=2JmeO@X}e>ri{eT%z=15X&NRHnk9VM%X3TcALkF?hesda z`%fR^A74Jk|9<=~?$j+MG|$4LwS;Cuj1Kr6s3lKqy@tkU>zL36nV&94xe z7crgRTU! z!c7A5)p-O4yL*O;q_IF&11!L?nFTmBqkzDis{wOa29B1d;B;9UcIGByVtYT;amyr9 z0a|gr5KE?~yB9oMJ>lx5QVnb3C)|sOT&2iY26pKv+>DQA8%o*`GSZGFF z9tX*pk=RF2t}01Hby*TBOOsGjX2ZeqG@fH4INNZ%G);whJ4N_BBmTzac{o*?gJTtW z*qUp`#?mar5A2PgF5cK)V8Wqd8|n&DaA0O0enE&|5+6jE)qIBqnl;x5iC`g+oJXM6 zq(QS-o+YNT=5`@Kd|7~4Aeq|>PP#aYpDSnIScoiGQ~!0q3`P-7SG;ekaiU*31;bwcSEKU7$P%pH{BK zzuv6Hzy5FkZS0JT1z{keisG6MTXy^qzHp>H16R2g=u(>Ht$R8_nX(BL#@_@U5@iKApdbW*BSF6 zl02#Bq?JwEZ!Tv>a+V$rSc&`Eii~7iUL)miBM+;Vz~>Ci;=ih_tO_1AXqE*rw7=Y) zEX@k0d9Hfk0?r}<0u?}JzJC;&?Jt^h9-Bk6-ms;t&J9_tv$B=OC(taC2B4|6 z2IKs_v-G-XHoI3Ef%5TY{?tT8X;AX=XmUJnkvu~lGmok)_HrpJy4u%Xrl!9ySRSpc z&M&ntc}9-BYAYPs%g4|4dmNDavu^h@pNHz}uyEd7c3k$E@9cyXHFc&(Svo$ zHRJTr!?=9zGy(Sj#*G|?S>;9e>eC1K;**aE%%3Y)4c$HCJ8lJ*wdfjO;ctKa4)^YS zfW}(FF(J9Jc0Uf)R^pHVb6vGcv!SLE*DjpHgFAN=G@n09U_O5uS1z5y<@2YockAn@ zD9A&fP94$S(-X${1f1hJ&px|AfvTH}i&_~hBQy}dPEEp#iBb4@mIYyh`zUL3X8(RT zJUs)qUMPjkBtKpzQ8Q9dKhuobIcC(%HllVGp?QW4^@SGfuoSPU|&`ec4VhuR%|4ERsD(2Ya5ux4aUo9VfckDSS@TK zR?p{(v+(x$N__d~Kk)76kMYBo5AnZ0`vdMAT7eHX&BOzVEJ5Iu74be?Ex<{DRnS~W za3w@5XimnXWoCT3!i0}rG2+f|Ex513#}Qf=6HE!Nnjf=3GPRP>Ox-6q3-s#1+yu@} z5jLEm*#XWvG>cC&^S7*&&^(>6Of4$Mr{bQmh%hcoe#>>dhxlMSfSKtjU}nBn^FC_? zoO!--KPlpOg}A$>gvZlyf8`9k{bDwuITJVM=Hkuyd4y$IC?pNnpGn1y`DwT+(FYb3 z;s`tDgBAG%=NvxUY#b{~$MKR>4vGlPIaZAE_fwXCH@CLR8Z1#D1`isF0euIdYsc>B z+@&kJb{79;5BSK0B^S*fy{*I*mW660>V~T~+$dK!FT$E9+}zutt;Fp1ZU--aZ+Q9= zzCBzNe79vA6gLom-gnrr;Ru*A4dDdTxbS!w2%ZK)ZCt1kF~J1VK*H#>SVRO9ltW_( zy>W!&1jI(fE5$`5AfC`_h)yK<8WB%uHAEyLKFomd$q^htMi6Kd5gKkpm?$(6VIhQD zrXLw*B4j5ap5Pi6VL?=g5fM|O5fc~%1K~L_IEIQsVptTCNo#KHdtGDHVo9E4p>-&3CXJ`t{K)%z+8X2(x4% zDYXch)&is_WMFO)K{>x1C51CFWBPnpEotc3UJJ$O+m7|<%DQ%G2VbxD=+db-x^?Y~ zzWs)w?|>2L)U6-dxOnnh)-wk;@~#Vz$ts%?Pko4=7ZQf{#+=XzSVT}>M=;)E=DSI4 z79ZxM7_2o5G)G}0^_n>XubG0eDDQCrP&6FZ7!2%#R4rB>GN~a6gXcX_+FTsfs527^$!lg*M9EhIUJbsdx_J0 zgZF-u1A{jS%(v`%b9TD2G+*PjZxgU@@L2q>FZ10yKRX{^)~v^Wz1@g^{qY>0&r7pI zvu2113Y>yH1m;~kw_?w(ZS2?CpRu1;eR!+T?2t33I7|9L`MxFVJXKA6Bz?LA{+jxE z>2u|NPe8N1Z*3(H&5d&+GD;IPw`OH_Dw}ROcGkavW&umht%3I!HRV^BKvT1O*wt!% zG?V1VUJp-|bDi1i?f(Iq^@Zn}FF;3VW?k2^{Bx3pSWmp?P<86D;e}zT>&- zfy)HKe-F(qE6TY7n+l^p_*Q^fBoE%XBA$TeCQCCPv@GTL-+*QTw&u_*`DDH%&y=jW zE-#a;h_j!7EOG{CE#S99&gBtl4Y*}Pesc7uNgAG37H1HCy4XOPqCilYkSX^*f9kk$%{Y1FAg-K0gB@>d#^8Rv5j=GQuADo8&mZ5zhwr_M z?-hMi z2l)Nl@8IJ3^SF57jFLceedTVfeEByRJ9scUd3(dh%^lA#T&Vu)^Y`~xc@{vL(6<+! zO)+6XVgz1Dj#uHrdh+w&PMxs95{;WLPRGfzOdKyw#o=-r8Y?WQ6KnJAL^Kkb56`fo zv6v8?Y#=b3@H(OSHEX=m>tbn6j=^T8vn@FRJ5!9#vf%&gI6*musACQ*1++~*Rmun5jQVZ;?qZe#8(97Z@+$s|L?0mjl)jJVLXqW|xI)&#*IpmL0Xk zYbG?GT_B<6WD;PWg61Pl(42y^<*C>~XddC`r=EkGnh4ki{rU|+RAd~cP7Fei&b`p1 zM=wkmH4>8t55h>c={`L=!{5(?u-q0dyhiyqxAlNmJ3n;p+ymXk&$wqF^yO} zfBVkL@+=eBTnK#bgz9#LYk%KP7&34qrcDY#cwjUls4xNDi4h2$6ov39F$fPJFi(p^ zbSObN(uhQYs4>BUnCN&!M8qH}Dh}~+1nqbe5@He&9Ttm(NCV7qNd)gWf^G~#2*9C) z=uiT6C;?g^T3MbW%>-}@)3qpo4hu>^6!Vb4Ym9{S#E?iB!y;jgjYn#t8GX8Ii@^B$ zbwmIGI)?X^`$tB`BAVbVZW%Fz_!!>DNN6@hB_b&y38{qU38N=)py7fZz58QwNCW~6 zW=u=XKvZrKta+u#Ny|rZS}rPb3Q>}ikJ6%V5Ij|9b*8#0}|>;?MwP#ob^D=X)B1 zElC{Unh4Cs1Z*-I)P%qFrf6&+EN?J{Vq@Yo>`sp&@Fw9nb&}wGGT($Fd5JiX6N{Qc z0yLqvmOxlPI~Rv$=W&3Y%io3!)R!isrp$tx=>*|o3kTR7WEZDn58K_VS-ckq#Yu$b zuueYMos)oL`9>VfHln^f4L=PZkL8I0xG>9#bJGdA1oF!jgie|4$HCvl3IS$kXeLn4 z63ekHC2ygU2+m1_TPp$C4$jS>+2(-edC4l;%;k9k%{H8yovgt50zvuWJZe6HnYuhL zjY=gv^IUO(;`aj1*9p$I=H}o{rEJ`qLulq_vOv#`d4%RU>9{EXO|ZT)J(p?c?dgL*uI7EOlTG>bCuS|DL#=tiG7pe1H}an zJ!u?r^!XY;I5_7=WoIMBN6z4}va|N@WZ|67bw~qc=g=YoiB)24mb@xp=DnYyTAowKMkFp8 z8Ze5uuktGSXPt=43XfGN8YTPpcIjZL!=pix_&jEQwUBxG z=h}BomRvg&H;La%{>9ZqD~lXUBjNS-!Gs3Q8a!9C{$z=3-M3kMuJtf6YM=SjB_U&^ z4G7RGHx_=TtimGRTZ3OsT7H;E*)}Vav+U1CzSr}wi|tQ^?~`)c>#w%5QcbuF4Gq=c z)Bcy5&1_QE=8LCK;MC!RxJGDRzhWtd59otFJ-gufXXfC;zx)B`P9MVu|M3U>>#yJ9 z@5-n7E9J-h?YE!d8|rU={R$ua<@czq-mO*uY^>Rf2L53F<=vZj|BvsWfiPWPvmbwY z_ZB{V{19)ybqg0S%Bp~NXci0ifojz3*nzMK6VO>gO0{W=Za!WpFUrHnp+i)DWo5C> z-d@O0jKTb*ILwI+!*k{YjOo+?16*l@7Kj6>}F8fPS-VWx%P zWg$qLP&31bZD|H^m!@ycX;vciNfS6T7NG7IkfCJ7(OlGw`1Ebv=^ z2TKbf6ZS-pRtPv35yT110*0cR4teN5uyI^+6p{gGaI*mSB{$tG6~DMgywt&&Nm3n*Pltpt!GjR)2W2! zY(jG$8fK*9;LL2oOcstwD6XA3kb`qi4M_K$`WhrGhR}2_5k|1isw18-I z?c4`Fx^_drn30H{I2z#-Mk9FqNK70#7=!!wgdf|HJi{&>dLTR`f$(TXf&$xQm`oWk zn$nSwU_%&za?F^CDny!CLS0?l3D9no_~-f&4*O!vsEG&(ibNP8FgAkV8Dk-QCczwQ zLsD!CEU{_GFy<0OGY}SSLTHo;5&S+TPA1SLBaY`qhQ=e2-x&$4=BNZJ5wQf~@DL+m z*^H0~3xXnS2#QKpt`T9;R)jOpVPcsMi9syyZ3q!Xzz`n6bRv;%F(Y8&cvVix@37&c z5fw}5jy4dQ;}Okm4DS<7SdUdA#4`^DLVS|JiYTV(0vB)Sa>WZjGv6Yz51d{`}WGhJf&M#R0K@HFC#;+#1eyzX>ojCqp;Z=ts-P> zGRG@e7I%yd#wfgIjKD@iFg7PjG>j-5DmLLnnU#ZQGmaM7SsKAP4F^inaiEBB&FyYN^YXN4yk3?CE1@~GlOML{ zBodlUgysa)@m>oDErr2Lx2r&%Nu^~6`%VPzH|7C62* zSJMUVYu09Q*+_vFb}kv$=Ue#B@S6GBIp+!Anh$h_=9)1#3s>w*XUm*{%aj~no0E+j zvkA&`L|KGozIP&m_ASEloAc!zOvYOryuK+bT=Lvo({p$&p}FA=;N8>s_VO_l#D}Zj zL0>mF^y9NC&Cg@sxC=XXY{fqILo%sfP5xtlE(RP4g;r0=!d)U+^-dx^NdM@7FG~7( zVHtrzVJXE$JWuhDY+H(-v!8Q-p)iJUnDmEwe=5@MA<`I1-|`wtS)loy1CA+{M|eY| z@rCde&!xmzsKdQV!=BG4ybXA)c&(YF{QOQys|)=*jp0O+mwnWJ$`0!Kmgh-5sXDSt z=RpO_yst#+Qd_AbZk54O09e%xB`n!gch0icv%Nkwky+$El26uytjesc#&*fEE^)=j}20cj6Sq*bT}VFH_kLqc^a@pqap|Sywe+*7vpi z@?84^c1X@~9RD~p3oP$#+G-6-hi2zIOIc+r$5JPff30q`m&XpKlzMK;KUvBq0|VBr zUFsS>XdRkGRRYay3?jX8I5#4Ztm>mikIbJe>8(lH0L#cUo78lTq~ZLM>v^*S@)KgY zC2s=Aa-Ln{hotSx=klUxbyQzlht02TM*sf(RVUhx;4GoF@-owvCHUfrCrRy!WSfaOvDheD~!i_|JcQr>xH3fByx({qA!@^QZXx-@n3p ze}0S5yh|-)(s+RPtK5ZmZd}6GpFYB$fBzO5YAbR3gFoZbPaopl-xHcIpHga`-siSf2WEH-;|9kxX>&N)!bAjgjczkjb{sE+AAhttA4@#rSw}Aw^(bl#7Rpvhnbh z96aVVABpetN)4DFP!9>s57j>NFCpjbp#0IQa-Js^aDw;hnfPSY9K83dV%(I@*a?~i zqHiup!!35$x46B!Fbij&%|YGtWE`x>#1V;GOHe*VXg*b%s-XFeEE9%F9Ajw{?Of5u z-2;P$jwghsA&j8eyY~=u>(m8PMh-#z)UikioWN%@9+6YWVcNto=+#vsL9{{7ZUbOS z%s@QhHZFz`8)GBHrZ62UJ{hsGNthZC0zZF=ckZSp(@G>7*{b)bohN$r9)Q4Up@eKB ztY#aMO$6lxE3AZW3)4%AvLPcOhY+2PkSGg61bCxU5zXV6xOBwDr6N4oKxigt6Izp^ z47^SN+KR|vBf`Uy5FC+=fbe8YjY>s8Oe%t72(XbR%_li58u6jAFowmz7#R(7Obmh9 z1dAyi?frdKKD&17iKuA8YOEPiOgl<}vsj!7&6N06i=TB2;aQZJU_tMm{o(7=6_ckK z5SoyNU~@WR)ANu~P=VsS>6npOj*8S`RAiN*ga9qTJZ$JFg0hEN-A|&G`1*EIzRL+5 z7{oBY0?g3_=BRMyh5K?G&U}v>JrVtT_T`}17rxA|dDw6)N{Gkm*a&Q}L|~JJz-o%Y zW@8LC2`Ez=jj>pt7>P}Z;n*Y-0u4dfn;eZp1!f#8vEY~hGvD`PrHLFw8wk<{G!_|9 zlN*ov5&|`W@c7(ZLNf|wiGL1^Ad2u>N= zA0eIFV{48Hhx3d$lpTlaqD0II7=mS%aGak@Fs?|Wa&TU($||BoCM7HO5tJ1$>oCpp z2+Wt~nhDOjh50GaOu)X(^Cf!5)p=%IdxqB#p0#tt;+(ErK4cQ$Wy0~5xjDE>Futn5 zIfK9~OVj5lXuiqs#ghCc%dS9~kbINdn+t5X`K*OX#f@jgg(?egvR*E7VEbj=2H@>u z`0{K$G9#y`e|h{}UD2;gCzKJI_wCq$UE2uF?E3_o1(+qQnws#&KB$5HZaw=hkyu#k z*hi^3IjTR_`gcw2dz;`+m`LZJ*2M4G#|bwmzEJm-zEk>AT|8#`I$&z&IEkMLRLi}c zp;-n2`TICWRe9Cc>1YPbx)cLET?&>(ir;k0xAgA>W-?iwb{Ru zkj5_ru5@WJgqbk~N&;gVG&i?2+Y4^5c!}$LVE+zEwjz7uX_EGkHA!K#iq+x9h#fT zsmv|Q(@cC$>bdP?gWAb^E3tvuRVCof?{#2SvIFyvL9;^w&SGWOfmtMN!meiLwg%0u z;CXOKk=WK}OucL}G}2BU@yL~(W|rsw5g z%%A}XpE?Os#*an^KTksNUi|B?UlV+9;4goC6Mz5VbNulA=Y-}j@!j`d5t={8-~RRu z{`iNRkXX$1)%;=PHPUv^pE{FTcU*es&uHi#M^XH#G#-INDE-qa@r^3x$y?h>* z&k>sI58&*PLs(E+ik{40cS3V_4-a%C+;{WxM*l9|(VklYfRTN=V_u30)5E8sJTwps ztce&-IIs@sjqOG0*qv!YO`ZjH>=+Lal&edV)V97jiJzHKGt-J4*@*<^c(o9MxMpa; z?0{wk&V=UI(+JH>Z&#)U&9kG!nJ3M~WlXQmSX!8j)wu?&Doa81h~5Ys-UsIo?Zv0J z-^O3RdZZ>&{_n3p!Y608;%$QKAAe`T9fIW@!sK1T?wu84ZPq3M-XlN@fPTI@4d1Lu z#&>IN__jo`( zBs^=-Osy2?XCB2(WMwJ0Wg0XSq93iAflpV@#k((Lvx8;%nnd~-ntfmxtA9oGoW zXP(K%fpQz_3C!}hxh(h~695I8&y=TOb4DVDdV6UT0X^IlG!Gjw2}!0L#6~A$z`&94 z_w&d2VS`{GFq=ZAAbuLbnTp_Xzn+@EZJ%C42#AEiXq#PD1sKtz$z#W2 znc0jr929S|5RA={*kX!R(7c(j{F+SoON_w=gB_aVgRsLKhQ?eYPL$XP$`%6iQ=mCN zfiTR^36Ce({*KR0!)XHNxrz*&QLFN0p@G|a!g3vdPwEKIdyCVsDlHy+<`f}w)DVPs z>wxXKNy^H6D2vcsV8E0kgeNW%`bDoJ4+5%)~S*d98r5&JeP{m;JH~ z0`^N^t3=LXAFe^OTr25l`DzuI#q!K^eoSJ))#^a_?cYPQNcHVb=*Iv<@^v~tt>TuN@Af17^$#Saa*XbJl!KsI*thLSIQ$nq2t_jSK9iVG> z6PEW>p?+TtUzX}7)f1q_`mErQ&@B79V^`d%=5I)3=(6T;tJ{N+#A@%Z6i z@Q;6dMPU8{-~aFxzLh8%gy%PJTvm}YWFnw!Wz>D7;ec8U<)451R>gb1ef!V&@{5lN z&;Oy8(Ut{Hu3ou_^Jh=u*r5iTA~e6c=w){PVdz0{?$WLuy30y@uI|iR2ZRR(pj+pT z=-|;7C8iimj}Aq_)Crhxj)!H^Sd8xAhqdLo*qD`s>ilF>6>>}P+9xXk5|}HC%&06j z5yq_8ont06$6});8k=mfSZ|F{z%1arAvvDV9FI+@25e3@Vq1n;LGzYO8=hx}-NQ?p z{4%CT2fSL6iZwa$SY2jCX3!Xn@7E0{4(!1FKVMg_8Q*;QDgN`TkMPCCy|_g%{o|rk z++AHlXf7orm*Dn_LZv(6hrBW$k5^^m%Qflveq9RwO8u}d1CJM3@aUBcJS5->G(R9L zH`8*l<`z*pH0Rn~G;*DwS==+S@qp0$kb1C`0KJ6Y5yWpZ%@39dJTou6$Gs(m3Y;G( zV9r<2{E_%lFCi!snjfs-mQXIh{FwK7%k;$+P~crad4&M;bmoWoS|v+{&)|8p z@us+8h(9x-`IZ3l+#K9g5KU;FpQfPs>cSjCbB==Mx*6#>JTpW6b$)8PxM$c1&nb9~ zgBgKl6>`o~pxF~6M^8nHwGat$X&5?e9HGw_BM0?E93j~pItAwNsfZ7qjOgIW7}U3? zdWHk~jeyaZiCD2T6JF!vx#fPWf@T6H0WTmR41WHdAfeL)oLybr(5Fuy^y}A8t*GZt zVAotXT-B<1-MesNJ4Zp?-diDrtTdYOo)sU z_mC9Avk4YM3QPp<2ytNuH!2HtV5FJAY(+?%4Z%DoG$KLyaK}XvYzfc?0q(so5bW5`+N-WF<8$v^4;O*TJ$~1SQ3DQC2t;WjW<2%PK)xb_t4eid7!`{W_s-8+W)!ENkBm3cM2t;8Dto zZQ@|WqQcXOrCC|FC3*<&IV~^@L;4RvkB(j7!@8A787bq&VNFUhHYUX4b-tUMlcE$f zzpkJ;2CpSXVIl} zadVbfnhDS1*Gw3`v4Ehw(1vTz0I)$%zS{`NRswSh-kLA(D53hFI*=G^P?7-Z>KE82p6&lGpIy8&4m2b&2 zl=Op&X`K4~=G;TsEzVDTPEJzpjP`U|&PQtMOYQe)Dg8R>C!4szi6Qo~kCpJJ0?le= zWG85rzFbd7xaA38)G#*w5{!RL&eu8X{{@#Ft60728jNGJ%=!tUUyN&V-}^$isC&v(_NgmK7v& z_RDz^n00C2w}dQ#zLr*I4g9t{;IG*=&d^Nh0Ln6IVidknDhF+1HK* zs5OhLGc=2wACAcR<D-lwBDMgyEj{}%R`&2&l=OEJZ>?ohqktPM73X++A zGic^{YQP}Uuh%Ton#+pZL;t*m_<586bknmDNg3obyY!d(Df%CSW&-nG)_rwl4YqFC zhWT^np|q$F3+K+oidPn?7?Ja*m%-1qEjqMwL%{ek=-R;_qlWg!@87zF|N7^bcyRCc z_~5-a@y$06@xu>dZ5F@gZ}9CmU*ht`v#6H|e>EC7%T^`?zW1kh@%O)ejlX^WCBFFL z0lxb3F+RHYA+BG&i0juhojP?4CypLc{?+?;Zo|vJ_$dO%jYVg6`aM0o(XE}Q`hlK0 zW;8O>ZSeDSMa-npcqYY+EW&bb;AFgznTp}QKFAB2g4arNus6$s14YTGEU{n@L3w|n z345r$%G#WaeYr_$QqhJacD#0IUMG@$TfEYSmreR#qPN?0n0Ux}31>b)82;YAF1^)Tv1AO~t1Af1DCjNua zd1v)>LUK9ouGFDfEY1W~Lg%Nea`5%qH2iga3jVe(8UNgng->1)XimqYWdu+{@<+?5 zWu@FwJic#-=KB)b?NtrF6*LoO#ZOwB;FpK{%d!+OKU|uDM@tCMi*s>*F@c?c{Qj~6 ze6X|#A5wCBkDuQoG>c2fN7Ow6vxL3-Xc>W-&@90Gs0o_$C;~r$`4RK+P+6I$Q!{XX z74s(Xnb%a{&MV~vWOm4e0Yy@Y*RJs8}2+cD2(!;|Yu52fxM+YJ`xtO4qjxn;LoUcCy_UVC`!0|{T zFsDQZ!yFNSIPMSb-&;MyVME6u$&y2OO+{>irg(wpI00vyiXssb5<_V23^xg5M<5hH zmIX&9Oqhgm<0oR+@KNZ|qc88*9&T>#aHqVzebBj6XN(v+48Z|Ggjm9DtU>cBjxi%D z&dPi79tJ`)?;Rd4A?1=_HfF)d&&)<~8A(#1;KX-1gs?d+lAz4|gvHxnd%NJKQEEf48AWk@TSh2)Gf6y(mt%%Zud$g9Bg>@t+*mY^V~5XqJlbn7aU4JEor zJGG>__)klWXM;?BWL^c9#o8RrGQ@IUEX`4b{%C^y_;E6b8HkQP{_t|s7B4XjABJ`5 zX?V?;fGyT&Y&L7qyqU1P$q+@Dj>LxeFl>ko!Y00VyQ~p7n4ie@A<%5b3FVq0&}_m{ z!t&t~0}fL4R9&$V`}5)zG#{UrgX6Ph6+Ue-5?RUb)XY>Iok?)!KeO?_D;k50+b_aA{!yWOctwO#1?LaW3I_ZUNzz zpggOD&`ebbG)p{a@j*@@EQ?>WNrPs>vj)t%6**0VYo2>)W~PdrEQ!6RyR|e4WsJV}S+N7g})bISZlLf}78z;T^t{w+P$U z=M%8!@!YxTIJY1T5BDv>cW3wFPdnBkyiZT{o9xTy)wf$W%qlBU{nhU6TiLIP6?u=c zFbgmrA{QJa^Jv!Qy##5^`mFuaDr{p1SRI(voF|QWT9y4f&BapdV>MRcR{CSHF3Z;5 z_p82C-ACZhp=L%#rl*6lvlKLwZ}tlVk=GQnDFdin>qtwurBz_2Wbp7bVAi2o`Ezq$ z^3=p7+}9=0uB$a@ZU)W*!!4u$xi#t6w9wO_xt4V);H&;6U>)n*|CZ!j`+X%yMw@+h zOS2~35-DJ%LylC0REReDPb4rZ-^l~kMogL<(5wUVldz=&ZF3+KIiI87yE!yh+FeN; z&@7`v=e&xv^hK(%C}`F#tFmRIahJCws#pq$$q_+igO7Q(R4+y>w@%~?K;-mY2#$SKXVm1HpJ)xOkc;VbB1-r_!ym1NFu3o^&6Gw3T*dd%d za{_hM`&9&wiqaws=++%Q+Ige52LZ&@UF9n=Aqqo>^g+K)-Y89sLt$tTQpS!#g&_{v zp`jQ-fLfEEiM<(C)aF^RkDb|`{CMooH)3~zK(hr6#mVe6ld#Dah4toeLURn(TL{dS zSghrCttF0%$68wgHl`Ysi^h&j6Sifg;OFLe4Diz?kxlB?1Cn(%jzzjaA`UoFUi2iiwVuI$Y>ah7Q_*1W8#yPt3*_sjr%Exh_WD((41t>QA#vqC`)%3 zK{!+_%n=k}Gb|w)5j;OkCe#r?WddW2SaBmQFcJ1G%)d+w6u)9wwtH$oFhM;5;n8sf zX2LN+R=`=d0?iS!z=`-eQ*m)-^y@baUOt_9O%l>Fi;$ULfecv=yI?jdisoZ(KDU`= zD9QV%&kE4krIigoQV2j4VvruS!I5L1De^k&JvQ(&k%q4 zRGgYcXr7&l!~CqVRNOSO`MZ&g*9$Um^w-a#Bw!?=*&j92({WtfL9!G1TVqD?@ZMO) zyk1&ZgmZ+`OB@(pAb4Jwm#-Gk)W(woGYPkvZI(zF8Z>9(Ceyt+ z&x#xKlL*d9xcRJz(7Yg(?;_9P`+1e`sfwk}^Uv|UytjKXzBsiHH{Mu>2|c?jpJs_y zKcH7{%$r%EcxCU7tqP9n*zYv#+mD0ffy28A&I+3M6QV_xtwM8CkN}bX*;C})hdS7b zecluOzTS^&9HqO6YVNF#be;~)`>>b%vu~$xkj6q)cIJ^j-Pz^z$&kYoKl^vkTRuMlqU}>pV0l5`P`;dF;b<;*G zkaMN$(5zR2Yz5MEXx4$v@u1bieFB)JLL3#O$pL!~Ak*bIuc`9w?gh=Q%+jC)eg&9+ z+|sP_tMbV6^m5r7rQAnz&CsmK8W=lVG#t>}9F%n__Y9|pr(TFAV3x+rvf3rfZcF;Q zZ?IlwEsu6!w%5PxH-k?l9b%CaNHA5B`s4oBJ!Ij+_3 z-%OG(=1(;$ZkshWUL!A7Qhud2jJLi6F8YQ%+v5JuW!_`m_^(WMjoeLb-H)tB(MA3njSAHRX-)3#}DJ^;YRG;xec4vtwB;;0=jwnqr1BgdboQj*O6{rI%3#}0q}BZ zgUGQXQ5YSGw8;~ZGI2cSo0Blc&kr*LCZQ%b4f|6QP?ep4y;*VC&+VQ90yI0-+5!t9 z+M=L&qa_NP#5IG!ypFKEE=d%RHQcV@@%rQhY+^dwGK|=oVZ%$-MD+KOzvJ5=U|@GF zD=;f)USFDoim1sL)xitv7X1?My?q&9een=q%i=GD=kMM=f%mrj6#qe(d~f9p+@bCg zb~R`g&@IHHRfYI`O)kD$mx&+Nij_GP|J;~`FIS}E;UbFX-djcpB{(bBj-{GAg`^`B z?;floOe<@%MB*SUugvCsatP$y7k_3Qnio5u`TnAO+A1ztTZ#K`Q*X{o!;R;2an5dOJ}6+bKm(g&1e&8|1n1IJtV<;{+bzwq%B7p9 z2F)3%(_ys~VM<^uI(F)gE*4*K^SPGI-M z=rI$SMjXN-W0gf&S()QRgl&R*G$A=WDnVJG4a|@DZo9a7WBeozL^BGInpuJjLUV4( zT$Gl~$LxZ+n44RHiu4kcWfdbcJqN=_ir;iwxVyDOzux^Yd2%506%VT|4e<$tE_v6r-#pu_igL;3w+PPzJ|9*IJ;j`>xb`dVOYtT%XYTRF`p!v|AYNZDDKVo?n zmko)ap`el5Cg77uTK2wQ`1CectT4f1_GH7w59bGj3j{C)LC?_WNm*C8_GCK(hj8Ug-S51h%vhi~bPw z2g>slgbF~hGE}8VB|a%y)4`IH@0ER>syqzZ{LpU%1SiU*5P1nEsh^NsD2bCj2fe?2x4va zaC2pc*aaKbF2TnSKfs+2-o+=6KE%KN_6`2?@899mhxc*%*da6$n8hulv7XQUfLNRN zGTnW+arK;9UGSd*&fk7cX#N225}MzA>pCu7I8A6iic=?#Dr>UMjy?pJHt5@-1JXi5ksTTW7j_ex_oNt5 zoso#j^mtTe8n7of5qtBE1Zop13#{0j9HaR(6OK0$ls8%uC<1U&B32t?@v3yv{BA=^ z99|bUjZ7PUV>MxruQnlZ+MpiF(!4I8kX>j*S=a;&Z0Ck0Kb?>F-nxWuzI=$UzWZEF z2>kx{r|{?3e}+FTn}H8k&cuhy%Lujwgl0mk@@Fm}G#B7YLi6{8=D&(-#>Q0qV`C=1 zAT)pU3dQqe@}R_07Eu0hX#qv3B|zUJG>b*|oLyhR(vp+hu z_s8TBLlDb>R8m9`l48PO<+B|+Xb_*F3#Ns{z?N1FlO-RC=3FEalFdoE1n+z*7a1AF z1mj5Mtj;mflFH-bne_+`6dU4L?V>;2#+=(LfkQ8 z39rh+orG9%oghFb7_o?iYKJaiPy)+}V^mQ&M_l|KXd#j`NIbRMC3HfCm*p(48! zd1<*AHEOK#?{;->hk*l!5P-u8`w2)*BfQ(vSr&t`UdIrcWziG?X5Ft@EYLAg@dykE zLZ9A!)dDE4d>)>j+`77=zqb#5ktiV{%>-w@r-bG;36WSC8-kTl)9`BKRIG`df?XW& z9>_{W1HrW+&wz$}gL2b2!uR|zb*R9IgM2sZBsMeQxrX2@R^|r6@litaQ3B*~g6WwG zyQO(X3Qo@Cz-$)V8Qa`JJ2dYs%)t5_8;<;BCZs-P*k5GDQU0a~IMzrxb*>fei68ZL!z*l)8BcJLb+iQ3EW_LA53se6c+ z9=8I!dSC7g^v=?tnOi$#HUmt1zpkg#8W%_(?pTk);oGbm%k=a99W?8f-Ikv#v96n4 z`y&!))MCTwaRyv z;~~n+qP;X4G_zqVVCFsnTw6JjFPOc&+TSeF@TIOC<(83~l7J}l=4@epLIRa?9?R^2 zW`~^9b%@W6b+4YE>^FO!b~SrWt)$UfUZ^J;AJ1(G%$n@aTk_uo%Fg0F)OSqRUYBzI zem=`x1m*2pw_+=`lkH%yd|zFI`nr159w0a$IfhMZ*P(a!?&#LJ6GjXZz-WU$Jv(E~ zt4r{Qx3A*XwX^u@V{uJMT^$9-wkKe0BO)gRA&K$>yCZguU5DREqKingy0)2+aw2Ey;k*7L(RVQ)^8LSSg)!VkA~ucutxDuV)A}n=!nD zpQ;bBG%qeNVNEUpJ1+rcA>+{Btu220>jI(qfr92A2+iNTa{_v<*pouJ$B2hC4%O9=GA{YxeJ416mfw_R672v-3P|E~FekOT( zxNauiTU3f`3$k&2ZZ4r&Rw7F!C|d~4X1q0vTf)OTGn4VoEW$&D701LMl~8g1C&f51 zKO090HAl)*3C!teq+U%;!~ky%nmxQ^$!vd&936zTl<7#dl_P?nDJ#yk_aRIV?1%7) zSx)Lw5zYDH$b5vKA_EPO{|@x{F}57QjMy&dZyDDO17;a~akkynPYoMHq`4pP=-0p|XL zh9EdNoMlc#e1Zwq5?r;}> zXu@+_4%RzxfIKH60-IBl)w0;@5+krGE)1_m2V;5YWULCGifzUSRB`ZBn_)mbRhtoq zx~w?VXUF4UHsLwffW|xmHNm;AAQ81iW>j%o!|g!Uc#e&PcS2>2~)F?jN0* ziX#>LetIVA%QCU6fCDxD1~kquz|4t55z*d54Zsic9tR1{)f~_jPZ)$%ITl=eu^boY z6;OoZ1^Ku*zff74FU~C_2+O2Efm(^FY{R8lq7=%C%d<`V+>G-xE&Pr!&4Hy3%$j5( zAVqM#G+TfMo@AJ^yen4o-h zj>LwRdnXgx<$UI60ipSs6rAUElGkf|XOA%5zwBIwFE2IX=b0Auub+yK&h6w$6R>{m zYX0u;!=4>)5U>bR?8EEX=ZICgey>=W1)AAEsQoI`+Shwf#vkd^m0`!B)}T$7I@jJ` zN?)omitI~YD_iz|>^n7J7D=D$+#ffGX05-bnBJ4nEI!Su*_0e4Vs88hZH+oXqIdXpM(!o|&%fR9PQ_!p?E66@2KMK2d?Z8fMcQ6lo z_%iG#G}qNP;y`UZs;X*m^w43vxbS&+w)4Qi0RzybQ)l%9IehQ{?AW{>=Z@9m?JK8n z_4Gk3d+8_q5!r%6wR>@>ZZE-kkJ6#~eK^|4A4$F-mrftWKfnC~|N8Dr{EP4`D-Hhf zotp}n&r<@-M-MgPEwKPSs2w{_!>bvsYAZR3i5 z9lOI48Ub5i5R8LIqA*|@;+V(4&i>etZoxJaf!UgbeQ6f#%}An**q=pc&b6wgvo{f{ zHxrn*B$=?qV!`WHE4J9I1Zg8y8)L96J`Bs*Ij^-5uv0Dgt<8clUA4LppXOHx&9CP0 zyzCeS&4b+9;MeoY@Rv6^sQCOYzWL#E{QcXH@%3BB6*RxIh*0`!1tGRrEjDtSz$sGD zygDCWuFJ)Dgy!!F&41gNg6}t^;e8%|xRMYppAo_Z zm_JxTP$m?AxJ;m#TY=^kSp@D(++UuK2L$H_Oi!X=++D)^FD_KjEV@l-zC&mh_l!H@ zrXf~sv4qRQCj?=!G~XvQ-+z_x+!S(7wqk+icXt12%{?R+cV(fIH5K^%FAH&1CKk=j z!_7H4xHTsgZ_P=<+jC5Ko1pOaECTZk3;slixK?4osp(0${6Y~f{glu=H;d4mfy3nl z=F)UD6lY)s2SWYr(Cp>y&4H9Zh7O;K6iX@6lBW~WQZRV%7`0m2us*#IIA%1Wrv)M* zB3v!%)4OM1xVU-~9+Q!hUJ6Tc5mg9VN&!++OOV3jWLp7BO6DOTAPU~zozbCv59OxO ztLFfCwDZTHfg=gVDX=7FA~7xn34~cgbP5tFBkvzi*p)~aCPJezIuUUZ@d%s}tYR6< zq9}fTom410u|UhVLq`I1?|~RifDR_`hDQ*dC4xr`VVR&A8fm1&$615v&1H(Woo?Ve-Izf*7I@Ai#AYX=F^hnk!>w3> z^G&FeNIKkB7no5;z-^e3hT5_e9O7W?#PnpGD^JGdvQ%6uOT!t0^KpKEd`2dYmS?HB z&HD;d@M>~2>SktP{-j}u^mD@=0{Njl1L|@N*i&Lf#)#fnnag(j(-{Qh0s<%@nUH;P zelekynwzhzzZVIs5}R2j{9WdN@iL+LGTZ%Sam(O9@;m|hqWC%UTs^+?d4lo?{@I!@38y^cI?mr z^XASW4DBOWZig%pT}4iizFe%V+CR?PJg55I)o9#Jc;;5tbZCNR)t5`3?~u+7Ev18^ zI-h;G>WkS=JNQWZ>`7_Nq)CTn;VTjQX@0Kt<<8K&qZu@7YjJ2Usx?$I!y)tHoHt#P z=hmRo8Je4Mik_b8!!-#@HZve9E@Jvl^(0npeP3rOXJCG+>@Flt*FHsZPJJZ@L43{v z%ev~A2j}uR$|>NlmB#_iWGX_lNY->{qn3gese)-B^E7CF;^EpWOgol+rlHQ!W@|ds z0-o*g=#bWs9BDOIYmls~wey?QR1Sw64_EdzHAVgqIJ4(NZ#a&MbC!J8{PQQFnX+4% zm3XW-o+fB!U9^NUnA|tvq^Lds2>c6*Ti)`O8e|SYkKtr}l7vPt|_x zsjkAl0|&6L`T(kG8&FkKi`tq4XsByILYzTO@EJOI7`zCo0x3Zg$MMD3ijxg{@$Tj0 zSn={tv2o=hoM^1X(fYkO(y$Ll8mn;P$N`)^-hlHb4&p>Z4etNt_xMi&^FO}*9N&EY zh~Ru5*Ds&P*;7Yx{`3i+e@I!G&mKRHy2?sy-MS60@7Rr>zxXqZlD}|yZw~?HCTQl5 zx2q?jCQe0mbR4YXCc`jf7;++lG04jW^MeAhD}(VE=2-Ds zQXF10#S@ql@kUY--muuXPnZ^8N}CxQk`l1e5Q(J*g0m?K>ue^x%ycI9(!!OE?b!h@ zvtwS76@^#QBQZU60)~6G#V=+TvFc z8L)_8UqlerEzLSKXKNN`!m-vc!|w^reL9+#CaKzKL-z26879Wn;pyY)kR|8DT}?E-gqUj$8! zfHfhFP;5nPs0oH}3k+d|W$q`0B*GACfHB+vldKdNWk90DN>*+O!5A}o0tO5kjP5;p zp?!x=@bdD3hldZmz5L+g)gE2D_QxQC|HLUF2nm;Pab`rtCvlsE@MyDg-3W`aAvz%g zafJ5ve%;ZbLk~;~j8L!~5D>1wIW#nukZMpa5^-@>f-K>lurJU&WXLG^`gCA^LXnb^ z$+G7lJ);OYc{7odKO4nGb1;*@Tv1qtvZ6AkA(2G1e_DF?>4&h;D8$DZ6)>C3OxI*Z zOuSKp=6DU7Gp%@Eg>ph%1yK89uGprLI$c((OGJAr1t&n^-No9RdP?1goF z-(;S_S_22JF`;@hszptr={d~)H9z^P&X1nA%;h^SR zBwOeHT9=)fc)zOMRg?pqRT}Ic6*RLfPeQYH-1HASw>5jVspF?orw zS|eM%JUaX7z^tsz0?o1!F+rAe${`)6YISAQ3< zVpU@hr}LeZ+UGeUsccrdT9%m)zD9g~DQ9q2X)z6LUm6bY&xfyM51A&Tv?m)_OWE^o zm;Seea?2oiIvzB)0;2p*%A&-3tG_MmKrCCW0!2JekH07tX@_WKsaEH+e5#QMAZt=E zEnEAszfo9amA-!2|IL$hkktQ)ahiw~2kbG0uosJy5G z$=VCMUun16s!=cNtZ7T>fAy#rlJ%sv_I=eZM9hm;udGA18SU>4*)ngE7v{N1@?+!l ztmV4q(w>nTjLD14I@6XiWx1s8_ySb&1=v?vg}pV^gytGTb1nAO)S>bKq4_{PYO1SI zUwc4J7VO`rKRS2pjDh|7!;`@3=k9`}xNy|%c^y|zHQ?g$8Z3WlA$GmC9OsS@mK*os z^x+zuIZ}tSN9%F^_+gwsc1Y>&dw;+`e)tUEe)Ry~eDM+P-2MYDoI8dy0?y}8qp_i0 z`B_|-U^TF1vEbK5QVV~88eaTkrNlwC!WTRRZ`!#a{HYFJd$wq9kmk%$jV&ElHz_DnNn#Z`9tmlx*X+Am6RVs@6YG9M(+)Dmp!OH%kOqA}2022N}b z?9|(|^(5Gig~^yt;4OzSF+=$bdwF=GSC>v~Bf}BE!Btpr2oeoObdZ?9zMWx9EkIgk z8G*SBsf6d$^b!Tj>1n0N$}C4g!7RkaS}|=}6auD(5p)US-W}D#AL63XseN}$n-qfh za3i9E;|a}#<FcO@TA_$cP*Mt~;7E7pXj%lO283Y>w}GM22n~-{ZXfZoOn71{(s;haj2<^;GRxqD!GlH-m~&Zg zc}PpiM`lJbazRi-U=1#1W38V_80HDvXvC z#3YyzpJ+wAQ9|m8C0qQa#T|t3OlkhjCQO+eLdfsL_sE5Vaze8Up_%WqEcfl==B{GI zkM#G)E7l}JbAnpLWScDxH95&Rn4gA*ObZ$b%!iUq6yY`1h(l>6984!XXA+i0Sp@1F zGaB*;%LP_c7F$tOmQ2~W&(BJ&I5IsM=gU%XwKx^mi_&ntm{41mfwL7^I5{&5$7ZnY zP0vP6aRvv`F{mpv<0q4cAkNPV`wPrCQj~y$MTyv)YeH~GKmI;gaQ)|V70_N;P@sI8 zFA`MGFUZ5W1q9&v1ljp!DuDoPz?C_PxKd%p)e0M~3J}joAw;LEmHe)--HR?NU*!}9 z!siLX=VlY?=j1W1LR^^3v}b0jaCx$H_c;y#&dYX2I?pF`&*LEV#WGxZp%mv|D8mtg z{I2p0tYW?V%oK{^z>zQw>y4mp{ut%yf^PhM6)V3i8YC_loq61c+ezKLv7pd`|G0G# z)AI{e8|c)zGnTyaJ2H>NGTwnbySAZH`u{5SNeYhG?~sdX1a<|SCAyZ9-KUx0++=Z< ze$C#eOW)qy)j@#HAq|?fbEV(c7)Dq}>09|;`b>=n6fCpP))p{hI;y{B zzbue!mwv2qkASlV&(-z-3K}%amgN@1A@3#s=*lwDYH4h4^`){^|I2*TQeuhK&((~K zl1KYixI{ivOdwmSL-zN=I#PD20|mL7YWK4~?NZ0hpje={g=8?J_T?J6o^`A|0PJ$A zdmWfHXs%S?>@0EFXy$4wWw)0}ahwzL9U${*B2-uHMjJg;iat`(R$^nJS(&<;th7xP z#02=TWQj?UK$}#goU4y?q(SJ>9BTN0oh_-_NQH&lEsan@x>YevfkG{9UD6=czU0IH z;KheogJF9Edy4clnvQv2{WFnNiVn@XRAW&2lY3LrcoYOP@AgV`9(6dYrR7nF_X?3FINz1d;zjH_!1e-)v*n$TR0z10V(It9%84%A^kp}CUKTw7O*sw%#egz8`X{HO5q^+x}` zz0jje2Y9=;g^ybs6lK|1Ut4jcY76$h_6mNzuoMS&ti`#*RXBC93TKbh(WZ?nI<)VK(vn%oV`m>aIsg&F#vwT{5JTPFP%&vd z_7j|U5tQ~M5r&Nk*v9SaCIeoxB;qxzrq`_|0yDu`L}=bjy~Yl39l?3IDF%xZ!|^h= zFWW3Ij~t~eoc%j^H6fVieMoQ?U=~aB9YXV60<&1H?<^N+mRQSqgl2*>;aLFsw!rgp$7Dcp(fEkl zhlJ_JgzAqMXF9^qy^@1Fa@`7E&wIVUR>Xbk!?lF<)p>+x!ZGusL9Y0ke*S7w2X2E&FPubk-_84j8YU8 z&P7i4bOp^(Q4%K2z&03z0sV(4zhoc6tgL+3xkGnMo)Cy=0bv4a0?#u<8VS(ch9|-t zVOBClnP9NHX(U7>z)FbBA;1=<7a~86uxiOhVq_AgO^m>p5mPW^&~WtZ*+;o;$l@vj z(Cs{Y;Ogdy&Rx2rSMRCPo4^|c3v3JX8^)OqcDX~J#|_nrUpeLC@dbq%!9;w zHnC1*agr1(2Nc7_b;Tduy7qz9k_lUKK2mH2$V%s6A*~3-Ib|r%E9YRf7$y0|u$Zjq z#(OGQ>eLO>g2L29zjzKv6A8-+W*eg7O^8cOK^#FoHZd771_`HUBiw6~4#lz^$9mWTU7i{Bgyn{O3+nT2s3}aw{t_GZ5}5au6P`;5-PB0}^MxWp zGlBVPX$CG6C@;#&fV0wZY*s1`&Pqc)p|q0s-H;TAgL(1z)uaJ1_<5tcz=RXU(KuF~ zfX#%^DZW1VxzR{yo`Xwl>sJWMve1d>{QNAOpO-<1CKxZU5oV19>qK0dX(B9Jah)K1 zk)V5tZT%`?_u6bx7B0_B<7eXY%zVzY;R3hk1)2qx7nI=atQ>w$xaMH=%52_?$H%8< zqLy{_MurLNk`wTXDGCcCrlMft2*ma8feD@c(AV1oeynHt=THm)@_Qx&6|wNk;xaw> z`#zPw^)seSz=pI0G(MM&$4B14!z(9X2@hB8pm*1u%_q*#XE+gYed{9NZ1hx9&QuQ!LBt?#w$&mH1%GiY`| zuS5DZ+PTf@xv9Rjx^})!{{3w!{W}fdwP&L(?@fME2GHiJW!frw2DgM}j;8E#fU{UU z`A}I&8Z?UpgqQ#;rL~Hc&t-HW7HP3Yie8Lizglr)+tY87p8s7uPl@Gq0_WDCxyfIA*Y@VJ1G6(U>uL$j&4F7YoA7f!Z>8yr;Sry9vsBM70fs=X&g`u0d5z4fYdUYicS{R#t?* zy}M(`fIjeWZ42LaF6ij%iCJZN{Nem1R=xZ@ezu?tzg$>>y4|ni{Lu!2b1k7+Tr=tj z%m)e12XXosLHcw9?%jD8|NQ%B`1{{J#ryBSjU$KaaQ@sGyz}#%KaMST6}F8=Y=XZWvg zAK*Je^M{o`!+)$u#ofh5JYJrPd&|UvOz0G7CKNv)C_h@8flmaUS7+i=IbM~8j|4~s zmhI5|fey_onudg&({yK90UCHK3Gq!e6NJCO0z?=z_L}jX1q-pcx#pw@655{U4DLZF2DOl5zfC*fMXQ| zm>HRbX2KGoxi%*juf_*pe0zV*((L9+`0~b(ArsYPz7#^U&5}#7i-L!jSc5xZQ2${V zGiss=6Bis31~)=x_g({#kuBEcQU%RQgyzi5azZnqvGiF2bcurI*jR#JtW01tAYf_; zdiCrNZ!cL5&|9^?_WqqQdh`T@1Vth?G6C^~<9M;|Mw(!ZBCtjiE-CG^MDCkmGo&ER zoPqSDEW2`$Dsh?hngm43bgCa0#VjxD07)#I`f}TD4p+kqR@M!15_rQg4*^c*@ zs1v@JEU+9Hj>!R$2nsP^T1X-SgN>LnEdhy1c}U4DCERAAZ@;1F(!Mjgb?$-K7$edN z>?tV)NFg|I2V zRv0*mz*>xh<@q==LvxilSdgkL&Go4!G^QAEC^eBHG^d+zFq^PUn5`}`qNc=z+F~;S zG6~g17F0@n_L3A-6(yrK--;sz$v8t;K3AHoCIFr#D4(92fwRwK;rN^s9G#tvL)_LA zQuh&NUrmX`p)LxoHfOY@C6rFe60 zDcgKLZp_Tatyvi=Qp`=Z_nR`wZgw8d^EdPm^CKbg&JwOKh?tKHg!c2(lW}QADq)_` zT&lT|T%Ml6_oEE&JU<)P_?vrrb^!;IX;_yUgXcoVAai&>ME2;2@!swj#KA`o)}w&1 zZb6o1%4J`+9jNYY+hCYiJ521{0meSvF@4Mcth7br$bxddgU{d(Kd->szb?cl2Ug<4 z<9iV?PWuf`O-ex3&aJB7l73G0*OahM&3^Wo&4AfS>|Yi8ILW~Z&Ck{0O_%d`(zogT zpUy2UCDz)OeA1es>1k@2cJs7!a29Q6{&%zr%@U_slanihoogJ@%HG6QYF~JT;V=MsN_eCqit&U*YLt> z9#HD<2sRi6Zh9p+Rk$6RRfEwX%`V+4EMP6*EHf(|7G^fa-J0|VBjBvXM`i)`>Z6aA z4NNbHmT^5F@>x^O6HlAFe6` z_jT!c6WL!*Ehe!@1Lo$?EXP_y)apdikq1dJJ#k&o#B|lOu$NP=l{5&?ntR5NK{MgF zHE6bPe+-%hl$%}0w3OvqrSE)S%{`>GJUFZA8L>QCdtmwb;#9SO=G`@f=9)T!a|3o0 zngyWu)z)J_p}DH28vEs0RPV*lURZ$q>{N^$r7e2m(asf}I{2a>%Zj;Wxp;Bj45|XZ zetrRt)>Prbu_HJ^@I7;=5hoh!l+K?#f>X!qaryjFeDIey@z1|~j;}s{fa6CR(8zOd zT$T9EZxO6dp!Pr&cCrpuu3SN_#A~m;hF|{jR~R;IxE-9?>8d4++n`skz9=rOfQ6m% z#33UQFmyO34eGBtgp4sGuqQtcZ^Xu9XQBaHEC#$TuxtnCO*|H>^Xr7=H_Vi>G@GzV zL30vT+YDG@io{Ea;dqJXW(Q7He_eL+aL4S#5G=BV<2SYtykMM$37uS#$d2pQg?fDT z=siO7WBl)L9^$WmK92j9FXGNhGd^Ati!Ya1@$pgu@ya~HC82o@wJw|DXKOR@5uy1b z0<(N~cNJk+Trvov0?r>0qCezyGVxEMY233z^F2cIT@k_gj#8c-nsb|=S*+XlSLo23 zt$pHHd6*C$CjP`(ELR1J%3nZARylk+_l1vN7j!24#sCpP?!H9E zqaeQwX=!;ZbD?s}$l^9Hy9imS*=ohV)TC79|1GhYeZ4zi(j)>iVL2o$2GQ|mL`kgY zXd^*66{eJI7?X35Xv=}Yk_7|d*277Y=|mYqKj{ zsI~+>SNFEc@)z>%3HI9i^KLq+Lm%(bFE(}X(0a(${n`7t*Vm>aW`aFFm> zUy_8XawDpzC!(gzM0hr%s@OtUwklvgP>@3KPQ&S`jfst`sv4MQbaW}di}4X#}{jXiI?&b~$K*EL{fpQGf=APz~N zr1ODHzHed$%DK;SmNV1n=ZG}06`*Y$gf(}E=6vFOy`IL?B>kxKar3mCq1g$T|1C80 zJJywWB4}bhoCX%cA99_9U(=#h@7FdNjfOo;ncxXytZk-wPPp!NcvBM zW|1Rr%qz>*Qj#CZml!JB)a|cB-Tni-u!a{1IO~xf><`%qngzx*UzaB(mD2Pu?fZ^K zCBVE>NqaaNoJz&2n8$Wgy>2RhLIR-9g>ZTh&iB=&e=gTLpQlTm!*rTAq*kC=Pm7I1 zyEi|xOTAoDiL$lJ@$l@|IN$dv`PZRY182SNT0*n-nP!dWmghEWDAI`JGpF)ub)j7^ zY7Vb0Ky{}boSnf}B(SWu0?NDW()M+qX5H7^aUB7gX*trD7ufmUk{9izZcXY1cdmQA zJkHQuCH~ABH19f4hn+Fk?Di{>e{pwDtflojif#^|d(OP>WLs>v6RH08Slk zP;MGmFPy+lVcbKzgPXwAD;{W^l5&r&%lX$rIdE8wbjW1t`!1qfm z_;N`u9L?jY60%7DkdlwIYF}kuz)aun&2$3e19pm zMBCn5lHUZ*0?jf(P&?lN&Elf*U}=tu*DT=t=#@-7SVBmbC>yH&LF#GD4p$xYDw&dgN2OT9BY z4Y!`p!Ht(nab$rkXPkl~1m}Zg791%vqR|?F%~2B(+PRYgD`|&fc^x%65Q$c?sFoqc zmaky5Tc_^m;OUE@{YPQMu<^=DEQ@$ZxVhn@CLt@Q9LX~A&+gBhmRU-GELCn9IXTl| zkg#!tSDD17LZ?vz&S8Y&F{3Ak<;qCXIvWf2p1PaiD?^Eg>zdkQ8@3c#ev(-064 zq9*7CO^ZcXumPb#@d%z8MMWYcAPOPe22PH|)JYKp<8Vxs{mD@Xm?EErVbVAb$i_^; zxY3g^lJGAPI0g?M!NFobj2bzfFgpb!+2)508-oG;Mq$u^@htB&Ud#Ig88KyQ3$vn1XBGD}2=TZpFgyw`K^yxiN z%|&SE?uh~Y2eJM%A88jC7rw{hZYA>-+A7I_Xs}m1Y%D6k(fJk1%6zym9SvDYs7p^o zU7CR+I1`{7bCS@QZ$V>WG8&4ls4X+#Kv^OVlo)ZKm;hZwkS-E%wxPPfhT8mOH0Gt^ zNM1T+L7l{I&oin}beDfxiW3#G&`ByzPfu00!5=?~j>MH;%*WXU#W?d!F3vociL=k7Iy4PK9i+D{o1p6c>B2`yh%vDF<%x#5qA&^uJb*7drqc`=X~Kg3%)qM35VWT0#Dwf zeK$XxyL24q`5RrieQPtzX=@$=mOe?bffxeWPYEnLvqJMe>q+*|&k<=bD_iNG z^?e8ADt~6hB~DwX>zk+Z<5K538$X*tvo;S&G1*h0nPR<2#0i~kw7Qah*-^gM*ibVZ zYSLU+1ZZ1|;41#g&45`X`~QcsGov#QJ43QFG`9xNnxAwl>Dr-LwuI(-LUSD(Pc1JH zXm$qY=9Mk0;Iyn1tzsNfuqO?MpDO@W4@8G%8P#aVBDvNXn)Qqy5ZMc_)Up8f&zs*@ zYef2SYv<@v=gC8FYDAh^gJ$P6)wTA9(ge;euqwS;DEGN9u; zt?@b4he#f1)4iLg_iv$D7oU%_WHxToGV3hwKDGs2YGNo~Nc~#PD%}#81%gHPJW%4Y zBa;*L7rObmPA@s7mqV3Bxo1=onkyAF>y~DL=3NKsv4hYo5@@dE=W?D{`|J1bK<)0= zl@8VH!Qz*HiO!w;*_nF5jleo=-~hx&Mj|mf7OyT@iuEgAB|OhW1Ft`QxRJnJjiU#u zafsW4b=3stMjWiI#1TUAw_kpOMnZhm4&I0F%!T78@y@lYxOw#gPMti2hPo;&Te=9p zed#w?!E{%zUXS0rvd^y{aq*ZoEg0iQj>ec_L)2f9(Y*mCL^Ia5gVyZ;*KHEY|<>w^7#gf5$hE+ClZ>Y@mo_QJDvoTMunr3 zXFIhjVOHn_{4O;HFQ-P}*VZstM)t(G-d?EMvKZfe@)!L5%ZI>MpW^TDoWX+~&*An< zVfgChQ2ceV1z)|Ak588u;GwuOtj@!2vF0k5h&+6-R@^c8*{VX^d9@TDtSr&oG6b4e z5^4#-%F-;0ju5`@6O`{Sc1XZkw=`>uoj5Gb_jG7pmV<|c=0}S&@$i*26~Fl-aj_t< z-&vncSf)10HV5yo%fWku=J)x%_%16@reqSMxMSR1#pBf)NJ}J;2dm2Q<_pOA>_Qg{o(1;LCM>d?)n^R)_8mn?9f3|Ax+zzcT*7i*ZYhDe6nXh&gyvGE z+}t7*a+{u(iQe6NY3~>7cEZF!4j`;*;1C{Vpp1w$q`+*=)j(N?Wy+jPS#w~rX2F=0 z4wE?rN#8$1dwtampT4-6nY2M2~>;DDiU^YCOjH~k4hCFSPJDk>Yn$ZIKBF2+cL(qETqY zK0@feyd><+O~U?c4yLIZg7f}tlZwuA{UZBtF?S8DYj{M#0lq>7Zzry*v{fkb$xyY0o#TLhc{!% zvolni=+(D3{_uxe${3?|#9 z`K~EV#V5iHPtVJb$#MRZ%p+x&0f5duBAx4+L9<;JL3|-BkS|(CmENq$l#F@m5`xmPb1@iv*g*+AMOm zHajZ1B{VB{03~T4a=zAJRHI=7P!GWgl67d-LwraY+Uk1x3Vi#UNqMnw^2Ub!c`-|4hH9{)3?TSZi&gd7d2C z=+~(_YpNTqPUS_dBslNq0Bc`m4fa$Wz)r&R4iRB_XH7l!R@L(RYE;X}l`mVvo^3d| z=MB{D+>FM3+i|dFKc<%!sW{H<{n`_pT`_+2IHV*c3 zhZ+<(AFiuGLroRycsEuqS;X^=*tPWy?AfvnyEkuRTdpKP*Wmj3vv~XZWn4Ia9OuuU z#9sa={rqP?!z+sxW9i}*gy%JQ{`nWsOTxdg^YA3Z`g;>vygV@~FhEUE3YZXx$;?xa zj_uKjKlHPwPC<2E0k$R>u+<>G%>-pavczOwZ%n{z1Z9C{9i#=CCCr>ynwMD!#)RhI zS`3&T6Nzqq1Q(`hk-s<7ly|18|ko>BIlPe+c5}H@? zI|8#3p;w|~++CWBdrJwbi@B8*{+hs<>G3-y1be4$b^bp!oxdia}7mr(jvO1b6Xu7FQC&wm|cvHKn-yN|Cav zU7Bgbm020MIy;lloUK9gY*}TDa6DW5n3I)T#@mGDcV?t3iTlQzzbwVMUl!ruf^@aW z$+1~!I5ZL?MIFY)vk*yJ(cCg+0W=oR?PtqfraxCeSQC%?g|&64i3h zQQU@yYTzs`84^vy)s^7v?xQ9Ly0rCRyVRhzEg{i``D@G1-8}pW*M4ww^Hw42TwOew zf9|VendHZ7UBypYBE_&hx8ZRci4x+btg`|vVvTLKb6>Y@Q>>?8ZWn!SKvNXF< zGT==fKNbg{o2wSJsm~!4XPHqaF`2WBXe=Qxmz?Bzf;lfX6lXI&WE=Jq%5o~4BfKEiMkl6l?nia$ebkeZ z^W|ENi5zO4j@IvKeWSBM%Nd;Yb2S5-qc3bJJ-w$%Pp3(Co)c;Do4Id4&pBP;J7>lc zHSv#=)Vcgl7JzdG>{?03iF;b(mF8*VRUpht5-?K_Kq}1{m}Q^W^WkgI%mOxnv*wl`9ba1T@>XDs6QvTl>DM zU*?l_p!p*+ukx}ftFjhl!Wo*iG$`iX@laV0P5YWu*?69FzMWqnExcTFNL3QpZ3fK$ zchJ1gzLn3kiGa)x>r<@D;^Qv9>`ngfPWjT)(T}zAYW1h;P}M7;neC!-UnTYtoc9op zcU4wnXH|^?=Uvo3w&N-l8+$M6+145rG;hTbnI*jKbu_Y$s(0^1cyNdc8Re%eT|P)k zG-J)m)wp!-BF>#WjiTIKo->W*+<}va8`ze#HrG&7iRTu~QI_Cg0|#RMjJeppX&d%# zVbSlFVEOWu$j;6ofQT<- zTeSD^f`_jU{QLAmyoI0>5r@FBlQ3%VVED6B8tLhQ4Qc7vV@t>D@d?->mS%%2a3T`{ zP1q!s}r$t~^ z!~_iTXoH2tDfs-}y9%2Bi_rYP|NIsnZG9Qc_%?y}_7Xxaq50log0mAe-&>NWKEJb!QcG%Q z@|sNC5lit(u{KK-4GBG$fd@;{2+nC*{AO{nXa>#H`W(DRAbyX){J|O0pPfT!&Q#F+rYxV$b2Mm9!8_A4@b-)h z+?bz<8{*IWvwR$SCIctt$i%En9On5)r<-szX)@jj8H3Oc1W-N$7nindgPs^ZbRwn( z$7AZW7z9oZL(r5UgiZ;-ke^G4k`+Ur}7p z>4sk23Cy9e7^Xe`S_ zU7i(n*+v}7HRCV=xq-l3TS6!n$-cN|6eOcM-=$Dukq0X69fIGAI`t>>oW?95CYAq*d6`)!z+L1klKK?+_NI}q7j{1C%@ zZA^+pO?ei!6{llkQ8Ip+mV`xxnfR488uKG2a{xUQ2@=k&XJ?G(0B@Lo2MnV6`uU?L z2i;wLJ4q3I2Zd~d=>|&x@YI3VeV`knp1%zv&(UGRxt;gg?x7kAdxoC5q>YtFTmvmdAKq! z8<*#1sqGcQzD!uWWM8%Lye#=mXukZ6SeSG0);vP<{9K%SArF7vvl@x^Nr2f_3%-7I zA5{{cd8foI78eD9W$DM+_elR>kDk%g_lV)}--&&qwy!Zz4VmW0p;`Jn>FZj9X0313 z&(-tsG->HM`qI{-Z_+;Gjp|M zl_w?VwFb;0M>;>6R#SePOIYxK44RuJ0@{IDRgV23Ymy3Prm0#!Xpul=^M@b}McfG- zz$Y=7bpUM+&f=D#0kcd56kv8pj+>8GoS|6<=jIQydBwE+?0*iLrBP|FCz@oT4MHd% zsJ0cDX5)QA+I5cl)TPz4HW^Zr7Ewg2LoL#TNNdb0e@^*#G?3=ttjZ=o0``16zc^a# zWo^-t)}_Ppe;=Avy3Wwd^YP((x9rM#b=xx}wmEgP+hgdgtY=4JqsB?t%%Qk|}k zy9c~{3A0{a7%^@lvQx4VJ|zT`#*IVwt^^m}Cu!hd?8z>|8%8rW8w}WLP10ale3+9o zcMS#2gyRiX1J)-SuqHVXtE>siMPms+o0AZOUhVx=zJmtzz)Q9`yqp}4r72`U*08%zCvJ3!q<~lU|o}g+XABkruOi1VrhO~pm{|WK3ql!mIX`}r{nGtUTa?xTii6nHA7rB?ky!O zFO!8$2+J!nn3h9#SES<}A^HBYG~8cGXy*38$_(6JO{f;1YJp|ZCcEXEP%pqN;C`22 zd{a*OcPkvQoUYFddgGl5v&5d}US^p*fq%RoiQX=9_cUSQp87 zbC$IUn%}KR!>x*R+zY&k_gRWWi}%`%!u$% zBf;5#(BK%Qv7;uVU!S3vJSi9v;c+U`Mr?ES!^WWhfT8H#t1mk7{$0BEK!=W< z(YbS1B^MVrbn4t4LBZh&3y(omtPydsX2iyr)a1Fi*hD16n-Clt2^S6!`V1b8!r3pP zVD8URIR7P-EnJTB7uKNkXX}yo^L5Do?Isj0-hw%6_F(3cEf_E{7JgkvA~kCcGP7n7 zlq)zeATSf0bMuQ)SX9Q(3JJ`ERlAmDzGbqbA;C&uPU3(;CKV>DdN3wsAc=r%vSbpF zDO(my)=cG!VJ0jmC8fc};{;wiWbi1$w}b|CLBHOE2+Ij7G@ne6j1HxEJaXh%g0ee2 z+}pujtj)@b?h3bdZVH4}Nm{V}3v7xeAy zkG?)GlqP@q-tHLa>xMzzt{Cd=f-(N>Fr|wpB71uw zX=q0R@({e75`^s)Noag73uk{(f*Ze`iJLD`zo@{~pI6}ci__8gOfe45Ex^gSML0XB z7#C(0;^M3#idzEn`PuS$5l+o5#;N(GIJ=-sg{jk)^v=hXIeCyJvTw-J_Ix)*SLYL` z7ZRwSO~;J|8EW$4wfPyi^y?Db+_@AJ`)W~C=Fcw0myd5_=a!ACpHyK|DT&uyMIh3| zenz<%?6#VE2R z+p=WI7Be$cl~~3$Tg;Lfy3Nqtd++IU@64V#Gjr~J?t9-aB4e#wrLx<5pYzB2WUgG9 zkr9y*@%$m+RA9UbOKIOCwZ?E zmgL^bkD76jGAodLiFu*7Cn+g^C-Lg}vsm-|YC=E{1uyN}dC<`N;ZJYAuBMQC@7=d? z)IPUXQICUqVbo1iE$U4v*Fy5fS3RrkA#3|GvXGeCV(m zsd)3%E4Y9AChpz3j%)9~qh{3Duznry`8?LGTaV|SU5mQ9l^8N)qEp5Fx7zIZuyAzn%f!~A}|;g~-SKYVr_ z|MkCojsN*yzQO;_p>%VR&!QVFJDc6Rd3CuqeRAttT?{;fg6@5!^{Z4#~ z1)R5Ma4QR%q!UhMgf|`E2nR)KPHI`>`Be5W)XcVue0k{CR2f^$6XQTLZ; z;o9?gxUw`EZ`7pYy-I>Xbv9nF%OE7g<5J8#?D3k0nS2H^$85{iEzqk+AIzIgP!?#O z=MV3BzVMpwNidra|M_z#%b>SbzPV``g-A&wL}ryBGrI)Yc_qkS zRF1@iY=nm=S}W&iSoHOcBn-<)Ga-`l@rr;yfpQ_?Z{dO^llWbIe?-U4`Au;dMw{tk0tAOVb}~` zwC*wx0nwRA%dSLn8i6^p7-^Y{k(#jxSvlgKk%!S^^eCc?WKW+ykI?H-t0DSITD@hI zoe&@7aIPpAMumns5E{WWM#d4E`8zZUUJLy(W%3O4=-!v%$s$DUFm%W$ULV4CAPfP% zVes=NJo8xva)08ar+ClyXz$^Hj)c*+tv&cm+n_and$ehV&MjMFSwtXSt;oh%Lh~C1 ziFhs7f!7wr;Br9}&gMs=J~sl5*^#)A5ruQ<(Ws|RXGG&fMkEdqmJi1I;zV2kPCJ58 z&yH|?LOAM^qi`lI8t1YTa3P;SUXY2C*~vJSpN5^u;aC^xg%{WfULEd@W&WP1^j&}n zJ^K*k+n{gzZW!66D`a{9o@{UW6M*}6>wrGg;BM_OsEY@Nb!&?;JzL|cejPAxU?+qO z?~dg0eXwY9Ka@=AgB5dzLgrSmV}#i@wi>hI$D*84=ZAEV?#0C zKClLZd+IlA&#hUBUw{4zCl2pZFeZaTGC#9?;4Z-2pcXix&88X%bq&oxvn+(72g~$t z*Z*R>gcYoTLwVy#&e7~3Tbs?JebxNL`}`&}Yg^dV1~!!n&t^aWne6e<(tM-2gkjp4 zNq;9LOeN4P%;Fm8R5N7oTDe9u7xyg|G14v=|U&Z$Y!$1F&TO`cqBaW*v7GWL6>` zZ(y40xz)2txS&}on838;hj!y|L9?kiy7#Gdo|Q@gvn=pJoj<`Xfw?I(t3u8DNX4|r zU!_spLaeFp?9i;iR|ByDS%GD}?52U_rfLqF<$7sUm|mvKNTrd_(n!M9$9W%yrB^eO z{}?yxxvOVyZB0e^J(>KM=V6|WSx`s(>(#3zrnR1Nm1iN1n8aH(NR-MiBj$@tOn#At z{v7|4Rgf;6uSaEdDSGshSFmDq+Yn@{aGS6zb1uLA_M5nPu@TdrnuKvRLtv)9-$@7-R|#&v*-02CnEtpU2R{%_ zf80uV-IR(iUWmb;*C!I13A==5HCu*)<}^aHxNgX(vOsepp_!1pO~9Gr{&$<=3C;2N zk@|_}I9)R=X#VpahP5XPUu{pv7ve*`C6i%f5%zO=eV&@t;syC_-}-q1l0lMX@yU@p!N@2OmD0 zgV$H2t322+08iHW~R1g1=_~ zycIOhMX29Gj2|-!o}TkCj{x1JYd2ok8h(Kh1ji!6=n`aQml7UJkXKMfXf8ucOez63 z8Ih4m3Yz`=1dc8Tl(-LTo_|LU{gs0(u7?r z?j*=eBI4sMBgvs*G4Kxz!|b^WR7XH&$B;!|%xaw-+I7LG5#yQu5Iu6v{0NrizQ0#QxkwoGnVh8%yHw z&f*ZfpYMftv*+OboO$>#-xs%vBk{O89gix-b&23w7>%1n5x7&v`cV;wd)0|lD(SGuEkl?5N%-&BQg< z!K$SZk9?i+mc9tnRcR{wpHx93yLYRuY@N7ZIQCM&i@5UY3EN0 zENgIfL9nE3QvWlq4>AtHjw2Ot^l(h71zIA%%nNW zKy#xS=)1&pi432~HBJ^04j~YW6f~>r<$s$9rUIuLQ1$=zdn!q~c+BLNmYgSX6s~if zYlmEYzvg9F@jnC2@)4jwvusJ$fSb*vTRVPFa6 zaMprm?bpmmSfHffiPxEtW?AX&0)hF0f=9+d?jhTzGVxH?O2$hj@VLr&omf}qGy!PF z&&%w#JK%}L)=u9JeK3xvHOs}2Isk*y2X2$sBu0%vYDfbx8~mf@)7$Ccc{**?-N z3%r<-W>t}xZ>kq6_vO9pl~(&kGj2?u8fiB3MXL(VO2P9q`2OF4W;vf}WJOlub!RNe zz7EIA7ujcg>Q&zG{*)=`vMvLbZruEDu0f#&6nFX7g$8+hxjH>uZf;NTv3 z&YOk#vu9w|Q&W(eoq-MO*Q2g>Im*jwFmd7xbm`I?Ik}5*1@@Zes|nEn(}Q_|G6%j%xN)6c9SOoXTpin`F~cInuK@ZJIQ7A{0wauViw zd1A`cN$AzNlbW+MbL==AOHRT5=olP`jKYEFDC~|3!>;IX?2(lMsXYz?Gog7`TsRIS z$6{YcHtuY`0_G*zjZOb-9fMwXx>Gb-4l;*_r?>N6Sz&rcY9L_&jjH; z1Zf$8-kpx`c4qK=nVv2kzwF9YqsH7=-5U zw$YqyMcGrZkd_+fV*zTB9OPuB@F6Ogyb2(u;b zf3r&yE0W3A^A>m25!re!cJ(GmCIzbSw^7=$pX$xqLpv+ zgC(&9>3G~o)NxhXhpJzNZ2lK_>m|*MY7oq%_eZ1KL;J&}F zUKw%zd>?j#XR~8E7QK4)Q;Wz*o=Rs#X2=kA@7haEZ5I>}4u8K8)qxPnoYMl(5zJT5 z+1?mEay)wX>WlV-=B|WiRd?C>mDLUV9Vx=!8;cU~TAoAAL~$_H z8;8>aa6B~#r&B@*#NjwkXugyfj!S9LIFS&{POvYIrbOa&ZZeK#r(jod9M*(~V9|_e z2pKXEQ#*CWfR?S4`<7ZPhmtoK-3W+ddUl1+DXXA7axQzDSp_W8II%PGE$L&Kh@;n+R6&tUA+vCo~g#CD@*WsT_!#%3B|`nUWD2O z_;?AoWuf?@HVI#`oPSxn7@t<><6%W6ZnM*QyNm$M?VYL&+++E_U0Z;gbxUw#=@Q(o zAvjmEPF1iju}<78XQ#7*z+4fDPfEk^c}W<)ToQ^ei^2)dN$ikjutS=Go5h*9&boM` zEDP7mGH|V2mibP@)f(-${6SqBx5;>KMZB7A<3>#q-YreQ;~lH8kN1`ZXQVT9c<*6+ z_3_6zbNr|Vyhd^Jqb(rhogo}GG|={1qs)!8t!}ngYzogcz${Yk8$4!4L}d`MQGs$( zU=~?+O>GS{Tij%=UtnjAC!yKC?>eUMD=_SWV)uQQ)CYf*daKUx^+>U>j&O+qQr<_| zOaW4!moDjBEwZ{zw>;ZDE?? zGCd}(rr{XbN3)HjK*(D2=Gvx(-Cp=KklGZ=G8N4k*Au+f>czS$rS+eiSPpXYITlX8(^ zOL>#ymoJ<}Rb@GisGfVZSFc`paot+nyL$_-y?RMa9rxmz)mUCzjmVG?bn4I^0|xXX zB*b9N3+t3$Wq-m?hmPG*Qc;5wC+cxv|4|${d;&+0uq|A*a~_GBBtMW&nEM2O{9=4bx#WuE&I&wW|kF9^$oVsX6?snKL^W#n04`dfi%`6zG`&*y*N z?bN{j!|n|HyekiX+qDE=tW}8thxeX}GmSJ_?lOf(g#WaRh6^v!z=lak&3XE^e&O#al}= z@m5_Pfm!^U7vc3~`MAXM>tj8!#eW8#>d{R-gO+Vtp-=yz@L4FMz+q~sYj1BK1bBNP zke#KduwXSk;hebw>8lIztvW@%0ZuO8O$@o+0iE0?vz<)F2~cG2$HQh!Kd5N+BS} zU;%;Ib6$XwuebOZ#}fV=@FO&Pc|~Bsf)K)RAm+~XRx@A`GIPr@e9UxAp6QFS zrEB0wSp@IkI82=rfC0m&px>a8=s$26diNQK&Ru(=OSj$_G;}nA!yHJ@A{ZxSA~7Y0 z%4M1f%v5?-AyQIvF_-a_`r4ZHq7C7(Q~M6=gbc!{kz>@{&y%K3$M{K8FmA#mj2<&i z{nxp3H?(OZvw5^sZc!~;=@0nCH@m$@2MlCAo;%AEAu_rw)6-EQ0g>u=Sa1v?LSx~( zFbD(t4adUyewZ@pDZ(njwS6as-5H%a$%m4JX4c_WE!$#H`_|aKI2G^IF2S3*2{`HS z!l4u&?9UFs!Sq0!NDRiAgixF$EFVvb!0E&&oXbkW>AX}N$VtJ5WCwER&w}spVHn$` zGkP*G!nL|= zT(6N4VsWoX#Dns9JT7w(m}QhXg3ug}FA2_{FA2jZh5WyWAkGf#^-`)ljp4HXl_wCm z8SknDyjPcmw^pX({bfwcvTVGzG8u0^orL#l67lwuM0~k_BVJ%WN@t>Nn>J{yKZ}na zeoQ+;j%aK|Lt_K#snd=1I7MJSL!grBN*e^^#FmK-v+RksDdLt%J4h+_4N60wW`!r; zF1y<^HLWde)Op%hJ@u*!l8x-=t80vHq`hh?r%g0=+mhHqS(({sm&N7^JJFuYz?FGp zdr`%QJY|8UhFqR&XFUrk3oU2@6 zu52DH*ERludGk6$#m)K+uR1W-@FWjJxX9{cwl!Lg&KxX(OjJcsw*`#||KPn$Xu!9n3jPD)3?q9v%T zu0`&m0)j(7v}xM`UAy!^Yr@fp5u=cnoCfa&Ug+1SFFLkujUF_bQ+eOb(J?sUNWk&v zI2??MQVW{w61NO-$%u=>zStg}r6*vntQ$KFzf3gJj4&GB%CES&bzwRj_+!x^EXHxOyIvGta)X+>o zHo*M-&OC0j@zbt+e6ukV?=SJkjglxltjxeYLb5wFXV7qKXuh>12{)IB-*X&pFL4l@ zsUx!4*s12cPdS8&sc&$I7< z5%43-2Kq;+73+L_{SfHmjbQJEgys-J^CZlgJx}?!cO;MrG|NIA@d=qoNXkc2YJplE zFek4ZDM@*75SAU$sfdk9Lr9QALGye9^ZdE~@FtuG1w<=o78ebXmo*Z-U||5kd?BVz zn~8~&rYV;N(X^*#DR}nr3RdnKW5-NE`}W;2kKu_cNDS{2h>A8-77C2#u5v^AdS|0~0tDsos zP!{(D0o*qHEl{gg_T#>6J9g-Z5yOVF^Wx9?EhF+_YEctWa9}v1!eiC`l!?>PyT?Eb z=sS!69fO6QK4?eSmYH}2m__Z{$=uf+)T*E2qEQm*i&qKFr&#|Q9f4>_4aKq45FAeq z!|C{NG^E7hbapaMEXl$FYF%117W>V^oPoX2zik_Wu^vH|kz{Fi+A!QMJ$hm2uu+&c zZ5Djwh{;m8&$bg*d-r87`C*;q|H#yh3n(V`VumK2w3L8RKy@P3ATx;Ibok zvy|nQuzZi@{&p?PeH}ZEgwOlS3UFV6a{)dg^ggOg!lSZCJT48vX-9ACGG_3HWen65e0I@Cf?1c%QfBEzi)Xw3&%;|sj98RuYAbj8yVZ*}5X{9x(w9oyiQyDgX+c z`9Ksv$(GREc$|fRpw*n8jkDlb(qVIJ%P{2%VoS70VENK1f-@hMI!7gj|1qIYLbJdw z&r@;MnI=-bBP9WEmoD;prFvd3i>$bXBhu+NaW-kuFsr5Dk?GZ)1tV8_m5YMJL*iqF zXO|Q<$x9Qi3!2S6SYd78?1EIehg2TDHF0wRv+Ofo{}h@js{$&iuw0!WxlVP8l&t4% z?Q1s_iHnwmWxt2oQvXjw^OL|VlDx70W4RV!7Bvbu^BU7BQx%crM_#v`Wra9?>@XYA zv)Hp|H=?7XXz&bhzOUFz92+L#v?8+1qFr0sH$0t zP=+JWt)}^E*BNbEx5M1I^AQsjgR!H>plhcN7~8)$#h&c^{#9S9%IoI>Ioue>Y`V<&|#2?5Wt85{k|7 zUf7-!&flXjv~z15-ns$*Yh0KaKM4@_Qg}FRGJ!T{&sHyp*dNN{C>Ynpg9FU@5xdh1Kz9+#HY{m|Bbl{n!kUE z5H2eNz9iGlr4pKxoFmQOZ_%)OK9Cu8PdTgXhz>fJ*>;`!9stQn+yf1j<_5J zs>uS+Dfvi<%T%Mx5n%}mkj3r8j5f=DfM2A#SD-c8EVE09@3W5&%f{4M7&UqVh728r z;UmUi?D)xQg}^y;7qF8T318n3!eAHWjuaLikLZ{bM8~F48HnM1W0P_S^0|ml&Q|jw z$0cPUJ|!D*soC%iiNv%y3ovEoJa`9$AtELLkuiyI#HSFj(~+8%g_P7x)%ncI&PN8f ziHRw!6OmZ3zy}j2Ou?YOebJ{=JM?JR0t32wV9dZCm@;x8hW74Ca28nBP$|%?A-NS= zwQ5aB7N6%fYEE{UPg%}s$#Z0CI+^`KI=@4Q48*vxqwv(!DVQ>8GRBP=k1?ahW5}Rk z?6Au8d418nD?2-Cl)42*44r@&LVskK1HICcM_IL-Dh4~@VXEr=X55d4rZ7p{K#*@G-OLMpH z+zCCH{y`&0VdC`JnC}~i0M`8gwi$lG1bagL=&=*jD0JM6r_fMdhS!P<@z#<>c&l^~ z-dS3N%gdJ_W71F@Opa1hwcV*m#vMZIeU{IMHCcq@94Zrcmu2A&JAJp7=HX6F9v;-N zqbui>M&ofw1U@N_B-F;?&xF#yR2AT>x*~j9yBPPYvT&n38Q01Za8ukg2)P2ycd7}^ zwTp1Qt^n5v;MZz06#(Dnx%bNinqw3+e@CJ2n)cufIjCOQ7y(kThhlSRTWH_x;4HId zHIN}(GWJl8bL(cy#J2GmZJ+?No;}LQWxLIX?Dl~yJ0IEQw28ELPk^&90~tVv&wC3r zlcB_wLf)z?R|erI3nWE$XjW@bh!ksy1eyi3d7ao#OUmt%Tjv{f$rB5NMU9l2l2^i4 z(!3+*i3^N5&z`P70?kHtb}(+BcA0pYb0nQl!m`mHfM!)FB;(})*yKup>t^y1pj#p& zqbT(nf)7NwWSk^q60+>8zKKi%?fZ5$y`Om?My7Z3$Do<#SaEiW5650OjGUR(AVc;9 zX0FB@M3W+gxfEPObE*g2iJdJt`~{rYbzVh80qvuW7gn$_+F(0cU$2 zn!Iifn00)N+@FaSe_It$iNDSBbc(MsV&@7F~lb~#u zeQaR?b5m&6&w%Bbjioev(f}Sic0|2Sx^(FRYN{*IvuAfT!jlviqZYE+w|_U5 zF0IABeS2{3Tq8<~OVGJ<7lLwU#5xjDSGNL=n0WOu(SQL%QCqVDdv+hh?p+6PV7~@v zaoJduU#z~;Pf5;FGZaJ-AhL58qp+}yfZ~Pr9lO#1cUR!tk^4b`G7XqNMhv4t?cEKd zdvwNh0{x14o;VeofYVV?I22U1zkGiKr?xCd;k>EX8t#vWgyt_^ zE+@>E5KM~*uIU8f1pIU$PDAtl6a~$CD_6+>IJ(t(y;mhZ< z@bS}W`1l$5z%UQrY$Gu5%u;hmf44J7f%A7e^6O{Ozoq)#LM4VcdiqmzeI9HyC#+(qG zh?#}0VY4x(kA~(}ZMA!bzjp+oG#36r^08VN!l+QfbBKSSg10GCr(x!tS?JtF{O((; zGU9Ng5?TqXu~~>q$k6~DlZL2>B!q>;Ayj70@Ryk~LKQI2o8zap@>~3S1Nfh;8W<6t zq-K?vK79d3jhcutV<%zK7l3KA=3>h9S(r-jo5ApB%<;s` z+2Ve(P`QH4C7@58HUq3GW{)2X zzuA)!>^&DxO&Z7gDAUAgaF$W$)&kG`EiN1$3`hQ(HGKxwuU(5-GiIva?b@|NJCC;T zXiKmre2c%i_7RqA<##K_!3EpBdklb=XD}i{nGaEk?1Y4&fB(S*+%^Q|js)h81obuq z?AEHDP3+zYhn6hHmEvNY%g)51^dxLx9j)|U2nRdE)4Fv*Z{AnA>`+oRIup42_UVKE z0|sK)$kCWIZ6@X~@PmH6O*DYwiRnEFZkQQjZ zS(A_JHH2o_GCYCiJ7vjuP#TYqOJne<0`X7<%U=Nl@88_n}wwpQ$^$k<_*oph73UD{s#QJHJ(YB(X{e`)|zOAi7H*9bF_ZG@vN z#u2H};06(S$u7lP7Bn**hGUw8Q^f$5O%)h6zkf^0cbqbSDw0{!WTvZ24DUYz&FZ%; zPAXna&Xsh!Ai1fmJZT~~_Y4IWlJSIz=Aqdoar%}77@3S~I?u4I{HA+Rrm(nV3P#i8 z_JkX$(P6n43*!^eESanlFAv)e%>uX<`(Doj&&ENE6E)+$WBZLV^e>zhjdFTiY%gN}!5 z-*vq`UCv4RWS6;>DjhirO!XykHI0Sjwd3b)+8ia>L(fE2`(4b+cs$7blJN9AgmR;Dj z`vA6X+l}?>w_@zrX#@lxG{adFGW^X5r&T#uSWE`bndC>C7cPcMueJr3i>jU&`fN4IV=dqxX9_0$~1#AsjQ_&CBTp*c)u z%Lt57qsuayEWj*VSw34}dH!5qmLETOdj-MYFN__rD1-&aX}@RB0L-1^1FwaF@bw8M zP!q=d#2q74?R(DmCQMI6*RH(~86AiC0TN!ID>vFhIGX|f^w+mROBay;rZGns6V&^Z(li&cV4@KBM0{(DzWh)YJ?nLPA+_4LK_2`X$ zefqP0Pr-z76Vbo#0M&^J4RRnhIvv5x>yY4RL`NmS$2)+KJBVS(=yn^OK87Xk8Pf*z z#csyo>4*qqPM?Cg{d-|x+m`6Yv~}YDoe9@nyLCeEKD{w^_!vwbKNYj4vU4@l6Vqj3 zn^_ChRChsqmOk@>;70{63}f2D5zMj^>=Vs6#qzub=*)Z188iTgmn_2h!YsUANWff@ zia!-4655kdG-VLh2h75)nmjdq;=^hU&5x=w@tA;ozd8fAYuG^~INz#c*{>u-R|zy{ z;eJ^P9uSBh6vyH2Vh8RPu)Y)|;(7r=xhM(OSud^$G?ymf2Bn~x*U5|-x2p3A&IRmL zF2eQ7Y}_oDd6tv#urv-Il|cjGE zyhG@Iqb?1v6PRBoF#l;~I^KFZ6K`<)+A}G5ds#9*s7}S#yPn6x`rR1PSI<`C@4pZ~ zeDOKndF54{6=*&$(0pEj^OV#(MpFy*z>2U#NXS}Rq<~&KK zo$uTwdEknNUCw!$H8hKuxrEH-pjj?B#|#r_7T;ej0cYNvU}a>WS%I^9K(5FdHGR11 z0rR+7CC*4?f@Bn>Af5L#Qi+gEc0Oza%o>{Ye^z2;s~!k5o*AN6c2oNl3y0{Us8L3m zMRsu3@NDj59*T~)0IdM?8NxCFS42=%GffzHmh)74cu$^he1AleHs^z8{ETaZlC5Id zKPG6JZz6l1x!f~!7#g}=a)s-<&cs2TeQW%fx}Qy zUV|6cZN#coFJSfR7qI4qmk>lS>fEIlVL1UJ4 zrM0$gJMi9P5gr}}&)Ksue&}Ef?%EEc+qOXFxG^{qACF^>C>)B5#!iQJ&)DyX!G7yw zz&%Nc*qM@uwTV&i9XnKya#Hh#4dgL`c4{a#r-mVC?qm#W*9sAyv+&n1Z{dIc+XMW| z!z*}v;AI?3io|P6lJV8cb@*mO4SspK7C*kc2*1k5eFx(3Bf{z@>=_ns1k6I)V9iiF~=Q7dg38Ch%;)If2^*{^mXn z|DDP>JgA7povL`eTbF<{OC316G7-m?C8EAO7H2X8aojN*+oR@Tem{1GnE$eJT)+N9 z2|tmD3{OT#cml%0V-X$hfd2x2gb+f<64LtjAAm`dCZP|(Q_A7QiPI1roeYN~LjiL_ zLN+^91ZG(-I)I=o^9lQC`4F}QmgOVA`SW}f6bA(gq{kpM*rA|L6v~dCzh5X8^1kwc zoh%bQYvw!*9Wp}s=gNIM5+r3h#Qp;Zqeq{9=+dJPI&|#;k4|0DrekNc=Krl^+d=mA zwoUs^Xe|Iqi7#=hR&5nnDi;bRfmQjf!Bs|gWmLDd0A;&27}T#9=FFIanD8K!Ey}`% z)yuGd>jpF)-G?{NpTT>C<+m=K!)s?7@H*k=!?)hXf!%u%9U2DT1)f;7d^z5F^;Im+ z%0^UB5cX}`hBvQV!RwbVH}N3Z|hbq)rj`k(XyD#Ty`=RF#p2|tASWB&x?5# zfv5-vB0{4TEC={8ucdP$9|)WA0Y<& zM)sg~GZJu)<@ao^1D6-Y;EjS9yjm2G z@@Yfx^t^GnTA7J^6^VFImBbETit^=uT#?EBG__81hA1#i@oh2y*@+`bpmV%r027CmZmF=fZ76~vbasLc$iwwZXEKW_KS%2%~ zZrk0US>6oOt~LE2|BpbkD-7OCxnpR^KSy8|2{fN2pPUsB0NQZ{&B8O5y>^3UGEj5S zETgOf&Eh(1A22mGUDE8{UT^sT^apyp8RZ7ewglSc ze85d)|F7va>H#sazYWa-wgSu+;3_cIaTeG&`)*Rb23;5~G%{=!5Dm~Sa5fof?jsrQ zil>063!1ewFxRu7*0azmxMRLb-a30eR(hPEDq&J~Xx5M{1>DGf-zR1AN&l9u3z&6$ zT;QzX+5I|aT;$oiRZ{!?xpf0(9%I3FNq$?h!nMn-=SMYrj^sr%$3-U2(kMv2{ysQs zDZel)Eza`H{MBh?9*=%}X# z^uhB9A=s84fi0;KC|)=d<2tp++|k4E=evKx|L-4<@vnC;;M4sZa3wbxSMuZW`T9Ei z<;7+A^`#Z~>7^nj0tY*=8S>|FUGz&P3JI3Z@e7QCjU#y8EG^gTQ zS?J{DRD8WU6<;preOBh-^JQ81g!|vUycplebaS$R$*ydCvm*=N@5saVTNmN;H3Wca zf^wOIFifLd8c$$OCL|M#i!&$zXPGrameo#CZX5S1GSoQ+ngtGIne7|0AWCH{!NNiK zcHsS`@i@1P(7Y@b$GCr*$4+Gg;8^q=?2K9f&;GsjXg{G@W?d0zj*d)KE7nCuC%_RI z2Y>Nfj!D3T@sre>hGD~oW8}yY%7=dN;1OyCJDFN8F)>%UWdsEgmI=KA#ok_l@LCw4 zX1iG6>8m6o$MW&1tkNgt(1$QMXZ8Y2oidAHJ_%z+PsI3fdeM?GqbI2SfdhsppJ4F~ z)+4}8{D#HdL5_=Bw?QicTe~jZ(78u%co3jlwrR(61eDtmNIletah(X1-B_Mw1b4*H zk$7s#bjH^g@s0!(pnihC~g zKs-YHW8v)+ik@A&V|bVLcro39^QBohmlKIs7DeHW{0O|sGFvxu1S$#6?^P0rE0giC zItBL$$PZbzKPu0_1NA!@59B`r^h2I^UjcT80_fXS8LY!8_>lU5dapDI9~LL$>JkF8 zfHT2dT$rR@++-(Hp!sf9j)GFf_`GV$K>9K5$Q71x#~s=1o4*CgUjZ5Dn# zz8xzP!=3)&J-g7XFHY<&Y7XwQs~va-Ll7ixx!6H>0Mk!H(wsWmDr z=(UeDpXTq=v@551-I-Ieny$=V(Zn9{Ja=doxon$L3`0Y+%@)%ZHD?z^<~o;KCeOEk z`5f)P3z})WPg&n;Ic>O|qsTZi(%KZ7$x9j>1)9Z0R|E5JLbK!-rGQo9D0yn#%Yd?z zb>#kXPr1LvW^%9Qpjie`O`6^AYaO%8nU^+6JjC|vd;K;z3mZB)v572bW`NEFFknkg z$e_#xvhaxlW@fyar_Tar1>$=+L0*@M_<2ig!QG-$L^?iRy`jYd##xcd6NEq}{=v!$g=)rxLQ zmeioAs9M2uU|=LX+I2w(0*FjECh}R}g&;p)wctwsKHbp2T`SD**&S;`!wJlZYWlgI zG2z$|7fw))!0tp*9QGwAVsk1D@AUC%mJ9Up?NX;Ijk9D$`l3o)~IPmJr? z6?fk_1N_%d@sB$f@x}i2c(*tkm$MxBWKA``dGTrdx?vT5-cXFcDrkD6b5|wMt(aTz`D=e5!KI_-S*IHQJny?>A~_{(;c^1EKl5 z4T<=610nn+JsaZ!=shq`0Uv@e6N=2PQhQ+rVx~q@!ivT`18^{d_rUY z-3tZydR;aitRl?sA}Gsf^Y(1~xP39cctL&hXr%XnilX#raETd|?=7uc5AoJ47YG?w@l`;6RE)K6PkH@Jx2Tm+az-a>Y z@pOM2i<-mdLpbU$tFsZB+h}MG@{2}HbQ+>!Q`CoX0?k1_fk;V8#q_6UV{0Ypy1Uw&+xLH{HgkxEPd*Z|?7&>$~JAHl8sbg1` z!w!%s(PZU2k+=d4=s#4=8$EE~5C!Tog<30uu}IB4VaWx~&4_^AjnFI~6wW7nhJ{8V zB{>}hxrj|p&Rss!$lu3t z`VjBI`gxq!%JwoL>6ON_$~eCJ<{OMtk&1VRb{#RG-ylqyFb%V&&m&9+z>hE-=qI2o z?hi2p=Lq-{O#OXB)d>DV&tUa&r7TJ_f36?X>&xH%gl$=FI|w6(kKwZ#z)s0T%%9_l zh4VbIVD3E3n=>DCXM0iJm^X`%Ji|u;v-eyd{?AShHEV{a>iEcWmN}wBLmh~ABvUB_ zXLhP&p%r$N1gOJ86A>I7&)?A)G++=0v~Pv2d8v4XVEI~3G~UUJ!duy4c&jK8&&(f> ztP$Pur}8x1t4YVhT3LJ~UAbp`T9Hm@CNNVU6Nn#++XlA}WEPCdGy-oru2b)`e!V9v zUWy-aO$H%Wz?s0z`f#&|^@F-C9nEq*(tNi%hv1w`<>7X9CT=p^ThdXL*=kC{3C$t+ zoUkmT%YP;;|D}lVToi&wgzD=RNqCEZeYq?HSIV;SroeJ(9&Rb+FsxjBSe?swu|vyn z^~@b?Z&noJ+k-oh;W-oSt+KhQss>-(y^Bk<7nk__WwtCCj8pc-lI%NIKc=k_Xm-M% z{3io~CwPqoz2ZuFhTAh_AenaeG|v%mHWChSeTSq$TO{*28~g48Xwj3l{g23Q9}P60 zbpkW(@NwQtCTUwiZt_GT&E)y2Gs(3IC3YIA~ zGB5ikj>4FY_hqx;WJ^bOSHdsXqNlQ+hG%)s}p10U-FE3Y|lR)gbHBTqX9_b}4OHWO}xd}8of!UV0KO!ltt^%ch z%W)@2F|*ZIo^~;=&djz;{V(wrP-4L`>EUsuCcjPLFo3N6R$RZ${q$tpRs}N;!-7&5 zFtc|ltHDvSJg4+FT_gEAkDYmKOP}ernckH4$<~GYpFy*|_v(UVBROUVW)02ae=XNa zm?|C;zQoDE@^4Ay9nZ7!&`ejTq>fAZ(P?sqZKY8Pyfcl~vEP!pR%Fs?ZxA#nOCC25 z&GtMtQ_P7}*doSJK45HMqj`qlY=>rX*HH6Da{ppuBQBpkPZ*{lKYJFhzj6hqj~_#7 zVgkxbiq$=2b`u$CHf7-Sk>kAPEDb+<^bF(pp+m^e$-$5TgV9>OnilA2g+YUcVB9#p z{I4tnoSjpM!omvV<}M*H*C0Q?1dA7!WBQD_XzS6*3C_KH_M#E>B~bfg+O$a+*sD7R z622lvjKJ3T1ni25!OrL~Y>N&eG>2hlBH=k97JHN8u_YxQ@pGo2T`LXE!@IV}3P&Ke zWyD|`q50`BFSV5R(DrTd+KFAjfBO>uuX|VU#lDwlycXd?Ml?QF(EJR3eQ_0jSzn63 z?IbiGj3hLN6Ply(-GO*L(kvec?nzY8{P|k>_|G3#EB$b@DhOY%&9s2I2{em)hKw?= zPr`RECg9ukgl09;%y2d*0f&<~!_MVt>LfdzCkVJFSYJ*aIgC@>KY8E) zftlLBPi>DMI)+mV&)_uSx#1}5Ji%GsdY(PTun!+b{ZSc- zK8n)^4=IQi8z$f^ZxUa-bP?OOY(gidN0z|u-Lo&ojhTemGv+Jzi1~8}w3O_trOxO1 zsnr`7dUC5q@?CRa%M2yn3xWvRVenZPL~ss;7tfnLZ5|d(_ku6O@FEC%&s@Om0xX#6 zg$1+y;5o;Sof?ME4u`SgYGB7_GAM5?8`1=hp)oE_0gh>J}_OjI($!r~AZ z&SxLyKu9RdL3lEP!s0P{)EM+_-2xjk67i?Xe7uwEz=t_ecsDy7ZxtrsS+A*x>DvL9 zS)T9J<={bGI_@vcz+=MfX9VL1HLMraN%)wc{J0`nO*JPU3*Mn_Rj1$s9)E{=yPD9v zJQr72=Ho+x^wm<<1p(zEc0QM|u1K9JBWPD-;!Y(ysTJ9{SCK_%W~a0&QN5A5UmAhO zC87ANBp6?aA9ImEzA7X*7lz>LqHxwRc0O50uasxtQh5%pROIvbBD_stmX8}nA2N;a zS1-bQ1oG=k3-La6YgIYE-m?+6c5cM1!Tr&$WeapRr)f1jingXw<)nq|Pq;v>aMlwu(ARCCbmbbp|X zONdCl{jtDVbxMp~R`yr#i%phUJd6i{;VuQuw4Y9Bmj6|r>GT>{Hlx}G)(tN-mEl|` zqZnv@LOShE#*y?}d7;$shoG5>x1d=~QDUH3wzkn`@oCn6%vOe2iTk56)0vrB7&Jgx z57P#+24a5?noYdV$&3k-9+Ti^WT07zzb{BSo7_u>=ZdR-5ZsqaLkQKtB;X{#e9Y2O zR+eM>Lo~)iEw61!<+T--Dl}FBG==fE=80}v5KKIphh~YB#Lc*l$p7XXLcDdoiHA~i zpzM@}e^Wj*EHi(N)N!jos%PgeH)y7sJIBbx$HY<30%9bPEDe!M`lAfVS1CKD4C$~8 zFdJyrm6!R!^XlsxDFJ8M=P^rX8=3FybzY!hy>j*fE;n9Kqsa1+#Fg{s@cQLTgy=@* ziQJcQWO!#-2J{DNR z=S^rX$D&0g$je)zJ`gM@C__eOK8DDMbbEqxr!Mel-yXw83`Jl-0DOJCF=f&ObZgfZ zgIl&j_S9+Eo0y2*vMOMFD0U@=U`IljhGuGeY67zT=A)B`{@7|jyVj@*^~SF3cmtnzQl! z%Nh7?eF}j&Q9<)}>*Mg$!60TPz4(U9Ar`MQGUWZbAEOjpO^hCp+*10U2S;Z*{2{nA7n zsffV=Li35pc{misJhq^@4GsA~f^Bd>6k=mi)r=HL1kSh^2jR;LS(#acW&-fU8R*+* z0NS<{7YFe#ZiA5{#vv>;MlC!tVd7Mb8Z#CH*fHzYt(O`_ZmC9)HR#GnaqHHt74Qly zx8hdr-KtefhSx^@FEc*$=+TSUcPD6f#lV3B*(sZXg$w5+GAbM?DM?tApQ{!Ddj7f9 z*!=QK*t2T~jvhLomWO0oSMDvoJ|UEhJV&SxDg%5Y;W;+|{yt&w_YP5zyo~RGop(D_AF3X7AS~simm)@9?pr8#(KX&ydYp%*NJ@=y!ts^jTq+N*l;q+~!t>iT z1m)$0xVoy8D#M#AE7&JH5=myI*57j`!d&04aR`ogeI*_YfF zX^7S`(5&O}dlC+?+sQ`qNF(ckO#f>}m<=$WJ1q;iHLxC#mw2sS%}>h?jv~W?nuUY` z1!VQeuh??qlA*-=y4*r6$ua(?Mw%%*Z;F(S=6#!kW_dqkn__Vm7SwApI zqyDgs@pZzgen@(xSp#!Dd$bLwk1m)-mUO#8vvfl2qsvOi^j6L_h0!Y9cDX@wlmD8^#K*+( ziF7f|M&=ELz5EzSSZZl@rc3TCzh%@w{!{+U%;$O_3^`y zL4(vQ$Z_pFQ0Y03kQ{?uF~QhPaNd~^L3oZOM8{%xMlvd+g3-I99!2a)V^ZWlAG`CD zuqV@j7vlmEG-fz@@V{-Vs)7IU9sY6eHGFm8W!$MN##uu1?Pp5ywSwm7@T-h8?@G0x zIUGM8a_Euf{R#MSFSRospR5VR)ry6LNq<}|^`(K1A_&VzfJGKG7ZREYzZ-KbXigC#G}FlJS>UD z!x{&^T9<)e_7oAA7vtNl`FQknGL3&6?o}q?L0LNP7c)EpvMkm6fWUm0M*p^qE)$q< zsclneCKwmX2T%#PL1>ndHCbBwMj7`j;t0zG>M9322o7A670H$+<1FF%XjwE4WcuTH z)I1!F_9Zm;QPAABZEFk~EYKVYhkV!ebL(#AA zAhd1Mkzm{jJ-YWr`}SQ3!LszU84cE>$KpRM0Nk%%KQ-UDhG+TskiblNu;bK@oh?y^ z4jnLM$`pizgb|KYP*+=rZQHh~4`LhH?uyQ_<9F%8IkmWt{#c0ZlGqJZKlOlt0B`-V zV>osEr~+v@rj01KgyKd5@>vDO0?7i);yNKv%`Ja7^7vUR9NE?rbOo9l2*9TfGu(p= z_mKD}vu!;hU@a~gx=lF6bEO?LZG`O0D4U!kaAUr;F>Zv?uF4?IxyG}E)%nb`78p8Y z7$!}ejHjl}#Iz|h3C?pcbNXBb(eq|6gpU`2*jHv12}59DID&)3hdBn}VY2W^90Gl# z)as7n!XduTGU7aErWfYU^1&Q&$C%@VIkOhG(~+(I(zyYHErY~&zZPV znvFNJBk_J#2;Lzyzs2&rK4?C?I=9BYbapVGCN$Tk;$B@E9#*I0Q|WBh3N#azt5R{l zDv@<77I)bRyu$%MD5t{E5%9|fVW|r+M=s`)G6g% z>I0u^>vnC_2cP8yd3b}+e7^oDF4A79nHj`3Q07|~c@wF5NVfX7nS)u{cMYgKPJ47# zpqbLMOvz|7+kgJ=CWfWl{hZ(?()Ln)07rOr2W62QlY6;KxrT_iFOp-L59I#t44`>L z+fEZd4b4*a#Vth!Kgn^7qq1cdPr3L=Gn3*YN^_SDnki*d6@O?uEgv%)Xg+^TZv|%6 ze`3#>7k1th$-Nac^ZthKl+9!u{v*(=N7gM+w#ypjx8Kun%5O_p=C{>C9;`URHxeMs z2VRnK${F0sFs+jW24)y#e2WE06yT^#kwPJvq9H~WI?=r`{jmR;&YdJAiqu1ukMHz{ zeR{?VX`}^EwCn(Fmx-$j_yzQNjOo&na2t7!EVnHZFl8K>W_7Ksir8>miy)}3m+-8I zB~o#9RuY^0$O|8&C4IdVa_QxrWj;%O%L9^vX?k)du99~9a^7;S1(IUO z)JFoA1RUKeiyfNf^`{s#4a^#vr4YN~>#9%;+Z2AMtYf^VJuL#zE?LJUy;2rzvgg|$ zk%nqpnUH*@;S<9sYOoBXmNwKXe)h^>YfMZzl6*JE?UKpo)dwIJFxRuJ8EBT#GI^bR z>C8F2QZL|q9#tstCnSEJXm&v2!YIDM3c{_qhRZW%RW=Rztqjr%16 zwv22?k%bpznksSMm_ehLkdQ)fO2EQ}KA1jz24>Ejh0$ZiphL%wXw`=2w)H>< zj}Gwl^hIQ7Bs}MPs#nb7Cp)ogC#;MJ!jAYD?2L=V_Jkw@P zL46F^iEx$ZIR{&E6L26q8k-VB5jANfy79kf3iI&)`>#LZUmw1OFZZv<-MYm%MZ~$Ka=f4*Ym9SFA|(zBsgzK#kcF_gTNI0xG^1n z-I|9lR?+Ad1rlV#xQ)Qqt8(!3i-mYxnSh%GQMj`t7WWCv4+yOvEl(h9=it{}OYp}-+#*He0=R~|$lY}ePi8xgfg#&3`I26jQ=JIEUD35$Z}jaqNck_19Xn3BPY4*x$Z|WvLwoLb>>w^49nqs(542-h%FWJU znQz2}b1dVRF5|T;ujpA}c&wh~THMXf5O!sB_!J@cv?XW#vt;VM8YDO=5UblL`I6o_ z@^nl9n#W}HnLvB?sMKrSW*Vv2*|Oz5&k&~NzT##fl9B1tM~*Na1ZTDfXOFYK9%H+9 zRN4vFJFBe_saMm=2GM2+BuG1I8Cfy*;uazuS@n@G;cDdQQJBom!=%Ym@zhhZSoa9c zd=9hcEWm;Ve(>`TL2z)SnrS3NrUg{c9F53`SRRWeOozkAJ4lT}>yLf|AyfU$nXyo5 z-Yjp-lMnvp_`++RuYzSIPeSwpU*&tgQ2dxf9S91Fg1NSSJMr~ z#ib!mrX6Iti6Bfzhb1GH(Cjxq06p4xAbQexTqrNVtJzU_H#ZD_%89_6gxMXA0L<#x z0vjCO`1qN8d|aJ?kHn9;mf*ZJ1s~U>;!!mLw_02>;&F!^!P_;lxLq4ZXl9w`IX9N2 z5u7t{b!jF(tj=JEF%vfxG^gv4X5Qx^JCyfWNA9wdd8arAHx`HDT2U~rl?Cy406tz4 zg3p$O6PQEssVr#1j^&@rkupu}NaTjC5TxlD8xmPSEz8pzR=2oue(6a;^52 z9cMhG{;=+-`a>Hj2`YirzsMXKXVK2U?TRz5V^_{Uw6C-RKUb76boe9ldt zhm!ne{FInqsuN*Xy@6&Lej0KaEt3&8^<|+NP2hvFGUx&Wk=~h+EE1uX)27VBQqzGk zyn1Gg0GD}C^1y#z0!b_kBK7(}!;blyu_-h+g=Hi2K%azW{%3P`I5(U;sw)XA#*?6|w|Zax*OG$d4$b;m{WdgP$9X?p(3`+A zrT%LMnnfn>{)p_*Z1VY^LNhC=y#naGvVhj6=AhZW8j&7lW_~E?$|3%jmj#&XFW{Ai zbGS^1kk`BNwncZq<+2?#cp$pBpeYA8qhe*m^FjO zq6NA#te6R-v5_FKJ(l1c8;#wB=G}?0*q5D(_2~&1*Q2uCE=YF~U<>5FRu%~k%mzao;rya+#UBP?$rIBza+PC57EOET?T zx&q}NUQQ(}r{M?cdqVTi8?*6fX$(GC9DqCJvJzj6nlttxLHK@Y9PX9G;V!rL%LvU? z4t%mQk>M8LhfVqTh_HR9#=&!w@VJa{P@0AZ2AZjR#j3=KG9fwf z31~LDP2J)BZ=c>?n}fnC-UWkcj1meE&+ zU*6W8I7(=jQD{9feVSYC2hBLBc7gTPO}ecRpXTPESw^hn)wRUu6yaO!LTT|5$O@pt zh7VVrQ>k|&M~&fcse_WQlQCt=49uSG32y?ibaDdxL*eJEXC#s5C-XEf@btk#!mk{c z(P&xq)0@w0;e0~1dJPw5#73|Hp>pk zZQFZb?Bt1viI2t7WlOPT%U0~&cK|2(ybrS7-LiEPrcIly&Yd)FtZMf!DW7^{cU<;D z+$CJLgk@6ZY9`d#?Tg3-%gsstcb#istm$tfyZsZ{ZKqwrNdn9Ynvc`=u(Rm2k*a?3 z-eMEg_f(8C?Y>C+HS6-O7HMHUb4yD`RfAnKtZ!Gg2gsXrQ#rM80)^f7$57IJo#MYx9gZ5 zd1o0RAB$RufmgsxAZR2Ih%cwSdQ-4UfRNE;J+dsXG1x1xBp_%SM$_QwMo*mI84)Y< z%rN@zix15S&AI^p-+^ZJP%T*1t|k0WL9-55WGafTG|Ut^-*Pu-Jad$3BRCs4v=v?h zax#TWbI|NOPlaiViz=XUKdG=55Hn8dfjEI#WI?k!$AG<#tAuMHRt!pmd}zo#wga;0 zG!2sdx5&Vz`$)46OM$plG+W`;GRLffeu{ZRnS9a5tgUs8Nq=*4Rn+FCp;^j;ZV*gQ z`JX|vy#l!M>Pbj`Ld<`6Xl59ctdlQgZRx4H6Luriemlx^~G=+@*#rEbZp?6W;q}wHcg3p6aBPZ+!Y*nL0eXL8 z0uE-U5t`#MzK>ocLo{pTP`sEDh5eb)*pr`xB{Wdto{=2ri+}&ym-xryckt!jH8ijQ zQ9-W0g*cHFjQ5r=#-GS*)QV0gtL;@x?Q#_+{f_eD!=f z?$+|0>Nx(Fh)3l#(q*XxZUN^E$_35>&9YRsmW(u~C}@@$0d5ci1e%)z=Ude6V(#<) zHwezR2=6lI^M_^0c&j=U7x?>7iZ>2L&Bm^X`3M-$OD%WYk-8ax|+6Xn$%(|dQnkfU$tk36U#tYRhSz+>A10c$_sCJHR4m+62twUVj6twF$ z%?`|JS}m*15E-Zt$*X92&1-CqjCRY2wY0%iWu?yg;I5YHEs!jo-Oi&UI<)W1{Of@p z-Fu@?um0%MdjNX&=!5RvdZBBVp6J-V3p*n{(ZAmyHABwWF%wuPCt~u%X_!83HfBtn zjag64!Su;9F?;$vwHS$fv?A78a|(S*Dm12|Ua3h=_PKYl^sML{K4w z=z8;g^fv6$1{1BG+O_22OIcR%A_x&QF-Ba9j~Qa#Vf8Pi+t?N@SNZ%c+0S=}F_)C@zsRzEcA0u)my9M~IBtMh%am;a zMq?9=Ta@_nH`3OKo0A(fOIU9AW1X;==|8OBfo3&x24z+y)NO@f1}$kHG^4TYq5XXl zm_-83jc(AamI7y8H^3})U46%(+7TBp3*V@8bL$Sw^4oe=w&!Ig3207k&@3N8H8eD! zv5^Id4W{u|6Hryd<&;44C1%h?Ci1*Ws7|bD6jd*W8KQuU53ye0(Q2Ijm;{uV88YI` zMp_E6hFgh;<(|QK{1!An@i3nd&r#59Du@A5#?urc3z`L(pMYk`P*ZrVg6eY5kgb-6 zF)4^zZgEiemo%MY9#U#Fxp`EEg*vougm-*Z-*PmrQ(9o>Qj|-SZ zY6?4Er(8bxpL`5*lFvh4CCjLmNM38pR`Nv}xKqcD;>yKK*t2W5dM!U=`V35)I)w%> z3>D?2Shlnl&pf>XyLN6@ex*6N*%&cmIQsSK4UZOW(6v)ngoZ^RJ|zi(LH?LEaU3QQ ziV~+z#q$K`JqfYc6H920Cp2fKVRLp0rVs6}p@7C{B8|}6#7IJOEOzE4;>E%Y4DHYs z0n;brU;pv||Ni)0e7<89p?NV*WCq~vx?Fs{ZWX>+y9PgOtR*yO6PgnU&4l3t@%WYC z{L9`r{B3U%K6y3_*GdDl%KdSpG7w*^;s1o@U$+ySx8>vKtvLkeECt3tZO!Ddbo{z4 zMMLwpH2k_Fi|6JlpXZNOB;rnaj9L^yd?fFbCE;#G9AQ}$j|b&6@M`p#z`imS-#nLv zkLw81HT*79l*7;G01J7T1jCq4|1YoD-Z~A~Y8z^86&_ z_bjk{hX8$-`_~!A_o~uynQ(t3*$;;zW@Ag>O!)Tcj*iU#E?v8;Ig*2dq7f05irBbp zB*&&tKO)Ohuzv7cbWQb zujj7U+qzv*#*}r|`P$ZU&p6LE%L&b5P$eA8m}!HiuNMh4H(CI#?#(zgNc+GxL0mKp zFzfbNw;y(iv6WZ7u8z4FRte_>wr}2q_&5jr{JbHnZcdsw0fPq(QZv&1$s!MLvR<4?48WzVNL;VV!A*9!uhit@=;Ac24);Ua%t`PWHw;ro z48YJnJ)J~n64C~RC+i`_4;#ep4L@anln+`jrDzWe%1{P+LkANb$@ z^MB&s|NVdA$6x*h55N2df4cPm=l=8o&b;y#j-P!6Z(qHIHS0H^SMMHZ+qwl7%$AfV0Gp za^8nJtLm3kXDylftfcr#@`;Scc(}u|9-wkb;Q1`;G1-aF=seSIX6H}~!%$|F-QG5- zJfci{rls36yPSC~48i#(c}AosKKN;%Fq1I;R4HfWa7PX)@#h2yuN*(xN2E?r4Z6O7%VS^FIxA()Bu z2#vS{s0yW?lQBrLHTxn1#Z3yhb<7leRSBHE$R`V7Q`z(9x1_%0WnLPi_lKaFVH+bW za)l?)#QnJ#AaH^KpNSe3=5x}sn(#iVOiS73^WgQe>ZKT5Q+ZWS*J^0hGhsP3-vXFX zYmrP}r&bnZ9-X4pG=$76@eP#@f@}p4WTZzR*qxGseMwYm3U=mZAZW@sHRYW6agONG70<Sg#t=^FcCkVw5ihR6-AuIYV@bo7z%Sf{)N?WsnaCrp?`NS0CNcFHZILx*;1w7Dx`s$bvUSiklKy!q-YIB|sF%<`z}DeH)| zTT)l#w-_X8o9!R_7=vVFj*Cn^vqQ774>Iaxpjkc=QXdMj&63d~dA)4Hu-9dKyCjBR zR%mk?UI|m;X2G%@n03cnq-}~E7o%(ZBL$jeEA6k;NtqW~>eyx0H+l0TbL$>CZ~*&u z@506n>#?+^7Wvt^NKZ~hMoJpu9dYpY^~0Q5v+>l_r!aB+M75IOupz_Hw@+V!aaVL` z*Iv!fZr&WpDu-P=bs>oNQB&J>?jUb_+F;V88F0j;swLC|*m(-!Rumqdpg;VJN#Qn8 z9SaPUQRgTEwm@~9DjPxJaR`WvhrjZJ7MPC1u)#wxti1=er^Vs*f>gXkXnij`0#`D^ zaUdfKGY59Zg2CPJ+R|Lyt;@m3mAUv>=2EUn(yIYhCgT=6pto38ZdbDIRmsPJ1m~s6 zxVJO~cbBK(&dN;OST1umC#w8wI(3z5BBUG4RIUm)T*uWRUST|FSc*mfNLMTkH-%m;qj{5U&VAqLr*m%4F8xPlG%aH~gIr}<}H(o~H0YlkIY>z%Y zx+9P6<{NBZrA%K?WnFA70a^jGj6R>`zFjgy674VBc4Nnl4EsDGs}6{q96%{sugaVT z)(d1UJL5E!Jse#(q+)>m8fA7#>g8G5dy!hvQ0flrlGrr~+YZe#f~q;h&Psnw z_WW>XIg0h&16z--4To-;V4bK{yB`z{*qYLS8$(}#IC0Q*| zL$j70nBAdS8Va3$NspVMmCgXwypQ#7Kr_pOzFxx8>6YtN8Rj$Bv&Gn64_VJcL$d(0 z=t*ccOLCuLWj-z7ER~w!3P4MlmNKaFNA4|oB)_?Tp;1OTT+c8IFkXP(%j?Vcytr7}e1OF;mCjxrBIZPf5ov8vR|_ znaEr)7oBO8#9$34NUchU!lA`!xbkc@wwL5$YzGfa?$aG#Tzv)q`@!3|vw8_`6I4$p zd(%i}sCk;dC3Fci|FU}#{&pxCzwD33-wwnPnB(#HJt_Fx4nl2p2yPaK;s(L_R(UWU z)9d^BT&|&6 z&7eVOzDM0H&L%Wx5}0i>XZ#K{i{#kN#RQAO6x=D&t{r!Z1(+EZ8Lci(#d{@bxWsdg zB>H2g-*mj}JsmNl`=c{EEFC*_B{WY)Xjm+w99f8nOh2D1 z_^if5R``=8vHg5yb-q>4nnJTWF8@n-mYmQm zZWVtRnsvLQMyU9|glpOf(`FcBYKLZ#)LS#sEWoS>F~rtMJHYmf;YwSrj4tn`?G5Wb z?c$a5=T&F+)r%ML+U3h?p`%MYcH!(Br~L9j#h=VA8}H3V;KH3BVz72qqkdhQ%v)ix@{5 zq6yUUpRo&BD;(W6`NYN3`epJ$rUX-HMgib>JX&95{qs z2ajOyp~KV>Y}~#BYd3Ghx}E#+;=bc}@yHphCpf=+;sRbe)QEj&uHfmlo6x>P7eaFf zbm`auD{HIp2A`V%oqRi^+i(Ii0a@!DbzbQ#_qAl99pMZzgp>)VO=tNt<=aiJdf?<2 zO4)iTZx%3XXcl;t<2Kf@WWo@sy2UcBfmy_JDC5W5JTxEUH3VlV*Om=&L9>pBnxX4I z3C(W#AX`bVs&_8QeH%~Nf!UoYT&HH4>Z!n&m@4G3b&A zGfReY&tS%w1xB7IG%Ddb@kULjQInpXJwuzyocH7$zSjNZzLuIkz~7UeK2GAx%u&EB z4@^e;thiI=Uh0cU<=3pw)zWEogJw$tIHs{T1>6~L>pBw;18FUz8l*Bmb=jbZT%@=U{ad&8z zcxY(08_=dQ@%}wYp2~-#@;XHd`)@(B{eDVJi_6g3;`~G$T)?cMS^SU*xrA)};gR%M zb=t&8UtoArCiV0>A`Q>(&}=G}<(6T|4Vo$E3Q?4*khXL?%Zx~#v9ny7vd4I^gZSDd z8t?OsxbeZe*tc^V=1iZC?%jHzeaB8{(Xu7_3>*MQd@N!jA`lZ4gejwjptna01W%rb zm5z99PENzN^fc5)gkfmM_G;R*Av7i{;-hi0vH(|BmSbyv8fNt9tbqIW#bdyy*YN21 za=e=tg)^}W@L^d7zJGBwzFqe`;dUARy1Nj6+n<8J?vKIW55)3!0{(4p7Jk@}irZy` z<|Sd=Mi87s@#!;3`0M5+_?I2BD9RE7Gr@UB0e;+)haWcQ;M+}E`26_{Jg6glRwXFI zeM>$~6mFeQBswCX2uF5t<(n zlpmDGQgOt%jg#&t;|?iVMkkD~rWom(7_eepin>OBR!!@ZC_V;s5=AUdz{fXGeZVH2 zDVYac+%rTn+AMBA7cQLB9YrbgQfJ)ih&Iw9V@$MfqE**^L*`m_y|gDbsR0W4Utn3n z6hoy(n_2c*$Nva4YY1?)NxD6~nJ~>A%tsW7btR(va$+GfaP%z;- z$e|!RFf#{FM4Gp~lw-uND{JLS{TpCj?u9k|bWbekQ{TQ#Y;zBCITEYDZ-Bfqtx z7_TiW#iga?II<)UFD1vK%zq(5#|+1KcFrUYYT7qTt=Ng{)TI{&4I7Jb(`I81%WQC5 z24b>GkXTfQ%*tnwR9J>?0|uz`WguYb3+u3(fb{Z#!+817QEm?rlJ{cGo?TeAa~D?b z-jCIXPT<*N4Onyh9Krbl*6pvyu9KIMQCQA!TBDszzu2)ORxhi=>*r~ES>B{PXxl0y z%IDBP7;7Lb8_BVAwA1<=mQ$-8Zce7Xc1hO-d%Lb|eG_O_Zl~(FOZIC_eNtgjx+BQ{ z7=J1A%>lEPt}l(Y5nlb)obi>gh4B;s(w33?gl!Ce{82rTeu<;9-#kVsTdwNw5oa9S zCGim($MdZ8ik&qwux!2~(gRnrJnoIgm3A>ErxVvb1}iz^Y&*Y+ z+@P5iQVM|h%r=E)9T$Nz7no_;DiPyt08gf6GNZ0WE?_>akxM?{J9pmlF4;FnJ9u(5(IU`TXR$^ZW~^_{=B+%{mNMCDXvnu&C4gZHHz9&H^jiPmUFk*GnaC zE-w@ms4}a|kCcte=Njt zyyl{R-!AAxp!Oa$hOiuoO_`Z^HaQ71XplPb{64K)p&}|6XO@=W^3o+ZxFi?BqlRK+ z$JTi7#BSiD5AfBd6?lJ93{FLQ;)B9;{P^PY_>SQG%hu)i>(0ga+rCu%eSaMOzBd;C zvO5X?Wp^&Vcv@z&2* zJ|Sp-RGCC*CMY|hnUKxv@A3S*r5PHUO9;zqVG|i;7HHOUG~bY^=0rt_&JpGti{l8( zB7!}^`ECILynu0&nKp_Ur{Z`u4c)s0?AMBua5_I4I~@zKJ7hMFCx@VD{uK0VE705p z6DLeVSa>`_qS6o=kqSpt3NkXX5l2Xl4ka)Vs3Rl9MI#B(QHg3vW$mKj8fiB2^z_9- zFMoJ0@Il|c1Jnqc%+V~q%rX!0tXVVE!ab^7icz)I8zXxebgw)7-&xn6lsQMPlWDs2 zR9#xCz2Se#Fe$NIvmTr2qWAiJPh=3+l922s^B z-S)ZK4)^v$(x%#Ho`05Yw%7|vryAj+&WPBC$P6E93GgEaacJKj=D}&?WoIcr-S!@B z)M%?rhbGII4;U~E3#1d|AB}K!m?C3R;7=%a#AhHbF$+snGU^;gXpV|V!0Z`w(S`R&e`*Hyr=?=Ozb9S}n1}6Q{y3D8fwXy^7{dQIC5GVE z(}gVmIe5VGFZJn81v{lOEhFpGeW_EdgZFE)al0-TcUG6;=GvurZ|w@4s4l|Fm|$cq zoQ+wdhhlKI4(Qrizr8Y$+`4sJ^yt+e!$(iTtOfoEj!H#ZW({(RRw1|IS!7i|kIdS& zNUK|qxazftuUm`Mx@R$P{8V1w674&8L)D6BvFBJlUfy>M8xNkqh6Bg3;owoM-G2b9 zckjcBUHh?o-(jraf6wx~7mn6r?Sa#T=8Fhso=H49w(EdC-MZn$)lcK~a|W7a@WTep zHeeQjR-?=cn&pje6BcM9cf0A5w&hxSR9Q;{vkaWcZFLfO+VD3Mf1krWgw z{?@Z}x~|CmRN<4#Y77>|d&^eO-eKedX06|cX1g(R8YNct|7W0C;wM`X%MPJf-@`o~ z=1W@bZ)}HVJ<=?dUOzuR8=h~6W@i`-S3$RoH1WBdVVTmB{U#&LJXX)V6=*iVEYi=x z{yel?^-GrznV(#M`4w(2H=M!imoDJg;X{}*Winc|X^nOr+AAZ`vu96?8$AkrdUr=R z8nbq7TN995V(yTk$oKQbiWI`vgz;)5s27zJ;Em&TCAe5qfYVir5jS-lMs#V7D+e~? z|GNJ+zTEOOu9c>sA=VG?DWguX{Ifng!xu@MoOM~txM#TPHrok?Wk%hMF*L?c%T>po zfM!Y-Dl*WljH0xCl>W9s?r+6M?!~-N%MBAWF31S7K!>uEavsyKI^MedmG)45l*Ikh z#|5H~VcoOOq9{KXxf!WgFn11Gw`|4d(i;8x4%Q#{g(VV@(-FgGEWn(Unya<~%?6fb zE5E}c6A>5`gAhWpj691%3C$6~ajLv{&i6%^R&B9h;83g#4a3vHet0<{5?hnvu+%#M z^M{PV(AI76LUIJ|?RXxypI?F7tLpIn)5Umu*%Gz<_SGBq0r{g_X#wUWKCNFQRD01{Bq9 zL2l&+q*XnS)S5L&U-}}_S8PPWQfkE}(<~3pSO(G>b6{fSOc;hm^C!ZeJeb$#9mD z{Xe_Ze@0eYAyTdymYnxxyq}cVS8ZdJePcdqiCZx1sZDyM*%EE1aGUCslj~&S1_9>! zGl$i_o-UYiqCHjh-{#U{hi1D>n<4Qx_1~EXmiK`g!KPZ6Q7_qQ@gh|aq@XZBlSvIs zoXT8AfYJ}wdcanq#eKt-@FyjZY~OzpoOustCNo2<22<@@k~kT_{9|GoELe7j<|CB+ z=Kod!)dfhWxB0LvXqJZ|1;QGc&<|hk^V?F_oV1{sJy;FR`ku{0vz50-48z<@3Y;G8 zbwTm($zFL3Fv~}b8vJxY5nyxPi~FwQmaO}VT!zb~=0Mpl1DPTn2Np~*NH(L}6q=iY z^FN1XQ)X=GRpN1VKlixEbXu;FW}eS{5@PGz&y)5MlihL(a-9GGS?1`R$ZEd#mg zo<^=?2LYva>SM3r0|%gMhjwU5NsxW@UYy0d!_QKIhKl}WDZ$t296R!c(H4AS==&2vPAYZCp0He+AZT+A)&c2 zk+4jNUKEF$iwNM02;liVuaM^ulCSYT@0B_5I`4aqX+KsJf&Cf&I27-NlWAe7@tuKQ zty-XS$Bvl()NJL`92}W~z_4WHzZ{!DfQpVoOk^^FIYEK4DAtjpz&R|`q1-YAmd$9h z*MdO!`UPX=toi8Ie=yp%ZLgrYb?a8>*RKyYZhQ%s*vXP|ctU)c+3pxuBE3?nmbCja zf4TFwDzC2k+O*C-A=U!{Wm5+T%+?A^Hy1N-VJCS`@kh{!nQvmD0fE%S97Kz77u5~9-)5uM^T z+AIo-Oj0fx0?YvlnjM7Zc!U$aef`4;iXAboXKz$QM4&X-7fJJ`Dz}O09lK*pzmXW) zvkztu>5Fvl8OWSNu{}tdJ`u5#Cm@O)tSFuvI${vK`}e}Mt{pMHQ+xDj*;0L=C!@(7 zI&?eeD}={n@ptwVOr zdZgE^MefrtBAe%BRy;$P-h`rOcO&EJZ7A7r0AA4<%$xRT)uJ5|64S8%XahFyJBf|^ z&tN^ZVc#iiAUMCc_b6W2eF)F*JcL#I4r0~eBUp3fIMyFJfo+FQVakm8x=oZhxLF5w zZ+clhe<=?aPtyLL)GinXn(GP6q6Px3`UsF^M7>e8m5tw%)EfoMmK5KxY{;8;*GQ?j z8=R*EoEZlxgAym3oOakIm;DufaoIO*kENbzNY?Ny_0VPi^tCR2ax$GTM}uqM$O(%! zH9g0Ep0bZ7p2}wNfAw3}VV87#?b3sp%r7#KofXZ@ZD#g1Jp!#ey~0fU#foGasTldyW7;Cibf9`|aJalbZ&&`f|{nuUjT zIe5G*ACH$V!h^bOLUStaR1;`xQ*orI9$zl;Cep8%1^}IJRVz2U@nctRf6(6Wzl%OG8X4+;&Hqz3j1>i%?S%}A}s{9 zeuU=ME!14evP5%8m;*tCX1|aGL=&1F3GsxdI7AboWQ19OS+?TWoG7pDV^h_9yguHR zdxn?gp0U6e!$*ukr_S97%pD2N9+)<58uskjg>&Z`*?D7GW%&_MmIZvo@7{p3ksAb9 zP24j~yW_kk%ej2?Cw172G^;me z{QY~-e2)L=kBV7GEV-b$-U-ci_l)M!>@NMkvVCInthQObk+tHj;%?hl|BIdAeI@%Ku2eD$`UOad32wpyX5-;yQjG-gOS9a@#I-|Xtsd4o-o@$m_2(&T=4j;t>^wnrQVpct#?8*%Z}K71I;pJsrjhQ_%)kO z*zZBJvi~x(ms)s@wn@RWvBQ>~Q}x3oWB-(Wwj>vj1+&G?uXBHSr zdQEC>WoIGaafYk^=Wn&|I<8(nn+QunENPbjEw@C;`RY1%$!icD zx%Nc@tPFz>q*z`W2sRxL@Ze6T3-GE$>TyuO~#TR=e@c3!U_ zf%#{328(msq*)pfo-d8RltBq!eAuPK@Z2*i@zj(_XxpkK z1`X_oVX}f?hmM#sc?vw*wO4+v0?uLpM-A?ep}l)5Xr4P_Ft(TG;qtOVTzt9|c>(j# zoyIgPF$DkXtE;%Zz6@8(l5mO8{Cai@e%`beKX0hTUtccAPaE>_*DX2tm+e`E=nQ=R zOf(+VMB-6Z0v=b%%3>LW)Kpw8jK%wlq6wpf%t}J8f@WE|H=Up?@SMSY8BI>XjcP(F z!S_~mvV!Itgk@PQ;Z9YC@^`*fA^TK?e9V`i=4ifOl7L5q=f{NMk1CSzv8?`AF4N2< z;J$LOCGmvq1Y9qQ!@Jynh4(vN5skfh{y3bx5GON2Q0_AYJzKRv zmoDwq63t=Zi3Fe&_!D}jL>bR;KdsZnQdnXB11 z2wvU+2nY4zBH3s8q*VLL`f*M=m6oKPQSA!hS7fip#>ZC~Zc4d4sq0vln@C_+8!%}ixvzQ?r1o`Pa&67NTqEs~w7I%HGWAV^3%5+uMcS~71ZDviNuPeruIrlt z3>RqV<8qFGg4i}S;w5c@Y8QB1#5!oCU)3|-a*f1=$HX>>zw;{>&mlG{Qaz(~?K;7S z&oP3}I6NwafSiHgPyy&<<IIVLD3zwao1u#%n^t3EN?b2Zf{@m|}QNxE|^oZdYIdV8g zjU2^tG#VqvOn`S_C}NY-(4_~7*}3aA7#;hJK>vxJ z2+CWAB`+UC=}QNZzw`xk=qhu`x59*JGqI6iyy@^6tlKZJEI!Mp6fm#be;jM~AH^DO zSM58Dm3t3h_1=TncJwsXytDp~(D;xwG_&AIW?Rykr!&c9rYlq3|7&`V{5BCbz}!@BiK0w+b}49PLe1#49uby| zF$tFaC$H0mNFJX119Vf9_}Z^)Dh0Z{uM?JyYz?@1&F_+cw5}|su-MaK!Y}|RGSJ*8 zhD43Tx(rxzQWc0s#SXDe^~76;rupucR`Z+TQ&K5qJ$Ol9({POhnvEZ^JXgD+Rk93i zQwcOb3Cu?Ny5@md#C&1i=}Kcyk3h4G7B}#E6E9cX)abMuG_xGAzuJc@dcygR&RHoSCRz{blUa0&~TZZ$5q+;uGYcHG9u!4QJn%u7!*5{|A zv3e0MudYC8ls|fPXo+M;0RG!2@8Ih5`2@vSTuKbayM-C}aocnFWm64)Si2aX)F$Bb zx;T8jBA$xH$5oN|q}qW`WsYP4;+hOzn~Znz37Yv10%;;aHx&;F(~qk&2-umpuRb=) z#63zr^1DsYm62(gEA}?w_m)x$?$!{LtHou55G|w7gzr0QF_UE6BQ)P5s6HYze_Wod z;Q3*x0CNH!R3_sQulbnq`D9r-;hEv`e1T?(Pg7u)V|T1o{;riK<3ob;jUs~dVwo`` zhG0x!E|0;R#nE_|zwhKZ@Gwt*9C)o*mfL2$3KDURVZBkEf|Hdo*pnZC!ztc4l@*F& z?@8#veDB(|1Ln^6MR*v&CoC2Ip^1bb!cuIy@@#-fEuUzCC+k_xA00{mLbl50)dg(;9;HB0OUBwGq}v zUx6|Wx|y3+?JIB=L(lTYI&t0tX4xtb{talh*IfY;O_w*(dv*Q$&pg=e{G@c+Fk7!vcJ3)XsLqT&)Y&N1D@^PRZ zg^uP{R+5Z}OhH&gA|fIZ)CZ#>L6K^u!j^aZ$~4C|b4_i}-(L*-FIaEP-ESJZ8-I!N{@G(7oqiv~J%Utvvdo zb(`K8G)z7Y3_)a82{NDAg#3-ikiFpuieEZ{>KArmz)*e90YgUO`ImQK$FWAdNN8Rw z;C$eu0_Ju5j}e}a;e`Xo@yx!Xc$#{4_d#qqbOOs(KaY;A2W?ulMq5Jj?3pug@~F6F z@>$CO7R!Q~9plsm9uvt_b27rL>xGjUEIG@Df@ZeQQlA8xdA_P2P8%+E@Px%t!c_V{ zr(NZ_>cdlBe^I?TblH3*6_$ZzBkNifwZ^wZvpAZveGRdBK zCPVEq$Ltv;zfFVPV-{;jJLhfay4E$Q%xp4DXAbjV}28yo5a`1{O`(KUN84j zjdl}RjkzI2!JJV>- z8zqUSuG}Jfn06UxRwK=Prf!C|iLtfI1jeT+*5(FOR6W_ zTz)QvRpi@&c(gTY}_oB5Au?5kAQxsL?p9N5L8R!mH+asDu%Z-4Ubl3@^k(!#XMxAB!dG1^vbn4U-UApunH1|U57A>)~ zx*Bi4@w!@ZQh_1cYGY(IG)q}w8CJ*SKMlxDfmtMFO$`JHFk9`JhGA0|e!CtUXf`9w zB0DtO-!9sL*==98>OR(ex&MC~nq`#l4?=TOaF#bc>Z3w2-c(a))+5bYVh_%<4G}45 zK6w;ZE}TOxpO1#-_Lxpcm1zZ~LzI%9M_`U)hbaM$_!NR_22#@U3D4Qe$2mSJ8?gx) z+{)}4>_AcR@hmT~$?RYyv3w*EoZ~QL$S~!iB}=FG?9mNK5%v~1HI zZ3xl*M$be@ZVhtR9YFSrhfuQdD3)(Igh{h~)IB?Q?}Mt9Yp~<^S-hx5m``KvK|(V@ zSp)Mitl|DDLi98HkE#y?H*#B2wG=H{d7!;Vd$eiU8UcR(tOE^dP(#hS!cOA3Glb?d z=lM*{2Y`)g6j`3JX?uZ2?j-5emgVA)99 zW^=!$($MSzW|6%fiBvhIob1BuT((5V&HQ$gWp@Oal^AdP|0cZVCCtG3;3ij|+U=Rh zJulqz#0{8LS{R1dXx1rXr=4|`GWSsYWV5AiVjP=tL34wl5>LG^Huj_*Y1YteJs@7B zdd5=u&4qGNGcpfN-(2JZB-Nv`{#WNvCX=17x2#Ms)6AL1Q-hFe>l{}99sM>myX1P9 zcJW%1PRbM#H`yMnOH7aTzx{dzSC(OMlUu{Cq1gq_u94=Zz^tBwl@4d&u*+tQnv*s# z)^C;1wrBK&bmejVZj#5$FMGO8MN<3xPp8AxK)8qPg67{`me*!%ouZM{(9BK()6gt5 zGan2fH(NpF8C%cVUQQ)G&Nx|(jmSNZ^@z2aw#PQorKHyP=lN>HnZT?@n(GxfOWc%i zwtzDKQ?P7ABQCE9#V6YBzKov&W>sgJFPBn}ob^gQ7yZ9H2l17>L`atH<+C!1aSCq{ zpc_vd#pKbWF?`@a^zGeK8*BNfheowq>y{YSr2}f?qHwmR5U;GR#51YU=tpP{T`(2D zUT?&Q&*k8HSu9>j3FI~nKW|-*?_Vmx*DvJZlND+BY*_|AsZPZwRjK&ACLNzv5mJ|C z;)~^33Vz=cU?!Bmmz#{MOVV+(BEy;)LtHInCJY(%l{t{33A3?kRlS=fiG=86+^$T+ zz1mFNTbhNN)oHk1or2p0=Q}IXaeH|>u9hd_9fIq-1yQ(0puJug4O#3&MwP#QHV5CV z&Bs@3vhn$|S!(24mf*f#=-|BsfO!vrVex4ea8?VMhM#N#S{Z2z6mtlK50;iaSv7hskJPNZ#c z<%QHkZG7#r|F7H_S$~~2lQPiU^qU+PH0%Gwl}uJ3eDQ^~@MtTu-L%AniPQL;li}+d ziusug{49#9S0jOtKVtk8bnZ16 zeTPm$-x1R>aNInMnjVCyUa^?&6OZY>@d!*)W7X`_63ZS|c$&foG%{hpMBFLlv%Z@UyUlNIc?#K5uvW|1A5jU?_SZqBsWC2dCo!*JSqeT}GTSk21H z%jTrQw%yB~7fvw@XT3G)u+k;2C#Kynv)m~AX|ak(UBhFh4YW)CxATqtI+49&VZW}0 zJVfPZEE%dYl?h_yHlXH~P<4?`tcGSeZf@ibNTy(#9$9nHtTKcVl?)J=5uZ{GLd*cC z1hDk~F1ZrvOso_X#<{8N4VZ=}iG##ZXG~LQZcfgOYbq-f-OlGdpD-?znGHgJH0{49UXKIqiD!;74#OJDf*HtBwcuV}X zOdgwjHWHsoHJWQL?3SU`1>fD+I^{ITx;(f`u9f`J`LD~TkvtzcZsKk$pOp3`w&J83 z8{UI@GM^1Ji>zts2=V6>H1m7|&Bx9VoDDQH&E}c&e}QK;>dfQjIW&Ex_D7)E-cYI! z8JKQ)1$;?f^--4_>v4s8m3oEXe1#3(+gC2(=$`iy?UxSW7~;~K_f$$S_C?IcCa+0&Y2Fy@3!aY4nIU+qAO?S0 z9Eo>JVsNd3AWZPSE+6<+Cg6)_^6>R@i}3j~S@>k7Ok?AUhFnRVL$fX%zP6_~SsT4~}F-A#K`NzvbR@)Qa7%!;=4_VvntS!^kM`}lqCoqA*_X7Nyx{$%t!+l;e|LXVmzn~z zNRFwzv)XZKPcAezs8s`d_v)=Wu(FhTQc@NsOqh-yJ^G?oufFhT*A6ZDEL*l}jg~Fj zqHWtwXw#-M_r=A^10Ejj(axhCI&|oWj-9&j`FBRAPF<;P=+wC@k99`db{;B>5hI7g z%X=YVdma`p@PyYwFL-(`VEI`{d12PfIfUiem^pnmW=)@iiQ}drA|eH=*6vi0oL^k4 z79|-xY#hp$twUznYNVGugC%v(BOoG{;M*1>Mo&hF_*chd!7DTa-cbeci!DV^ay7!U zmLVp8B_eX_5Szag8AZ$C6Bvi?J%^)3Yj*bfPJ*L!Es6-u`7a*8(p@JJQ&@!-s#DP# z0Z|DGn%5mTg%=KLXnx@kp_zJK=4U1}KX>@F8f{*=We>I3$sXFU>TA!PurmFnXUt7@I`&h%`&5=AC!)l zl9VycGnaUte5|d@g&dc+`8H@)*Yh~TkdNM6_Wrk_S%bH}ueN0_i65@=X=dFqwpDD5 zOpR*~r#WcW`aNjowH7cdsrsVTd{}n5+c3L~J+t4-&ON`EC%Wxm_^R!(q-?2(`9T@} zPQhrH;w<3FvqYZovIMIOB)&4GK~Ui;zvv4(KO%wBXU)SC-cJ? zBT0k(Jp1*kM=WqCvMRSRv}(VJv31HAW2UKT*)Z3t4@4#ZZO_ChDSs+H^4!g{7ig9^ zF-&Q2lz5M3@>oGL<6+kGS8MtU1Snu;I-JnV2KczV0&D`!l!j*OdGPt@N^ZY5!%^|! z`HXKud4X=PL2tvI2ZwosQ3!rsJd9WISG$N=PNBuE@v7%ZhNTBp)B9bv7PUW#PWKW0Ym#?Zv5ROm$$hzZYt!jYHMc5vY1<7)mD&z>=|jQ8{T4o}Mup zoBd|ubW#xBWmwnC(h2dh*hxBW)+FMlwPN5W&t&6E!tz(E^VO*Gjp7(wEsP{A6PSw~ zYF_96KW+a3oyC>zd82@GKmv)JbIt-Gp`3Hhfh0gkAVf9>8-u|S8=NC1BZNe@-FCa% zZMQRy-F;4XpE-T{^a*{!%sDf6=H4~meD}NSTlate&$Da4Rqrd|cF)|=TK`q0daHKr zz3X|N{i|KOc6}y8GGrXHe27H8;MdVa&~3&STbo?9%a~*W@D>8L^f1e?jcxV#gk_Ky z86~~XI?B19^fEipys1i$`HH~&%@&GF>9PK4cRj9dE5qsaIe4Qv3ug$;Rdc85XUyYy z^OBoiL1=D9VPQS;imFjwS&zEfW>i+xp}4df1^H#j$}GT|hmP9$@y_ zq9>;LVk>P{{o3(mpFaosFC{~$eDLduQ-RQIeck(xhVM9T|6$N9d6l@xIHk~R8e461 zSTN+$xt>X<-lg^$}Qd~*s&BDZqQ!shrWXzl~ z3sa^{$As}yFmb|EOq#^?iBmB=Hco+C(w{Po(43i$m8(`_RbnDCGqaJAk&X2995gkp zMNwfnGSYK3-IU~XELo6%%=A(me)MJRK710p4nB$rljk5RW-PXLKY-1sa`-MAoaHpVZNZz39 z6Swn3=AKc@H7vO+$yOC6utpi_W2mJ51jcLfay^V`DHgb`jATQ8tfjJPG zrCtPrY;*Zu0dvmd7?W#MJf3yv-=Ij@Z9V*H4OY)0TQ4Htoi?{w!}ivnSGEzh zW10gsSYs>FGRyBQlWA04+c5J@+jE#};U_1b=64sE)sDDLz%vk9z4h!9^Wj3Xth4^L z@n)S|kZUWTECG3to@Ph3@z=6A=QO&Y@^73?JqJqb&v|+ylAeH@3}!L~-5oXn0?cm_ zn19rN6>n40<1Fj9S?51{^A1iw{~Xpe)M5Fe`G}GaqOehl9ySckS*f^qUni~|+J-mw zY(?_qF<1~EiLWj{kMEv6hm2Szj|N`etrK&{N~^W{N{l+ z{Q9BI_|-%0`07wQ-rv)PJKNUd*0v3}*4Bs%4P|()I0J{%mZ4+uEHuoRfU*g3$Q~Ji zq_82#ijBa!g)?w8cO_n{&A^rQCAis9gZFl?#YgvV#^=;$`#0j#-K_-XMtmwmIJ)K2 zzJ%3%HTccZ4fw~WcHq~C1j-xm-EP7#Az8*P%L{#I)Oq(nZ5VLl`=Y*;X&B4ZFcVkFATE6~F@`_xlU>For%3^`o%KD=9P3xbslpb~)>onf+ z99kCf-9I52o*{mF{_Hu-o+Y0;mnRP6(b%vSyLRr!gAW|T-aQ8}cI*U#a~vKxbOet+ zaso#mdIHBEehQC0dI~3>egT`>wjw(t4`qbtg1i#s<(42PTZU;ABZD_tOO~!gc!Z26 zCX`Q~iHxjlB(7eKq-67%_3WHNB=bhBmN#gnrM29akMxXOq!OH0EKA1f#9SOY{u1^) z^c?m*bP|h~rxL=W(7Lf3-48s4P5T}}_tB@ZzH1N0OrDDAv*#f+JQkTbm8f602U!){ zklT0wxoZz2Z{txEY&wF%%@3e>%ORAn-;J``%~-W67jf}35jA=~)>L+&^GQPU<7d!u z>MVAgd=;Z6PuKj9n>HH{vdmAO>GeUg^e`J>Hqd;8(0qi@{2-zE=uJ7@DrD;13S&HtrKM#%5s1OW|3h=$)DUHvs_$yubI_g{%M}v4%}C7 zAoOE2S6yiKHsab7sG@^2A+2OC1<>?uL`U{Q=*fH+Oft+RD@4bOQnHku3!4EmwTiq@ zAr)buSs}(&fFBf@3a~QUOrEXeeD~^U_J?Nww2W+i-Tk~5Rq|^usyE1b|M%M_brP5d z3F2~*eE{V`v*~eWy|7%yI81%3G*7PddT7vg<##T}po~$8SO%Y14#|Vo!w=J7_3TQF z(io<>qw_Q(p8E63f}T$6!aN41?+2-aiglp-2HJ+_n%TPZNw&NBbgz3a?UH%m_43(0 z*L551(N8?De^0Zl6G-I6Bo8=?-I^Bx4M2DJ*GpYN!`w|g4#?QX(vcO$;+B0TE{Od1uM?G$qY&0i3j zza%t&DWJTy5ns1A5_p^N^%j9<0&{ygA(%_~^50eg<|=%=trA}loWI&C(9HGCRrpLK zz0z9*v+?KvDq1qBD|2W%oEBe`vuei8ERx8KIzz58_kRVx#W zZP8}6mznS^&}_h&Qd=W{I#V?HY);~s7nVJ;FzM%8^cA&98!KpEGPFW`!#iiJzwf&C zZ*NlNk+PX9eV{ZwpMEGbyYbM*Bo~+s)C9(ETRto6i-G&1~ z$8li)F&sO30>>VH5~YPz$jd3wkKN>Emr!Md<}zet6k_GdR16<3A5a;FiG^O1?T@M_`{1q!PXTf5O89x~_=PbecE%zb6dK>av4j_NS5#+TU zMgFFT2+fC4wB<0Wwj4xJRU5LhDll=v^~O*s8^tw*ZnHDo9jlq5ZJvWpW0>{O}nwzTRVto zODcs+1C}z6z7Lp1dJKPapjp~^o$sHvBHIP9{XmiD)N?p(xGULa+PNPh@k#%5jVv^K zZC_znfw|8qOMo0`_8p_`j%iD!U9}A5wz0DGuFG%VBj(4-FTeB|!!=wEs1M`w_k?Dr zeV05aG;0sB%b!->bNgw~e9JMCjzY6IvTeW_ykseXj2Ty0paJQ$IkIHHkfkeAFcT1* z;NVDJax!!BI?&7l=-6q=uo#C_j07(ptK+?7f2RTh<$})O&@7U3`ry$cH*aE4ZaV(r zZ@&DyhUXK@?=>`Axr1(#a#7wq3>3hXG(0v2#!gC(yIvB}y z6j>)0XtuhlcIl33HZr~7gNY6G0BGh?!I||*Nl&;u`tw4w^f(JJt9*56{cT{QtO1*O z|H07ArTr3TAT-}*uzTxz9|OlFyvglval1S^R2$`Quf6yjRxUDAJJqBm;oR=c=-aym zy$5%qE^!5tW=+AnXP!gf(cO5qr2?ndH>08m6e%hXo54IKI=Ut*ge6ph$pY1BcS9@#m zmxtHlmj|2iSyvVA%Dl{b2*NUlb9WQI+R?1wY@zvc8OyxAQUAZ&$$hppfF@yZI#+Xb2{3C5KQ$e*^?;-d}%b%%T{P(FSlW2U*@Rf7+A z)$?YcfzVun3ys;h(vXMKys5~V7?0tspJA+@VIg4{HdMxLI=#rOzu=%?eW`E6$k7-( zb|R)uosK!P=V9*L1(-i?0T#$hTT7M_mKS5@%vm}%_W1GR$jz0{4-Y}&%2f)Xy<)Sp zD}5S8NVecC|MzhzpPs#{@XY#Xx z`)S)??B55?T)!7Ii%c)G=!ZeG%;~K2<$7Y051MKJqQKZ~%Wq|UG0;C{De+6vw2!;} zJ6_CjD(S(uv5r0`(i?OBf4#3C1$hN}y<%hIQC?P~x9lJ%WcGe3E(`k3Mn|O${6L!#24D=gjmX0(1qkGE0!2UVy|^=}dR5+ST|8 zQx%$%l2ehAl8LPB0{seQVp29r%UiI$`yjG&E0IQUPL|KHr{-h++*R1R`#26hc?R7N zoI-6=J3_saBe84PX`hn1Fh6)+`@%Bm&Y`pxB4wC z`$IGH0%f zDOjJJh{h!gP%&pRQl`gadh{@imX`^o;isks4Z-S>!_hi#1`ekt;B;jk-rih?&vp__ z_Y+JHti|Vh>+#8+T6}h(9$y?JyxvbheXs>zQC~mMg1@-G9>3XJg}sS-CYe70lAy$@8ml5#SVdG zGcU81fo4K=Cm~tHn}F}O5}FCj-)^nP*97BlI?C|f=5lhQxVKKvk zgD_-R2q8HN(NQCHIL4g03$biT0#>g|MQU0uar3pqok;uDkCTr z5T07FdE*w09ARcgS-mO|mw7WM{vxr&#BP7d&<%m+zAGZyv5a|^SSF8P9RN`E(GI!I z3(dNpNE=Wi^-J2f9UdXD?)6>UAz?w;!m<~d6`Vx@(Co?!%{*U#3^X%eOi$w?ec8w# zm_>oH`_F}DuMgVzrSaO4O~>50JNr5DPs)4e)-7z>xJj>9XlNwza*K6tWC3P*;d9F5 z*$5@{RaVsF(81#h&G+p)h6fKluKz#&*eP`H*pK9-3=|erAe*o(Ja6w`VwDiPAQTy3Xx7NAj~#zIe^2b zE~D$vNo;K2hmj*EVaDtP>_7TEVfhv8d-62)Jn{lsI(K30^qE+dyap2{PDK0mUD&+y zLF6~@Bs4#$;9SCG*_H=U+;J3@TMwgd!)|2fRby#F7NW<_#mv-7v^*#?Xk0}5iE}vc z;uWOjnh(GX35q~wSsk9aa1F;~1`Pu8BWEt-81=A1^A*DLWgL0=4IF>zG|K8*=vQHc z<_O62i<>uWBpmjm=lmt~oaJ?5n``1alZ#dW14>+G@G{DD+7s4o5%X>EI?jcw2}@IxuzSLNon~vRq=@Rf#9wI?)@OL4f&tp*c{V{K|2D z(0qj%6=?Q?vwRprh73rc)i2$MY{7Ip4NQ#8kpyEE_cKQ61sq!#djaGa2|{9|0g&Q% zoRR%a0;oU8F`qo27bLBGUmH{nOv?`tGryXPYifUcJyP!O%c6gC@&l z#BGB~?`#91nRYt>nz>{hxzH@4KXIFNY$LZZw(@UqUHkLOThAgH;$&^@7VB1^Sit#? zKy*I?AAwn4SYf>hOv@d-Jf6RF?sZ%^`x(<4H#JU$9}lg6NS&U7>_oQ00ni*dX(4X4)?;}+rcrv%kc zb~oa)9ku%Py)U?6(4rj;AaHdpKq(i$Andx3;FZzW_-D4Ek5gR zz~{T<6Wq=E;gipI)u}$)DMKZy^+iBCwpoU3d`W%1ji9X1EU?VHQ1TMrmz~V())M?; zQyG5I#&trn^dNuQS%ELMR_IHD(#!n*wmSTHM;+eVQG@pg?myjLg`ad+;T^*MTLijh=uxGZ!OiRl4>778I1Cw6Frjh1DpKFT)j9 zp`@^quw0EwfwtW?O#e80}$p z(y~2O68G#_aVhhy8#i$Hfx`$3jli%W!AMEYK-*u&ntBSyk3EHxCtt$)bz3oN)Fdoeu!5jmi2OX>MCHo(=3=C*$;7Og^Aws# zj*LTgb`CPqvyhgWjUw8V%>SL7T7)%eB{=xd36$4vKw5q!lCw*Yky(sIi;_@Ow-ra9 z?M3(D=g_(9FlNkHf{3UI*tYL69C+dl?0w=ic0cwKI+(w4v*sWrGaVBrk4Mw`4LC%o z&ui+`m;Z_h%;lROMCF#lDBk)oDmxxROUFUv7S&-T!98Z;0*qdgiR!&4(EijVY1RiJ5sABD0LJ zUMGwhi%~u=`tZv4uAWN%m-I}Ua-0s&us~!Q^UJ1zm+j271On-;(6^`g zyFjQBGo}CYSlQ26TF^0%qFD$nm>8f5Eav3A0FtI&1Or0~*lHL8HU?Az;mCrn239A} z+5^a?eJrPQhjOU^%BB2fE7r}2fk(Tv<(F}`S}t8mK262z8=lS`mpX>6Q0ISB(LA5` zqEp5Z%*EFsC~{}SV6S6a8QkUWQG4K7COx<2!38R5tdEOQ zCvsCr*pNnC{udFN1*|m~Q__g{X@E98%`&OJ?P>0%?f3F}st7N=^Z{AF%Iz|y&NR4g zy=l3ace7+Y+d#m)>i>?NX>g>S3Ooxm%leHz271={Z92P|t%hk6Jg%I316R(U!P>?; z1QFm$RxQJY?#(!}r2%Jmv|)W(66TDJ$IP)Kuz2D~q)d%N-SXKuQeT9NJKOO2u><&v zCmtZscH@itH{;6#8}Ko~@n>5K@w3f&_;5=;KItqbpqAj99ToU$dl|kZO#Wh5jalxg zqZ)KR3Cayzm!*N`&v%*W=sxWtI1`#>x;X>PE%>&xh0xqeSY{qN zDCuo(ug90|rTB775x(A3jIY{?@#QA|-(H5h?IrkfGnbny@Y$9cd`QUt>9z(1<{yi? zYVdA%E#6`I-r3uLpX_hJP43^@kcVq^1$ePM8;Nl-`Xb=?F{811O@_{(A)i4mEvZ9A zaSbYqYYD^kC@ZhmSu`rkYgN^ib*QawKpoXs--70*Rx~xX;Dr}n#*G`d5fKs1YdH+5 z$?`aw@k#xJOUleSOVN(aY(?}j+M9U^P&xo;O9H{Vq}|93)>ZzmKtS8mbyxbE>@STE zZBXN-q;2Cc?OEFliSr`yYl$&dVu#k(H6Dro@PCEldx<_m+r28oG}PyKoWxQEXqHK0 zD*rca0d1W=VkNyzI$MDwDWk+8^)rw9M9iN^+a>1D*A_}%rA#cV4mEWAPlstp`Slns znTEt0>&G%umTX|zup-&;kjG2s~L&FgfIRc{zmXjvU!kpQQk(MU2W8@ONmmwrH z9MRD+1mSFC5+>7Ab5LAVgX)@fNKG$BLQ)RyJNyJ%+q#jNTZxREa%5x?nwPFYdVU>_ zKXVcLkH3uV_dSXQOC(QWXxgwH`yYQ5T~A)b_7fMd?dU1YUYUaV%a>vCvPH-#$j2k6 zUO-9fHk1*XOWGbl+2+I411N2O5LH``qIuhK6jpD+s?>Z;n6&`0(-xtGzioZ;GCH5W zf-R4{gdO`2;*$?Q!0b8mF?(?W4nO@8o_M22hh_*c%Zq@=&ziB!kGy^XkG^^a&z!x8 zdCOO7or+ppT5#jqbzC^Zn0KBrSf-rgG1t$tUDRQc4qNa#YW$@Pc+qz*_49Yi>Dam< zZ7KKF>&~UVU?y=`d_e7sV%wv=yi8ARkLl>RT5XGY9PLv4P=T8J>Cjg$Z#YY}C;3g% zm36f(V_)Xx(<|CKkMqj3dy<}{FL{*oWGUOZF8=K-^~FAcXJ1dV7oKgo%>Lds)Z6CD z7+;Zho4*2oV_H_eeAA~S?<|kXS=Xmzx=rdOK=ehCGAQd*8H#BdNghq!wXJlnk5}e* zx87aJ{};_t_FycdZV&j+v~9w+;2FMuBBO2&73|F`Lz zpy^3d)0K22J<|hdd-v?Q^n9!oA2c(60wBJg4GEHt?EhBYb88^8`EcaTqYlz%KJ+d| z4Ar@~TG85=aQi(Zzj=Y#D;c*fHwW@kOfNx8A=q3BZ>rM}^zs&VAjh<@{CXl#wW~Bj$#_JEw#`wJQ zzb4-6BI}YLvy^j*^V##-|9uWgW51UI?M}+)d(t7&GO_-dXEQ#Efz**ad6An*0cZ1q z6B}=ptaBS}O?sF6&(nrQ`b2?wrp)jRW3vLyS9nfW=gpFi>Kjuhd8KUK1=f|`bOdad zpy=&ckf9pV>+IXc-vydQ@@2Qa^D?8tMeNwR1z|x$kh5$dUfZ@2FSpgQ_5z@hM^QgY`N1`Nmv) z(q4#9J4*?)mAJdR24C&0#xHi);Tr<;*M!*b_7Pb3Hsag;&G?4v-x8X?-Aj1g(}FK| z6M(zx2*%a;aC;?H&GlNJJ|r}YWc?%RQv&qo-2`O9vHj2*;)PY-17bwiObfiwMvK2BbHa<8$u! zX?qhs=%~k!x7FYs*2CL7>+sg@2K;Dm6W-gm79Z?gk6Rlm(AQ9aYqfcV=1e5U$Lf%l zxznd2r>GPK6%8mVZA3*$6Kac_P*Y5JCL~u>)}pGShN>no*ASR%^dk)ob@ga$Xhdsk zD;k@c@Y?Cqc>lu>5gQYuPYp6t)6jE{(CYY!&p)_s;sLj*FEM_I92@dBDk5L669M`f zZ@#W^DeEHpn>0xBp$~-tN7_U50~Hvrt-McgtmGzDK0_^E)pPyM{oOsDWn!$8IB0=6 zuxtVnesPJiv+3C0DwW0^6Bk4hhXzbJC)+5mKZ+EXSx53EI)P?ieTuGL#TDjr(SpSa zce7_NK>OA%Y~H*b-QD*iKfjXOLy?%6h26U!&@VyGnX?SBvEvXLIvl|=W;a5{y^ld$ z{5Vx~bUb2WMq~2SnMhonf)xp?5j}h)h71kTSy{5P3y_hPhw@V1bTw|`wjwNAk%~jd zp297*5u>plds`{C*MH#L(gL3TA4)b7n%j>Om=B|3>jTtr)OMag z`MUeDY)uYkFItS~v6GSCx*J_5deOmjn;v))51%-JfBTnz!15K#FlN#eJov;ZoMdc$ zgwT8Z+*LeDsC};I8eXJMNl){s=aFAkhu|=IM_>r%&0mD)o_`*fcx^7c!B{U({|MSQ z&-LQgdD<-j`UdSr>`>yr?X(a(5DvM<^sf_|ZzwbyaQ3!4D*BZjx}jg2)OhOvv&sw2 zVmB0-K=ugx5~hj@0(JueMdL3~UZ2rX|Pv!K_KkNYXYuW1!f8uPf%& zlLvp<^nR$!ibd3vu5 zCHWV*X?o^qmWxAiXCFlPtqW8caB=0pz@X6Vzs+|Z9^)DS_Yn}&eFTgJmc4*%4AX3L z(;5t#MS;+4;CEm!6WRQD)AfgD_c(JmDMrS9XlR2;3{V`wR=fcfxwt%r7ulB8_cpob zyhgUZ1k4OHi(FWBk8yj}d0xsdfcqrnFtS5Kq%rY3pEqq9pFO`Bhs$-p-znd*UfF9p zxO|Ne>(6Sio@T#e20f3F{2sW!xdV363UDqfuN_+r2F-E~`^3c$nz`Lv7k_A$G_Fby z80*VGGoAK=3(eBI9|+BI1LOzI3>bbdp-6tI!J%2oV*;bM4gw!xX~3p~t4(rW3(VTv z%yRUfKZjdadvN67eTaz)N6NgJc)4R8o?lmm=Qh^iXmb@3CXPk&+!<&{TZI=ooAJw+ zkKwx~4&YbE_u^L%Y{8c@1Y&!Y&L;86<|2I5md9lwKJP5U-5mtxomGU{8inU?_tq1R z8wktP{w932zY%x$5MHUfyUqCHFS_d$m_P0oKo(dQ89=tMZ1vHOI@KqH=1+O70cL+_ z)-(ysU+k*Y>FGY{uHwE`W~#a!^|;&Bs2^JSdYilu*nn@g5$ro_@TI)yM`-?fBg?UY z@V}u1Uu=}#<|2HtDIa$?v5frv<1JPAkl-x9{N}D|+}cxv>-%eQ{Xji#9%#Z__qXCL z-W2qRMfB6r+;C__!HXGlw@`q*@oK3nuf6(~t zFDa`xCaJ{VoI0`c#vS{=bPo9ZjFSJghdH1g&2S8QHUKe8e_*z z=5?Qdk#S>H@uS9L^q7fQuy84sEKNW}WVEKgIw=)}MdirJE=EmtD;gTwke*eB`Abu9 z{Hd4l{OgyHCeWN)fouYMVoEON@wdGXox(%UT*CcNpTo=*=?cx$<}SdVho45rV`tEI z{5(32zJTh@I}kl)9Fo&gbX@b1N1npwT}M#dx)-Hwhf&`CFv_+bLHSmKbH}5o>^Omj z&d0DMIS=!fC1CjYX;@XV5jzOYU8nj`xBn45dg>(p`~UOb3G(R}J#GSyK5-JS_1whq z7f<8R(=TG@Lyw_(%XXC2G-G9QI%39*M|jLgE$hQa9>FEX;PYqB++E*ZD%QI6Bu6&Gz%d2(SHceGQ3e)giInlLVL0jo3Pj1*qXHe z-Zoiajy9leG3|o(WBEXOHf{Ul17^`6wyGU|ww-chj~yt6L$eU73A_wd{Vf zJun~I<2-1(KU{w2_sg>b&4d0nP?ob)PCd)b$qxg6Q$D{m{3OsU*N{HzK=bu0BJp8? zW~qGcXb9iEh(>Z@fR0&|0v%H#CbR$PD~#0I=!)29hl_E41mpl7 zSSGm1PBX_}zS{dqnamT~2ZS$7`|G9Ae;CG|F-p5%9r%y1d8 zTXAv|piKag9u8TGJ;)7Q4{lQm&5{Q~m3%qyrDsoK{Fr#Go;4LOY;DD}t>t)beKj6i zTZ4pgqmVd#3J$ET$5U;!_{B@d@tade@E3&UulKJd3=cg@$~buaU0;mZEaOOzml&JZ20EORG^*yB4KYYf)9X4z*=1 zs26}PZ$MQ=Jz=>Hb>+3Hdal=3)uExL0S$Em&8=uAIG;ZA27dilzr}>{<8-J^VQvmC z@uo=pOzqO~4?jN=Ioo}{6ChtBBado5zvoHKK&`l@U}k<%_5Vo@d+*m zDVxX(%=$ll!&@gJx4xYJ>2K20tm9hgU;1Xg0|cg_7MVud05jKBroJpR-|81=zJax^ z>lH+0rj&-Jwb;_W9eelPkB-i6goZ|A@!~`st|8DYz0EQ-W66?4L=TVGUgpuGClH|H z6`@?aRdJe5!_ak=f zL@ZBO$vln4X6Ctl&v6tr??D-%xuoqOlx%(orCX)P`BB322{d*+fz*O}%$UCvG2>=n z`pP17J$4DZpXo>a?xQ&M(o4Yq{eLjOwFnOl$I8SMq+}Ih#=->5UAh{JmM3A#)LDoK zi9&Q_49io3=bwKG@BjD%eDL0fxZQVKztDQ&^f_E2IQMdS{oHxnICB=a&YmMQ^ZyHH z*_KKir+R2pYy(XEmzaH(ZM1;1bmo~jZrX**koUiOX)m&_F<9Dp%GdiTcA_>ec4*pS z3(fkZg8NEmhHam`9N~pKk&$~I`CowA0l~S^Sv*E$)M?tGjMB-$H)Snfeo)*_>$jc-Rx&%Ktm_IS7?)MvacK~ zOE+EL{yzP$L$d}yPhPEboUs#ZbqtaB0y`Jc3(X>H=)S@K0g`buwmxlLi|jf5%PIf6 zHzV4!B1>B?)>q(5fCHh~duQv)Hau>51-NYeJJS5v`X9WGy!myn-=BbHku*rQfpQut zuXMjb&Sxa&vXb+Bk7)4smMyb$4FaLrN#6_20?b!f_g7fndqJ}tC-q@^#0)gs`qpve zPO#8MnNl#8^U=4iI`iaurMAWV212t*!yId5$|63a<4NUO%e7ZW4+i`MB1Acq@ zDg5r)WBB#MdvwgOOz$Sp{Hc6WdTR;3ATWQ~RfR7Jymu*uYC`judkL+(Yw*j%>+uh# zcHy^=Y{e&gxQzh)**?NE_37S5d@3&wdcfKCI14EMth366=52O(Mjd`GLqq(anE);? z1a=EJJF{oVHiELuqH$MXxvL4^Y;VSQ+su%RUv@O&m+kcwZwxk9;oG)yd`DRRc5NxX zroLKVin|+2@Ohg6b3Q)al8+DC3-OcAGW?kNc!%}y=AIfta}93nt;MZ<4R~vRGv3>` z9zWZ?5pQm&!L_;)^i>q#d}9?hq$FZAL1xmZ(I_aXLTTN4l+~<9b@h6zYidVh#acpg zGwLduxU8q@2-5=36v4Tnx&igIjfCd4gyvSf{>Ewi{_p+{Q>RWw=&+%vEG@xh0)g01 zFJq?l!@+#a`5=6ns!_dv7v!$vcW7uQ%~xmH}4?n69Ja67IbZp&?(vo@|^DZy(rL4(8T-*eVARxz%h(`p0QNTGS zc9bqBOq_;k(`O?>KW;Kqr!ic%oX|}0E-kLZy0slBBpl~+ec_5UWER)r_{mpM#2dGi z>~aEgIWqDqF?-1xl(i6!pSXbir+TsLu~!f`VWxrbLP-Q;z@##ncQ_IX30_sW^`=zZf1Zpg5-H(tlBb7yet{2RD= z;SBdZ!+3a>*OoTrfU|a#(1!FyZnBQN(_-2LGhspx<8Kf5lYMO)D4ih^SAF2DHZJx~ zJ5d?iS3jUFc{|j69HB4fxptzyr|?Oeb{N7Qb5F^>_k?Db-9!U{S!AH{-jeixh}<;I zIgF3_(ixC%?>XGxZQZzEqbBosIZJ&}rc)gpntk=?k)%H;EK_n_OxxMV>v_$nN5I*n za}Q{i{|8D690<({846JT!PtdnuUz2rNx(~?;~u|@{6Iy9Eyy++5M?1*^#h>U2E&2* zWf}C?KM$H^EYdxJncH>vhU_Q%J9RAzJijXg%>vBchCmww0cAFVRx*=`KQ!B(MkCi* zjb!+VdmfLoTRG>qy4Us7{Q{2lH7wt!fJVwI`Q6Q%Bi4hSFYsJ?6KCpifCA4MAYa2W47f6?7SXl}-r+gk|D zErjM4LURj#wUuJof3>LrzuZ_$V6Mb>>x%L1x?oXl}xFF8in(`_|&E1Do*v{hj#ezU_E>Qw#2_ZN%-yI=oU{f$F6T z5l`@#I(9rt3BhHA=3)X%ZG9VdbRIwx;kl-iz+BNxSgu!K_KDzJU)6w`%6c?5Hlv}T z8Lzx{8vppu|BP8PXCNXZ7>%_x=#iJ@c#XwoeYWF~&kv}adZVdX< zvawlpf&c$N>5J43q!|DE-wVy|eoU8sDY0JGr7f`K9u$@-U*k<^mX|1fF9JHp$vLn0 z_2IEc9!1RXSPUH&f`YZSD?u#X_U`Oq`i za^ItvI%hHBM~}zU$+M77_!el+uj@wH`Ug?A;XzbvGBaqDZ$3(Zei)6NN6@f-2WHM$ ziiqK35I1EpHtl~B`=5LrkH2~eKl<_8gy#Q&Q;#1xnz?_VWV=W5I8BybDC@m&!C&(72jznmff^^^TS!*< zf+ox21Z8^xY-Pzs^EMb>*`8)O-dakS zG<4}nTh8s)ov8;eH2ccufo9+Ie>~PgvW)4mFCFNelFw01U5FGGwY;YBv9b-2KO~#! z(q!Fx&3$!7d-0{g`jJLjdz4uwTQ}Zv4krKoOuBAe08>G%zBuAF_vSTF3XghYH^_B! z9JVxQu1yIv#%kztU@fbTi!}K`I@C>oN!Sf*UT_?36N34Gd9M=APFJSlu0`tH3 z2FnD#LF&)Rt4}u1zH27e$h=5vdK%36txYO8b16M#KVoC^qdPY-XXZ34nKBMf5gcD` ztHH}HWq56K6RH=_)gIBO_wU4uJ2&CuC+@@Fy?6|F_iWH!=Fhsy@o{H~!t&j&8hk0* zK@oHXR0UY?>d=T91?Mkhj^>>;1mS9Yx~CqW5|}?DEGs1MX;xtVY`63{%TNxt$60!n zeZ9;MG^_Z(^fZ4$Al7ls-8@Emoe9fdbX#yHv^xMTU@iN7(JfyKY{6ICoAAvxGu_-T zL|Yp0&BkinU0Z^$Hx_bRF@DAJ{EB({1@kFA=AZAW!OwTq;3vFcc$;b6=xV^5oh^80 z+d90%I(lp0R@^$!i3>Y7;L(O6Y|l(Y#oVbVoG~65lSW~2bSOr#PGw%}rE?ddys`{;o3@Fl|kTY4f*dpLrIOCQeqsoib$xMvjcv^`K#4R5-#yW3YJ9DpZ!! zPI&#xX%B_@RRrp(2n~xyMC5RUhDRejA_kFBBeb_Uits#aSSW@J4JI&$AUGrxi>$hWdR+&O`RzVf!EK5OBK{MJ9zl^rS zucGtVt0-z{Lr7Q@(+)xG=*d{$br{``zJ~S#k74nuR74ThCr+Ay6|1t*-hB+Y)!R|F z?f`Wks@e{qe(NE$bUlm)0`uDK52AhBeMnBuL{LaHLWhq-ZtX@qbn10{_|30z_sh@l z|M~C#hF${hmeyvxd;1P<^bvxu5@!1QRsDUmIqG^pnS-Vc=6p6zKi(CUJ*ZOgs? zdw=%=uKI$LPou%}VcR3AJDn9r|L1Q3X?dk(cgm(>+vL~F&2-)0ZCh#Nh2~qVHv#6` zy#|b{n?&ou_x!LkwUY1qmvn9aJB1R zXg1)?O5pF_iuMDj011E&3e7q_5Yq~fY48m&_40pzXqLC82`n^T?Ufd&an9olj3ORq z*L~1zFOE}Ta77*Tp)jN#B;=+JifsNAm2*t~*O}JqA0Ek@q?q$ua)W?1OeG z3y<+avjLDl1@#lQlo%})v2 zpGpt)t~z|Oo3Kj>Aj|y9B8BEXgkR4v4GYbF!0hPf+htY^rlGyegkwpA>tF0_!WX-m zxg?D1lyq{ATHKZ4BFxuU%-7dl_4sZ(L3wMPe$eEr%`C@;3VgP%3||wLe{rxCUmskD zFAwo1;qWHBzpn{zc2(lWwkq^(tHsr>7WD4ki0ivM(6hA-FSOQSZ(b^j=S;`q_~95c zbO@qZeqq>a1 zT;7bDiY8Q*)Kd-Izlz%$P+2BZ($%24vL4md^?2c>SMhg${3DhvUqYCQM^`7o{dzxo z7&Cj$%WM+@(8gEH6A6*njo6j(yTK%WX~yVM(wKYXwP)K1X*|^9#EvEI1}r`6rV~Qq zG>`d#l1JbEK4;3|ZAXmn@f{k>0W$VfV z%mOX~0t(HtkCa_}n`tq9`Jl$>*H2@?f<*-7p_njXD&phD>1VaajGoAAIT;ft%tXTS zRFsu8QVqQBvaH7vg7hT9>CBn)FnRI}Oq@6!>)((t_ABQjk`>tiHXM`ZE=Ko}NARQ1zQynU`QLH(>(B82 z`fvY?JJ);A-M$I$+`369?!%3KLd-S7^K~w-vyQK`j<2%~;eP$vQ!BlU{H=$;OxWq= zbuizckf+pV*>;_QjAg!lk+95|uAPF6spd%n<18h!CHbJ4{&IyrbcOcT!+2{Y|C=X| zOq0;uclk8aK1=p7;OvgCH}+_6{=Dt6wMDN#DJ!U*2_U*Q15^ zWL>t|-vcGTnKsAD=F^mQLOzPn7SRT8I8d zJ`kF}PqME^%)5b2_xfm^ddlg_wIkys{&&Ovbs#`!&tvOopd{VzyQ>{=j5>!uH2VNl z%fNFpJtOm&+rWo^oq1))nqT9-e-1Qz1Axy)XkP=$Kp#}Q_ilH5(7dcDVtxtC`jHle zW_kNppxM}l5$l=9ynXW~ZeP2KnbW6W_V`ga(cXgBHrL|J+7k5dSdYV%g$m7k*Vf_s zQxD+Pjm7xI!yWiqdTV!7;j`@(I(x=l?OC?4EW;tpkcck{$nx7=nXQ7*tRETSy7V?1 z$Fwc#h5HLnWUQp1)Ak@!`XrxVgI(R|wr#cCE#Q zu156iS&J+C+Hh&_CcMD{)~t_gd(IirF7Zu)KW{_kAKOIU-&CGBH@J+yiF6KW+U*aKK2`Dfn}uc@^Idu3 zxQNFN$Rp(gX2xe@&2v@3Of@I0nBcUdyZ8+gOEejAbq z%~^#tNXstKIh+&I%d!1|=TXsq6e-o6s9M*B$1h`Avu=AC`+9f{dKf#e(B5P|eCg!rm#?|f55%T)u3yI<*rp(S>HlW2PPWf~wD-{OrWYcG3mP^+D0V z-+gU|`nzpM<$w9j95>MK-Q%p>Gz0C}<}c7UO+CA9xA9exH%*&=BWbT>Xrbi6p3{3i z)~7rXpyVY=U%SaN2sGdFL9=LZXcpPBC_C}`du8U6r!_UmcA{9T~=%6Z=1dE|v=Z%?LQ zV5M>T{ZHBbyc&>KM;4mpIE7}Xa`iZY7%>c~P%#p@i2Ti4KY`F}f~8R)G>b6|o*#i8 z3qS#IDh;1Hsocdn;LKxOM|5R?+3f+hd9&x#Uey32H|Mdi8gOpUcK61{RUkB5x##oB zIIPHpX0J`L4m{`c&xh-LUJZVp?~}Qne+rsi82$60S;`=ixpB?)@LUTIB;TYhc>k9h zRIv}Ww}3oLzHCo{ufCk~xcM~s7Rel<2AUZ_R0f*mQMr`!O*T5W`g<{V&P>c0GXhU{ ztW{_}zrGANcdo~Y)@qDpW8PYrj*l)pi%W#pZx3(8Hv*mVvY*UEK`9&yNIP<&xlVuo zA^?~_=KepYKH61>kGS8*yBeLL87?#{IPb3K?*pOv=N;7sn5j><5xfb%(v$p|NQY+- zl)D9#TkzSg7JSO($1*&Fzun!_iZA!B$K8D!@b&%e_~yaw`0UVjd~n}3yw%-?%Wd^I zRhftGtR$2#nu%4TqOl+<2unsrA!*WRWX+z0@?~>SvUCoz=gmasoLR`4Hy1?-E6`Yw zho;hE)D-2RBqtL|%a>r@^y!F?9**H*k%*0sL3CskLV|*^VEz)qP6H|{)>DM$N&)6J zR9ChTjA~I-RE6A}QlzF7ASo#qiL0_OXXa9j88ro&nFR_+0#^df&k>rx`sNpy!@NwK zFa{4FeGotY@TWL?`c+&$HxQal@0i$<^|^b?`loGZtOjd@eDcP*0NFTd{LS^fA1dvz zaQy9m6`H;A*~V@C7zE2?fmvafklatn%1jKM3M+N$xxBnG6!2V2R;KHajOxbCSd&?f^qdNv zvpFF#lmAy>U`u^wV`R>qv(406!so{3=^i#L{v;X z7A#3ZQhF(7EJ(qerD;gcF6B8&keZ&4)XaR8)i>bC<4>Tk|0cfu`d9d?zxg|S^ZVc9 z^S}K)zWRqh;&=b$zv0K9{|vwW{df5Km!IRI!}sIOo7ZvM1I>1N+q51oI zn+E{1(|#)x2p{koNyk$^+f4Oo=LVY4ERZF=lctB#H1N{6nsqNgo4Xk=ns`hsnM^x`-HR4ZJMF@~&SnuF)Z0FMi7;;=>*nhiM1 ziw6SE0?jlE#}VCY=z%m!^F+DlWLW|mAl8=!RRewg^&@rff~ zaZrDIGSWKmokt}3@|S@}SH>3qOlctTG~7}LET<`hlqnFJ4S2Y9<}bO4G2;K4FWL>u zGvInyXQv$u2F(i266`$nX7gpPi;=g!+vFB5Uys{Px{T_~7v_e08W5-*i{u zE5fhoT_zY4d@V2wJl`b*TWFRUH1vyn0?Xai1m+riNPWQVAM79mcN2?R#Z6Rc%vYnxen3gP=pT|rt9YaAsEhWvYZkh zhnm6?Y~QdE)rAE}SiBGu$BscnL@0(1HD4qXMTUlB!kCFzvS=BWEnkV4xKT*W$U}AY z1_IJr!q7U@R<{v|nvu9N6N?rm5P%n9#;k>yHG4Vc&0odi(y%%)3$eq;VfL)W1nmYC z=9i(kxI&@%g%@ANh}alRoiah8`NI!>g43_Pgp23T&?XEl`$_%T^|^b=`lmM*`WhhG zvUj^r8snsq^OyT%NBl+JQ=lXU{XqHrRQ_if?hVb%gUb1zJ~7xdm)`hn$NC%pJ#z2KAHaosQuyDts`kCe4Q*F$;)F`xx7gbJbd^tVn@i!e53S5zX}JE zE6Q596liW%+c1*<8&m?=MMago$s%YInB`crooNuL<#XDR5wWzvVVE<2DT+$#ke^qM zeBO-Z3N+`JB9F%>r{yCdDH}!QO-RY)jT)6rh+dPNgEgteD6Q*6cJ+1?H}661`klz& z@$zNBiIe7|rSlN#c0Y`D`ya;rk3Eg#j4X_uFa;?Y1*l)!iOpRPV&{FwvHSibIQ;PA zcCR-;;X;>3;gyEe~aJ$;UDlffAjbFD=vTium6H?|I@$Vr@#FxT>A0P@X&LQ z<42$V2!Hvve}TuJdK9;A^b?w|Gj3hSP1Oxu-tLo|e*!e)nT$Kt>!i#NYsY+t9 zwkHH%+I}yYN5#0<#~9hy+s9>}{%?n7*x?xln#m->Co0BT>F|@@5f^F?cKvYq92eb6rLx}+m*i`%}q zKJL$T_flXUtn6`?6@<~eom~c;z2^FXD&}KwXqHA#8XX2~9wS2l?6>h1j8vup&}-=AYcObb z8z4zj44dnk51TLlylMWNz+=Qab6Lvo(uirgJXVa|p2sJ-HgZjD9!xzLN#2Z2`Jd-r zGO*~?Kzp=l`b$7o*ClvJ1MaH_39OVXdAt`y^@S>RDDLZ#8P6@(fllo>-8HVe&@6TD zG{#oe*#jhjK;GWfH+VI$nR4iqeKucCy5{=0bzt+U(<5=aNXtrpwG*>TP?EgZ`WKtN zeZ3E>S1!lI*hoCOsTt?CHluG-E#B^0hc^%HLdN7#SUPS5{=-kM;ENXy;iEnE_-5#jL*6K)9!kFvZI#ZObG6-#rs`V`04gaF01h~ z{{M3V^2aiR24VTrJ#_@=I(#DIo@IXv&0A~n!B#>ymmhT!thYBa{k6Eerw!lSw-sL> z?8Fy`w&CXoI`HnE&A73BEiP=V!L!x5IGmS+w$)2ePB2c4i^PoJp%_D;mHCpxC;?&t zT!GzTK|>J|9EPxvD18iwmP;A{0RQw!L_t(NgbE6h7qfy9JS+$yLxV7s;2athirDBl z43CUOct{vRLkY{nhGGa6KY9!nE?SC}3CYMv%SCya^aNI6`O0L>n7bI+`4t3S0i(6l zI@DCIM@D)n=FVM#1q%`|Z@~&IS-u9#S7jq9r3lGu3XwsWnK)rK#*dq(P*hk@hSJh1 zyngx|_U}Kad73t5B91?NSfTms=~rkICQj(h9&Oyr2X5?GY|7Y}UHbcr+esozCl*Q^ z)CP{*xbDVCSNa)Tr=hk>`6On*?9&ZI+kJq{q3B^}`0UUza?;1I`wzeeHrTO%WSgp2tG7 z3(n%x%pcEji_m=Pbs#*!hGE}1mWdvb? zWtm+=pjvvN1)M7@niQIc(>~?J!dbMr653#XemU|8y@j;D+?*oh5}Y##&`T52keydW z*fr3cnO%zHlsv4VO_tVeMM?8+l(y_fL)#vdR<6dZy&U^Uw+rPx`{`&X$-QWHJzx(Gu;y?WJKj9Dm z{$KG||N39>`S1S`XMg@N4)MBedFBPQJ^2ipjvYeJC-2~||LJe>%(GA7`ZdNdncuqa z7T&yi8@I3AbO4(CqGO%=eSjq6T%8N!dTve63(X>vufb%_ zEAhZ9lOG*!qWdv@C#IO*WdqEHTf}drJSGpyRZiaBJUI2Dd80J%QU>~)Lh}V}D}8Mz z^KdU{zD3>a6_Jg!4V3ekbiKXH_FW`t$IW|M5@VQurrpP5`dM%C(y7FOA+k|E^Lbt0 zIA!MLE?K@wt`*LPoVq2>P05MPNlhhr<*is#Spr#(v1vxk|A8_FDV54bpI~eu*((do zR)Oad4VD)({?j%B1CO!BXk{MH%kid+_G0=;0;41EIjsgghO)nZpFvs;s3)zTf%5Jn z$Eg8ppqA@o#?+a+w}5=o;QY(&G(alT$QnbJv|T9mmwDODi1o`nsPXYw`?$kvlxF<3 zvrW&-px~1@gsD%r{1%$!+SA!ob~uOxtzNU_|GqkqvIr2+sYF^nPrd4OWx6I{$=E4z z3YL?3GoUQJ3HBz%^eiZd@|p>(%DD1tJfBw*H2oWFZliqHgwijiF%1jN@*+( zZBre7)X{G74Y!y^c?xc>wPde1A#E{f3aNpU|ehCKP`qz0QPOoh3sg z^DqlE6Ncr*!OwOOh`R{JJL+|Q<)3s^;=Qfqc(0=ZKOsE-lpy~8PU(5B!bd9Ul{UT5 zAMR}AvV~y10iW*Mf=}=3#3%bZ@!{T1ywkk}{q5`UMr$1&FE7B(^dvMaU5K0+Q?PVY z9HvEtV+3X1AI8jML}$Ku4JkK&`B{R9`!yzbbN>l5ymC;2~p-TI%E zG}5L~kyu8Fy@+2b{Q1k*Dd6m5rS{BptPDJeZ&+Vp`i}hlQ)KNwP|UB&{BPrxtos^1 zM|SoC-#(PYANR#})8GnN-Im6d;G}bydu07Zz5r&*DdnXP5}@UUKmld9Es%QR{*-y) zwvTBN?yg+pZ;UhC#|3{L%k;#rW%=gK8#w*St5~^w1%?rB*Cc15y1G^Ero7zPhum0I ziU`m(RqF`OGK8m20oh1;o|{$LGfm*Gu53m@ei>q8WHyc=m^FL8_DbjGmLZ2gEIrP7 zIVB3tSy_cxPH0X}DWt8@-cpN^m0PZ3npg4vx(&Ngv33`#*6l&v`dz4N-GIB1t*`tb07W$fA`P- ziZ@<-5jU=}-mXip@O9j|a);o22REL5aH2$Ll9IJkmx>Ol4hbJl3J1 zY%^$gJ@f_X>6h)-xa~Ht)ygCjP^A)B1u1U?cA5*s6*) zsBNi2rguM&0{qnr&0c8rwl(s<*;meE*}}XpBisG`oxVm13s@gCk`BLfw0#?QxR2(O zWn|tBr!i{_db>yvZZ^aL`T_g}_ zXU({B{=84}za7tP6?iVI!Sf^Q1M{b1rP~XmVnv$#cn!qX$v}C}_ow7NuYvO)E(4CH z?7sSN>crg1nf(mx`#XR&YM(sLW6SMt*se_alD1s8KsXZ#pfQMXDTb#rdpKZhWq>(g z8`HK<<2sMA53L_Pr-f#gn`yYvEH_sI%Vzw$sToS6WBDdU9p!PM%(f6q82Mg$GP{Y}Lkv+r&c9j|C8Qj`)TaYy6f;M;rF9mwfJC14L*>M zmF%d)Pq$U!o%V9PyR}mFo=hLNtC>LDhEEP`#)s4g_ix3I@7s)}hWkME` zR_7rryMlmRj5UPhWJ0rmv-CEnXO%Kfb1{Cxbd*)pD3n)J)}yEA297-R1j`nRG4Z4D z#3RS?(fjWiXr@hBA5(A^d#8+#3s_16=J<*u3(a2L3z~Hqu&>O(5M6{-i5Xaykb%`}ifC(v$l!0w6SJ^7vmEuC_M&RTE>sIJbJ}oa?1iZ7+KbxGeQ501kNWmKD6C(Pg9PSp{^oD-_y6s`;`jglpYex(|9{}O z|MGA6`S1P~y?4LFqt|a?^9!${`mv``_UK7eJn;f5PQHTDQ!k_B*;i2}(ERiX9KUf5 zfAfbw;>`0e;MVm%T)%w_w{O0QANBu;;Cvf5`fupW8P~6~&ii>S`?-{t0Tr6XMtOZ$ z-(rhuZ`x*D#uc$2+Q@ZoyK#YSB-<1DT(-1l@|5nHJe_0P*>_%SN$g7Y1J@QTY$b9;@pY&7OL7(#;p5YGhS6*A5@9I_V zM_`uT<{^4FrO@o@X?B8vg=Wc|5;}v2UKn3xyRrqfvX$-Cy|#-alLkZv0<(n^EAMgM za}J&-{|lj#3nBa13+6}+-N2VIdUZB2N(Yino!Iqzop0cFkvGqOT2e+w%!?T|5eRLv z&4n|ODW{azU*3F}AaCV5qA?=(oW8oywB@=P<1*=4=aQuil&1qpxp~|VlH)M0xJ~Nc zsXuehKLyPq`8^PtDOpllE+6_$CvE3DrutY3N2!)m}Z zh>ixcN!=zNm$3Co=_~LtXhkAF40YmzdNx+WVlrp-fIYPLf2q{-6|Jvub_@1KvOxgPLM7vZa|4ZGl8dsptF2A&zY2qtegs@W;oECTCBjlCank= zgq`u@XAqocp{lw;zm8T_-Gm!A-^D{moA9j!eUH5HMUSBkio89_{$T5F_xh)iiNiKO{^EAEaj|(< z=3Nv|n(_ole9}`!X}|-a`TKgBy&Xi}vdZiV29oK!To-A>F6E}oJ3y|at?6ZK>*YD+ zMKqBaW6SL<@0+)7;o8+cG}hIteJxn99M#n=va!bBt_ z=F#Q|&AG)`O#4r!EjG08)lXGq0oSsh3cB@?}&Jlxt4D zhVmz0MCFMWQ1ZmHC_DKgs-AxprH?;`$NT&6`Iq0KFe8ik&O~u}86G%v7}qXc!`uCD z5r(egmh>`T<8^bO`BwiGf^!dUDkuri@mjI2jcw3o2)o*IEN#demw3~34R7_`;{Mmk zLH)cg@?I6~MnAJnuwyKj7w`4^2y6##aNBkAiAu&V_nf0`oJD`{S%qi$%BvX$Wv?}3 zw@>_^_RuHo9qmZl0G>~VSK6<+$`eAJbxFm1(`L*NBeoMf-VA?YyCG$xjmoeCv!o>E zI}E2c4BRhJGQQtTUuxt9VQJTmUvWR~W7`?)pT@^!f06it@rQdub6~7-f!W*kYre!^ zwC!@zR`HzryNyjoc6df0G`o`jjW6F6Zq?BfZd!6)|Ms(&Ho+Sv$_(M*bMr54EoX?cULNjgS2SBs@?yD914^H(Cn+@0rfjLBoC&6 z$MW14?AHQ~Y#9fW0P{VeS?l7@g=Tw=jK6wiuA{l;GA>@GoO|TRVT=qOg8Gz2IK6v4 zE_77mM}*&>Y_Gvvo62x&T`8_N<>OQS|Epa!_=~P`{E8s_OM>+;xb5360yEb?=W(BQ z*5b46&G>kG3qIMg0iW#JgrD!*jQ8)`f}iZ)jJK)2?hSZtb0Z$DE5gpaRMajdbWR$L zMZ+U8IW!m}3AwhnSYTd5Q2=}-yKfQ)j9W z01gcmXeP7|kHd=P$pq>&tXi3hWP{W) zsNMKAW=1}Hv@y?XH=f>Bg-2;{opm=pP_$XwQzm}qigAQ~>Xr8XdHmB)K0okC8ZgJk z-T3F&JY~;i9>Yud+%ih6@%A=bXqLuRdW3BQYvuJ9Z@F!Gy>(;v)9v(8`wof7K=WlY zPq0V5Y-caBt-E>s1{>l&Y+S#AH+R99F>@X&D;rT>-hisAwWzP#gvR>KXlmG^b9mR+ zZd7P?VcE(+b9pI`Evq-sTvCNGW5z2q%gh{wh2;d~5`|{@s$h0ju>t3tLZqeTW8Q*Q zShgZlhXJMMm15z_G^7+%qjBp#RBqTs5t=u2D>N5XH6cvrgbqbgZaLcSKaSc152NhR zqi8ts5+451&v5+hpJ2zii`ekOOQ?GC1Zqw_kLqX7qvrW5sCw=Kq4^Ee5}2ExAv8bq zIP&*Ci0Y$Hpy|Z(Xgu*8Djz(8r&+f*Zr(;1eObyOPlcjmMqpj@M%?MUjW=a@Mjvl} z7{^o!&V*(H^DWx2JjE0H*I_$i6Vh|cw&Utq#?5nA@OIxF!f-#{cgy5G1do1=DzdHGjFWS5DhM6Umh#hEC8CXYOjJfaakIocQVL|(u)!h7)0)41|vJJ(<^Vq z`~OB`x@f-o=au{7o4@>zb!F4`1uzX}5&+#G=qJxb7l{E&W%@TB?u81JJ&(6s->1Ry zJg5R1G~3f`B#ninWg85Wt~rO~#Tvf90h;nkdCYay{PP-{vRb2+H0-fbHmMUH8S>3V=G>a@Wi!3yoeZ13asdG5zv}w9uedPIM zo2_HtIs7#kB+J*<0_7drA;;FCY#w48H(~h2<>u|fP5pQp)#I?Fy^p>UL-Ih{(*3p2wgyXNf zYVl1+J-+R1!gt;4@XhWGxJ!LTApZFNPQvk4ytTI-{oR{zdQ%fl)K%dAyezaOEJO8z z*~pqa5etV$V;n(vgagIFR1m>-7@>JsxB=myuuueth3Por(4a7c4WovJB7A5Vp_=P# zutSH0AcV^h{vR?-_6yfua<)k=WSS+8J-avJBq-ZiiCs|f^~+1_QZ)Zkxpn% zNz2kt7|W-OQ&MuYr?9ZF5e0?K$S-U_Zhj52a}6}-7jb(rRaDR4YWaUP^RgN%S7+*@ z@l*nGN(R9>qZrAA=2QZ7YHA+h;>YXE8dZcAnGK<`dM&Qq`Y8@Sd-HG6D>k;~g(tQr zTv}hVFzW?oZ6_T698^AEFtYEQ$omC04*JJJ=Ufgn>kYJ1Hrq(cM=WlN0-)Io9bRAY zNkN3$oQ@-r^hVnrX|q42@lv41K!k;6D;J!5a7FUWcDI*lY~QvGA-pjgH+~9%xgIq& zt!Qi{FcXv;2+lGcq4YG%vW77256l*tWm&H;iB?wBp|qqD9G&i<&qkeq{;>Jy7 z+QFE=G6$WYk(c=7Xjq}{RjdR zm*C*b7z=MRXQ5eV)L@1FDPWedKY`HfSMiSA-zC5WLh?Xp_5n4)#X?PB+FAxDh#ASS zlmA(0zPI#urXlC^p5HCgpC`XGJkYFD6G`xw@=86iUI&9_lZNEQYy8Fl-SSE~<+{4y z?3UZ+!5Xa`W9vtjj<`)92kCGgN!!RaF5Ys82HR#H!HSc}IqiAFQOqp!CyfQZ7UUPFJWBtMJeFUD*LX!2Z)TK)? zBO(G1G}Yn!zE1S++NEFu7^hvoWoQ>A?8KAu+T{E6Q*N!qqu(vfjErEhVlOh0(9t*Ag&8AM<^%< z2{<#&sNwO5A2pGU;S9`~y96uPIIc)Y!o2y5F=_G)g7X}V8aoLSCKGzIiU_YY$jmNB zYDOtiQj3w6R)WHUDkQB=Q*c-Kh#Nq}cJSEV36egaa~WFsj#U7;r~ejFjX zkkH(M;^G!I!Y#-vFwmTnN98vtG#8Z$OxN*uLUTbS7B6M~2;^A=_0+6lg=XntPDw97 z8rPF~z7esbv0wpjKB}8hDqjbzU5873@8ICklNe5*kBkV%GpC-$w_koqJC$JxPVW@$ zNnS`*XqDj$Vna@hu`+gQ?9m$&RkAKN%XN_#ngi|Dg=X5I2b%TeS@l0Fo=;*H?ev}v zwZtVSZS9q2I{NaJ`n{1i-s$iS*+>1`mdlm+p|ouRwPsc*0S*Jr`e@t6O6x0Dvduy> zeaTE~=0LM-X9I3!AD@e)e*7QL6P78R|Ci?zU84UUIBTT^4q$QTG4=Mi&tPceI_V021AC*&1o3IM~ug$1xwI<;1C{tC&Phi=A^IO*lO|;e9lz^4oFiIyz?^(j^`HQ;$iDSo+ zk+cRek_gm=5JNkars*BSv89+~sI%+<>PZeoO)T4$s$rj`kz=cjhdv5}suU zhXA=kx%v_R7q}8Q7JFkGtF|ZqYa3ve3YZQ@7@HJ36PuG4)Mbc-Y5Pp)mDr=OmXw9C zO#87?Xr}$$;_o*_+~z{FS5d7 zOY!E_fo6FE(J!;3FkWffD*N&nA2iF(z6qS=;xHqUDGOEdR=*FL<$~}+3K-uLm?;T} zz8M5WgH^sG)8GrVIB@7R;{NMKUTF4$vrWqeIB($lUx8+k3zz=V-}Ws;@HEw_PY+v9BhAH1j8t!r|W~MTrYF2^m;IT@gupGS1z2@$C~@Px^%4Y;_>6qkg^J0`Kj2OyBbFd(r`3C0}tk= z<3MgII?|S-IblAk7tcY_tSLyJJO-=A$70@yP>c&6s!%NBjm?k>u=*nK=ZREA*;9jgBa$&d}5 z>MaOExlcG7y^--_F>U5NELgk}tCBL2kyV0}w0taDvI3(HjZ@=;hKP+Ns`f^&LS1p&H(KwC|iNYN?i1twBO!8gdEHGDIUI zTc-aiQl(`UYR`5IZv+-CTA@A7CGr@)ZavOkc@ujMJ%-^UMk9jIeCp&=`0lGOX`eDZ z6YYyOs8g?reTdylqwnd771_qi#sZNyHrNJO8)5!04T(G@QF!)BZ}u1qoL**mvSa1K zvg{)^&ht6(#>A_^Vg;pX^B8Gw8~~Rx$P{7*EW9z#iodDvSxMO>wi}rn2>}o`tn?La z%bXZCnDi;E_>VNsT$)}i(@<+qmaV4%>G)9|+sE^ml@RosQhK>Q!{Uaupo}=FX?jp!4J#*z)u%*znkM zDA}?T^VVb`f;X2#qhk=ve1-A`IU{`Zd$*Y-rBtj2!OXoz%6_tEh5Q-N}EcoF=dnU!E3HA$c`4Yve6+lYJD zz1z&A??EcvBX5K60^z-7VZ}v`%IQ6iN!JOs&b5~S z;@W|^=29+!XS2M7+-&nX-)I79{IDUI96SWmLWW>wD3_r_F)qkJv7OgghExb3+nFi^ zlI5j5dEsuD%tS#L4kZA~3wJszMbvN%j~T%RNk0A|FX{>K1`%Qj%rdiuyu8QbLU~+R zFdI?s7a2*2j)*{LcsNFkibwRwSj3DPi4kMssW>*eA#AjVV0hd}OrAas^A|6{>ZCM6 zYk|%Vkdsl2CG%Ec%*e?sTM!%Ia7>y!6%!{<(ln*KadD#w3i&9JxsOX~kwtLM$f-nj zel_w6tF(98_B6|MbJ-dBNJ%D)r{!SD;uVM+Iaa^Ewkk0NqsL67CLxDVB0bF|1m@zB zwa&QaT7q-6g0ld09@mSDrKg$a%P1h=EJot$bnZuJmM^FgT+#^qX<5aD_+o|T*qG5+ zx^xvP3C)t%%KD8s-E$iqdmlnn>?lMP*a|-OgB%jQDADFf2lPa(ROr7KWkWAqWZMjcy2Utm#`ZVG$V58{WkWm$Uqh z%xf{iqX^dA9?TdLJS-St07OB%zPv#m!q~KR+g8?ZFRr{!faOikt*eCQKAE5QD%+-B z?Ff*zN@hRM8!leEXP-Wa^fkNz=8boVybr;6H8OT2I=61ekKg+Mzy0gK$6x;SKce^g zJJ`4XVfuJJCXStr=upO*h|!1*ABAb-XCZk-3R>&d;(`4S;N|CD#>F#?`2^@I7ep6v z>FgW0eD(}3pF5*3;~L*O$LoET*WcX8TBZ=zGIm0H)Tg(mr&(VHbUMAX4W}$Ld%@WY z%_<)>v&<6bJ#zh)ZLCLuzVp4%d@s>IOnQ>O=R69{;-@ASn08CrR5#|>cFc^cHtm+- z7MXLK*Hf>l3(ZEtAEy44OPo$TSNz{H3+u(yx7dO4e-o!99tl(Q5SHZzh|qjq#x&D6 z86%}l>}9OFLL0hrg>mvK*LyD*XqHL6U1&Cqst>BT^9}t1iwy2&W6qOU=(v}3A8OFb z^~e`QoeKR^@&dD!Y?tG0nl`vux$xk>UFEAxBZU{e<6${?!F>T!jodMGu4_P)yC_c? zY(BjI-%H;8Y`*A@63^QJMd0WrqsEwfu*D=m*t-rr1_t#J-CTw|Cf z4W8R^9CHU}WF6B_%&S|rfbtD08KPl;*~r?0EWO)l=yX1jo}0(|*DLdRFZ-2VVbMi` zgUWmgRn$jaBQ(pfqiYU08+bO?)1K2Sn;*%$To>QVQT*S2h(_L&mPT7_Oy6W?{#-z2 z-HX4e&zbS4A`2TnIoID?R-fHke|F>pXUFd4cnwjWu$7AZa zX_!5E7UoWyiAA&LV)48Mm^*V8rc9cM$cRvc5XQqIf-#hU9vvHjG2?jL#3`6DXC4+U zS%xJ`S0G_!B37K5>W^ZV*|{_dSu)veHp5wpG6=cH0Kb^7cE$VF)Xh@^YRsmh-0Hao?xAqSD`)4rKM|8Oqm%p zY6!^nC@F44NokY5{3ie-^W?5vnTnAk#$#nd8jr2ywsK@rX<20i<`QHPZsg@df#w7P zNL5WU3QB5F-Mk5}Ub=}*I}h>3VHDGd!HFjy$9q3|8?U|eg2oCN)}>P@(jI%*K=hoy z5t<0*~gF|qVqyJ=eg zqgWn0R7GT8xRPmJ-A^}P#q-ZTgITj=?&Bd?o{*?h2^Pzj{s_e~Jj3)dZ%}BKG0k-~ z0?+GFsb2^*W#A+NL+ncRP6a=H$ok{e!mDK}J} z;EbM#2>SleFv79K2;Kw;B+JvK@Q_F@!!RtEH=y!?Ai`_7+ziLWV)6V1$j!__8S}qk z{T37v{>P4;gs{kH3=0qAjdc{l>4RZG!3ZX7?>(>wZ{NO+{RyH+$ehgr%)57VW74>Z2;zA|SQpZ}oVX$h&!2o5Kl|~=_~|U_pv#$=M{33}9EQb%8X+zS>Eb3Kg=FKdB zmrgL5)4PWZ+(Tf#+)`2oZFm0^?lNP0xC67 zCc~~wrGCE*B#Hc?*@CE~>oq8mh33HRDyJfS70L^4493VBs0Knpg?$8}6)%B!ka7gd z=5H{`@7_9bC59^oBnBt&Y|mqi&OG{%Mn{(3Gy;v^l;6M1_WBN#l*@k`o!JM>rgvH1 zOg7W{7~?gD$#V-dGktLyGew%e)RC;znF1wcyjOjCq1n`_d+qIZIhOm;MirPHm<*IQ zyi8+&-Lg@)3WVnVi)Xoyfo3a#XOX>@{?IIWbn_&Ss4O(!U=X<};@6kap29l3rQv{=So+Eov)w@l=I>D+0YedPtb z{M<>r`0Od{=-RH3DPxP}gC5J5tis~OiCDTU83_q%boj!eMawZ~_5v(hnt+wd6S+?U zR#1t{S7UWTG7=M3vyn=~>Xk`^@?<0@t-*xx6BH6;ri-D&hG5FnDag$$M6P_bEWeC^ zDPPU2Bn;GI70)w&-cn?y<>}nW$q6ZJ1gBxdh%tzdABTktmlHDck)Dx zymDj^z!xn`FvCBX*ZH&NqaeEoS!p?JEQ>LJ?gC6AjLVdBixw}(2&Oe<>MRr#R1shr zbWC$`sSM9((DBQ8GOjtR5}6sLNLrnPSu+=58uPX~F%6^QCbDj_k&{he=I=5*MCN8r z&z8>`ml2lNAS!YMRxD3Kbxn&xb4|-;yl}1$>$dGj^vHPD$8a2b_z2#*eGBK$oTg7+ z);Vdf(4Kl|i~dYC04?J0J|J~upjaA0=|MVc zY|D|~qgZEu1DIm}1|%3$p zZ|+|*>+~6UEN;^Dwj1<2LcooSjtI>+E)#|X3MiSXv!7|o^7U6<=Jj2!{xF|6Vigs2 zC@ZT&C2!)Y3ChybT*{lRV%m^QDOXn`;Jg9Vm96|udZXpVziL8psX}vpem>IDQjwUr z5;JE`(+~5=EHI^|ym8B~LP4%bdYenB5~>)b^fyUEhHA(S;R^b3UO^d3n3t8wS(v>r z0TZUpLHzhhh#NBwaihoT%eq5Ch7y<~5E2rhU(pO28ipX9%8@t4VdjZZM38xbH*9DG zVx!|RRc@r`F41940_s(S{`#tB)Kxe01~d;N;wEUkiHMFy&~Sn?;aomP9~Bl(=pKg5 z+;lv0^dX#o@ikmJL%U)fzIF2!-nsJ@ZeP1jP%?913%pjAl_64&BfQHDB{9))Xs&M~ z*Sv+F-2ORk^}K_(`rgC0pZ^B``9J*+{MY~bf8mdR{CE8A@BfJ3|L%|Y_M6|)ckZBT z+X4D%CL+Q|VEFJ+2n%E0hJ_<|SOmgDV=!UtRHUS2pmptf+;`vr4nK4ld-v|f<}Dks zd+$!1JaqzBXft>2+{RmPzlpc*+`{b}eHw@5!#UD+%1xBCk#d9ED^F5rL$b}jbLM*@ z!le#hkYNeh2K#F;Xci5$QAsb*_uTdiEHjFTbe?9pVUZZb_@U#irJa_#zTkqh zK(v&hhu1`)`HJ*1_g^&7Y(F0<&}<*In+j&a3@+N!%%CZshBSeSClzqu(yj+ef8zL^ZslAVv@XAXsWBmBZh>shG+R8?>)@?*n z<7PB9Z$>kMi3MT%~S*%+!f1`bbMw>VJ%gS3c^5oN-kzhorm-_xhTpmLs3pSl2@c-=H&Sp zJ8CN8M@}THC+YBo$x~({VmM(rG8Vz%(S(IyHt^$7QeLG{8_DexCQimgDom%58%DsM zpks;~nl_`ZVGBW_4r#fSNXaQBG#3$?S14S|>>i8fEkYpyJ1?^UCHbY8Gix5EPMLv> zj9i7C5pkn2lVDv`Ttz@>(kbPNxh#>HG>U4Gl$3)hlNMn7m>C!oKLw$|(QIs|AR{#g zaj`PYgEs{^mB`8{LsnLWzLX{4oSjpKrArbK5*&pki&tr{cwtFBYFpd!%$gedzzkRf2j2VvlZKe zz?lFp%MFpKy0e+kT?(FhNT))#UGn&l~! z+-ycg#bDyt$(S>3E>_H2i45K>7UY&ApEsr@EL&B13**W<zo_Y$& zt5$2B%FqsZNq8=A?2kP7817tqA2)k`jBDrK#I-YbaN)%(IQQ}u{Op}i@n8S_|G@wF z@BcUTf8k&M>A&FJw?4wtPriaf2OmLVQW}EfWn_6DBbdM(63H@#GJZ*~y7S%z<3uF$ zJTiU+!Xvm{hJ&!KMvohd`HL6vro0H7J34Xf*h4sb`X%)BUBa7pZsLRYe~O#^*C6wD zzjf;l;r|9>Gvl<_2<=a7%C$?0;T|vykhd)tx!a5_TbVQieed4>AZ@MRRFAr^w6_M9 ztqe5FK4b#v#kFmv2bAqQYW9CGG#hTQ@a*`WjQdt#mZb~L${@Dv3e5h{Ebk!L;Vd%5 z*i4JZIN-G5=E*V3CsPqhUM0_z+;Cl?Pxml(ToLv8pqUp*Kle#+)-g5yP-Oy)xk&ng z2LrATG`wPPw$NdE$Q*bMkiGE!V06g=4F)*MKK{0k!Yza)P_}Cc})AqET9- z6NB{VrUhn4YQT=&c%3>>IxIq@_9Ers{#;s_|K&HwZlr8#Po{j{`t&xyR)OI7!{n(q z4FcZlB{DD}>)sa#z4GSE+Nnr}ZWuc@`>E|a=hZrQdz$9cgwxM}#yH1fMvpXzVx{8f+vi63qSV1jcty(F)%_~!M zh(QwJ*1kwLew@6F7ldJK)TF_gG-;|%mnCoimz5G62m;AT=~zYBmKTZy{pi;xUJ?QA9{DCh(V(nU9yqSK5jTs<3d*QjCa*M@;w##744F zC16aQFqMsdGRBRWgpBk8eK9aAvkcjU=5&HYdWL-NxQsUjs|gP=m^XJBDk@u%U(|r+ z4ZHB<3ztwz*bj?{Ls&=zy1RDa4jbKzGG>|fWybj%U8YTnu3TV4b>R%Io}j~H zKdF5gnJ?KH+cGvMa(z!^k2kXE@i*GN`iE~{&kzvWtPWM;J|guI$FD@*cQ&jB`tUu( zy!yp!E7{Jncw@Z+fR@e3n~sfoPIxXues-~@QCTMQJ6B@rv}wNg3Bn@? ztWgA4E`tew!J#2W^q=5RQ4oTHhv^HuI{ad2C?dj#W9*oTJm+Hi{2D@dKJv1PP>@;7 zo8B_y(vPJ_TjrA%=wCd46>os2V-#;Bq$HWN}Fl)w4#6-q0FT)VQJjtv(RpoWK(sK>BZ@!D3 zGdFPQ^=o+T*$a5=!IRj({XuMQ>B6?QJ$UlrXK?X!FD|}u1t*T3!r|_t=-#vosmU3< zS%}4u@DL0MW9$f$uZRvoOw4HB_|8D=@Ue&)J_cdDSsymcjG30-@1;;4KW20+W-pnG z?4n%UfAAn0>gsrtoQ@qkw&T*7H}LMw+XS~O=pwW>Ivvk^Y ze@R>IO5v1{>JA^eN}Dvj!?tZSLySy&E$bCo_N)5n>)m)$S@ncjKZktcF{nR!#nrIyvixRnc1u4MRxSGE`4+|3!8n+n#C7$BWbzTxi$=7G@6 z?F2EOQ95!Ch*(ODVKdg(lnT^jC~rLm1}Cs&f{ozGFyiEiXzjZd_8=! zu8q7j_#(N+vg9@k&AfJf0?Q6K_X#jl@+n{o&R$sXOVgCD zY(#DSCheJRXxfD4md(@_HqeboO3EZ8C+n1KGMr##!WtwxT1{wPy)u>1EYr1R>X3%z z%T^I)W3&;GF^&;zm^0F|QCHhUaF&?>s*s;gc-6tLPec?!RDO$!jKzo%qcE2cn^#aoxGX{Zs4*BtO_?|ub7s%R=+Wl$!qO8h9}6id zYe4I|Z94CJ0r6?#V!-B<&bzgzzrHfY}FS}3yK!%A-BXG~0F$d|)^W1q0 z6q@HPT7u$=8Wa&wN@RwE@@9fe12R(bFqL2*9WtEYJOc5<<8)SlDHEq)!MsKKlA+9( zo1b5$(43XYeB@LU$XWjE5*^DN9yXl$UxKoV7M+2krEM>sdZ`EbRqGKF${U8@NOW!Q zCKy;~Hc#fPdKgPZG96vtd4Xo?!dYHx7kYiZMSCzE60&p~5;OeC*~cdyE4ELY)kaX( zY4^6l^dDMg{Z(Qd@S~hSx~lewg>L7cJ7o*JLg^pRGoMM1bsdWefFsS1L16)JhtR$fMUE|afFIv+UU4P#MJDS^3^z$`On zIO5G=F8#|4(Wu12`AZ-l1qur#Y(_@uxZ@B4vW#0E88;F$X3fOh1@j4q(z_~Oyc8=5ai4l4A(gQH&qv zGx zFeb$!Enyi7Rwp87=?W~JJ_BQ-%}u~a#=7Z*?+Nk(wALZx3-cVwG=@fmBbfV5n>rN_ z-gf}kZ{5ZvwzC?W9e;D7nYKBItbO@Q+IW9pRyZXC-jrcOWDYwl)JRyu*swB)aErq) zro9t?G=8m;wp#sH+CBPF!23onI7@7hIALRsg=Xt};-i+;jD$%nF#AI@mu%w;U#t!+4qkgc)ya>q3AwZfJ6IelVhoVC>{3Rzc z@WzZd0nSgXFfXurdw0FE!N?l~MH+~u(j+Yb4FL`YeVvfpfo2(4FVK96zg^D<-br15Fp&;hR#wB$vFdqJ~0 zsu-xG>$%2*N^OO9Ch`EY+7#=gT%j%U($0aW&TIcr351LsQKH!z>YSz_{Y0uiz z%xmWbW-9?<(<5yq$C>oZxH;~}x{;YkZuIvdCo3DlghuJ1k|7!O4VzF;U~X(7H1q$p zYuWIu+ls=%N~~IyrqH}HAys>p1(Xdi8zm>E5t=i!r#Uktj}SPYK*~m$4Yf4BV@8ih zRb?G&svESoO~5&i4f^tBiI6#+b(%PtuXz4a#Kn%&@q}YWO+-vo97d0tfTc?k6b9<+ zTTolqgoTTiAvQLSP&$dAJsXP`EkZmSTW#Lx+={4w- zVMT@2Sh8>>M#qjtEa5&rIv(TV#_IHX)2GeAl*!XEX~Hz-yG&=^&=&zS%8+4unsZB8 zr>Q!oS@K(2)`UDl^LjS=CtvA7MsdB)b1ff%+19b0(0oh3I446iTxjP17tX1qce(HU zS%v04LbD9Xv5lCW-&kazS>m%lSo+zdmHH}u!UxUbAI7h=K~+EXLbKQy?M?hb5w^w^c#n$A>hJ1x`c+?1tiXV#^(`REDZEw}eHAr0}PieCBrc?YUCo3O$ zIfW?5D|jm5lq^K`gJQ*$#mZP~27qIC`RwD~!hFkap%Scux%dfIKS`d@ll z4naAe`76+;Cj!a2ImO7zF6FXR?Yye0S*L>xjUI#1ljdN=sHp^9LMNeAdXT4w2BBr; zG90Zc#j*TcJf5A2N7It=XvP{mnwx>cxmnnqk%IbpbFgG&JYs^F*RTkk&spEW2n%EV z#b8AA2;WPHGNaGDxl2%0T91vb?P#o7hgQb2jjf$%-_VUMR7cw`ba(DYTWbf_tj@;B zkrR0>Mj|pe4&g&bV(yfcywPaDfdh}>{JB0XTO?1Sh9GU;G(5X|2da{j5FI=W@gri< z-n<@t&%BDWM<2y=_wB=$k{o1DAA|KPW?_56B5Ys15>0DXBY$NA=1iK3;Sn(ij~Rhr zo!Mt7^BsyK4pj|7C*IgjNx!GX^k3l}uU}c;68S^3z_JBq zBU4r>v($sB3-vpRJJNPKd6sj#&}{N+X0^GbA0e_IC6af2H zVhu1(aAV`Fa|znUoWbQ!LNk}Lul9(t(1Eg*5*PqA5X(9x>Dia9{GeHYnbL91&NdyM z;R*7R2P5W#7s=$^g=m?rz=dXyL46N2Yww8z0UGE@VHqPW^tDI#8Q6fx^l!&3j1Fy{AJ5 z$UAJv%KqjBW>FwCi^QgF@0pz2n^&KIQBo#r%T{*ykv>iFgJu!aq)m8bpqcd@0Lr4P z46JsXvz7Wj1F-f^3qXr~IM+*hnr~h2M^!}y8}uMH!bNCZyBQ4)8??t+fVrh*i!NJR zx1zDJjeuLhMmZa+S7#|auU2q&pm|ja8=`civC+>;%T@49P0hrF36r$p8A@o7p&AS3 zFF|8NtAexiILjmFyxby8o-{*WzKM+)MUa@Q^A?8$M`F~d@e0+_Q`T7Disq*E+A~*D zT**d#D&h&KQzy@0gT4$aRxHN|HfZL2p;)?XHR|g&njsH`wa6=|CXiO@+^rRrHJCnq zy6!6wvtsEg?KKmK&CJLr;7`QD1mTbYEiO8I=YOea^5Qi5}4dLgDT z?=g`h6`DuIjK-uflMy^L1k^RU)HRjS6MB3{tF_Sj$^IbVk{Z(S96GtT;&=+jF(%5=^Zh+)> zyDw$!-Men>U*|&CxL{#@f4|^-H_A z+$7AHJQXviPQ$`^^RaB{GR&VpkN#wPf`c)A<`iz5LkOOY`3vUgcxRcGZUOgOKro&; zV>%+GXPGy#@gw7~VD3DW7MEa4`xbO|c4EtxE$Hg%!mgb=aggAA^ypDM^w2|i?6D{C z@WYSLkDB#kKa8tRkI}Bc1EndF8b8O8Ryyrp{c9;iINu)D-43VkClw z1Yrc@%v7GgGjSyzE6l|c*%^2;H5sQ;lJP8;Pp1-^x&Ba6A|6Uh!O^S?RIiqUg;v(>PYcYQL z!U^1Z;2;)_8LhqClgCcOsRtg#TQ8r+jgya~=b@cAR+@sRvRB|#ZaR(^7N9eO@o?r$ zESNAE^<`Cf>7|$O#EBDF!B{2p-LF}bg16s%%lIMvKCW&_TB!iEw{SieJr zuB9NR()fcyhzV}e+hxBaZlKeJ<^kX=L6egG4A2b@%>sGmbErnPg3N{Cx;IT0#!l;H zdrNwB$%1g3J_?s~oC*-R&`fE*49siZm}fgLr%w5$2IB-V0S)Fs^WwYkjy&m`!0(9V zvBoCr)+qdCVa!|4VnFVB{L*B-UuPa{9{uaj!m_;)V7VmF`|{v1Oz!`El9%zU!}y?C zgS`vQH|*3mypuEV%skLYEi_wjzHwDPej|+r_mg_1jSPIT$3K5YPWf2|9V6}-v@8Cv z^(Qh8X>8QowD{|P4w}9CQ_!qWKg1>jezW>MXg0^$@&@F^Q)aKN`N6XM=Go6Dz5f2` zN%s0SC17kG9WdU-@Ae)i_A>uB4!L3^6z?eMkhHtW~Q1dv4n%vKvVY(vZ1 zcEV>Bl2US!n3Snwn3H83vnrFooUJdu;}sJQXk6Fqar zTqGtYV?{!O_Cm=^fbv35Qc?!0tJmo;iu{5a6cBa`2+h?5_PV-8Oq(`Mp(89T0tqWt zD+uHfnp0DA71m|mB}VlUt5qA@bpvU~JvG6?bmk(&1}8gkUc;%g1a)G9*KXW_WsE^*&}URuGG=d|wyczsJH+hiY)Y~ya%MS;-l zcRp_!RHkftyf%#f@x#Uwd9mF;77qmG!88z>O+#*d)aR=_*X?W9=zmvr9J)MBef8y6 zaPq0Aaqz%FY~9k1hPry%Ujn91nToMv#u5TYAT(G$1~OD7HacwRP~M0QMOa8MZ}JG( z;W8(>3`62?!9x)l9fp_@Q5YT*f#Jgix+4%B9jW_8M1<-wl8(GMH#}-MM#qmPxKB`c zFDxj+w$AO?xpNn_8+-Te!@hm{@xTKQa(Muq9Xs?!Re(7yB@bB{GOVhEH;hGGH(w*n z%qvD_PBDrZA4-dAFn;1(jF~zg(eg>|kVqY$JdWpYUAPFR@(b}yN*bPBlZNM0(h1FJ zcseZ&kEN#JIG4xLQgJLb6_2H7VdL^um=GL>*kM6B-*b3qm=5;}V*RXKnW%Y}GK}I) zZ4B!zJUmk8ULVJs)hgcL9y$CNjz9b~+SYH!qWP;3JA52Q#!etKkK+yP9Hb-_VQu3M zJbd^?oIKTo_Kt^mle}D^c12tio~+41Ut1a8?r6a8-gp6P3HKv;6B`p0i=tIoxb@6S z`0%wCaQQ$d4(1S;OEa)9b2W12OvB{pV5F^FglA72$FIKq9RKk@{w@CLkAH*LUwHxH zAt5L(FUPGn-=hDDABfG0O^O`}EQ)An7M96wMwj@zY%}&{W$oE;kFiJBFT|JZ&;&9U zeO2Fe5vflzMyQ{=?B{m_P5<=wD{1d7H2Z_Imw)VWHXqjaygnzsWT9DaNXa91D!RdY znjNO`z_XNlAT&$NGcQ9cI5S@^H21I`m{*A@(l)YAd2xgumQIN6K&ab?d=b=Z^FxF@aB&tczIRtXCu1hvkt5 zNP@lAtN-~217)wIx7j+4H7+qWe`xL}B=5LadvMG5XoUakO9&Tz(5!VU(0mVY zcJpY3Jbz)z@3al%0oTOpW5*yY!kh)+nHNDGbDG(h9%K9`x z-eu6=c)V=adX%RI($jnaZ{5CuT|0NMp$SvyT(@4Jc^#UX+O)UX_Bgk;wxey+4s6-F z8&$RI2+es&OwK`aaxT_zxh5$aDaqM78%B0UfsUh;`C;?r^S%V<*|X;<$O<$IJj+<; zoSb}AlvksqxSS1NsSeYSDek6Eor47nmJ(9OC}58rGXa$qwP(4S7oUCwh5GI*Y-=n(|eLR40^Dl``mM2kvfj@)Y0);1Bs*J8$u z8441S5mEYhJd67aG_N82%gh1^3CS#97A8)bf%y3GEN?ani))adUyUO9Om==17B5IZ zG=VuXJPJ`^!}SHgprN4%2@cnMO`9@H$1DppXJQhnErcLtgTn5|?V6Zid~+0o$8?EWst1um`b z1Ue?MV*3mWU{P@JES)w~&;H)|F~kBp^V z@rE!YNMTrBNE2|58Xie|k3x8)Oa~W)Fa>4#I%kLukCHD33N*{CGC{+*jo>Wfm1U0e zxDoO6!7}XV>c+NhUFh!Kg*|)rVeg*(*u8rnc6RS2H18xduO&3+AtS9&CE%PRv%h2( z)0T^nNng**EoyrPVO_Cz8uT=5zTlI780f(FOvD4qoZT#PuZxi zZNRKqb1*V49wSGNz=#neS!VeFQ=|?b*|2UaCQq6};2uf9o{4c|rW2qOQCqeid%BL| zxo6Jf(1TB572{cS)F}NROVWg~c)FnsZ?)Is$LosmqfKS_^-GW7%AtLjJvh_B2jxY{E9ea^upum=VqT4;g~ms(ifJe-{7rS9kGmfAu-uyYMU?>)wE4 zJKJ&o)fe$MfA@E|`}H@tboPw$kvvJ(p{3H^(yq)Cb7}vi9i%^6XjYJ>?b5E*zQmp! zdp6u-WPCvULwt;|Ouz2wVcejf%KKve&`cjEFS-m!AM^e0__WWjByBG=i@fcv*9RqD z*kjGQ^)#nbfjDJa42ZLt$ilc#MxmmyDgJ$E0JVxSz#uGDJ$B--M zB^Wv|qm8;$-gyEiB}*qD8fa$4b3eDjJYvvsftm6Y+7?{otrR~=p!58%`!hcR&h{89 zFGPu?68wgdsF)wkm#o|T`$Mx|Pc!8`zk4pXf<-m}OXW)-^fv-lveUo>ZuiPGBJQ~~ z|4x9{yF|^83AEn*%x5uKUI9vJc(^n@Is?x!7$}SMhJ(w&pjqTMrkTcPX5A2Re{n!{ zM3Fe9i1kXD$Cr*1OWmri*ap*;=FL|}tYhZi)Mr40Dw2kb&IQ&(rHLNBrH|vWj)l z$9j<$|6~la9ctqOvuUt>4YxezlYN|^BWcL>xpT804;(sx*q9jo*vR?~TNIi_jg6+q zxv6OrS_sV>Hti&|?L|RxJys-UA(;(l!fFC@Y9S%BfDLM)4pqp>EJ9v(F`>CsB|XXV zqEI{`SO7IBNWggr#*d$fl9Do%mde}yG8U3BnUSZnam*)d%3QJX`Rs9HC!nUP0h`us z#=6G!SWC6A0iQo}F~*FXi0M=2V(F5VJib^PPDxv5*BCJhg@qNUs#?o)H0lRON(jv| zN3ML8teMbkdzxd1kHG506v9BB4x?C|l!54&c!gc*O&>pg21bpVq&>qj14dC%Jt4Rj zx!L8IIenf!UJsG^YD1&gs7E3+SiW=?u8)kTO`WX`x4?K#j`TDOG@D*Jfo7S5TBZyO z35&+`*^7{wSAksCargeCc>I~yu{rv-L;oOM~?^4<=uu&RBFyVxlLu zx;XVEui^X7G~ zPRS@Q_06QOR8&;ar#lQJ@7<66`wyt@yYGJOVcxZKKelh%iH#e!Vr}bYh2->9`Z{HL znTyruyFPJLZuzfW=L#`3#qMgh1!16IVv3qG3eM2`K8}qddl(lgS z&D!fKv4YYYZl-0S*|yE{WZS>vl>gKBO}ixRnMv2Qr_!$aL-Szq#Lb7}=gv0s#NCut z{fl|_-H-H3;?+MK!Cmxu0zqQ5eJU|6OE2%~M8RL?8J0pxK)r zbD@nS4>G>mk>2DCJcqsTJ_BI6-M8PFFV?lMSCWQd4b2**+{xPBN>@Ba0aYYU=}H?3 zeW8HHdk<*#!8`Nka~fVB>yU1PD(l=X<(9l^nasN8*QqNPm_=?m+Nn=z8>M`*DC!RzORW}V|RFyL5d z=AF9l+_?Xz>tlMyc8(l zEMuJK@SHOBmVO>s-t^BY(qS4ho5pB1lrm-A_;Hib+_+YuxuvcZ8=KqE%;nTcGch`D z0_M(Mg4L_jQOtBkj2y)VT3)yc!i0&_%ts_DTYW<`%H?AbpGx>I(vMIS71SXwrwVDjVVFE&mO^qU z8`1DkE(yyrtVHIsmHBOF&swO!EHewpOc`c~MltKEK!<+Fi&e{(ufi}MH*xA*WaN~i zsG=2n4?TwCPrr<%$^Spz{yMylBun>&O|oRMWM*btwq?m6$zo<^mMqDV%yzlBTufcu z1!e}Bp{iVVag~d!yQjNndKm9e4@1wKIWu$Sz3*C)v3Kq*xw`wF@A>|CpUj;*Au@8U z6~Bm#j7Za#W*On5wY3e`u3gp9hHRf^J0_}j&p3L7uzXmdS#+MzeBS5n2<`G1ZA}LI zmm!#(e*e}U)n;vsr93g8_9^jIECs<^W1;clMn5C}$;5 zR8(Anipn}V z<-^1E>o({xkTUv>>>smyH5JFoV=60=30Su#mC!5|S{ zEY!Znqh!2vR(vI$6UGD~dDaXR&6|UU*d^GKu?`RBr{jt2Gy-!1o?gEW&m<(^N$Qbx zRN@9ak(!R(Ir&&RakBO=j$q|5YF??j1EQ2*wJW>->dlXH2? zEMA8(!2EAwgRxza8=7r7EAfQa^Z1{`R71!=#>iOcR(GBWQSQ^_Ps(<(lzWwXw(-F5 zlY7WJtu*MHHWTjznk6>ab4ZK5wltqQOu#zI8<7Bcls9~oH-1zCCxaSqN*<16%tdXI z=Yf=eEAmBVhBt6pnSI-2aPAA>z6}8k`q?q`;Bw-gMa7ku}_32F-F! zol%H+GZ>z^KhLMH`sn~7GY&QLR*aF?zC@{mQ6^wpaeq%bjr5fe4OO~lT-Jg^@|I#c zq=ID%B`$d%+Fy$KsWawjTBOjE^iXv4a~|;?Wt*f^@`2CW=7l=Erp-SnNFcC|;>x=r zAUqTxTZw%do8`4-f4?-yy2p-XU;dW;JzYU$KRJfaM3&l`?A*Us z)_$D-H7L@y?*PsIEzF(*T!M}R%~HUNPl)ZwtX0oG^EeBhahN%CHtHK%6`EzpD`{=k zE*kY6Xlm_6Tjw@(_3lPT&n~PZASa|1qoleANm;dsODsTAS}9Ueijb8quv~@$?b}#} z;(~G%N#EuCBFvdHTVE$@7m8uS^%ZIyVOsh&=jF;rL#Ft;{I|jpSVBQ=YLQ4vKD0(GRaOk%A~J;b%RzQ7cGv#@|ak}#H`j2TIB;=nF(+a;dyjW2m+{a6Q}4* zm!nxJ3kZ-dKr)l2K(4$EGL=thtreddHcY144?$Rne4ID~b7n8*d1Li0&%y=EAv1MK zB{4wHVa6*q$C*Caft0iskENm+1E!1_iOktkuv13NNKeI6iR_z$kVp2jnR;^g09}!E}FX^(8k)DaB#%63t6ri5Q zifkCdndUWd8&J(yuzf3mnU&GzhK-CZvoIkn5=+NNV^#P_?8#V-8{1oPv%4DawpZZ& zwsL&XQHfifWw^DmRJVV6-wyo7;eDt~O+Zk{NJNZ}Kt%8ujHmAxrKaJ+xik3pzx+e| z?Z?;g&VyTVvb6xWwwB@3tws2JQ$F77EymS*+VIbR_!a*9zyAsUga5+c|NPHzCgDAbFKam{{wwjv z+_Q34Xx0Gcw~sq$8F-haoKvoCu4yyaNPdefFw4w_ zlKBFKLqRj8zfES_>zZI~9-^-#;Cdn1y>5TsNxux5O@MU&Z-Sx;j$*_{2AoYgMDFVk z1go&uAtU)!w zw7jg&fOA!gLUUbHH#)j^P&?7H`CdX@8!F6qnX1V90EUQlO6?Kal{OOn}WqZjMYg5{ux_5Ug37fcQ0uI!ed9xl56< zfpDIXi==gVNKGuls^uFnWx{Ms7&{r`$4%5cls?!qXUrv75J)1%YZs67w0zULoXq36 z%*c?B0dt+<`w}osUo($HG+|^zdJ$?GdvW(e-@`*sy@KW3X8dEwF+3}Xdv6n!LSRXqJ> z3C4s(u%a2HGvJL37^9WX(W65UJSJSnFBTsad!8_UI_-Np{b82+gG_WJ-vUT$^ul>7 z3DQe2d-ej_^DIoBIt^2%PDMmyG$O*IFqX>+)XWJJ5FH+d=%64>VH{W+9*+LFI6Rt> ziM{C=*q*Q+&8t=-XWl$42n|Cd?^S?1Z1fmRVH(8m7tULbY*s?ouSr3CY$8^yFfHie z5z#sdiA;fCT3m_L4Vl_~YsQp$SR0p&ZJY1LBM&`=l%y<79ybkhr!sy_9EbR*U_4Qj zhWB=~<7Q_$-fJtt`yJ)D*;dc-E@-x`+;Tmw)XF)zG|(*9k+R#y50T@4(%9$N_nqWnIeTpv zG~0VqNLI=HGOgNx$$h=xLuYUZYnfLH(K{KC`T-W0`B2!RHZE*1O zlYDm^njotEg>~Kcd3_f&+k5ce?hngWZkxAxVO!EPP0sPMuKTFCpL2|DZMNsq-`v;O zCzm$wouJ_j6ta&1i$F8;O?*p^k;4Bwpjlt-o~2L8u!8dOk$eQo?b03R;<;1W$8GsC z7M#JsC@Lx=C^zU#WEJK0glIx%Z5!&FdeGLj9j#qEu(AIhv~}-B?D~AfCRd=mbq_kX zK8o_Xexznp5`1e=m|u;e{0a{=O956^QjYxmTo$S`^!4(nQ2|=;4<8VnbrOwjK=|7Qg~;HrksyO}C7S26QxSM7cuq6#YVedCY2L@|dywt*v9jNAh06 zke-pJGpp6t%dmM}s3$nfjEBV~^(ZNA&}oM!PBPFuapGhoCI~cVY5(eF%VHoirpYKD z(h6+gd?k_+valvLl?8An64vHo$-;QdnYk28`JYU?9TT$>D_6x5h7+)0{!)eQl;kW= zOLJCMF~Ky)I zkdR2UwRPgHx8BAvo!rFnGuoQ{ux~(Oy93Q43(aP982%@%EEbqqHW+9&HmlIgb82f3 z{Zk4xjjOzdX=xT{rVTTeULY`9XcoIPu}gs23(fi#Q_e|Qe^Gm9TBX8h{lp{dgIro) z8d|cCJ6;=TmVEF%r|fH>*-rQ~Bs7a0KemwUFE=#1Wm}kS8RK(q8?*Gf65DKib<*yn zQ(vD8XgF!sCI6TElBM;1@%5W;-@wKW1I;oGb$vrK%B$*;S6GJ9idu!{y1Hf*(boi) zWqNt(-@KBNk7Q%x*6P&v0m0!IPB@-Ln_aLZ2Ga?;0f92vNjPTAoX_KA%;ywVX46=? z%hEB`#Wy7;$P9+^mP9_<6<}T+YruJJdiKx z%Gh|sEMjABJHaKtEtjvkGP(K0*V zbc~CdjKvF9AU%nd*LeEk>LeYKa^*^%m-jq}=L#p}hlfP*zeH4))l&^V4e2Mov3^0GDVd3Xik{^%?r1 zr-4wRnM*Ajq^vM9cE)w_4{Jl>XBrQ1j@&c^SETfWsp3m@`6JwZm4>XItD=5o7%DsvYSV>uE%0m+iOnIp5 zJdc;3+|cZRvoszY;eJOL3&_G|9tmMZ8wUu@XATpl1eWC$9Ctb^AAyLR43o^YP5OGE zF8w4ALT-Z6o8e8`#Cj=TmfXe%=XupHHy}$16rNxq;yU*=tq?pvN#8&+udmcEH(}t& zyEG5aU$({6rcImmB{(^m;1vTOx0#?X*W~p@0xn`Wk{8SqmDI}As7yW?NicIujKs)1 zEPHQ5$^D@DTk}dDvdH9{0Z3k#{|~UxIV-hIN)5~h{CweArRlK$OWvCNHIjUn=V7lQ zOOafQ=T^xzOR}-C|_B^DM;PZA+WTw1PS7`i#8yX?;>(LunB4&l8g` z5>&MR7j^0EDKqV_tjjj(2j&82`Ep;r*1s@dMoqbN?mR)-d<1&^>SY#;sgRkg($g{s z&GjfNlYYN-D6ecnRc!|vTl>-6ww2Jl8||C!M$eW9QB=`^H5-bMkXDX_loDiSS0N|2 z8pVb6C@SExpo(x?qto}67IM9`0;^Z8L?8=5nH_7&#PDJAC4T_I!lDq*!Zt5Y zCIQLSev2cxeb)5(EbzL~*3gMs7TRr%?L23NcFmYLeumCcC?7xt>qHOcm9mW8oReLI zs>)_G6Zq>JyR^l;qN-W@IG2<(pt!gi6A0Y`Ad@CfMMBaBBqgOFH@5&wmd0pnbu0^c z>CY+snHMi!fmH;66|2@^b^Hb$9&X00#aOT)2CJ9`8Sx`#`3l4mP!b6MD_6v01>r*` zdXW~nWdwyevln3;;a-N(8y7VdD`VE{uyvBhU9603+5I@Ww(LdE)<>}Y?x(Tmp=U8; z{&GU|XdO;#Q+F?}T)v7Egl1ca*@8-5yPATMF-KwfsKWAjEjSr#6qaS__^9{?r4tX) z4sENCO8s97YNsWN|BG+R8xEOWR_sP>)Y`Dvyon*+*rA|J-?Ff5<@PhT?74oKWM7Rh z5`P@&a9J7y9KY50L*Hh4ZROJKKLX00h9O%vNc!X+#rI_CNXio_P%oV!P&lp0@-el{ zWM)1Lq>u4@ZlAT~peG(Uvav(&!Bf6CV*2gl;y7RZT#S^T6`z)$z2!c&uBqjyAm-BUo9L*5;6?iI}q}1~cX? z!uW|ZFn?YQvNKCjRo;R+nHjWNhMsKG59O+=WQu*$ZEHhPCYCRc$5Pt#vZbpLvuq7* zeLa`!S&1c>$E@SE*HZ~vL0q+h>oIG%T+0|hh^9ZRWo27UL@>>@(-in)sj?2*WrJ{GSxmgAkiHr#Bgzz6N+_^_t} zH#ZjH-Oduc(^`agT8eP1gCN~dg7-V?aj3ZxISZy^T2v_dH}~N8e)}iD|M^$^;8&dh_vNS3chDDZs7nT)e+28=q_~!JGXh__Hf-;BS8NAr3vh8-Mg0 zAL9>x_BEgJLD~?@FWRm)YD(EC{z;!!fYx>6a|*k(O|e}q4|QMKDs}!i*Tv7Zq0~su zrxj{l^8C)Z#O8g=cYpC1@h5-Dc3#&jlP>eWv2Vj7w0ABgzIfscWzVgm-&kMN)^)86 zSf4T6r0nGQj;|yg=Gq?7Pb3Zq2b|X>C9TXyC~axx|02l;O{Y0du19F5W0I^j;}n)9 zP&&@4L4^lM!i4At70QnIpZ5m1KND&u(O|8JH)$lld0|(uPzfMe5gv`WE(w$j)F8@h zs0c&y;CU`HYZ>?9QgkP1cBf0^sd>pU{`bLc))4f%jxlPhhql7_%0Hm_7>rA56#DS& z{oPpxn#I860~V{9IfisYUwG)o$^h1sS{?Mv?IxI(i9XE!u^zt&};FZ+?UrhHYE1M^Le0|_odHGY+as{r0-7h=H0I%2>|xK+%h&G&h2ZH zHt&BG+57aDj~zR@NPoS|XL*@6Z(-Spfrq*%Ez1rtOAGY*vjaFsS!lN4EXy~qT*ckH z@6z#L7cN*psH{X;1!0%qTt;ZFtZ7F>OFx=gx1yzUC$)=!cR!lAo+G13)ch(|8^Cq3;tBmJrZfHYkafQy7&PTed7# zKLivwkOF!6%GEk6;DRN~FmdWkghq_Td_u?CwW)-YwffOjEF~Xc$?NcWESv?Bqar8j zG~lB~1?%|JGI5K%&Mqr%KwJA3v~};o#?AMmgTUOq;|Xk{w%_v%x(U+J6J}!A@BoAa zhoP&po0Xq)km<5bs}Ft9f|k?bYM@zwS(da}EhwbmWW4c#X4)tBmssc*L>|5bBzUi+!`IQ!SviuI2)@ZqJSvU9XI~=LRB_sauozT3DP@9=q#L8_0s>=!E6|JZ!r({{ttltAvR5U0& z%aDvQ+&_kZyn;X+Lr7l53ZQ6ZOakpciIR#VE0-&i9H3r2Eqg^xNsMWHJ z+kn{Bynd|sU_92dQoC|V9ASGFeQrD^Pnn5jikD;UcN1PLD?sLq@mMq|0#7}1FaG79eh&QN zZ}7VpkKp=_X51hkf3dv?pLS*7lg?a%cOh?b~N{N74{*+8?Gi|!7CHKPp_3ajY)CZcqEt1YnNq|r%H*yCA zcW`l2zXbzHloyy~-N+p*+`xQZq0u?kIesuS8@X>>_mhCdbNdE$BR8D*$Q|gcm{$9M znQ=Z#_X7{sm%RaaNFY@Fs(?3XR15!Bp+M@WF+eb{k_-?bZ&&^*x zc1($n3K;vsvXR+GeAwm3-2YC{EJF@ny?9CCV(DU;@_Hna5>xdxZ&7h2O3Ld|Q`gBt zxEu8?y=ZRRhL(;UXzkjC_MY7+ENw()b_EJ$I_ZKMqY~C7psJ!8mAps#UA?!=(9h7ZaL$(AM0o zAL(jA3#!PoRD!`0bF>l@y6c<&ay|qurn69ns(3a*Zg=QI2 zuLZ>=)jENR2bzSu}#2HNY3M^i<9E%n%!?bC$bl-`SrywjM8X+MO zm^yWqwjj@#K2N)pOqw*^^Tqzyano2}PeNGuL`<8$08?iyz~t$35gV6;hWZ|C+_(!J z-Mi7*e=mA=K0;uAjLRnp&5vRG-A|!);|_#J%LkDo3EjbHYiYv(pV=AOk-pZK7~#l5 z^MJIlC^XX-q=n@uw@E=E1*p@~Vtkd`xc^Yl?3LX{AEPa5%Z{IYdhFB{8&odeH2!H~ z?ohsE`|#dL_8vX=Dt^SJKQODDYiL0(kZ;px{eik8G*jkSuRpqv^BgZeNT1Q6=O|O5 z7#tf+9B@IioY%RACtf4|0DNob9>f_nTKP*q3;wG6l2}mH5fg53`Vj7Cqo%dpS4h1n%A;& zC{SEhT1#LSKqiE~MFN;T_mX=yHJOjx@%MWJ~GAvlJh?1tu5E2L$a&@8YVLvSTT zui*bHmapOeYp{s^JRxclCbKd*oxTz(eWwFusKzl4EDzUy)G{lmbODhM86%@6==TBR z$4*8>=r}B0v`YIWOAB)x;W#dK1DDB&l@{l?wOB)VUb`|8>toZfeoZDT4Fd1+m_KC- z76k{Qbk!MCqJiL2cec%N|m<<=5>)Rl&hdol>hnYh`Nhg;o+{EyJw zlY`H;7UR~gTDc`0bM?OW0@hrY?G7mWNVfR_v&bB87|hzg^#`|4h=y!wrrecT>o0CuzZ1F7_g(V(l97RN z(ctrcmu#%DQl^uJ2Ii-E8}7On?l1l(J~m8%S-NM)&_Xh3w!20k2@;}LC}v^nGrWfs zz6#P51BAH&Z-Ts;=1M4?+Y1XONcaQ`SKzbfkl^8lCXvrMDNQG@?HkmMyw~%Sli@bN z+2102UV9IoK;d4Ee`D6L7J`{wy9G`okN z(R1<{`9QO+NO_Kxg4O%%jhKGdJ^0A;jD7#N_igfa@UwEuKDS>*-gA0?D>N_-gXG+k zj~$BK>C?Qrv+RB%_x*|7g|(-^_Ll@4ePblR?CsAlV0`Y3fo6ec3(cbQvYq>1yL<&( zH*aO3Wk$4EykrFm3S|-y7MujkI)Z0iLk}9I#kqY8n%Xv_ZR2)S)^;KzyBwLh70A#j zs>=z_mB`7jKt3fO=w)RVAu%ySVNynym^^U`vNLm0R#J(Aykf$$d;nOC<;zz=X7d_8 zVgyEv9Dy0LW+5av9P{TbMoUvCWH!KMOxL1?OHfl&r=UG?;&dJBc>Y3}JVhock&l+5 z^{f7dx^}cSZ9)TqzlQ0orYdXNP+29TX|(7NU1o%iVT56U<_$YtgnuFQ1=3>ee!Z8c-u;3^Jj$)xdDilG1kqD&13CVNj#ULRm z54o&JlvFgMp}7}rUE5gM*oKat9q8`87hQe#YYTJl?nklduE){6>j`Yy^%OSmehS-m zKaSol_h8Z#>Dw(J5scR67MvS6OWzg0cKq0ht0HZwq-^EULi5=p^u;4w5}3`jvJyME zjXt5>^nC2r8@s)BY;4lsHcfmr#}9^PZ%dE602|U5xxdE1Us2{5##L%4Xcm1JG+QYo za6c)qC9MV!BsLj;ywh;)nqqeaH)%8uNX=p|c0eMNxDy$?##t;|_ z3oEd7%MR4lHRv?mGDW?7Gca-DOw66Tls=lSK2cg$ts`)hRW>-F+-%|ifm)!stgIIC z@f+x`JTD=61tD1^U5yMhC#dwdtgqy9CHGwxs~`0WEKAoaQSj(6j2=mNXXRMBX+_bW zWVp#+yCk{cTG-x{_C6YUpT`73Jecx8DW+@lWv2U%!J79`3`N9fkO+uMA&z=i!U) zY(jGeKI+ZFyPGoc&L%>0ZxKG|$-`%aX6fVn!JZ}@-_eKz5BA}w9}nQK|L`mP{%=Y5 z3@P(jR?c@r5j}L z2+hhhrrh(F^*MW9_xXOA+}#+B#CM!)I(}*8Zs>E%KdsVj&H9!47Pna*l5$y=(ikvo zKw(+B0Z8DG;AWRjAP_j#S>Kpgmps7i1I^4lCeS}IdJ}{r%_vWRa0gv)M%uvcK9BYp zb3v2ux$VRJ)u7p2$9tT=6nOmZ!3HU}%nULN&O`j?mKvR0kH+JBzXp=6Y{ztw_MveM z)+>2vye9uYbArGuQfQVmNE*Le#`w+ivG;C*_t4<1_wGCsl^)0E?15&HJQtA|p%i{n zD9ZD49jC{d>kkFZwy(CdFb`N@cA#0hqI?%LyYIzH?9uzYz5khg<@zRHz4>C~`^>!C z?<9Ne!E&xAw&pJz7UK)Y#hn~30b zj_134{yby~*Fy&m;H4K|Ae=U1$&zI{1eMHym!6)-0<;;8O6d^UU7#X=`1ZP4rmpS?6$R;pnWELPbEf+B>;t@&s4CHy|&0UDRoI(^6%4PiK zqWn^AJuaZuu3eANuy6(0sOTs}Mnz-MqUC69*@(KDCM=x4Sf>Ea$<4*GWh*dc>P)Op zlF}BIq|b9|6`9m<>305E|N0SKo@-x@J9It~Go1T%;tYBbCa@ z%4eaw9+9Er5E(WOQzp$IAkWZN=82Q05;CS?!UQgS%zK{3p8KA`UH3eSnX?u$jWP?|XhL%{E}S3G%BaRuyL6<`Y=PMW&D?&3@r61q z9}sGM^ZJC^n~&XkW0~FNF06cQQ;ubfps)Kwv&K65p^1534egJNyNn5U6z@%EK-3t? zdy-?g?Yp2^YvQWcHjpUP)-)_ldCnuS2^XJXiAqaDGi?D6y-P*5NRM*so>bfRY za$8YURz+xz#j5!ASiF2S3X7|-=iY}{fvdsrk%2mn_{b4K2&J9Rp1nlJ5U-SxbM#vV z8C^z(W#oCKixy$ocFmA)0i>VuvSrde!$7mNDhniAtysQ}%XNh2bqcbxXDvWzaHLi~ z=gyjksPG9GH8PZ-EPa;4wfj>1>O}3*RaxGMlHywAvNE2VB_9u$BPltJzBdgK5%kH( z30SybnRc6sTb+b8ak5NCT%3%gk&Ja*mu>NDGZ4Qn6YJv%)pHhO5#vL{f;l)^UxMrH zB?RVN+^8>5z1Lcdo9%_TNdSJQvl#F6RO7~$4jk%eM9%D~SUYzro_g#a{P`dJ2>6FT z#vfcbfLlA7@NQQjVYmRFZp@}~aJ##J&|H8Idj*!WaI-HT@AntsliqxM*_(^cWc}VY z?CYt*%e&k0vrjMJFMt1Y{N8U8n&snv`7XiJvP^%`b=oE4a!y;(7UpzAu}CYc zLu?lb6I$OJ>e}`{k?E!)`QVlE1DBFU5A!%n8Rg4v)8C9+^43n4XPMq%+ILpJaJFAL z4~FLN0%i%8vY!nYk`dk@XXNdk;06YF(DnZB3GVK5Fu}Hej>!F=1ZqtyZ)PZHc7F&c zvez*W%%s^ZYas4xxC6Mz0<%9f54BAnAk$&)-F?5-fQ&3KOMzxOrJL)Tez%VJpB>m< zC9TcU$IZRZ5c1}gxi`U92|QQVv=BBsP}?9M)HM-~ zO9;$)Q~@>+jMK7;kVZ(($Sy>7UMW&|Oj1fF5(%R77c50sSTun)l!a`Zj;LW&2^q&( z)&-n%vh!IW#~~~{0-@ny1n3AXSg-`mO&usNt-`!H^A&QJE?t6z#3U?UvK*ONc|0bb zz#NH?;0OhQ^wa|V08qMSSZJ1^=j4OCdZxdju?_7VJv!W$oNvx-nYAwmS($mr$;?OG zs&yC_F&R-26A8#O5IvTFJarD{EsViR0(DYK4)XFVP+C%lTAsJHsTW-vccP1s+|j)U z9fah@)*Yy)nm6u2W5;eZbnZgq#$5#Fd(pf5F>HI_d2E011#Bi9_uu;@w%q?Tw%`9O z?%w+%?tb7o-21>YxclBm3G4>!M~xhX=Ef#M^Erh`i4*!EuZ^peX@!+IV>H0$d-f=G zS1bjLEaH6Ocm$1}6-j*Sug&wc^#juRq(8yDGYT6wl}f9W>%U6N5=%=F<- z8N%f+gJ$#Lu-AWvlv8+1>@h9DZW-To-?R5PIlqlRW*-}O-CbBL4A^sOoH~An&&3P} z>zPZz-QuhIC)4HLPF;g+*axKrluOmvNWJAUG%zGiEH-Y35~I_VV%u1?Q@21IiYdEi}uw z0P=yKv^3jR<{0V6>;mSPW&AH@4OXpOhiOyi5{^eBB5bTeb2$BU`0zkpKLlf=rfF+* zWqA{-tD0GfuGNo-b;!#68Z(JdRxz@33RroZ$9NEh2v)wO>&DtO(z2XLKu*CL2b9;w zrXe9Vg8-d|xP&yUPfEeE*|V@DI0&ta=HpCbC2n+7;(Bu)-fGFm+l@tdr@0Kb+RE@w zM+t6rm*Yl%Ew1fs#r?U7SU)xtHR&7h(VLfn|Mge+uRs0@KX`R7-rZD*FT2a|O-~U% z?asx=eT8^WTa-)jad#z`g?P6w3-9*l?QqjL;-T}RfJ zOrxvhhmj5n;9T3?ntUgDyQ~tN$w~^%0?ZdqpQCKclN6?o||*Q6VnObwZB_@HVWJN;eyi-i_WPvJsx)_ zXg;rno#chDgR0>>X|(raZBX*r_u9_AP;N&u!NEKuuVkA9j=?tQE|`85+17g>F(33? z?)^N!xA0cQ)%1GEYIai|0hf#zd} z4&v~^0|uC><4oJR^A~WEcCfjByB1IaqSErbaN%++T^5Uli`=FMM>5Ehm}W5TqcFDHDdx^uqO<5@ZVRgmq@@*EOpXbt00C>{1k$H=??s z3(Xz!G2}M15|CRv3B+yN2q@doENb13`j+jeYu=8Awq0oI+=G^`d#L-+(6I*%UHsj5 zANucp3fu1g9wGUA*i3DI@HyP~^!M@1t4HzNYbWsZi-)lH@t5)3OZ)NSEBmyCThbU5 zGzJ~5Z8$f;=Sg4CVVvksGUATL7V$OumG*6xiuZBGk>iJTrodB258;%8vx#eFUE+(e zPw_2ph146r%x{seKdQLh#$sP!7I0-6euYZEM=T$P0%d=l;=Sthzv45pwDQCh1I@Nu zna)AL7^XvJvXJ%wFC~8C{%zhwdgF+Z6MKfZcRTc|^-q!bl{Qwmj?obVxSxq{^iN7Z z^g1?x^G63DL;son=8hb|@q&)`TyhQhaJQCXD`Cq zcmlJ)GQoLWd@9z*XJDOl$sjZ*CnRCntZ9ge3Pex*Vw`R!5Vu$1W?Lz4c9!CLYawp6 zm*Hl61>Pn|%LjpPZSTOLjWwuRG7SmSBkci%gl{o(BZv5WuOZdk>`4&I=;saKm4$v-GRywWA1C#)>_>%e+?O9ry&0Al`*Tgm) zp3oL)@o8lWYdf?jA9>3q>wDIhq#U&6u=&2oXz)424tb6tp;^jirpMa8`j&90`RyZyCp5n;;W_5DD|z_NTmbWvwuk-#aEbtXW9 zC2ztT6ikBkYeQGU)MwP2f6f&3yJ;+u28CpN9vGJrxn=ALUcHs>D{U? zm(=R8bVd2)I_EQ{oWCNt4VJDQuj2+kW>30SH36Brndb%bTO(bES>K1+)@`V1-iqo*s%ablCoH$_R8X$tvaWSE>e_ar zzWpvVcHW1kjrXE?(*x+e>nZHs`!crQ|C~be&WB&d!_ObW_m5t{smt%+%$0X=;re^H z{>~>j&)+Y;ykB8w#0Z%(Sze`&M1OA|u3f#%AWz?6j8eN#nXmZbpm$-@V5 z^2kBz5Kgdybn@_F#uwVXR@y1ru8CpZSmu@8=J+F*_RXKq5OpH@n$>w;U!d9cA#}?> zmKk4_*QEUaJ9L`o6`Aqi56#`sY~=yxGhRQk(tg7l`+Pyc zg0n}wSIrYH-P?WRfhjAcdx#wp%q!EtAkchHq4}%>%sh|2mY4V-txARHyb zxeXtN(e$B^ks}cr5P~U_<{~FwM&Rg1X=w|pD@4*Q!@#m>eU{y zhkeEPfck{`jK4qYE5hw9h4^$!DZbcJf*))x!H>3=;RpA$;H}-AXj!uqC!cu$fBvIe z_@_VpA-??R22LN|PrIf45`0aU3*mz!v_UgAv)U(Pvkx?D*~J(@fAD#8?cpqI$Ffda z(xs90AJ0T5v|%?aTZzrN{lRlSjW=#+_Hu#s33nO(UGniGP3N8E^+&UmbX#d-o!lGq zWhmLox5Q8Rtz9!5iQl@Q*|(+H1__y^$d2!NN&7ZiaApFyl$&?okj*fC{mz7G3*9e~ z0Ghitq~Pp4v|py7pxG@AkTzHj@xX?X1%5Lj0E4j~7|T?&L$xrAyap*oYSQKp&Hmu* z56zy|5X!zn6%Bbk2mUm@uIpO)-M7(_NFb)j(td>!hJl@EaI_dXe!AAE*)t0wm3V+Px(8R6ximR zl3wS2MNV36<;_$!OlzB!1&X0$?%$lx%5C@l@)V4c*F07wg}Ceac%GLOdUufVBd@&o zFnM3`67+I&)AnKmJ|BI7yB2n_>_7pM44&Px0^ ze+5U6oWM&jzJi@QccHndg#}$P;^N{kbH+?WghwD`Ob7xAcVmLaATlD7FgpR0CrgXa zY|NTDOD8gkiXM-YlnfLUl%lAx0#z)GYpR+F&UGkcp_!Xiq|@Wd^wQFbHez@HrcIfJ z8W!B?+_s)DyQ1(-Q&36?BfjdknOkd&N_JQ-)Ystx5e8&O)l5#_bL zsBGAR3aW;1T-$aR!T4^}cifGJ&U?7r!)?2DyZ*=RZM}Q3aqDB)dhhetMs2zKS!|Z^ zmmhfrFCDyq(^uZd+aG>~4?g=ApM3QLynX9KyztT+C@QJaRy7@lFHoiiAEv_r?tA?; zoIlTJ%eXC|C>2rL(yVgqU8dJQA&~3NsyKUU#OD+v{ywz#BPd{lTndg#nMq<-Y(5#hBj}Lied`y6OsMa;{ zK}Q+~MP9#j#{`kQ%@NqJ;9+8$Z5(jLO95vYM$9!Fpn+z2V@AK_azNsvfcL4hyuVX; z>*_T;`S_EFU-svFrC(B^AVt^8C6v+I$0llSIm@y9#5#uqI=?x-~%Ooco=&$QnCt-bT zA~KTFk-BCrR*f5r?D0`}IC(WLwH4z=R|#%zD#fkMmAKhkjki0iakIM~@ATH-#;z8; z+FF8wxs$Lca3mh;XvRPP$?Cg*f}l zllb}Xe1?Dev){mb*BJ{4%{u=9p;^Fq;242a-aIRqioJ=g(S}?M>((#+zsNtWR{r~GB|`kowavqKy!XLk z++g|?nz?Sb>oHt^JKE^_M1KH3I%pK4qqdb9^fqjq&toj<$A7WjB-k#SMDi)aiq1g<#=aoDC zUU>Gsj$8J6_S)`y(n8FU-KO_P3H+R;5kWlQ%-`O7=hDYcq(G752H&6FqrCRy()}#;et8}1 z6#7A>6AU~-K?+jmUi@Y5#rr(n*L81m%X>YSn0MB0Z9dBLvxVSLa^Jhi?GyGnc%O&M z<|KW5rq)+|x>&jTD_fnT+|X?D(VKVT2LjCcs!HvJ=Qwv>!TAge>Jv=s@uMdQ*0h}q z7x5+w?YFL9$E}<1;M$eTdi|U%*q3KZfr;^Arp9N0G825tS8XShQe1Ca@3- z2?@rGnX{3ena#p7UuXW4skRGP2E6xn%|2nicUNF9x+@iVb9E(N&-^{A}dL~!m=Xs&JD zMQFYYH68b$ZsUDu=zai=J@=!ro9kWop|yW6I=4NFo?TC(|L*6p^}ZLd^&SHAUC&_a zo@a31qp#xd=^MEI?w5G)(;wp9kH5z0^Ovz>_a4N>uSKxT)<#&CnGK~yUx9hiBE0t6 zt9bMJRX(TVxOk4fa+a}14`s z5^&}@DFtT2^jU(mj?pW}IRJC*(q+8y>g$*^k>D5xB8B7?^v zDs%#-PnnB#YtmRru0alcJvFrunORk=ls4$dGTAcDyTkxOa#3*=vU7@%m0f^>d_r?x z3G#F4ml=huNN4J(I0DS$$4x`P$Pg@?vjnLLtmMXTK-&6rWW=pS+{ADcO^?R+@>B3u zTP1GyRO5^OT6{pry+O!*ds7wO?Q6uXz7|~UZ^d2NYq2zV7}6F`#@qXz1ODa@@gINq zEq-wHS)Ax9#+&VV_+d{Oe$-uquQt};>rHj|ytfvgZXys9!oTb*$5*{&_^PjjpiFHo z!Pi^M@%6SUe7U_EU+t*I*So9m%{}$F{qQF2?{7rms>L{W=oS3KpZy3w|D$hlS>{Zj z-%5j)G7^1ATA9z&re$_u!xLgtv?G>VUeOMnvdoC~ujQaqE`5ijVy$H*_t)-Cv|WE- z7AZ6j3C_O$tG+-Rw?6Uf>X((f_J6l*S3|eX(l6cVa?8Gz^2(%J&LuIx7n;usFiWd5 zGlIZuJ|G$l&F10oL{4DPgz=>2q9(JXRZRMl_!s&{3_eZ(kfl!5{VPJV+<^Z!y&-P5 zpy`%17Iz@iv^$yXBb%X)!T1Mqk%eYysk~FL7TI)(0c#5kujz(n3(j`wmL31peIMc= z`h~lKGSkAKY2>}0J3p*^jK|q#jK&Mi7Mx|fd=O)y`J%$BnSxwK#88K0I`kd|G}T#6 z-YW1aEW5?DYat-d*9*w$ zfn;cT?Z%t0KG$=$xh1tyYeR0ipOMJg%22Ru<$v$~=Ww6L{+2Xo`qb`>jd_9DD;Y-8 z4b7ep0p&OSUI55S?n%EQ<~aqdwa+YLgbY!4n)V{yHO^_v^BMhpn&~(pAC)oa3bbFn zFo5fq&g0F?7jfg-Wqfe+1`GD{*tD@7uf6;N3UhN16)wZEjm84PUs`$wQc}{9mYR)> zlstlR0RcIm>jY*M3(fRG9bsen;+2>XIT^({Whfv>=ceT%Jt-9_35iJGkcwOxx~{mA z(98n9yb<+vo#^S=g^rEe%@A`0MG(SgZehPOz^b+1UdKG6ce}q%#-@y~l zd>^G1b(lPDhPEmTAdeau;7rmo3}NA6g!weQ{>JNg`>i(#hR1N}BIEi1pYs6ykCM1; zWA~Y3(z<-m1I-8D_&)Z%_7Z;|;B%I48;7;)h7=H&PP0;Ul<;}fTX3qxPk#+)zC>N- z@os27M*xs+6sNh}KhFBwv=M#FDPQg+PuC{yEB2~=oOw?t8DCTEqpEE!_5r3b}T z1aE0!W+k+=tRAJM^(c|4=SyVj`9@V@Vzv(Pwrp7(rcIxNkkBXu21Fu^&^$I`3L?WM zVA_OPh+mnY730#PI^<+m(&vkjn3Ruo2?f}YT#BrmDpsP)kd{$IA1mXzYml6hp&$3= z7nBgBS+UM9L+XZHB&A?+rN<2{X^j?tq+Fmvi$LUTH@2-NFi)?(GQ<}hnt)5O#M^@o9ppze*-?)(u^ZD`KX>Z3v)&dLw9v9{@~lUfPefg z{L@#L@Zn>daI_&6mm4$iO?Mf7v$GW6ZmPy-n`-f8FF}~V{Pm_Xd@I1*Uru-~#}`{F z@cFiCst#XmtHC$h3D`U92+Vc(_MSR?^H3|^eW(iuc6T8?W+BcVeHH)q55JE;_{~po z_V7NI%k&%iqJ?G!XBkmr5Ht%nc)?jXNM?|EZJ@+Z4>VC_@Pg1v0~iOd@&LR(6?-o z28{_WXqN677iD}_7JR1vBLgONnT4oy&oI;O(xF|z=Y}Ki>ubq8N4$u)>7W1U>t<(Z z#or3ok|_C)2{cP1Qycf5N9Qzm);<5}b;VGaJ_&S^v2s4$E;1QsUJ00W_jdDJ{x_K_ z*AVHH&#J*eU1VNg_y=~cvNbr*+=sp*Jrta@$4b=cThORL;sA>s)88Qnndz zrLl^0$hDlpMEWDyey{$8i@7&m%P}yIp*h>F^j^h~r~z>{{+9jp6%5yOeZGUCS zKe^vCa($-9yyld&@m^_MGQFpftQ*j_&}?4m(7sNvf=1gg&z3T*Y4;ZlGqy0wtEIms{+AV|kY$~DRS9d;-=X~5|(kcGHXCU@Pds82lR%efx zmwfJKx@}&A5|GwmCY^Ny-n=K5j4hl0hAka&33z!?b#fP7%@$ zqNlSBJGO4d+W0sYaH9|u6r_EdH*84L2~<*&Gm(~(jr3H4aB4m>(+iB!inL{^Q2IA- z$V7BVG}2ix=BDK-H0Myc>3M|lQbKPvD$5#BS>AxEie@y{Z^Y*QJ?LoPjGCHuR8;%Q7%MvH5*N9b7coXpcw_ljc91ujK`mS1NT1iJa*pyB(~i1C~7+Tk(^bA^(^EI zN?K4!V3wBV>gJuOYukhR_ItF?vV7>*)_p%(yB|O^)kKhP+ISyYH$8xj{d>{A={|H& z^{rdc(6$9#eRrdxSr?5R8%2O9!uOti5*IHLtS+C&Y38d8 z8+hUTSzJ1IhLx4GxN`m!uADoLYlLS3*MS2EaPqZ%IP(2{IIwR&jsP1$>38}DgGy>$y#bpJP0d3bM8^iv$?7Tb9Hw@e}bY+O5Z5C@o7UZE0p+x+EXXo@Xpm5oj$e zSbx)U(UG0`&%_|6ll$1EEypCrm=OsqB>PKX*|a!I3^ff7%x8%S+R|*GS!Q&T(Ke*q zm3)x+caCX|k6Vq=0RgPkO~A@kYfxBRq1`m%*Cb(aCfQNZY6goK3ac;;dB)v=Kiv3UM+WTzIPvZxUSIn~HcFGp5J88#&4(f@O? zft6)}?)dm*WM&p1JtGJ84NWL1tw2&z8j_PUX#YGetC;Xyg@lA$tc%OQs-=mTIAIR& zc_xOB492XfvrtON&Re<^nX{&2OHwQbT8eSArwSkU*Arfw@FjuyOM>zzz4iEHTN`dt z&*ZE_TJUhJof?6oFFc6<`s<$n|Ib(W%XbdptzGrF(3FKYo3rt8XEC9<5MUrh1}e~1b*6j{H%z|-422xY9Op9}RE|o{+Qw4-(f+`EsJZ_5`6Nx3W79m@PdE z>c&3Yv-d@O@6{7{;OW<~>(Ljn<-upM^`572@7@eh#D~r!GS>p-_Zo%NGy{X1bH6m1`)qHg}$1DOjcm?^UF{~ zC@&$%mX^xLwsnNwBpw%vU;=Y^SOje>5_6c2WI}IgKKGR=_EXD|lg?#&6_OM4btuTR z^c<{Rlc;GPKYk)1ycA7MO=xRt<+*vC!cwe`U5A4Fa^z-LATy^5Yk6!!d=}!CZ@`3c za}X9el@-n~ESo(SZK)fub^R(llb3+E`)cq}Z#8cB*W!z9jreR!J-*n{j8C>V<3ry2 zg-vzniCc)+z+vdfSdAZka0>Y6-^Ty?-4F2x=U>6wyIb()#u9w6u?!!!m*Ufn<@l_- z3ZM1V5SFX)C4u=XF2Cxn#aG(uO!d_f%Bd~fwzU>t3pnrK`fdXAjtcyAH=+B!Mtu5M zFRnjz7mhx1H)1A@#i=)5#((|W&vEiA0*x4 zPuj(delj5IA~F$|q6_pfBU7%pWy>n@V{J&21~qLJH|IBbXv-{~lah7GFDbXIUtUr% zb}@~Y87rJ2=bXUc0cPHq3zm%}A-o9X2@IZxV;_(O4`vt>_Ul2jfdY5Nn{)XG8UY3K z@ZHB)V77Aq=Z0ne`fh0uqd}!B{PVP3mb>DgmmQdzk;k{6z7|9T+2!tjR?%;1ZNpy zZtK?j&?SAE1&nJtJVVaa)onyWeYdtoHxis{s@hOS;4Ug@LS?OtlCcH-yB^2G&mF=O zubjZ6FCD|9-#dtoP4^%pzY+CqyU?=nUUc?8g0}95P*uMJB^8@cRJsv`#qB66>p*E` zH$izLx3wXkFup!17jx&YL}=J}!i)g&DAh<8ERP8aBAg`Q>8GB+*I#{(fBc8P!N2~~ z-{J57<}dK`zx-4D#t**4yYIb?Yu7L0+Es${S;p~!GkBBhH{Q918}DAjn}q9ggzA?M z?8DaGJCT}}f|-*hBV_mpZDoucF&q&A1SCp8D==s{0>=>CxjusFh+h?t7oL8WmHGjk zWdS~L^a!6bpF90d3eL;4Pc!2r{ZQUM2sCSqGZGt-)@D12huEle&Cr%+O5>>53uA`| z2BgrHxaXG3{s+tJYwp*q3eALQcLRsVXVpHv4M3)Rb8MYHuHf&9om`g|U&b;mh;1Rv zds4erShnK%xXsf|jMwr_tGQQOIIEa$uLMZ=9Q|Zkn|WOanol2PK2kEK^Qj~JE!!k6 zv*IM*3j_rQ=(hqPVW9{J2%iJ~Oo*PQ6@qEwCZIVZ z9nUw`;dDm>u57Hr&Aw{9-$%&Z+JH~{YVm1*BR(N4ze7D+lZmt`j6uZ92A;BG944#mKM@8f^|_ut{8 z_pjm9!9(;fu>sne#8=vj`U!2w8($^1nt1EcAiEQ2mTlHojK6sPcYM$-J-_$@ZQUQ3 zMIPqzIVUCOqMb{tt_5Yc44cVyWj`t33^YqR^!orGXy(0gDZdRgbK7@7v$qTr-?f!q z%ZA4HZ5e4~^3rz8kk}*J1(q!|OIbNg07gC@xX6>~xUPg|d5wEXf}My7lx_o;oJ45w z(1OqPP&~4M$iRf{J7QtW9Ta`VV;v|kU?7?11T;sUYL4bA@0>DEwr=hjg9XYa%JTtnH>(AU*G6lhj4k3IP+LBzYC z@hM+B_BqE8$4M~K(My=Gzf3mZihda|i-tT0uO)rn#a6^twRJ+a(S}5FUiVOYK_mav?uAN3?}$kH$3Qrhfu$3nJAp;;tTW^dbezZaUT+fgIkG8|}@ z5jCVWx~X1Tnp+6mP1;4Hs5h1vwss|0NyU^Bk z9~#^5L0Qda6bLMrb)uk{fLz#&f}%#0lr^Keb|W%#DzJEYJi;R;D##2U5uiiv4Rg9+ zj3(?XUbGm!goUe@ui!8K{Qn^+|0DkC@BbEm``3So-}~85@f+U~mfv|3H?Cd6O;$qQ zzkU@TzV#;FyG}r6MNImVA3X3X9)9$G)HYTlZrvJ;pE4DrLPqmC!!VN9iQ;ksmvcsq zL~K+TissEh%bHclSuhJTg9$BxqYyquh7lZw855^sOXnt>+<%w_xU{yi06ub*etiO0 z3Ca2avG^j_^;v72V;rCzsAvPmzG%NJ8>HXyDM~&t^~O@h0@|@iyJuKQIl*&iVaol6 z_C1l~bB^74KYEh$*nQ+!mrp4$`woreg=a~#_`k#li9Nh;jWM>oapD5^)xO;>aCWye zn?@P8NKagcSMS-0Q@c0g(vBXS>8&R)*WlEyPP8V) zB8tyEoPHS?OyDLcj}8q&IAJa-I-JlQitxw?+HbhFK!-)hWJBB@7KyRZlQ3`Y5-gk- zgV7_y5E?uVE0?cFL2fPbva56^zpRWBWTwlv2_?wMEJsFKF;=cvj|r2eV+1RVGR&h4 zUn$eo_w{Y1-!`#wTZiJJGUVmb?@L%IE|3Y2s*#>qiuH-v*pQHgMe~+n;@F9-M2BL- zq6K)ZuN&t!b>M18E#4tCf6!lz54Y6d)2;Qmy}1dW?dZalP0iS{eko#thoK;D7T!Pi z68`m1zruh2$v61t4=>=W=kLW^yIXK`TO&T)R*#SSYj9gyt9vR5&!Q@PLQwuxTAWqY z_?-FpSx>F@alS2moe9pL^%I~8++S=ZL=&37*dfqdgCFgx#t$BB!+VeQ;>!~+;>Ei+ zV{UXXE}z&B{MX;%i`#GGBrCh}jhS?vRG6b}%TgL9yl+Uvu4%Jw(blya0?TSEol#KB zZm~nZ@`>j&T;{}a>kG8;p`qEAxx5ma7oX#?yoMW=txScJX<|AJG@CNo__lLj3eEJ5 z0a+Re6kil^-7E1^^PjS+BdOG@->BbOpS5K-&nNfiNE&!dyfF7}Tbxa|GJ)n37-nj- z(of}-^U)v0%)d-$_h|tYl0WSN2kv8_(Yj1_ z1ff}LXVRra(oHda7Md@eI>+DV_`d+HlV0Al1u-pjT+hPZr$02y*sA}(hGuu(xz9J0 z?0)thtW2S3pN|bhR=(G>(meD)Go^WI;Mu*OxrW?>-!)zP_{-Y2JAXy)fMUdZ=F#cFdTmLA{!e?7%J=}|%Q;q{xRjRWr=EBWIXPJ_8WO>RbB)4s zQeuV!&DoS~X_h|C0?jfrU@`LYiV4J31npG_8554gwMod!%tuLqd{|VC@{)QK7uI@O zn&m?!nf*_oc_X1&TA5V>&V=S#h2}0a5||t7J9HMZ(qe*csX%i#8r!y`NoMYA--V`) zdr;r83)M|qQP<>5Skkf+6}9~+t=feAk`AgJc}3DEx(S6mPjQ*37Hij~VBGj=7(QI! zU?fJ49Odb+92_zR85zkqdh{Uv_z(X8|M|cE1OM>1e~aJw*>B>L4?n<_3+HvJWXW&& z&{FdH^1vBfKYtc)o*%&LFFc2?))vGpU5Giery??9jE=c0=N65i0;q^l!!Ro-5NpSe zL&J*YxF;(EuT@pxM0FLewzT4UUmu?3@1iA(FlEdbgpVAFF+9&0f>&C?2Hdyj9vs10`HYfHX-%*_AHj(`hSz@VqyJTd>>=!#1d*?Q_ zDT#+%=e~C7>lfmi#y>qVnzH?f&AHrhf&02zn%%PCZ2iQPB__U@vc;A!Jg3+{(`H^P zGggaq+iCrNWU!BlpZG$vCvI|oQ(iC+8Ed@IER7EiH1j^>HM?}rP-x~-fcZ54-}lA= zRFqXuYw>077rywLc3N^*Kc)q6%7y3JJrK=8?348-P+VE6+ zEf$21K^VbW`qT%6j>Yica4sVdOfVKW4hfTWLUU9kBBG*H;So_fbe&965*;~FKlTe9 zGY&CJ)*v@iM!=~-W?BU+zA{Ns8R5B9{aIjn(uCO<6BI>Y9>a4-VcOK$Sh9E}5))F< z-PMQsx@J^X)DVLfW+@F14MFILVOTq9EDm%u;YvpX zZnRb6*2Z$&>?y~4eKq)COB3GR+J@IFa!@=u3NazWu(PoMzx~N2;IDpy|NO}Z_|x-m z;P!)?xnC1L+0}}Vw>IFD&9(SkfV)?Ql%qCPnO0|lv9vY|IDgVrO>x;>qxzWJJ|;+i z(o5Lp@tN zP?POO_F5u)Z&E?f-+uYvKF-I`n73YNmi#t(F1pHfg=H15O(PJ4mGo&|eFrrA-=ki*3Jo%2 zZObmyRuOg!rM0Xy(GbbHOm@;xzYz^}?IwiC_mglGbDZPPYXH*7&=T_3@?Q@dXjmbBP-73Du>h*%@#TR5!s~ff5b_8|-u}-5$bme6IG0mL496k@dGgfh zXjmSDdo$ATQfUQ_6GqNAwc&hyJuXyL;u>|Mj!@Cvg^zdcz~vp=aDQbvl4nlCWS%3G z=Nd(YhlL;}W;ye}6bBCO$D7x$Xkl|v-dOTk8(5ZDC$T`b8{5+lqA3~L@C@VJ8OAsZ z%{G>L3Ut?NWEn}vLV}N6an@?6Mh^dY;1%sw%L3ysUTF6Ej`0z~s`hOj3YwWlc_uuE z2b@(>_DNSoU*9z1|4zKLvU#TBb!|CuiT>%C4xiVP7-(QeJ_MAh?j2x0aR!GD97acb zC#FxIsoi_z4Mt#SI2J8gfr9LE`cD~_#wB6)(j}P03f0cs6x`g|gSWc{nk#T}XD13{ zm+Ny5501u^Suu#7whY1J<{~I^I!2F~fWY8!2n?Bs04{?<9dSLFiVUBKaZ!^H$!#;I z&BfYPNyylckBsCJR(7iiyEVv?p&JRj@$re6G--xT95j4*2>pKo)4fQCsavyV16HkE z&&qQ;`ukWZt!+X@c@6!sT&J?nl8*z^@;sI3)RcUrq-10Mf<*`l9Hre#R!0S6e`6W0 zwN~MJQ!(D`EW!2OO1!nD5y#ug(YtyP789;h=1s!2kI>eE#uO zoH@E*?ZY%mS(~B#&{jrWoPazXQ;a*^jXT=vVL*;L@}1kJL)`V(#c zJ6oEaa!Njil}1aw7kf=g8OTDj6FY4AWx?3WZ1>%!16mN84LBSB5r203t9+YbWk(b{ z%WLbmIC@S=qkL?6)IhV=X1S9NE9Y@i6YMUXAuwy-T0S7Z2V;ZTpiI|{mjqc@CS36W z42EX+#~pIK?|B56G}uYTyX6UjM#f-Ffo<2lzinXiZnysp10^WeSfy}6!)u?KSv89`zYH1k>f za%eUm0{YtM5O(Uw=9xW@X)SY0YN)oLG}pGrS*Q1kX?FH;%RU?5Ycn7HVA;szft5XW zD7p9Z*77H|YD>S#XZ9!Fy!16{e3D1P(x`rN= zv^Lk)5Sq)I38FHjRtt~oQ1m7O|_ot(E^ zCVMETz}$t)oslpI%fqE**}(0D@#E3n+NQ%IeR=y+{Kvoj0{{9?e~;h&t)Jrko9}3R zIez$%##5=V$_N$ai+4UV+M_J5p1+90uf2h(HexKQ7Mx2l`)VO7*Foh^8`rxDk>Y6%$iEZlqW`L?aN zwR=0xZS6yUdLouaNekpK1Q5m}Lj#%bBQbZ*Y&`SSlX&N?xA^So^L*BCUAlxTeAZHq z$oR4{X0#S;GF;(V-UDs$j1_HF3o+iS*slkEDUUtT&n#T}$l2FlZd)|+Ez}3c1tZh_ z!)w0+#rn;Ujxyz~lzYn-mWT38Jn_+#cV<>G`8^n#O?r%9x$nh5vyr=u&~k$Lr0YzR zlqr&K>8g0<*h!o`!gHNDhrNr8+v-i%yU z;?`$oVG^HTfASiW*jC=T86P>7a%-z8iGbgV#>rBNKej0VNNMh6S9$+QcSq5 zL}o@QGBXRYVr3kHSxFu~JOH6#<1lyba)NRO*00OKnz(et$0ZY**J&Ksv3-wDlV4lY zq#p*S@Vuh5)Erixi;$mNu6>(RQ}Zx?(K7lOp_%p>6Fvs}T1s)by#kk-3vr>V1gCl` zus1IOi4j4VH)7>Z(qW#y_@jCj(U7a$?$NWZ)?D9EdbJ z203G=UU(hEFAg9NCEL>MZPYPku*P{O{%e^b<%F_~l;gfK*EZMJn9nrHn^!F_#FzNn zH%&(B2U<>9R&eA~b~)E~)fk!gT1P?DT`$Dq? zW(&>oz~oq28kwLngf!rIWrLf0-TpQYaWEk2e>|5p4h>$s{!q|tha@t`n5C@{*jZzA zn-dK6{M69>fA}Dori*-7B56U;KIHO%oux4jV<-wN`oJjX+HQFRu&*QyVo>gTb{m@i zfA0T1zj<%ArPd%h$y=xy z`xLu$%jB8A+<7K4pzM|#noasW6&STkC!Z*NR(fB)vbH1gxxT$;e|dc2&d*quKId|z z`)FE*4m^XQnR4vSU+#Oe(w1pn-{zTJ_fMZbYwjxnO7^>aPDXV&t6emtWmIMvEG;QU zdU`rSL&B*jtXrFmL0y8uCf!Tc`^ zFTeP*!o$8d4(kknWdh8_^#pS+OPWwq+l5WNccZDfUne%HsUaZOcB8(26B-(N3Cx?c zn?^-t3-@o(VC-E%kIedi|r@$dc~|M~C#j^F#)&v4`Vb>`b? z$du40jvZ0>l403S%7jYvi!)-!v~T%%Tb9=^UdD^hy@0thW+Q|^J|{E^B}-z^6Bmzr z*RR8qDG7KXI~gw(q~qNOw&SPAU&l|6y^7y_{YCu7;}7EpyLaLTTYB-?=1zQeUoYOf zw;OM7ZpO{#T70{y8^3e^UHIzm?f7W#9(=#M0rl%wVL?O)rUV6GZgjMMNH~s_o?ZQ0 z@#%*j;`Nta!nVFXy!_m=ER-2LWD*A2p!mwA^Yl;J!C4k&=eYEY6EAk6K0=%2J(>?n zt^Ls!2@7t(^oaX-Mca1d8A{HI{%=~gME1*J#BnJY*6w^V(3?@iU$j7fz?Tx5Um-M~Z7;{!mO`BFF2)gpZSBJ8SROPA2lwv6fByXUfPeo>0`s5Z zKY#BteDeGryt%6$@9(O@Cp#+f*^XKT=1;cQ;$uSdM}*vusZR;dpKh+kZ9=p5ZSE#C z6N;rpSpZo;`Tb5=n)MGhR`S0}d?c;S1ZRQf+XU~gx5@-3wfOvzO}M@PQG7hG4;>Zx zm^U#BzyHI}@Rxt^4bC2Yfi`i3%s^mfJoiAeQ_ht!?l?iuP2E>L!BP`OAuv~gbI zzo*Qg-O0CaZdkT5*R{T7;=kHCeOBRF(m<~AZ*i9W)yH@p!wS3~xhLM69WK?U?2>Y4 zC}%a&o7-HHnvH%?;wP!C*|fOtV<@?e%3toh zxTAOut?fC^Ba(aaA97Ca)fbwb;4jB}?w9xIl8(`+g%Phqgo&<-YIi z10uH%NE+SiBL904dDBF(!t3>8@oSNP-tjlL4^Ff9`TNMGU#{b%$G&l~vgy+q2%UVq zJa7(I&JF1G-@bMgS1w&-p|?(_(H=D_fY2PNg`#%J&=E6obX{7TrhnKi0aCX=pyts zNULy_jK@r1uG{2MU2PAls-=5H6Y>gbQ79v4aG%Det!R^0=JuUv?bxAeZres}AvkX# zOm9I}ZW*Rbn~%WJ5tu%09wtqm;Xv{T0&{@Ea=@rj1cG=x^VHKiqoRy-@Ux%%6o35} ze}OM=e?~Z#mgOTjK|nr!%t%I;a(9)9={g7|m^p84Tp zQ8IrCw#LTe@x&xNm%I)yWhLUJqGbHuiI;%C|6TmgKm7sz=P!SXfBp6r{_^?>{PwF) z5SaV%$$dTe$@BN&?@zvpKYj5@eAVBD+wG0`38DGxhj!zuXZPXzTWGc8Pn0-(Tdk!ev#=us(rC95}IW!>GK4=Gc4fFGUn-o zIRedMbF_IAPbm6@?Q1N33T>s_D**?WJTZr2KDpzsNaC8Y&%qUH0Z<*gmOkm*J%h)p zkGLz&29&)rEzLS(rb}mdtzQ?K?Yb{C%PVGYYne#WeC@&|+_-uj-=i&Mre)IZf(Xnb zwQ{zpe;W=SIfZqJnJ6xAMM*_Biba*(D6j3n^2G~~GkQ>QLNRnt8vX?O&Mbx)wQ_DPggKTcqN1V!b0Q9$LDJ&5e$J;*EpP)(dcP)HI@*#<5sUzn9j{E(lMLoEr`LWfFS+I zby4IPJWpso*-?&Dgyv)Ig*dsT0ln*1p?T#Zd~@t|y#Mm!_`%Va@W+=A;wRsK2=Cw1 ziVyB;z(;r0;zNS-M>}fpu>dpmApu&~3C|yIt-&W-Oc#yYo8)7_3VhUEp>XVlX0JZj zSfSAT@g~AJCEYtd+gyRKWX3?A_shroar?mI`0(5TROO^%-h?Rp?H~RYe*Ed1ICJnh z+JMa6;Pe2X+;(qbt(2>#Y*KrtjY&D|v3=UzuL8~P^2}RaP~rnTzkhiyl5Q3D95(*z z`J@aJe-cJ;HGFyaioPK%AZ3&KtOLvVBXidp+~&kU@|SQ-dKTuzW{orrd^U zuW8d@2F_=?7!3V2IRE9h9B0m9&tuo!vgiN5m7Gf_ArL@vWGf9rLo=PsDd;r#>+w=} z$&yCx<6t)JTrwZcxy^I%bWd>3FY=ikj}GoB{G7D9oyQlN2N#S(L9@*}_jyEK2b15X zpmfU~XKTtv4ydZ^HecD}L|z-Tw(rdgkX=8}2TiKW*fl1DP;p_l|ml@Jh`rlr^bV)xTt1K&L zK_5t9He(XYhk)|ok9_DOLrUdj7iu9Z1*eRbk(*Ph{TTCdOSw*HmKnl|Dp6cmhH(*N z5wmy+_Uzh?r=ENYhYlXYV~;(HJeh`@@SBxYiR|nu6c;z4wPh=+t2d&eyd5<)-3rPg z0cio}vWj*>fK1_AhvKp}RMl=mW6L&z@-71Mc7pO&?cUMexgG5rx1zkd88NZzF(x>Q z_aXN(Oh*w|zB~pI5z@*`SRNgKg^L!TyR$=MhUB^SQRcId5hMhT4;|c3*gvX(eB$^q zZZkSTSU!G~(9BpOvoFe0U`!|5AUMB$?F#xgZ9)j4DUjEiI3fU*OIF~a^ejA^oQl`e zQ*fjp69j;6L8Eh(CJn3BtoBeD&}i{P?*?SinDkckbGb1Ijn-t_b$&< zCe@JjprFyn&(6ZW@4td`XIY3{Fw?JJxNs2z11vwd&CEn;Y*xmGm5+*5rw-GOjL!QE zXDMLmg$5sK+>rv9g)R575GsbY}kOz^lT(2W@-0~Jo>S`6n7g!*hXmXSU4N!HnrkRTNSQ$*WkU) z_4s&4BR=2Lj4$qK!N)u6@WIw91?7(%Xcm1UT{K+KEd855=@l3*$A<)GZA~U1OP}Ts zI|$vtJKLi;v<`P3GCe)wrZ^C9FU#AE67DfpK^`vZJ= zV*uw4zeGE*!0g*?$J?jbD=jZHmW!Qv%M4@ZzGb4foD}Fx%|NpkmPMY50W}z!P36^B(x^2Uo((h)-YyL}Qa%lWW)Xc( zC;IYSPwv~q6v^9R2GY3GAo0fiWFLZgFaiz&CJr!rWLlo@+|uk1$yP@wnGRBxW}7cG zGffJxBD>AVV<@h5``hgAzl{=j()4(xt*cDC7=iY?6$uCBUjnzhDz*6|U2Wuk?KI-{ zeq|bTy;m1Bdxx9jdFYVtiio?#+(5H)ehbVlXm+mqv}Zpl7os$&t4zaupBURV$tXy!I2jYFkDQGi#gK|Q1K~5p2Pn?ED^X8$iXA>TO>~ZXS;~<`Y{uLAyRuh<|&m)05r4%_? zRoc2-Ro;Q>icYR~p^m@nYI@Mr(1+^EPUIIgAuGQgIfacVs^~yfU9Wxw*w(odErjOo z-aXjZy$eMp4OkVIgvfDInEpWC&j{YjDD9uUaKQosK?369S7YSJ5g0dqEM9u)1-y6j zE&BNh96Ini_PzcJ_V0TY$BrJt(IW>LSB~k@{H6%X#|THqjvnXIz_Uo8`84f#fX_{I z{pw}(bagT>M<8GX#eAI~6^WjdBs^DEg#AT%IGmr2V+FZf=HPH{HumLa;@X}*{OfOi zg#Y=wALHNu;&c4VkKe>U5uE>uK>g+3o%r4T&jUaB82HKO_{GO>;*U@7!yC0lxF>co zb}yQT`&Y-}X_XeA*6bV3kpxKE#UKu}g zr_U=w1ARz=neqVhnRC9pa-p^`+wK@sCeLJy2&S(Ea5>skxMfrqRsdG4ia`M@*b8PaB`^nJ41IBNR0zJu ziouEYG7K~n;A~wkPPdg~d%_CTPm9K-jkS1N`XFzq#qC}7`1*lXe6qV9U*6M(&vrE9 zvu(}zbcf6mXnQJX8Y&q*!;!$U=#y?QG~ebi z0?prSCOj)NZ^c)~p2Kwl^72_zk+5v3Lh~nAXbVTbk1JY%^2$Q98DrnKOzaY@#F%Op9@!m_>r=k>Hqq@7=4+3G)ywp`O5?@foCi~c6tXsbg(v$7e} z?uKRy%wBLd{~KtQ@^~m{7Mb$eEoo)GC~rE4kcYo0H#D1aQOZMy16bCZ`{Zv*q1pC1 z{4SXX;bf-G6v-^fbb_YH+tMsG7lBe~i89g_GznA;OoL^i*#fj|lg#!HGE9pu-P^?| z|`CHm)l)I~n8yM@}AS=JLEcd=SsAt!fT14<+p{E`UdQ(!qRl z=c_%h``j|ri2-ISo{vl4dvu7{LL)gPKg+Sf_f1)MG1%DP$lLbJI)n7M6z=&!Zm!Id;Kq_Szb-tZxV~d!SjC z@}EndG|KgzH1PiKEP3AIbM{z~cU_Jj()M_6wHpOzp5udH*IvWKabp#l0|;3oM+IQis6a$V zO~BabiI_NHDke{w&fgPRFq)6&MvWYUNfW1Qca7LsGr~quZaEsNn@~|)ftgchVDf}X zSi5E|iVKVJ=p#?yjW-UXqN0(7ZYtKV%T^^N<{>{%J`ikDaIUE`&|FVwuH|}uPA$?h z%8`**gMzXaR5ffuD?xc<&n~odY}S!55|XkobM|744w6>n5t=?}os)YQH+~`(EnbQ> zYt|v1z>t?$h{cPSAv`=9bLY)PLqipweDYxe@qWT0;fk^E*wF*3;{@iDM-Sq}kpnnE zaF*6)0cL^c-Q4JK>?+8@8x6HM zP*#G&1$j7AT7vV%MYvR6h?lcA;4;DapMLZ{@VCE(KltPV{^9fMz;Attf4*`GH+S~p zXZv2n|NP`j{QEZ_;li$NJe(efr?b}K>GT9Vm79(SiwaR3y9%==PQaW6^ARytTH9F} zQg9AJV89rR95xCeql59#0}tcP>u=-iz(ovPyn=HVFJM6WH*0INjQ((l_rIUt-}n0KtVC9#v8f5OX3oTj;jAzP1fZ*@55MtSzl-NzJ&XOPY-V0+lluHgYRsu!dnEzC+hNbZ06Y5MBKmkW$e7~ zb?m(VD7M{m3|sdc!Is_ov32(w=->T1x_7;T&K<8{yySd)N-a~C6MxPj(nQDJzl zu@t9TO9{>SI7Mha(NTr=74wlFJsJZY<@jWKBW|qryoMZ$l*v|zZQS*lSJxhGR&XYe|Gp0)l93}gd%_pKY~Xl^y*1+&l#2?@iTxeF1qG7jt4 zCnGs21#9CIbZEAKQ6W?)rc9nqfS#;GqekIDv8j+Zot;1`L3K*%gs>Q5Wil*i|Jn{Hm949be9N=@KkDn$mpFXl5 zCl2k$(L)63V@Cq*x(Qh6JQ?2w2d^w&CL%E z!lvXzyx!7){nh0-Qd)$QWw|(7oPrbOxp*Zn9cOyG@K^8N!2kZEpW(lM{zt%H{4V~_ zmmlCmLi3T*d|cboPw=S66KQdHCU*^uW zq@9v`1V#=YgOGqQg7a`JAt;@cX`rvZi4%mr(-$t{{KfMaIDZDGSiw6^U_Np5b?Q~# z>#I0*>^0u6K=T3Gv$QnJRKhYIJYn|uVO*ju%4}U;Szj@Qu$EH_p(0l3HHMnFYWhlx zXv2i%fzyW+n2nB5{7<_#@ffF@lkpEeCkK`lnCZh-=1q$?uKCA)kFUCX+)B1<8O4|? zO9Rc`R%QduMza0wtJm@L)6dW@SE8oAme3i4sK7u(1qC1}X+3&2Z$?o?9TJjrQC`)7 z>V_VyNleG0=@amLZw=miqzCWrZp8Zp!dukacX#5+#&U%5x#bsE<=6Q=2C zGt%`WkQIVpR>q|3&zPWKj2=A(fq|hIGbWsIVKigL9BMX3kD!n7K4J;dF9|fam*8Ym z9u79<;*GXi)GSzt4Uqx3xV0W%?QX?wg7XK2;}5sj;gcPN=3UiX*5a!z4ftGU4wM-H zogwE$vi?bbtxm`N5kXlb!^sICe@;ODoN)YkH=&r@6_9%f$~}bRO(OnB!2Z0i7GKCD zDLuvbXh#J;duAKHI`Jy@cC~A4%+Jlh&wu`hxN+eWuG4lEUZvI7C3j4>@mnO&Y(5rr zDtd0)lQP2EtkaFrQxxM}rYCrmtE^{!a0HW|9)ZG6CLxpnHL3%*8Y3n0AWG3un%&&YwMt0U5D} z5_q;c!}3pf$NI1=wOn@^m^?mDX+xN^U7^_&{t^g$B{%Lf0)!KUyfV&h<>`(f>y*vN zp=1HbBY)VUOzp@yMjqz>2ie-Y``RM!wdJ=MjgkAI4<-NeSb5JkwA}mN>3p93e+e|3 z)<-$sK-ZAxQbY9ya4XNpiQGq?p#Ts;@4i1YpQYglG)s$#8o7uDZh_ei&USrBXqJ26 zy;$It@tk#k0d3jtq~{FRe^qD}**voM{C^{xSN69r;QGmdW>Ziaf3bcu7@C>C%)`Ow z@j|nCc8-|GT4{3p&OSSlq)#fY)(2%)GWSP(?)wm#ilDJ&eN#XX(bi_$>L6*9rOE*d zxrU~}>o>+19eZ?1-#W;%kmuq+vj?0BAZJB9R^Kj3`guQ>w1ruq`MiF(B=)pq{sJu+ zf<^}+CT11#3rbN?T!EsZa_yfiANrq0r#-{RJ$AU#7SN2x+S zm9ilpu`83XWYKCYUa$)D=Eq?2)VaLI1fDk>!$-)t%F?22`g@KV8Hj*@K)vReF)~Bi z1k9bch#h+OYY3m|TZHDjdp2QaSOms~ zjKSpbQHU55sCg`-2#gy!9EHmk;JIpR@WRJ>G}isy?r;K`C?++UK8l$jHFZeC;faINIT z$8SJ-S`HSo;&Ar^4`ch?_al7l1m;%&!bS&ZdJ5As@b2}uaq-+GT)aqVz98K+P7|09 zvXbyR@9}k38jQ{yBQPJ^hk@eqzhn(o%!)zeh%u;7io>lZ zci_FdoAANTI=r{F3b(e`;H~>R@jzX!j?Y|O*N$hNJAl29y^aSS*+*#JPgvgP(OnN6 zpa{$l9>!e{AH_Y79Kl0R9LAq||fKiAJ z4M!NEdE`h|qL~*{r%%NwLbHtM5Go1Wb1sfF@h6l_ zsV%U4yN3`ga$1$GK9|;I0`nJrmH1M|Y9>^F(O-^tHy7ip7k1;bBQK+3UA%U$s;Vl+ zFaG`K`0&PMT%qq@Vx`&*&8EefFnfEXHzlLTl zaGjta@<6jbobQ%Tfh>izf|dm%*@x+HGG77ODbRma$pe!wu1$Y69&fMBbGxst4_ts* zhO z_T27*)a`=?n&tey_G$CONPaUYh}~OgrtO$u=zZoQg$d@Tejw%Y9rYO}ul=PJAkVXQ zb*thvhJt2krI%M_SI=L-^^2FVwYLw!1Wp0a$&;t*jD8iB^(ZZ`MP+3j%1f)Y5A&#z z!342|*!$p9c<_P8(ciZdTet2(U;j3=cWxqt)*>Z212P-nSVHS)0=V>x4hkCMC>SG0 z7^ogKG6;d(790|W;Lr$!hD9MHB#OWrN$?G0AsdWrR_uT6HJm3*%5X>m%%WpQ2+v0ao~1%5 zd2D?5`sE9_f9H10jE=ySl=nmC%!!MqNpV#kfegSkaGTGx!@73Dad zUx0H(ML3<4jgy7hI8m90*Gtp!d|o2<6=mQ^P6iI=W#jdNY`j*IjpquI@l?SEJX)5C zzU(B-2^p<$639GTv@8y**QaCsx-`t2yA;KxwaDPLMhAytB;h7l#?BozTp_-_sS)qq zyon1kk_P=qDgeh0?qh}FptiIR96f-u)W9(=j~(RlkoI#PI6=52I1kY7&e8rPzUxd~ zv^Os}vv8$sIpq({UYnJr^kL?@^QOeWvWPz5fo8E|E9;x$r{e!sX1nZTeOIJi{un#8 zOce3AitF-Gp!yWktnt|a=Aoe34a^G7-1hEUZ{zN}b|E}C5c4L|Ze~qG&B`TciHkvf z+;UXL#~^*)Y^<3$5y`9oCeK)m8LaSaOOX>Hz*d-gieyjP(40QT%Xf<2EM!`*uialeD;-TE*> z!lz(VKm?X8U9Bz6GV;yp)rkb;bZ30^QY=}r8c~sxFn{h6tX{PS3+67y9Qu6B(pW5B zv;tG6&Orz(TLGaV1n?2sYCI=0994;N=*&w-eaaeaSTq}xg98vcVmKlK18HAD2neE` zMTH?CkQHYhlQ3xlUTvwuiMC>#sL#Ra`eN*FszKs}@yMGs79T#`OGvK6ZE5x0+Jw)y z)Z>dSH3VZqEy4KPKKU)MEZdAe=l0uu)e5}Yf-I2SRe{?Z3B-hEnMPZ-D>(ZAGXc5J z=u7zskkI}4rW&r7;bw0!e(>`Bxb@N#h?~HQy7)+I9scFtevX@OUcklUMmub!gK(pZbMUCb_}+EF~E@-k2tB=#p}q*OanV zm;8^?vfJ)U8EAIOQeMXm%pyk)OGtG)8vue zW(sy+sQfi#ujMB>uYRPaneLSvO1?{j?qvx1-j9)tuj+J0p`*PI&ph=CIy?Hf-iyx8esp#B z6P&lBo69YmcW`?T3+NSu*BOY6B-nT>+pNr;LVN4Sl~IIc_A5BXqm?wo~Kx^yLh zc@2R%35iK*NKVNlFlQq(vp`4v$jvR)@uUTq3%M-FD?w3VIm*jwP+3t&)uN)T8fhsR zSU7(PuRRXop-}{ig{ZD7!?VvkPIx|}A0Hn+bU??pmhTEAPfhoXQ+V_0B|QA-{e+LH zm@sk}`qMITcFPW&-?{^P%S*9l#uS7&t#so@jY9Fl#dxx;6mL{k;ZR`#juVbg7o_1- zVKUB?rQ-y_`6QwFSauqY=4Ilw{B*pOn?h(#!{gk3Uuh0XK~t)0ZN{nC<52&PQA z!a|>wW*Nrv9Am?P#)!i>MR-1Oc)#i-;rbLITeb-ddkS9K-DN$8k=Mp)qufur5c$MZ z`h>?%JfJM`(8~IypKm(bze_&8t8v#z;;@+_oU*Zva?jLeq1krHa6_|tfKob}4#N;!wV14wA=5p>*LARK>($K5e~z(M)`BZx`O* zT#1jiR;zAqCp6#Jgzd%awGvcb-Gs-U{66+R{u;q~A0BvkAMV@x2BG?Szx ze((U7hp5B2>wyE<`@|u%ckMyI=xA0rqp-$#>mVQa#m8;H`t_L#&H1?%C@HK(9@mq& zZRN7HShhG0E0(Wg<#Q#*O`L_W=&2YP9)aOw%$VluW=+MZJzMe7;}78Nd$!@?u3jA5 zwF$jNX_ynu$}sl{rCpC0H4LL8f-q_7M1*i##;h6G*HMS#t;IOmkcab)W%zz=8J32H zpl-z+eDYuqA-EP__SWH>z9xL#ThC=R0lE@j5srT#FukRkuxupjU-VZi0Dn#}{zBAU zfiDToUv?6Ny9mqz%?>yZhGrg5eI>_ls^)d&+N`wnmE*^+J%VF*++~A* zSz#n)kQm56I)nsH{blmR7fObP<~wW1G*Cl9vo9nuU6e_Ozr5%1 z2XfDA`ynLIds-%Ckk&DUW-eW@{C@+@BH!ob?K5oT`@F0aj2$?3!?Ig;Uj#wr%X&eLkOpnfGUXPzq6jW_f1z{GMQ@=k(@@N@0MuBQT%= z&|ic7*!uclF;8qiHmP9JhJ0G5opXwf8fdnXbZ|fF^7*sa+SiNGBMi9Cm^Mo%EU2w% zAuzX~uC4<$T+W)gkPsV=1@mL@&;w7Qf733swroO2$7Zy)_MoG)7p-kw=<4cIXkNN_ z1;&n@fSQ_GRZdP03+Z^wn=_Z78-)=B=aIulU~E(*CQTfViPZRU0>a}kHaZ%UCQZWR z$&(Ql7OG$!L~xD>4a3Cfafl9$L_~174reDzf!WA#f^^gbjE|m-sZ(ZYw}gcYmSF{< zM85Ku;jvPZGm)8*huoYZ+{$%2c+gD8iOaJ7#O$)!T z-wqbjNS~E{*4o#aK4Zs@wqPufEVA*<{+1D!hJxnn=gzC-gW5N5y@lqUHjD`!iP;mv zkv?ZCHm+Tc%}FuXoUt0!tLGzs$xI~An1H+`3z4^cITnV7AZ1(_uI_HfdpnzPb4vr> z+unq?w$|X@Rb6B~1_J zd-fjY@BMi2@dGHY>c+6)VXRzEbS5S;s*Wy%r4PNi6 zM_+CtR*W|j?2Kf^FpS46TfRuaGjq-?yxLxkgEB^QV;;^nmg0%REX*7+487}P@X?M| ze9}>hFE&=;+nxsepr?-DT!k;U5Sq8t;G4~L_!9&6SRCJneSZ30+Tfm_iy)hD|;Un>iofOC}0-(1G9ludk&j^e`pr@p5I^Y zbXe)D5RXh@q9a|{bLz2<5jlqBD{~$%EHl3yxvygj04oJ$AIY#%hX={R@tw|N|MS0B z-{0!|Ei2JzHU~EQ~MiOk~JTk1I z8<_o}S%2D`8E8Lo#w6||Yb8`>XO@Qu3OKUf}x;N{zy3$o*&YT6~_I}Bq+jYc5BdGW#p1mB~$dikP$<$n1Bf$;n}oi$LV+qPf4%b4C5`TLdU zpGALnC+ce}RehT}vH$g#6`FMxTiT%v&2@n>-fi0!n#ESN^~+CgpKt@SjfZB00VDNG z?&n)(4iyW|^4g3g^dmPoOF5&y!!k!9`6%O_6B}hIlCByO7j&3LRwk|xlCPdShij~0 zTy>!Nof~hWrL7TRtRyX*7=_#gGtrX}i(T34QMqakVkV46(z2OYw|Ev(md(S4r3)}S zA`A;gkH8BprMUI*4!m{mX54yU8zH(27w_rDmV#963b7$I7Y{!2JRW)cHEJIo5^#R> zbwcy2gyioNl3&AJ59}j2OE-;!x_%#FIyJMBb~OeQ#?R0e=8Uu)WTqE#Js-&%^0li{ zNl^_7Xe(=1reN{BI4oMQ2Ggc5)xOXn1m?(y$%qUYi)kUDXiQ1KiF-F3q7<6b=rgZ$)ZkznD~0tr zI9gYXd$W=-h5tRAoopr^sm;X~E#>&8qYmG8)#0nYYJzelJ`+GzXy$JM^A|F6AN3WX z`Kum+vqEzPz9uk#Lty^8gJ8||PrFRtW;ZlTtMr$IW-9~EwFK!J?c02d*ZJXVPqWgJ zgSZJ(Fn&}3jvRayzw-w_#wpsijFutsUT3(YuF$>&l>Nck4VV@(tv&mc4g9U6c3E5J zzAF9wU9x4D@eli3$|x^eIQGr5TKvge)B2KO3ERC>%X<2x_@H4DNi+RKW!BBK@y<5> zuluQNWmt~mwJ5_yaz1BWfAc>{i@%ODO)kFC^zj-pC37$gIEao>y`T0RewG7rQJ%vN@PbBqP6q2Njaj@eEzc)Eev z7U+Y3nX&`V%XNGKXR!SLv+1{KvcEmsd0&GikStK)1!e=yuKT+qG*e+ji7`cpf(_Tgm--p;`8&tqLgno!@gFPo7ZX%fB)-4+YF3 z2be{KWgRxi1-P^ITb1Pa}`q9+52~ABD!MTax+}P5Eu1#C9c5M>jERbNCf(IUW z2>0E0FCKaLAw2QeBY5_iC-KsY&*S;$p22_O7{1gwpX!@9UNNQhq#8FeFm^*XE} zuu3-z>8m_-%1p$q=JwclmidYDSqWx|OOm^niLdI`b_-Xlkh<~A8dFAzb2p{mG;ahOerUbbu{ zVkHf0)?wYcwMb7-LvnHwA$tnO1dhSD=y6!SbU9%p6HWE?*tLBd4!-^x&h9&atqqNc zoi-UQ8`j`(Z#$0G%4it{c)d6m&lKn2k%~e*SY3*mv;@qZJ^>+4f6-`y#_I87(Xuuk zkCvC<1s?Z&c`jZm%fjgFz& zJh9FlKV5SBgpXf3@z4OXj5HvxwZ&g~9glqDf;SEf0nL<@HR3n29FUmjf@XKafm~l> z7NJ=n`3j-=Dxvv`R4VxU*4uBPxxN`m zR#XUPhL6US(9wt*HA0_V&irY3rmY5#HdbR#X)d}`*Ptme7AcG7Y5(`wxDD8S_u~r9 zk3O*vdkN0>Kl&P>nZW$e%Y^0c6O><9Sl)5}8`|=`>)sc!oFF`GSRf|THc|=AnQ6HM z=K`dq7)js$g8T|(q!wc7{CHMQS8?CPh#ET^5fOyNs7aU@Iu7&4O~%%Oe4N}x1OZ{li9FEf3=b89ToUYpjrAj`$4nrE8Q_%(ELrGv})Juh#Mc= z)q#(me;Cb4iHI3H2@^(-#?=d_@y!oD)y4*yI$cJ|@j$b5&!CODfmw8!wkY;~iR%*6 z6(*f>!CiLxlrh%sMIO8NmMMe9V;t#kUdyLEa7q15IECw$BP=i*v5asU2d!TWE`z-C zoR`P=&I@5Qq;HEa@!W&O`;>38oOFqw3zu1Fw(|NokKuBd4t!1Nn69$UKTPLsb3qHu zMh2J-JbRnXz5jdVJFe`VWgm`dm6mk@itmDE^YHH^6L3t>k)=YN)6Y^Kj1Gw<(ClQm zWaq|eNK9VZ{$#$gT3 zSKRJ0HkED{ATrXS104;9X30w`^pKJIg9&EI7gR&swn(Cnna|Jh$M)^_p|!P7VY#kuBO1AEZ0<#UQxBSDdutD(qbKRe7>$iBc<7FtDKaS_0eG0F<@EqZ{5bNXCB2hlpTeku0;*+p`O)?VJreNZP=?ESZ$?YlH zbs{a51%7%C3iC@*Sze3ksyeD3HB?by84Larow~cU#B{Zgt{CzvTBZPBxguV>YAju{ z3bSX;*Z$8k_VjpyfOPGMiik!uB>+4^pnAkeovcBI(UXrB1)!rM$7Af+iP}e7TEHVC zqX-p|JVrilG!+k#ObR2*anTXj(cO)Edp2Ri!nr72u?YLSTClIW7;lv2;g!-HLUR@# zCip&@NdPV_MPFqZ*36%S31fl~9W+KkLngLZ8WDk@LVb z&b&F8GJFJPFnw96`N+&JM_OhXQqziAiHOnG_YuQG`3%PpzJ_B$;4mEARf^A^XvWv~ z*WlX+Tk*+bTXE>FO;|N^G6F}M^ylW~;nL;HxJ+<9di)q(eCZ|Z-n|PI6(v|Wf3~(B z>itr(lu$?IJPs907vO>HRP4>(fR^QRQN4T~niArv1Xfg5BY(|GWW}vU+VW+Hn>rEm!-6n@ zmDmW{khIFm$7dn`FK>SxUPqF(`JyIC7F#kiGc$uckax{h{zSOei0cNL0Gf1 zcfy$QGf-Z!78^G1$JU)E(X{m_)^9nC#?6ONub}xLYS$k|ErEGO<1sbeTxr!73?u+| zW20&4kg=*f1p0-lp5{RR<@_CirQvZ14UJ}l)fXd&O=aVF8v6Ac1NS~dFsT194EN}V zCF3WcDJu)7D~fPBKLzLW5^=jS3$IqoaFR^CSdoU#%1m4=%f!yKSk%TY#fq3nR7NdF zO+pL;r%l2T=iVw!{$qw?e^xv$H$1xYI_}(i4VTZH zVS|gdp!^o?ff9HYaF!1N&A@CLo}p}>cBa?OV&6ueto*AS6aOMM%kvdn8f;%H7h?aG zGNQzDjeMwg!kGPlN!lt}}m2U}6kiIyCFcXzMaJgiYnKn{sl`k^Co1G+HVjTWGe1XQO^DG+WCE%=)Ts z)`QL^6SM?(EwfQ5(MSZ6b?6i)&W9+_+;LHpYN_y{nW352>6tBY(zix+1wFd-Yi;#x z{xW{tTHsUPm#qM^4%Bk&`Os|aZ8qkE(MW8KN)wNp9En-#xWJI6XQ7$*;QjP_w-J`F zGA<%*0TsP>T8o2TNYxnT3l?Z@BQUqi_WT)&Tkf~>e{mv~VKdr&r6p)p*iAC#t>nAK z!N^>V+I)@AK(q8dnZ|`FXf{EzK4wq{=dOC2ue0D-fvWsA-P3Y!y@y-wWjNPL*4OEF zqeHT};bi(P>9asH)5`KD6_s%sAYEpsFH+(1dhUy?LEo}(=}GR&p7nBPlm-vx>-fBt z-7wAq&B|U`zI14oN>O^8cWm8Az||JL7&ve!q6nK25wVDjOdxzFs6*6{D!679nAAYAU#ilcr(S>N*1RdaU2D9-B9B zz}78Igyv0X-nj+MJGS8P{uUG!fZ9vpO2R|AGP#$&zZFVGpEl}J=jwx&BTOpQ!skuc*6V;0&`#03obqC(sM1n z*4<_P;~qWKe6^CNKHiIPv$G9b8rC3m{tWmJ8;C>cDL9=Jk0a5`uzPt3HirjcE5Ub5 zP%!EOLy@|82^NhRgIVLoVa~Ma7&~Gl2D%Z_8Lsrw4d~ttllnNpd)#n@O`n8_IWsYJ z&;SgjL-$?cqh7W91+jq<9K*&&5Imgw5tQxFy_YtNiMxy&4V{Hw9bAF0c9-CX&3X8C zb18ngxd!+5Y(Yg#1e|z|qoelG;KYd&F=x&kjA5Mm_U+3sEFWz;JJ@0Rgwa?Y?1$v& za7<^zz>e{g|0Z$&)QKZn-E!$1eUw&KEPl&U*BO`5*6R+XXUdL9%7pHJWZOZtvpn%y z9h$Acne~-&usomjNQXc(%edadtiOHLXU))imt`q+c>`y?Xp|NY0cIKNEz*HmhG^XC zyoMJE&9R9wY>4&7)S;e8nKKp5@yoC$ItY6c!*HQ84=-+7i_W!mxYW1`M;lgRcU>(u zS686EAPZB54_5itiwzh1UNY{#8|)mNDK`ux^yU@RpuTYzw(pXjW`X8|gysW;<^zOg zHfq)#!s^B&sA)K&X4S~dtAn%aaE9qbXdZ(g|1boo-sT9^v+V1$41r-{FOirqeilZJ zn8vzb90m>=tA;ZT>DwPs3+H23K@N@-WaCs`GR|a0qctZEua{@xtP6Xf7x43Mf~|bZ|N}Q-o#pQD7ydX3vm$nJs!y zl#GYPmfu}MGeP;?l4Jrip_$wFOVjYEJQWW&uf*B_$V+oO2A16DE`*tuLHBmXfEjp5l!&vSVu+RzIfnngOq z>d;KPG(xkfr`ZV1KO35D;8(l1tVksH)b90M8Lks)_Mty-i~cb*i!9;VI)3eyk|sT$ zG>QFc(&1P1>^XY5qnH;04%*O{4xZm;l7O?sk@uvnw#)N1L9;ZLuAGy`(mCX2XPA2l zXUv>~=$J$V(|Lx4MI$6+xte8S%&5tPKo`s>NS2n?AV0qX+1X{t$S6S~fjK3$2swE* z$jq)pQE43}O_`5gcFqV43L`9U!kRU8Shuc`z`T*#pcV~j-mw|GcW%akJ-e`?riuU? zMF3AmYDy*nI9I{?L^^Q0p4yO*rJ-@^W2F#+a{z&QAwfQv@e5rVji8Wlge{F^yk+j$ zU<3t+VCmAOgvSUh<8jf_un3cwwoo~)eF!MM(ILU%+J}MM50!oXzf{0Hi0APh%a_F> zhK-1r$V5a(Bp{Md8X6LzdhC3>gVd7ZvXrn4C6QS?WMP<|J+)rvWlMwb-usWSb892w zgT3K3xF5DehT~*%GWIN6id|6=SRWorEkk2q2y$l4!@Pk5F>}N)ESfhHGv>~~lvz_T zedbh596cJtJUlVb*##biav8oM(|XCG83Wu2Cle>Y$1e~*f#F!>vlMd|1~5)jVBg&V zj@`OrWUn6Zk)H1*^YC(29`06T;I-@sT#j9W?E%x(YHWTZhGMA9aME3dsPtr`z#g)i zouoyVi<2X~7cM}$efZxM9AWvT0o&}?PzR%OTdpI#0Gnl*q{fNTZLA|A7WW+kb67>@Bxj?`Ofu6h9m z`W}I0142fbF$a2=tn-^0~Y8*b|s#4&)y8PUzv_1~-qv zNJz@V>NQ)jd7FF;c$m<90P6_NYd7v!JQ4q{l&7&>?i zg8i2&Xb$vS27dzck|k^`hAxLUkB=NR1tUjH#ehNM&~M-fj2bltnQ_tByP_P&in4K~ zC<9kAV{m%8Kh8w>;kBG(JYl@=mgeK-ibA}yp&nUY3*g51JID%{v?J*$UovbEHsxny z&WIuEckq<4IG7%bwvEc^ct0fu?`Fl|ac%;kIfKxgP9WyJ zWC#Y~`CZv3EGux9t@J!o4++ip%aTlsn~0Kew_1W?3cB7Mj)Q0I@fdFIDztvMsX|8;PHxok_h-DF{|&Ll{QuQY(=AjXsjnFKs5&lDHRvBWhe5|2afcgLb(5zKdnoNLUm2gWXXnXuwXy(=i znngM^ON&mX{V|5wmAa0*wQR3Zd8z}iL3-S*q>R}B%_1JRx>mxYwDCf=={6abjUic! z|C=0%S?2%v>?k9+N_Z-4-cP@`#6y5t;8{zjR={lIA|GB#p6in7qjYH2)HO6Ka96EC z>+n>5n)dbl)}4zsJVWbg)|b-Mpqb~;R=a{`iv*lywXJ7|Xc#AuH_K$-mU|nanfYsb zzAnRcB9$KV^S0<&Xl7bj?nGUNZ~9Mj{93-49l7bZCQCWffmvQ7>*s3o&I%kD#Qc#7 z9}9CROd4}^!u5JICAH2c$m z1_y=1$14zhT>Hb_y+4*m#iO{S8tIuu$jT{0T6!^33C?+iEBU($c?BzxnOV+rhr)&M zn3kT6)wOk~U9$%3);FMOLj$&`-sTP1wS5y>c5TJZ?VC|jQjENO>A~hUuMnjrRfvmC zg=e231hoE`G+_>c0-_KU7!ALG2sK7`3}2=r0KPs!RG@-g`M58TX$%RD;IUx%$bU?$_hNr|E%L{bh5q~DdRD2)C9FSjO6Rc;Y0A+Yq!vT z^(^uU0<-$Lqjbq498b@{_OPYcurw46%fnGi$S+#B7)u5W#0*}uX!bPBm@^e)XH3AD z>EkhR+5}9UItfz=#uLVl$EcB`F?{$iczX7Mv$G={oqNNruLt@KVcz!}1lRt9(WmcV zHOIAGx1N~XcL3@c{;}?N=w5fd7Vw5P+!@zp0a{wsWbUo8;@y_vFsSik8OETK8XKYK{LyuH8hKZ<+&o3 zCF`)HN6S*iTUlNdFbhDNqye*5zsbH_BjcJ?FS7}nU%b|Vw{O3IyyPSda<<2`elEye zG84Nqmg7iqEY8$tAjNA8hW6}+B%g^mxGD$xRuy4WbtzUA=V47%2^(wyD$je!RBZ%i z87ia3*E@PZmWP)&IV)?opmD<|qR!t))T|2Dz*jtsLm=A*BpJzRU)A$nODzW(wPbaoJg=>uPC zzk!=KIyGGOZ(qyHVqN~vy}yy#rMj2+-F()DQFgd-AzxpO1vug4AAu3cw8QY*6aN2m}Vv3 zOa}}NG6YlvSd7rywe<6T&bgZ8p;|KK`C@w1!=OP>mT}A`F~jvKel+o16G$t^Z!!$2 z+|9jsiU|CaF;TomGrx9E=cok6AVA=cuoDQ2j|0d75MYy);n$k@|;Z4HfP1>4#P@yU# z1*USXQ4Cw%TeD&9z5?vDW&UplbdzL7C)rx!YO-bCLu^b{V$63*zd3F4j7-l=JvUW= zwXLzdsq$wyPh_+~ql`9cbuPoSRXgvcR#7r7J*us!)J~hGm}Z&rL-n-Lo~3?}awTzY zKX(pS&z|P~IW*Q+VuVaXMkgj8kHto3z&CI?0W=CBLCXk;%P?c&RP^fB3!_GjCm0JL zS0gE{5Xl(?W|=i3uM*j!oC>U3xn9lj+M}BTfp#3qs#l<*x(2J)tVUh^YJzheHg1$2 z=MC7rX&rX%pyO|-M^0`&3JQynn^#157H}@X{CVDL<_ph0L)D6a0e+G2@m>mFg6{NL z3o&xcbPOCa0sRM$$Iy{8Fm57Yd6q9`&+%8&WX+uJg_+Y9VHP!)*DsjsgT)K{)HrDw zZldi6!e?;^e+R=$7XKhzPn|N84T%02KW+;Ayh8}iAqbFVhJ#}e9vF)-zbGsVh(Tlk z;W?1U0-_PY<53}rh+US7<)MjKGS?qN`wv%Rhh?ZjbMqEFd3q1)>sDjVh!I#u=xs?& zN7J%UtPKxB^)i3tcrAcWzkZnL;f5)bM>8H1Flpj=Oqn!^jfRPg&sa>DI2Pk3jKN51 z$ha{WIAS7stqdOdW$OnZz2rK;&Hfk)ExVmGy zogK#X?15qScIexyJ6vStKv~s|&}@cehu)Ypemrd;1a*}a=sbH0AHVYm|JN6P!GHVw z&-nMx|AfE(<#+hmSyQT}c%<_MQf?$`A8+g~FTt+L609%HLEyq^7~tuQ=~IRy+hf2%hd+dHZ)^{^fVKen>HUr zW79sYS+@tP*X<=Z@59Ra{ptfjzu-7@>!D3AIB4K-`1uCI*EbA){^1PEA9E@1pygOV zfF3+}BB6N-&!e6C`QgZhI-FToiPp7+IGGfJvwjP3BXkk22hB!%;A|XQGzr`1O~pE< zu{JUWYqJaBJADR*x;h|m-UO^o@WF}frP!7jj+&G>^l|E~fI7!_KF$;*<3h@EoJ&}a z?QxNqN;_G_@ZVisf&1z4c#<23CxqE2#YuR#M4Qp#31L`12z*a%;|b0@Pr)-mxPs7J zk%D^!>O0(jn|h1K-zrZeNXO%Tsiub|+5#tPk&_}qcOk8!2nXY|ty-{ebl|DukGpHXqp z_@ToD%_7?m{EhzZc`}|O)5xjsbyOK7dsxXf4U9V(LU>A#?=upgtBYk)z*#S+dbw5K zC(!R*Z|l&YS!9IdHj&LSiQu!)YyxNA#NxcAcaA{7O6H#dVGTTFUs~q>NR|imlVS#_ z6{Bbr&j7Q?#-Oedny;~fGD5Qk&XTc~aOGi0Wvca9SWD}*(C)2E9z-kit(Ez8m6ESB z)oT%fV@lsji%WVQ^wyZZf}VL5J(H2k`MNqNTiAUbG>ddd(hJ2;LbC~$&5}Z3&Rfkm zl@v5Ht%f|()uoY28qL6LlBw`mWPxVux(hM(TUZM6uANjg01@fGQ?DmJDXOE&3~{5C%`o~y1=uZaZc zC4O4DyD3|qV+3d8E8*u!g=yM1!m>z!xy=C0vUYzP_uFMzQs$kk+uzRoRC6>-g>~T^ z^NKC?GiOc1`Hdmriz<9W)k|l52B-X z#)1WbC@x-uwDdBhW|kpc#xQ4>A-8x1YSwJPn#Rr8*}M-+yrh?`I|2g3QCqhjW#v_< zsHnoK)hp4^DC3*!(X?S5p?QOvMPt>%!IG z!`~+YO9;42mIPwrl-Y#lsf6Vz7&3esH52^?O~&w%voT|)AE9$GX3g-%yxD$OKrJN1 zE}kDiNES&?asWkOULsH&2Jc0o@F5&8TI8cv5tMPtlO|3lEQc`+Zw21oOn<1~a)kI& zK9K|p0p?gl5TL^e&k@1#1nUHZ_(v1S1J&4PM<-{MzsYIQc>kjZxZHZ0*Dk_%XGiSH z%0^R66sp4l5I=7^<_#Z=VGa%$H)Jp-PaRM2pMY^=$6>hp>s=1*@ zjv9hNg9pH~Zy)p@*auUlO+;`g<4@?CFmW0|dJuZ5@yopw2zT#A2=3hzo&@2(4o+~l zchFX9>)BHcqY$v~=kA6X1oZfb<=C*MUfIagJ8$5#-+YK~fBzBw^1%cA>FHbe-NQHV z^ZlP6%&>={@xYdWS3_rNGuJB)UA!ytS4@KdJz>%nJY z&xVH`#b@Kz8?z=(#^_;Vn7=(Re!@%?l&ogC+lIzX`>|2RG;e4@!}?}4aJ#y(nb6#V z+Q$7@xA_PH!{X7?&Xw2NsgGWLyaUynB$>W$78{du7O+eN$WWuD7&2fI1`e2vL4(F2 zD?S0OyPD9}P=X_g{y4X6AzqCPz^!l}Tnt%=qd^O>KQa)j1H2J6Z8|1;_J=#m{-l1M zC|c%+mZC(QE{Vq8*d=I+3PJ$WF81b5dn*oHgmcBoIG-7XQ%O;1hz!AW`)=4C;e*F3 zN^mceAe&DhE=$7GvP8UVT5v>G>yt%H-j#991m?$;ad=F4)}T2B56T6ala%gCZ!?eI zB{bhFizP6N;_y&Z5~mh9k)CFmj!x!-7GRcP8Sj;*64q1khi&U{|40kQ^mj!c(+Zmn zD{Jt>ci-Te3>&=p67>phcD{%k*KgxS=XG4yp}B*;iO}5HM&C}`6aOUzz_i?HfDYehQj@mh>={ui`#^uzJ(e z1vp!vndhnw$mg?&r5tP5YKs_Y(pUV{-yB_UZ?o=cHmWOV)?4h)LbDN=+m)`HAVODH zplp;8B7TmH59z0LjhXUup;>yGjnHfcIz9kBleI7@319N%XGro-Rg%oBHcAb?w|a15 zM5dlbk?bpI)+OMq^~P#4#E0kG!tQf>n&ny1=yj-4;<>tP!?6NpF?vJ3TBni5X8g|} zV;aq3*t(%AiQztT9*tJ1>oD~2#TInic@~;^oUqzeKFER)Hwd??M_EZg*&@x3O;Ae5 zrwx6eeXwG(hGzNC0?n4`?s(h)%NA(10JBKhGS5{uVX`x0{)q%?wP$6zPNma&+_gZn z4$k_s)nWMO$_UIN0cKlhHcO82Kb4<+{<3=8bvEd((>_&&&F~UqW6+b%*tw4<7JAWv z`i5i7sOfNU>4)z2PUxn_uy%*Nog17S2f@K<1V)ctL?DhuP)G`**VUu3VJ$W_ZN$#)J5X98 zpj@h8IXAbAU|x-YfN;3F4nQBb!5A@oA{G%sy_bZkkJ{!FdSzMYG2>@p@bJkPFnA&c z4V{X?!=_>ExOtd0%NH{RnCJLo(Y!!SgaYYR7HPm7Kwy^P7@?{s*@plMLG`|2%d6@+ zvlpnjrDb}$IkOibnDLl5+Y8gD%qR5tBG4xsf!^T=_KqYx#~{o<7E1}tOM{}YG&lyK zGR2#J1m@23!szjn(PzK_bhGbHs2z#d-|ocyr>~)=rV#z?dmuN!A3L+NkUD=ZW)B^N z!EUY?;L%sX@%T~0FlzWv3>h>Kg9eBOU>Ns@^zVnk9&S9>1p^%&;pyav{sa1=Uw`=^ zu{Z2{I1);McS>ToQJz--uT`uHn~@ z9^&&~eTX0a{CoWP=ilMa?>xkt9hY$az%JBRF45LR3!pNb0F?JXmI3qkUZ`436 z8rl#3!}=p=q#L5g_DAZN;Yb@h97$8hV8Mt1aOo~@DSE(}wlZ#5Uo7=rf(0{Xt8n`c z7>cy)QdF(ph_xGcW8KCUtmn3IJ)cwKcGRxhj@9dSVO2wm8nzL#G!8v_IiUwZd^j70 z3Yz`G;NugD8Eh2JnePK{e?NG$E*LscfO!T64xE6>>>RwhcPq|kN1`J+5U(VJ;N|c@ zoLuIQqv`Qz$w)z-pDz~BW{0}CVjN+4`TW_~n3aV6xe3@A7l^IVUTBK-L(=Ri=p(~T zXe<3$CaRWs;e1IVuH?nyd{!J*M+9KDn;pZA#N(<0Jj@{|7s?8M1Z2Xme4wX6Ga;Ii z>E&eRjE98ghlJ*b1Y_xG*1$Ol_svS6;&I;$&9Q{$7(604OHZ?c<|0Bf0bhEv-zPA? zSC&dpPsf-0cjBcT8(G$Ruxz?0Xx_VH8@~PeO9jm@+=)Y+z!eg4f(|)bEhW_DBYd&~DtIxt(H*(l{ZO#4P`O3p{H>Hbka}Epg%27IuFUn)$5Ez^sdL z>I$6ID;XVLRbesZJI|NasPx{b9vn$4kL$)}lr-dc6OKCHqbvyCzgJ$xC~p_dVhc#LV(-hLSEoA+iR(|eZ9 zQe&blwywgJvnuss}?;_(652B%rR|zwsOr+NrfxtnYMEOcBVa9VjU*R)95d$S@Su z)M3l+V_4g;6ZQ35P`+Xf!s6qwATSV9XU)gx(K9e;$P74pPKUkw6gap~gy+y%7(QVU z#!sWOT|%czAa(N`LeQ4k8642ljIG0;O^Wp{4!z{oY1nhKP8y35z3mY*cNS*K%7UKm z=W)Qb(ZDEduYAI z&iw5{_2Iq@yBN*$N7>n7SkGP<#%CgnXG|iTEel5l{`sMHNkL+%BfkF^4oFc zoKkzMNOQRQlynN3<=QTwS>U#n|C>8G=udQbR@YMMzA7Bk^(>#(;LLJmDqluPBUM2= zr5f7p{LS*-+0n_yz$KhGv>#`W>_<~=rJ8wU+NiwxO|U2Uf4$hSd$b(6r?M!ouTNK6}B@ z(FHRI-ZHjdRt5B-?a!IxgIROE;N|P1hHZ={&u}FmW{n-b7rAFB^Jl3@^Ppj30qDBi6+S<3e!)+Kb}RT9k^Ka6ioRbin!CSUj%G!8--Xc)uhWk7X6Wa?`99 zCTM=be0y3>Sgs@xSH|N3q4_}tp}8^%k0=Gu)FT4(BLedyp8KFInu;PYM=L!lAvl-B zDsX(UleqmMI%qd~JAw?6OZ%5q;RBiv^h(aMUH zS*`q1SoiG*brtBRzj&>#oF1GeNcLZPwvg8JBfttMIKJ zl(qB)Ni^w0JS5&m1LA`q7^~KoJXC_Q8Z2)HW}{fCT7k1XBgTR8(v3rJA$AQ-{GVIt zjnu(Pm$uptuQNci$^(IBN(RfD^OI87nIKyvQ{2dLTVPg>f$6rD-lJ+O{T{Y@HZ59w z^nD|Q8lhK>|5fKmIGQc1d{E~KFzXUyHQ-W5%C#o5ExgEnL#NSGd~Q2Cojf;F|*+4 zIUIH_1JKLP4R&@euv5KR4r+O3=iYtSB9Fi1hO&k}*=Kn2Zy1l&Nsa4JIm9ynkqx=U}| z(9!Vs307fD7&{HKrY}UG7tiIr0=NzEAw2uVASfUPVZqTW$b zGIYWLy?ga$o;sjMFMBpf?9ol+;DBztt{cHjdX3c-bktZn^m$HpSY+P~p{@={95x6U zlP99o+XvgT^Kf~`4!nNjGJg5wP5kBkhxqKB`}p&nH}HfG{>9@*aA17{iqev>VBQS$ z>+eo5@2?9lkhSi~Dtn zcqQ8lZzfE`dzoIiw0sf9@Y?w^XW$BLSAFo!x>0>xq}x30T8+P#-YIT1ST;a2rI&B< zbiOyGmp>UE>6a3LND9TvdCPHsMJj%= zu>|io72-jCCSflb?`B2fy<~sfj$MS^(MvFaU@48E{;p0~?z;d*$x%p&j)04c8^gE5 z{Kft#tyzoJjoS##yHMY-85;@94ePdG#p-ofUAG0b_1m#w^BzRRB&r^2PtU&aTH>pE zngyDDeZnwThG)$2#=<25n9p)HYRq&vx%I{5p@VU;vJk(hDa2iwCXeYk#61Fp293#UQvkFiVU30jlqtDU~Eo_MEa6hnB1QYXr@CNbv`jE7|jY7DdNzh9a{SeB0g z6V)sl_X)~(3C{N_6A90W>Lb91rjG;V|EC1zCuMOI;aI`4K(hwU8Z>LeG#(O~AE}Q3 z`5fc}K;GwJc?P~Xv>S)Y@>oXs96Vj&>DXI+NAuNZpD2Iy;;ol)^ZEArRX6dlgKB!|@2Hm(sUqi_%u;v+-7><-B zqrWoy8D67@t(65SKRUeYTRG2i&OamlzMAi3IO2nqFV_9C4$tD3HI7mDGxJhc;aZ^C z!kAiEmN&{4NnUr;9kMPx1B?M{6%H?w%AiYVw(e!NhGu~fBPbh>8=#rtNu(?h?@FeG z=@04WNa8P*=g*L_S6ZN1J|0xVG<4%KzIw||T3>`^9h$Fk ztB?6n(9HjJU~c2S=vipyHQegPDAEgvZp=oh*Zcx>jE9VO)g{MdZc{b4wg59FuWB@q zl3r$0e)G3FU+yIih0=SQ1xBBRu;)OtF&_ob-9a6*Mz_Iy5UWZ`$d&+X>9=tgvPO zS_hwfJD)u(5m}k)1;*_*?|dMpReTqPVCWxdkOi z&dEo1Nd;D|k@=JfQ%Blx_}mNFb>=4O4qQOlru|5-YD7d@5tal+!L8p&hUrA$9E}mf zr=VvKH+1hN0Nn@fu7lO=7tT(e{C^mN1DC34-olr~Au&E1X(>fWrLwatk(E=4qTvG=KEuq=JbCh5Q%`dcmMjUz)ai>cX8df7n>ZKKXL@1bq7W=v9713YRAZkP&iBKj z`94@U&sz<{5OCHcz03m5gaxl))x+#hXbAETC-g>P{+uOncI*TDUe1^{Wfp?`!WfP( zrcR!%!U^&Xhu@MAf-~XSC!D|>O+~8v%$vOk3+FDz65e<59AC_tvQ& zpWsgq-@$uty@>~}ynvhM&)`tg2Gr(cA)P~gv~SkKYPwI_wm@=wE9+)mhO#X_Mn0PE8k*Y( z&8=wVe=-JH2WBOK5B}CW9#rRpDsytZlyg&k!m_H3m1H?JNrpP9k8UaXh){r8W=D{> zN}bo(P~~xf=A(O>acKJnMEK5Ez3~~5!Pu0QjDxB1xSSS`7c%2*uL`s))Am{a!QewR-hI@Ss}wW)^9^}Y!a{OiT(rn!`C-}z-;Pi zrajM7(9H6(Fc`C0c1BN}f)T^|qbST9FPG-v^(X=}q5X1R3O44ZV$!gI>H|mb@k7y4 zoQ5~IufoHPrFgwM1=ovWaV#Se8xxixX~9ekanOci^k5^em%NFh|V_wb;lbLhw74GHgP|8xxvUZ*w9Z6N>LqcVxB< zGc?N(jRZBuS!UUIrz{@t5}4ni9@{{(8lq7Wi-*N=ctB{DCAQ^*!>7E5%&u|2EFGU8 zZo&4r!#B~DZaW?N{M@QuK1%uA!XdM5f@nfmN5 zIy9TdHOp`e))gJ?9cp}YXU7FXGwTB8+e_Cvag#QE>G(;ksj7mzi!1CLoH2$VSiiXi zTbuWyd3OsouG@&JiW-zutUzvQ4Kj;rkeyeJ{DM`eUcCt$_Z-FUv+X$8`Vx+JyoO^} zZsXX6YiK@r1_OsqM7Qql2nH|A} znT=v;B4VSnk(yM3?5qm4I7)VI6($mfot+1wA7OmZpfRc!OaU{sXrUh_Oq`9egyu<; z=VR_%0`j6@%$e;?VD_VYuwbsYT2XMxLg{UmRR#SC%K>Uwh6c?7%hJQVOnvAV5*UGH zq0tyOdNS;KIKtC?03q3r|A(otW=&t9;97d51H8i!;H7D4Kopkne(ug5YFamYdndRu zo$l^326+@l3>k-!gV~T6G!o-Rj=|J%<1ugQBrF&|8nXs^Vw#&XMmyTWnGn&N(CpH) z2PX9Ihavnu&7~K72fHD1L_egB8-lV)qtGyY680>bi&H_1(HiNCS2C8;;m6>km6`bC z#wvVvcq_hXKZC!%{W89Ld>_BNa|d^>-@;3+*U)zABKB`>L49Qn62c=fbJBDS@E8dD zp1l>!$dq&fww~Py2iy*p-awDuh@U$F+f##avo;AIHRs{m(-rvkOewxRQ-Y6Al;D%o zHTczmGPJK;j>Cl^*iYbT%}&6pX_0s>IRKp{Q3#$ai)s)eQ`1zPBr~@NG^=T?X!EK^ zn0BrYrOizi({j)JwF=< zvr}De-=AJB5{ov;ps7<}-6{5CuE0_D>$*x-bcJ~T=)R2z%R>k37WgOZGz&n$p5HW2M`qQ3e z46viC6S~{WID1*1-VP1~?!}`9D`>t@m`G@jM_XYga^}v!A{Pf-&nM(oq~WPREWuZX zTs*1JX0-UAG>yknwU7SzpM2!^HYMBpgk=F{H3c1E`DrPExipT7B{;{KpxG!H+iZg7 zLXm(o!JE2A-7U+&r-bHJ%R}KxXzu6f!UnJ>K78^JfBfAqX-^mN!nGT?-qA@=zJ_b8 zf91na@dL_ESU0QrnPp2KrFxku@iRIX=+ckb`UA7g$~m&tA)1o8mK8wrTGbgOK2-6T zp{&pc7`Z{00#`%Wx{T0l%T^*=|7SdA_T%Oqly>|LJQ_e9wQ{^DCg=RC@h|Ex8$_$g9$t1%uqpafC z<^LL7T4a5Vm6*XYg_|ujtAX^C{(x1_16 z>7cY{V4Ww{WkOo&`hON+R&O16ZI_K4lO#S;E|pU?^){P_W|)9k#x+aWIy9?>i!nb< zvOuyX>kpAMnR}Qian|ZpGVd+A#>!egin!5n9=AF!5|~+uGYv=g9>o4#yAc`~K)CA# zXIBqQnz0Zy^;>cB>;=62=4-sJ4SQSmV@3T2}el4cVnvWj6=-{R2)XovT z9o-1f&gj|GNlha+cJvH{hovApqYByC6(}j;^$9r`Ja{}@3C(iO@L>~G@3D_}2)w;D zXr4}pojP?sCQqJ+*|WSfn4a&4g#_4zb2VsQFyBipo4t6UK(nvvX*NQ$pTMuLfb&wu zA%?$$F|glI^z7z<(Idw5zfkoNp!77$5DkH58Ojkr*p_R9e3oGu!GHR=>2T>Sughf- z3X{}+PxS8H8%~Z67&gcafeVHsC3pH^6>HgbbNfd0)KzJAOG$B z3%Iv;6}E;h!1l-mXo{JKb>U0Uo|Ayv@qxIN;)~r`fpE3&277{2W5arugX_3*`T{Pq zF1^h9QKTT3b&8suNa}9A3`-ruv^@jOJZ=lk?F42Gny)fE*3tY=q}OYD8B^mad5`Bo zvq&|9&5qwxwgrURO$%3aT*a+xZECsf8y)nUJbvNSQ5@LWggqNpA$aaY4C-x<>@Z*K z%um6Iq9k0(kHV|6@W$#4JX)DX5KJX7XAl50ahK5iur?oWRTI>TLU1Q90M~MsAz{*J z^=4t3XMdDLE=5yCA(9Etj@`TSxyiyg_82mJA_7BWv7DeBLnXv!A~m@HX(@%s$}C55 z=^CtAvk{TYWlVEVczAfi-!BM$KJvCDM13H;fY9u{BnSZkVVF5{25p0&?%{~l-V1Rs zAQY{M>8KA6#VF5SNQv`B+rCD8aega4-=l5IACldOOj_ zp%*;d?BML~00*bu80gj)&b{o_+mpp32BSGW3a4^oa6Ts%SMt)4GHEOV`Z(ehSI1)f1Y>Px4B1aqAN8HK&yugmOTtaG4<((|#23(R^tCd63YvA_quH}2-CpIbvNjH1v)St`Bl1DF+RAVG z1d#;`Gd>!E^=9#!12TUgV!88nj5_WNg_9D9ZqR zIi}^g+(*Ee6|s^A$+j}bnftZ{Lu5>sMRI>j1+MA&c`tD?>UsZZ$85sobCuyJUCQ(f zjQ6&RC-YzOOe=UaWMg6Z38{D)09pHRLALT++t+)Y^#XsbO)C)ULqC(`wU2s*s;+5g zTx6;#HeeVZqq9}dmGSNB8Q4A#O#;p$0bG^=!lo?Vq8d&Tj!L_h2YS9}X=3_J6@-*2 z{lh<_bSPJ4m)Gb}tim+N2+aDFSGumyK3jRMd`Kl<@#{-@tNW`mO;{E&-)^>_#Y@-O z7+^U+wEFQuP6xuS=i17^(eMs33`Y~HdPH?QBsy*m$a`^DFB_+lH% zHnkvg&30t3+>ZQ}+fcA-6S4@>nU!miSFr)*E4QF=<6i7&K88_aX3}{(VzFNo@~bx@ zwY&*Q63Md8qoXGr9r|KG-_e*qHxS85d8n$a zM_@oCTnNm4J%_7>O(u++p`5$F%qdH7R$CvL8Y@KgD$ge%3)C*0>qY19g@rOKV}XwX zW|=;2F}JqREFb?#PqWs0yj1lv&zZRpu1+3s=;eYLQ|A(nwGRYkScVMc@aJziU*t<@ z4)G6%4`E~6@NtZ1fAr|t3*CBkSL2NB?YqN{TY-ZKo(@<^c)YSM9DKu@l(ei2vGEgMVvTfp2%O#wX1U`1{rK z_}k4J`1XYt@bS&-czpgCUf90_S9WZ|f%+BLR9T3soMa>~4~Ea8S(rF>1O|Ebft$Sp z!MG<}_zdJjLFrMJp&sK1vjIbUBX7|8z#*DE6MPE#KK#Cv{zpblT`uD~}t3-RNQ zLj16^0Ds((g-@^6J{3)8i&k!Tg`<|d$*_yXciy9 zG9u+z>N5dmGc>c@wq9(dFOXq$(m2)qidM!{?-j#TlJd**^?K1#KS>>=m1$M3O+X}z znCQ?fbMXo^i=?4?;nXo4A~f%AtU`5m9QwJ~!Dq$<>?p~^>GE{67RKVu>J;3oNy7aq z`FM>0U6XD7}k+S?sdXDmQ)SS+GrQVz`8`ckD16|m7NXJuxv&^6TP@tIk_JEDccgoW7ZgDyu z5T0eFKUu&;iTn3U3C-MpM0kFpl)x|&2+s0xUlO5NRsdvpgk%}NERrD^_lvpZ@w-KF zxI<{ZE3iyxezPDRZ}48fZP|w8IaA?AX!dY-#LV$y@wacj#CuO3pzZt_+6wF0HW|{& zI$k7oy-8C4sybO`FWR2=_SfQvZ2gkawx-cQkWaqd!yFcQy#v#t0) zZ=zlyNQ;OMR(tg=l6|Q>B=OdXu$2Z@&zEhccP+g_vz7IEDoj3DGc+3{=@&54dYDbX zp&q&sj5UZ7gOD(6r9-n0&YCf6c`N(!U@gNI^f*cywXM0ATS;I|GrH$Nv-O~SeP8Zv zslY9=z^g8KK6<#iG~;`ox(fH_$mUs?@=@g_^GzADUQnJRV?4A*m@a`{qqO^p-N_hw z0cMf@de$u7Q+=RA=O$wq+b?$zAn7D&*E%rkU(3tf%JK~K=P0kT)G!Iz=hj+Mwk*J` zpjpPVo30lb^TM1i#zn6nWNv2Tj2T8~R?mo%Ys5yyMr^};9x!Xre1*^~<24Bb3{Tms zBd-MV!cHF^ZWt{cIw7ZQ?lv8KKmRn_bH z-!`;fy@eOvdWhW@+K{!e8Oe>ikygJK8LJ4;t9BtnAi8clvT8OXx4H?H{2fKdEWOOG zu0xTSQi8maMii~siL!=6C|bQ6m9^VZ&GS-oDluf#6rSsV?jo5MttYQ@=?}YJZm_p= zNB_Pf;6s=dNVexOnQE?&>p(R$A~Z09PCWvF1m+-rS@t(nt>h;VEwfvBJLZdKl^d$7QbAhM(U^SdW?jvDqvutQ%pr!X& zU8{z6gk#Q(g%~|@9Km3a2F(I9vQUT&ZE$x&UHBqA+*ye~A6CUG(Krc!e`qG@z>K^@L$(Y;6JV&!{@s<;*a&!_;pnw9+juzwW<`fRwUzcLnXd` z=MjGV(qz>_+272A8*-$J?pB_R8zo)p94k@9*FIw1vpcdi}r#Jidc0hjhC4Y?cqJ2WoaoJ zoNROwnnV0#cv2+1yce^~Il_&$Gu6Wb)d2x$ULJ`nB{|p>=Zk9_i}3x8VMp&!eE-<(M(V*DEuqcN+$C*}-r#@_N& z+}u`;S9jJh-K%lAA`@XFJPkpA+ujm61%79_XIpO$kiFo#Cbd}^FV98uJi|TzDYVX>oWc)GD5R11AFgNMJ;uR)-D^N^GwG38@8Rm@leV zgVI*#Q?dn5&7h>f z*-Fm`1?4JTB8g*H;r<-iK1)-9qO@LSOMJSPDUZ~%lX%E)J^!>`W{c#Qc8w||EM(6B zGo^ts<7ETQ`t!4bX0am!^r|solx;<&$r_r?$Y{S7Sk`)(uP}@&O!F1SQ+k}`-r8#}mT5khTRcxzWP9bB)Db5M%-gVI z<7VvK+>G7L`%qg~2hV|?3hdlH`k`_~BdTk*pmt3&s#k8onsv=M+3^zUj-Elnh8>7q zzX$Pa_ak}DL8R91M^ar2)l85kL=%Qf*6zTtG180ZfT>fLptx`iRf80RCQq2n`6Mma1dE-ZCwnA0a_LY?5)Brr{Zj z{Rqqg&N?*9`AZS8G==~is)lAb5}HR09SeW&V0B+F!n5|lAn(QNq~}?NaR^X*FAm1U zi8C>B)OZXWG!*s@GB#L)(`6G!;zn%&KH6N4k5(n(y@Fso$&AFq>^R&li6sQY5DcR6 zMpZ0cDNn==Lcpc;cwC5&Mq6wQI%5)XJs}OPX~{U1lYlMhkytCUFx0I;aaJb$7A?lq zv7<3+fI9{_^@68^J>2Yi!J&H(HE*&#CF7)r^z4S29*zhfGY}O^#$sp8T(lJ|!<&rL z<8|ry)1Fd%e{waxIa-A;50>JK17-M(`eREWe%D-xKkhBV@A&(xwnqHdy9a^4T)@{) zx8rVGA+}ae!+}OG{POkn`2T#-2K@a+{LjzZaiMA{s%P}U(Kv7Hj15t9$2u4(N z0>L>+8>$hXi;T1q=@L9hhkrH6fR{)<91adUaW}2!|F`@vM3Yp=EvfGSpr%Km*xI*G1f)iX0U-~ z*Imu3G1S8yGe(R+UUV#WGW{nivv8p@A8QEL&1^tiUSEc{4>jQJgKO~Q_y)AE$w$P* zen^=y6mM3i5q6Uy^E2NqPhi6$6?ZC9@u(yNPXwAv(iAjHFY~<;Ha-M~3C)jWS!{x` z0%rcUNT8Y56Pz_Ao8kFEF~OYhe5W8DZ%a=z!CCZLUJPC@&BVQRYv40<0Q$Px!^z1G zap7V3-~aXRc zq4*m4t+Vw!0sK5}wvqMP$Ui1^^0xwO1JiZIc;+}ziWBKKjPNW&V2#kM$Hn5)nI0QR zwv~CAdio>Fz2&}I8MZ*Pp5|K}ER_1VWBy-8Z&)>1PE7&6nmv?u{Ujgb@(;6v3jPU|_5Y{VE zk%DrQbWqkIU7fGO(*V!1)$XlH3{H!OJR?2cvQ-901JNsJU6uxeFOd1d$XX& z$s%)JNSsVkW1m&na;+GsCb_rP`>Ulv!k5`NY{x$wUPSS{xQ2tsU$+O*X+`L1=K&Y@VF(UOL23DVY~6MQyZ4;J&fO=mZu4Q3RBuCS z@p@!bY(xRUJTrea7Ay>ghsO{)dYMK|R;=rdp51#hUIadS2e`TQgNNr}I;g&wJaG<^ z6S5H>orSeid>mq*~?nmBy8 zJ_#Q-r7&JO_|3j*{7c(T{Kw4}e157Pzu#SkKkO{R@3-aQPdf_n=iSBlcwZTQwYLbL zx7Oi*KG}!={fqthU%zOvNZK&gXFi&wJi2;F2{rgwX&&R zr|>`J^Gx-pe84SJglf}Qnd)JdCrzw(^sNE{OC4$U5z0?UIW^S<@;#4o`m7(!(A-8p z+)ilLdYW&ThG*O&Jm2n+p6zqEa`q_BwX8*9#9TIJx?|MP5!g^(h4a( zuPg)iOEU<`X_};mxhz=?)3_%+(xvfuP#T8^WpTVN9*+t18bIq1En}J)CL#L~A^Q>or+8iqB7dOm3?4%`-CzW6>WPeOa6MGD?F-jnM=KrDI8ao3NPp0{JblDa>}0 zcBh8kQR+93t9{uLnr|q0W>_Xrwv`25Ez;t~{|r*TisesZN`vT=E!Q$kdFx^&{rfm> z&G(gawKP~7y_zKL+CIyzumt&5lU@$RH!}>Cr&=BfSLoj^DS+m+ie*It&6M^bV7o*@ zizFY;vo5?g<*kZk##!sFm1ySNjQ3ZM3!e`xwjF%jA5D`gNlb{ zPpU`QlvWFn>au}n%3|aeAk^cf3|oaIb|MmIb=IUV?1%TWy51zMaPWUh|C0V$XlA^W z-SK=0$0}StHyeA?^UHD{y?`1%Dpb!@@?P@NlAm44Hr_^PmVNbYfL@kFCKyo9MDjr# zIh}0>C0lX4(&$oS^OdCXG=9*eU(a|+Uh@3wmsxRNI*Z#^&f@x+qd2^EBU-j@#s1v~ zu&?Pa+V#ySU%3nAwfj-BdLIhw_9KUIoV(>XQZ^ky zeA6MsP|*bE=*Ins;eU~9d93j;@*8(!@+>cg)f+>HPe)vG8EWcwVMoge?Ad<`TX!GE z+D-dVT(cD^CF_t}Cg9wJg7OUr2}!`vq2uA;?4dpq?A5~=y?eP~1|f9#Sm_Pxj$Zbz za3aL^=`#qkXD>ncvREXnrfdtxs#e`)7Z7{(%G-x>;e-t8vW3W6V7SX}6 zhz*WIj4W^My%fPR#A2b=%PfmIDEWpVAYd7SsEK2zz=ikpaPh?K8FLBB0q|QK0N+L2 z3N$YXg)hUA>->EM)`PHs>7P1rF~*LbhGB#wXIFPMot8{BmN{n<-q~D$j_9lg3e@ZBh(K%nJ%*6G^8r18%+T;ohqU2H5q)P-;2>W%nD0~J}gytW80ZY;uE>of77J`wM& ziNP<|MB}&h(fD{{0zTfBiBAr%!rwY~s>|o6QTLjUB&pkxddPA zDZ{7xO7WM&W%%}d4Zgorjqk5i;y+($#Q%7_8~CgZ|NEO4@jpMgj{p6a7xAx;&*0Z* z)?s77WK_-=g#EF;xLlnLui=9r%V`%C7vQBEFW|b&JVY31m2xfRhxMMU5O`Tc+twv- zSmbSy)_K9L)Byx%HJgT%ZI%J84mH(b3`h4f`rAHDEKig)*!68!(5%nVuhj+I((&6u zbEg5CW%dl2YVHR0fv>$-yB4bx=3``^ZWuOb5E?3~aehTH+6ofzG6C+5 z$|T&Wio;z(^Sx?9Uqv$RuCK*MtsVH_#x1Vs0>Imo7Byl1n4I1j^y^hY-~oF+`3j%AVYh>6KSd|VDvlkmf7HSS;DqR3ko8JaPB)_gcPxxtOlJj25UHNn2vm6(9T8JUFUEOh2% z;0@|tRwnLb=HR8mY-CIur53rdbF@P*2M0K^(L0g=U*NkCH>wNqT3NbE^TSoCc)vCk zAJk^!gSEwYt2zfSuFS>V-7E1*OEq3Sum-1AW+9E?t(ZHJc|%BMzT7WQ!Nbxt+%J(a z%&E9bh<;mom`h|>hBh4I9^v;M?{SZ?{Gc>OEnxDnEQa7rc;<06T!Y)k3aXPSZ8|v} znq_E)=stB%!E+oQ@>$)>6=+Vyv9e5zb#lN6cUQRf=!SLmEAZd{+rOdpEN$@&A(r;4 zjWad@9_>K&xSIG%F_wU#0hq70u@OhfZ>`6bqQ8?mT{uktK1Xa={FV7*Lc*#h4W7;W zgk=TJ0<*NmXT;;$S{pXfS?*HI8^xSDQ&{*! z{HNrp#;}@Cmp96K{sK+E!UiXiBnnlLzwPON5 zBI}251Vp_@SA4CNTwp`w!Wq@me3QC*@-R;B-Gw9j_u}Y*BRGEO6b>Ia zi8CiJA~Q2r0dsFxR}@vQ!iu#!6*O1W9zYo-(41Aj580cJA!W-k#BDr+*o}u3G)D#?G4H|jU+MdPLx)HOAux_$@p%bSo=tV8p9 z6jpA4w|4~G+y=wF?@)O3m#N4c;p#XTr_bHS#rD^clv#!W!^WXoFGv2@6Fqv`!_#9h zo%&)#geDM*6A>g!UM~tluy+`i1w#7OKH8@a|kJS<3h@Unci~IG%7<)SmBrMC=g-^@ZHH;e0{18zatd?dQTz#xW5FS>?y*REk*cxcL7z1uX+5decW%!#SeS( z@z;HY_>lnpS< zRgr=5EQr%G~1Sc zRR<_&78%N|seI}BNoc05>k`T|_)rqrdB|lJ=g`C zQo?YrsR9oxQ}CpO;98Y{cUC6hy_zKaVpSGCsxQHtY~bA>06*Nl2Cwc~fmip|u?!?2 zVZuzNbDA|`|4@v}f6A9G>Uzr($&}^Pcj>i-n zt8varssJ8A;l8(7gXUy2G%Ijcz0?G60dWzJJt*RHk>2L2Ol(bBh9OMb2oE=Su;F_A z=sx`L-B-AH`Yf(rq&?|qWGypeale%fz^n51gm$e@;GjV>e-k*b z3M})wwhPjsa-KFO6G2?TZJFkf=SeTKG~TRriH*9hV3xLPjMqOY-G>?BR6(&RY;A1< z*~*BB4z;!PG;0zt*8TG{B2Vb|u=dB=F$)Js*=J)z?>rOnxC%q!M5!^&n)B8QxcRk{ zjm#B*R0a&hC^fc|TLlodku|AHXf}bC=IHfETgx1Vs=P!RH0vN{jJqX{#@A$&?f)kI zpFp!wlG(Q4Vw5`1kQQT}8IS9dirWmRhRWKQR*@cmOFW)qEatdqMrnJ!Z8)})Yg7x2 z`67~I+BHmrDQ^{2$yVhvkDJ3Y#$AiQf@V50NvqcQVt8H20L#yZW@FxIhOW!}+$qaB z&rhe4snDo}51zTsl83f=Ea#X(*(hFTK96Zq*QzvF(xKbDimTyybS3>72}AWXs5t3i z>fvcpz0JHv17}H(4$ZPXub|nwe6!+I5^xrXmc|gbre0}D&oyqZ5ts#>+s~Z9RqEWy zV>o^61kM~ki&MwW;?nu6NQh5X&^&V7c+{@nii%ZRP`!y z=ChTR(*m%BaJ`t&ym(0zX3q}Cq^ZFeJ;e(XC(Xfxkz+B4uq)6k!x5&r_rU9o`FL|h z2CfvP{0)2Dp2osj?CmYD~~v7>8F0dv6o&-WGT!FyAdt z!5f6;cNoS;uf2-9EgSF(f%CP>B2wAgUg@zoL~z~`8-ntn#R#7< z0b}~PsRo;iqYE2^VTg;)M0`xP8k&)un2kvjr?DLMz`%in+2~sWSrJp_C=X?0QoaN5 zboD@gdnZhEbwRf8A~ffuVPAGOj%F6%LRtZ?C1>GQVmfXlrQsT(d3#(U#yi-n`Md+h z4#u{WrA$W=KH5@(U#(8XFN>Gs{ep13Qyq_Y3C$l6ntw%T{>|DF+^^0?XCdKqS1n#| zuElG68qg5shv0rLSTTPx9`HO_wB&&-kxc-;Te6&>OqIzt3ik-c_e*5zIGG7UX2g)8 z7X);*!XHJb)?hi=Bn_M&3NV{~%UEX_1N~57TE<2b(C?P&*SE$cJ;Pge5dj!2Ak1DyOD+${iHspejASqG`E_- zPyD6o$fRw{HDdPynOD!B!KD+&aryK~ywusDfLSD9sDtwD_EshRx9Vv&z_PBcU|96e zoNIw*>w9R>Y%Q&ao9CG%*O`6pPs!vrHQy=H{IDhyLl~i1<)3z~+(Ys}L9^N6s=}?w z+OcYtfMoVn271-BwFN;HxJ2HRc>QKZXch?40ZHzufJDH^bgm`i44I%3DDY(q&6=#^ zO^6doFSAr?hX0JR1!e`!=8DZ)+Wjs2B7H23yfV?H|7SU$=`~3MMNP)zN&;EjYRh2D zF(Yv5X|;}%NZm`du(Z6_#`fs>Cx%3spi`Hge*Kwt6|P9m)d5Y1W^G0e&A23Q#U2!x z$!}A*+Vxtz_1+bUzYh1Bamtqei4CZ&CM`WC>HDVR#&{_JH`$!6EY3`mcD(C!N@L9Y z>;w))Jm0*pe#<@eg00=#Y?qX(jCh_d^%_m3&)|^Nb2i*t{cnh~Des;qb9^Q3Dz1jG zY-DvU<08umUzHwa0cOz!I(K>YA_B4+Pc6N1O7fferp7&MQY&I!ID?LhX9>-OXI8Ww z=g;EG`O~;~MsyyR&R)XhbC(H<^7VW-EcOe++D*+UtKE+B`u(U_a|ETeM^I3A6ltpu zAf@3jk~SVg{Klh*B|JwHlB3oiMpWZrgs(Y(#Kwat+qf6Q$I=ORcS1yT7D0InvWwRv zr=*dpM@C5<(hKX5mS2b1+?7b=v8<{l6qYw)`g9*SIS%CYGmw^Bji|T+Oq%SA-fknX zJh1@Hhc09Do^#lBqz&tKo<HXc*Q&w%$l zA1w7-hA8P}BRI!K#3P1^<9<9LTA(>Gg5WIUqWzbtvCskjp_n*f8hXpeh>mU;I(#f< z&-Eo3YrV??6S4?|k52%+d;_uAD;V?U2Vw4{I}Q7bmhYeBq!^RkVx+|G5 zoPy8yX5#PXs)1K`;(v5*#Fqz)`Ckq`-<69`cM_&|W#Wt7SyVPYYvw+o_^W+6_(sJ2 z@A%&j%^6DHH>cx=-5CVsH2igEI=W!X|cOMPPjW?N`h-}*?oHP6DfeQ%A@!Hkg;IxzpKIe* zK$#A?oiL!@(DHXX0p^-%Xojr1d6Q|Bp5~52P1su=fv|aF(ci@t8!IYtX+@a^%?0sz zh2Z>VQ8eBZcrGENl@X8|Yw(2edjHlfys@hhuhgXDwVE{SjPu0^nd_MOm_2VQE)kl~ zXT;z_dJN8_N8?CZ6t*XYqcp?^(*}B|@$`M1T+}p)39;FTB{(M~@__w zXz&m$THvKVij_ru?C}8cnALV#5R1^RDh4`lJRj(EI!N+!-o}dc)TJR@2yV5 zM>WX`nt#Q{-;))&xSGR;!{!>izN-$ewyZc!&1(cUD zpOzAAm+^Qc9+XGnVM#O|5t<(tCz=$GCnd7#pDbb`Q_iJwo313|m^E0|fcXi5S){<3 zzZEp|IovImSvgX1uOuCJS$^-WE=1wt*%-ibI?&k>Bl~;eH@|!rzxwC{w4SFwk)CFO zVm6|5wz7z}!2ddEAF`5U8+}w8e_!Q(>(z@2oMoti8Jfj6@jCI@x7ipwbNC?kY~F;U zd-kX@qlN(~aJE3QQQG)s0bz-gwYr*CSc6`HTe;?0fPMy=>0dQqwm$cH&@4lkehxIN zZ|_)MJDCsKFbtk6Od(}m%CQp5vAnGo$qJzY%-k10u6NQ&Sd#v3R!-I|YhYI5F@fbP ztQ1-qkjOZjfVl#oEa0X|Zf2<@1e!&flUJF=h}d4EU#Fy?N#M%}%|=;Z!5C*%p;@KN zqJI{ewK!Pztyjj=r2nU%Ym~Vs)s${q$-T|-P?feZeWrUG&sX>3R^^e&Abv9cwCg18 zk~h+;sPc%v_5X~LYxH!Pp;<{YG>fsBWCeznAt9Fdo8rQFY9MNzU;jvY*y^__JUt#p zSsa-~&x7Vp+LN|2AJfTm^!X`u2{fw~KJTd)Y^{-J$y;@u6hzZCYG3YWgl3WP-p1on zSPgdBm8`D)2QpqKWmYwG*??5QEIr8jbEhoh`gsomhT11ETS2ox0JkzGx|7GRGpvpa zrlB4e&f@BM0r87Ce)uRRO_1Rf-LNz!0jt(Gqjb$Klr<25*B(dFnq$bTJBGA1hmf-F z7?L-gKpeq2cGGdha2vJZ2$nYG#jY5KSF0YGAs>J9Cv(e4Y z0}FhjQNQ^(8h2d4hL)??cIXD0kKe-j<}*kuS_LmU`H8a^W59@s=h`oX!J0 z3DE;()x8;5#Q%c`y%8bH5iiSiFN;@-4U0j1WCC(C@(~>ohp^xXj2k;yExO|3>WML< zCSdU_ANVZ{hTmct%S>?g35E}W+0Q>1^XJUN{HZhGGkq5Prq0A7!p5S>(=coNcns_} z5WSq;(cMl~KI?|rBL^aI@+b@@2*}iLYPG$d-7t#KJado}LM9DFuJ06VO!C3`m2~ol zSK{NVoABN39r%}*_TXPL95l$ z5Sp*+tF8((i&%fCAxx^Q((YN7wfb4Fvn;@D4b7&y$W(qU^^9I#WS_@X8MJ|B{a&W~ znSG{$X4%)ESvo&D_*;i&0TzK~`B3n-fWnbYI9MHx?1+Wv=hPcb<>k1tssin~@pv&W z4zCcN-zbbBI49svF=4Hg(7dJ`k58SzQ>O3L&9#K)47|1?13TgZFpSVFGod8SoQQLU zX}FXfi;L;Y39`{ROMpHo;G7YMWiuuy8*_DZ#^S~PYM$nps7y8gxj^&Kp`&P*Ju!Ij zPzBAh+GS8cxPs;(1BS!3mm|g#xPu1|z@gLxT*yqr#msD6$S%Oy%zT{B%E7s;ES$~G z#p!}#>`jZs+Z(FzpJ(^uzgpJghq^p`Q=5jb>QnLCiYUBS5{ai(iE60D)0L_Ci17T& zRq6O}RW{xuIA4`F5*w@VdUFF_+0%q9HVURP{`G$IOwb&UN7eCo$gu7UFjs_8OL1R8 zGl97@22TjhPm2=ql+gUNB!P;1i| znS4~3LZwkMrCkP}O9tK~G{;XJry4CXL(SYt6Y%+8{(wgh?yz2^?OthDjiokeNQtdj ziGEgqnG#rTBd96~G}E?ZT)!}y*qi?LQ)h!0J3DZ!WjE?8%CTq57MwkD6fbtPD`=LX z8qWf=rW*=mWn8hf^abm5X~#9V)y5t_3(X=6Fk58&zbjx?k`HC6p9RfKzj<7@o)=Qi zxv%p}8v$8Ydn^5$Y*hnTlLpU{cZ6n(bJPpJwe!Bz`f;tI z(12NoM;(f6uTj^T60HG&R90O>v$1kgpgrrFAS>++)s^3S_Zksp8co=28m-)EFU!J!-cRB}i zZ?h>ZO&TzF4b2jtuJhD*@1HGGTz*>CY1Hp0Q@_dD`q~%C766uI!S!qP{OeNu&+0m! zuS+!!_`f+0a&MK7a!dzm+1lovwT$upXJvUlGG3VtB~#f@;ZpM2T7zbuuR}8>K>ci; zAm0gSk}2p^+@(S}e^%LG%g*f>(7!*r^{_)qPBAJQcA}(VKZ*&$MQcx>aP3KC)g47j z{UIc+JBlPDG&d2Nc}##gy5TTV8V;bMX*WiUoq_=aMk1$ZH8M-q6O<_eb4KABq)9Jx zK|SI#sZ>K}aG@m#yB8l9jtqRKAViT!?;y#u4oMVDw~P zRMa0pQ%f7_wqHQ~&WqS~tP?HgZlUG;P3$?}ffZYJBO$*6qo*x^lgBW)Q=U{`_hA^| zF%(1lj=+>L)8ReW7a`tZhz^NGL|~*EqES^`jg^(F5f_z&VM9j4$+-_4sj;Ib!fVzN z1S||9G|M6;;qdhhhrfRq=FOUo$wLO9F*_L>Q)Nfd6Q# z!+*A}!hc+RU$52Sf4sZ}f4jH}AMZ@Z7dun&&CXNMJ%D}g~GVvY% z`<~GJ{ci44vi*U_e&qk(>>*?~C-a&Ve6cx+u$+O98}snHP1V>J>4WIOE=Zp|3{{J! zs@WlAwt?4cORy~?79IrV0sUODbIVq|aQ&tltK2R_8ffP)oWF=$Y#`ngXr`UZFpW;> zZKj>e7-y}1(#n^)UZ!u-hJ#u6GV9Q+*T>Ruwyp;a<%IXve37N@)5@}(FJ<3UFDS7t zk~-Q9&Ag6qVeDz{kk{ipCc`siVWSs22nvTc;Xq{+DpNzy*OAa%T856=YINkp+&ORnudl1Yt2NBW)j8N68-ziGHTjq_V(NID zEKJ9x+yuhxa$L%Y!bL*!32qM+BqKXup<0y0$-x0Wej$j9&qiEqj+(DIEAPD;p~D`tJ33b?+9W%$|fvjDJJWLOfcNhkNDH)0}`u4C_Hf z1YucvnZtQJl3*+2mSeQuWr5~GS-?cVnb2HDcrH&SIHz(y1CI#K4++AL1)@u3>bV30 zeIni$fabm|gz|{c{6IcBtVpHO88%g!gO^tpB5=e&^y}3Vu04BTsh>Cg<%civ@{6~W z{oK0RNhoZiUC<8`4xeEuf~*>nL0Fc#^)+2l(<{pG46Ub`_N2}yFq*=U;TUHR9YkqH z2DeABk~Z|_3oj}#)`n?q&z-+lr_p3h( zo=woKgR{;%nzZ!l68~iO&GdFP&xL0FxKRSl zttQF78JJ7~T1VU>+XtN||LfEQ(iX1y}DmHxj`MrhU`*-}YA7n*rL^{T-D%vzp_vFP!%+@q^}5E=^$uhrxSV=Z~$UHU9Uk^mwb^ zvNiqwDQR{n*IUYfZJAK7ne}5V2O3aV&QoREECMsP+PvbFN+&CnOJ{Mj{W3PKTMI`z zaA)^^$SJQSC^sX&@euOYpFsY)Q^;vJiS&kJNG2F3uBSF1L)@mL8aOu{N9?-eh+caH zvGs?M(RcuPE1TfvIT%x>%tcP&Dx?>zB_yv!dj1+@6x0%$>yci#me5><(BvY7rI#SN zXchc|W6`r`9}F5a0eR(XQN3<2N>=Sb>B_w*s@RMr0dcT%mX8BH;67jiGD_BB)4q#X zzvm)W?>vW9y9m>e0rYC2T*_hJ)=dVfBuKh)gfSjKv`sID88F4IGPpeMiER@Z8sZ zCF>&+^OdCHN3uk*Fl@L+4VlAR0V$s)QARK$U66D?0)N%7?Er7qw%j~@j z-oDG>;~#|p|8NZP?1#C-2IJa>8obh2Ooy6*mud@fx;P0{K?^X|p%-Smx*&D#98@h0 zL}PLYcIAcRXi*f-5t6Q!#N$R)B3@gQihJwmyqgm78-d@WIrx`T`S{!EJp9MGLj2do zQv6$M34Z7(!jGM0_*-WM{_BN0{BWTZe}` z3t#Tb#bf~>=>ZaaPY_6ZAJZa@i)ypQ|q(QSt z>Mm`oL9^7u{NGf@^!moSEKB*4vTCXOR9Q1#BU`B-q~4SIT8Y28)q9%db-V`61Q^yq z($g$}@dBaw=JBm)DT_c&dI$zMJ78^oAv)I7;Cfy%p*ayR<`J6nWAQqn`7Ki~=iQnN zyxq70uWo3-8+FBab4427sLerh;!+G@xs`G9KI29bS~GAVKM5DIqHrZ68m(C|IG0Iq zE=)yrR1o^Hq2|=v0p5PWh)>Q#Tw)#)lX8)ioP*&b#-c}0I}95>0^Upf)bOO3sANo= zI1~N)3_wv<9wHXaN5X_rxK)&mdj(l|J2xA*3vzLxI2XrCvapBH-jtev>hK8U`_DjY zc|1OAsK6IRw2=xns8}X`NpOCkrqq+68S!{p9gBCXM({VZ9@m@`D=}R2f0``!U@N)z^-DBPS@1^EppNr=lMz`%voWuArzH24>txF~SwXY4G&||~KVzJJD!xhv zBe%%<8a;31xyc-u>iMbXXu6+>*Qqg%%-42S0{;v&i&Qw=YWI}9(DUE$4D|A5&bO|m z$6NOG*UI{_uB6+NX3xfZYd~xaTR$f1xFBUnFGq48o~xH}K5K#G>xAd){4FhZSvjin z@>#rm{VG;gma3r{gGWw4aqTAL)HWk$-4SGMJcaCy=a9YO4AKeE$?K0Hq3I|RHXlX& zW|=+X2;w*J_c}r}fjMUNe&lXEim0q|bnEGakdR1Z<*h(+_9`Uh)FL%!HPZ8_f;EKZ zdc+Z$m!_5=G_4r%xs{kc#~0nZyJ7YmKUCLmLrGmT3Re@F*BnCes(r|<*oLX|m!W&_ zfsC&s99;UrFFXN_JC9=Xp{rQ4>mt^)T*dk$*HC}p0_vzuC)==U7> z5gk<-c&)Jjch~3OZ9>Dn(s10VjKG`KOL2E)7#^(&!v{@~`2CJpe0CriUmZ@vcPBFO zx3hWp*9(RC*GqZ$+m&qmYg+;S{aP9Ry`utO9nZ#R`%>}6zDz=KI-xiXpYPAWH~TX1 zx8_Xz3-v<_;g_)dH39gWUD*_YnQ;8o&J1m6MzbgrU-H;DJ2Ub94uUby`;Kt^T}v*$ zp=AGuUDQqj^mc~1jrZ8d`)$g_{klw4FPVUl!7hjyIT!`w$KY5*DDKx3;;rHoyit{j z3)R_}F~Czzy_b@dgj=1r&`P`2Ruh*61-kNOOUz3J=YjbccCk`WBTog3VA2ARIb5n3$pgEhsoEeRdtXN#mjK!J4 z6f~tpVW_hm?0faV{6*e~Ps~L^QUMZE@{yF3trp>t*)v9t9Ic=^JS+;~p)r^>Z7w3Y zUz%48@0l~3e1SQ~{ND*ua&hOS|W=5NhuchGp#Wqawnw^fnWs1%PF0xhDi} z4VoDjN~W26q<-@pfo6vJKJ)#l8vo3=60{#MJ`XCA3F!oN{{L`wK8~vz6lbnX;O%~0LF>uv_Ga4of*iWR-CT^XsZ5$nbct|6fD1@i`l1 zIY)o~TAY3=zDlOJS?hVvPzPR-sU@lk7t_FFGQLqCCnnEOdyORaD&qBo{H~x`dSTUl z)%`4aVJ*ogt;`wYW>i;cHCB}7k0(u1zj;5I1Yh2MXwOS~?$-D1N&?C(OG+$Dyk3p@ zG=Z}Q&F7fDb9Cb8Y4aLD-?)4RFL$;gGbKSm^OQMDP}bOtth&9(XgYzk&1aFd`64nm zokJ=CTE;RbNG~&iIerttxry7xBS=_#7zuTU5nFoz`J0bpPCyi#UHc+6H3#Y0l}N~} zMpD))q-3u|dd>5T<_T`}9G7X~?K zmGKZN{$RFvR&yA#B;BDkU8^8|rW^ zHyyWfk_pX8ct!7NCJa8PjK{kxmgC8qB;0Gr$CHLsJZ|9eb=kN`IGog1Tg7w6pg!1_ z8HWq`$+$q6zDl6$Brspih{gH5WbDdL#CQ)^_3`DjsWTB5n~B84d?cq8sxi%@$4o@` z9=$MT%s2%21tTI%KA4q{W#f<(lZ^C)bj+JR3p3nZc@J4kgpH*947^d0g_|WgxLT2i z!-d&c6Q78p@K_XkEy9td{`g}_AwDn2#GlL3@!Qpzc&|cwa+#*`IJ{FGLq+1S38TM(3$WS_A8koo4qZ%AUo3sh@;qqb&DO(5%C<8JbNX zZIab}O|tk!Dc2gSn#-+{elIOey3}<%rgw~4ScPek;ThJ2UX!Xo_|X25^z*e<04>0* zpjl;@34l!M3YxV9N`@LT)-2QYl5mwc$z&5WD{0`YL$gH&BYFl7G}!{Ps4HkTW~MPy z{}D86X%r)UR>oHn`goq}SNQNvQbQuF#4t^9{3&SGq=9nR(v0!vq*0_V`7E-8&;QJJ zVt{6WD|46@r(`io#@}Fonh{!TM1SVSW9D;tUx|;|!Tm&>^|)Ef`g}btM%dMWUWew3 z>ebd&K0^(f+j*?rTxqFRt?Kzwsbk>K zWynxLh%M-LVYUy09m(tgmS>|uu|+ILm)dZZ(0o+@g!k5tOIoBynUE~7OlZEz?Jeey z!1K*ksbJ3{$ak@-^cM$&qio$C0*CpaNO>y|+)>wSH zI~v~}iO1hhWl;tA;Y6+)zVVlX8TjIGF8+F|1pj%V3V%Dvu=f+1_oq+-&*}JLe>%QA zAagMjlJ}(H>*jQVafX8CFA2!nT+P`Enq`dhw*t-5lg#aRJEUhi2j3E!zi-aRj|Aqw zZppos>4UhuYzWlPZKoDYi|J-0cQbb znTGB*>qY5pzCp;j(RK+hpKrmQ%1DIH9SPT-Jy94FhnwrxqBBo}<`)ULuNK7N^!;1dV8;1SW)j}6$j80HOxz(Tzfzow8)XGJS5}O5$r&hE zmWXVhMc5X$1dod{@XOL<{Bm^~-X-|IQyzm4OA_z_^+aaCsENmuYJxO@^aFzF`!$5; z+BCdOFnzLP4cJ4#^p(s{EpRzepJQc-imnK zA>6-R8Hf8-gyyERls{G}?Vk|&n(t^B8cAIjn*8DHj=g_lfz*Tf-aXjUJR@qp?X=Dt4JyVbN?N>W+y zrmDgaNkt)nF*8{B<$BekFm2UP0TF*2$KfbvFNtBq^^DP`7@^usl1jvMOu>nQJ;?xF zG!`W-EHylXnWN3WsXKfVT$nOVWt@t?RBnv7sj^f?VBU%WYSQkbU8@_V{MIs2GdgL- zsn`1=-b;as{I5$MoIo^dTkkoMkGP8{!<)Z{AzCB{|QOCS58oWXpVdX3KOvEBRlG zgKa$2_+!a$b*+f!aI3Y}MU215^xU-P$9OYdYAm@FCZ4a#5W~`@oZ~%JJS872Hl?i~ zX~`F}ok#_uerxv@>3O90w2SnLK$qEOczsvEEYhAQwp@AFB_oHU-5eh-AiW};}*QRHknjl!*`QP6l0 zc^eKPtLZ3Gn`9+G>1{rSq>Tp&(FYJO3!BLN%&Ybze$_z~Za9I=s?F%-J{&%Qk;u)j zL`p^}QnIU%oK?kbB~o*!Tmk1*NXe~5R7M5T%4?C3TZ}&ahrzASU@VPD#)@^jQB>cI z?D{=w$VdL}^T=<$fOJB0#_BytF5QfIzDaQHI~Cn~_D4^9Pi}i-`rHMms@sO9Jr~i? zavlwbTCwKf6|6me4Qo$yqV~vD!t-U+9l3mwDu^gZ8PQa%vS{Q%XlZsCX#orzw&>qXj zzn(6|-wEK~9L&Pk2MEa2=LF`@_od-0p7-@0!g5OnA($c@%kYfPcThXE>E^y*m|r!g z;u{|Oen%dmxd1;Bl)v4PjjsvOUvJ?#>$C7N!RC*v5(yYdIG-Dj_(`J?GI9_?2fJZs zv>)E8%g5anS$J5Hfp@Ah@Md``&J$i{_jQ0Cb+MsNQDsBBPOD!eECFRHTT<2- zmy3h}1x}al>(VTBOa|;@n~0y!-c$994$$K&$yv#Sj@=bV-XXT zN@&hRa!L-;)AKNmjoh9+9WZ6`bVMwT#&R_+Vh1eLp0I#zF@T4LRzi43Nsy3O>9FGsm zWC(9`?S=++DS;+NvV z%6vj{x`N`nCAkD-k$^Je!(;OM4&!yVI2(7Ehj(S%a|K0UmZh{G%5V#U^8@M0r|xrK zrm(wH5ySIh@Pu)Zp5})e*3HSFZV;NQ~pGgwlpnVs4&BS2G9&qXHpc>0( zPaMUce*Y_6pI=mbF1}oYR^sf(*> zmb!||PutToVm@m&EYC;FOJnb}{FcU(THc*`C)XHlN0%O+ZXcTTJh6nW7f|(Ef7W74 z+!}37m(d1wiEXI2JCX;c=V6J9t*u*2OS9H6lC7jsi<9)GpU3r!r+Dl%Zro_a@xyyD ztgj~=db%JXe>DnsUO>){bJ%>m1DRQ+NXjWk*_K1dX*z+74X2R4;TX~h&B+_~A$G%l zgx6E6xW95ADmEN}Pe>A+J;xw6xd>?lt?FbOB8VE}(JeF@(ouz}c-odiQpQi;D{a0|M1pPkVbuxDllLxb?%RkrUwM6@&o4 zNceeEUXcj!UQUP(#S)(Y%%498i^dN@Rb()lVk5CTW*Lq~``}@1EIw&T#-DaX;`8Pt zd`V#bYI`xkHHYAufiDQJUvdAd?YV^1Tzs`H3tw(e$7kD8@#oEP_|v8s{9$t}{=6d@ zpX^M*XN2c3_Y#gN8J6+m;e7mXr~qH@&*kxKrO)@KtKQ{rThj4evo1{2z-b3!J42MczaET8Uy-XWjfxi$W&uSUtX1smElV;gm5v+(-qq`6BG#+R|(D6 zTCcN=w5wOuS6H_xSmsuMSw37>c6>>C&8+JYUA9i!O6Gb*lcm>6uCtt{zeN&YzD8gc z2{>zSn6$S{mitI~f3_aHK_KdsSqTIPSP$M}eRETOx8Fot+jZP-zl>MTHRE_g8p0P$ zL~rIrxYr_F-LeC1rDY15U&&3tn|Xw|d;%RC84s!mm@A_3D?;O5^ z)_DA4e+A0J=BugXhS=L7@W-w$Ek`GES)({0rsFlBM=>x zh~&g{q$Fn|Gb0aPOZ?#MbY#4O=J?1IBu1nmg$?RCbLXlVI97**;?AlX zyjYxz*GdZUdQlNN%d>EyA{B>9(onuE3`I*8pviA8|BJh;}Jf%gf`hs$#@YmhUhI`zN~mZ$rK=C{i- z@ph>O$ZrWW3m_L~5tK7=w}{&!f-|>omB-<&sua* znki{jJ`2raSeAQg`C)yXF0*s7iKo`HY6(xzo99Ba!7im2-!_f^??W@sXIunAH9OK_ z*&@kf?PDR!byhY+$uZqFo)K-tScXLEUIu73+SoG{k>Nj)>Y*2V*VAAXAET`sWr(lU zbCr22wRB3a^9|aLK=ey5-Nc55HE?&ZN8dh!kiTLRN_U?}@$NG?bMsB)Wfx+`JRg*9 z*o*w8m{mq# zu0V2b6}Qz$$z6$*Jn3bwLuz&-vWgl|UB3m31D2wPgFD8Kn}fpQMyw!=7uD}W?uJ9i z+i?=vEfQoaXqnd>lh_A-L*aJab-hO_ek*xC2Pv^fD- zxnU2spT4GgmTQlu-u%3uXp9*+nsql zp2scscV`o1v+(KmWc+1oB0ix$-JYsxM=CzsmCmp-@ue(CvOkZ?Rg05+-japS_hhLL z13#y}Bs723Oi3@ZNUkA7e<|adcW}Es4WIEIKkUxKmm9M1VQCB=6owN>mJ&=taj!T8 z5Av7dDM9RBMhspj3?EDmMd%O@Ebi-yym^!H5@F&&Wis9)K)qLyj>nYr_TA#~&~by* z>UF+zXX4D6bGY1k4XuR9j%soqz_-S{c*t(KR%S{}X7|mNgb=zAe4a9XD~U;|8HwhG*=; zsSVjk@SUolIe>Z5x_K*Z*3{ruE}@ygtU+^<8de~^o$u8w$FEk0;J1zb_}!);{C-c~<(vcs&8K+I*_uqm%%7@yLS^stPfEfFmY3bi?-LA-KD;2yd1Vh}BSx ze7q?`GfJ|lEJAdqg6BI$Ie5E>U|klkmL8Hy^Qq)P_zz z@9UQciL?>%gSW_CHwnKtq_>!URM{K<)8SC+Y~2<$o0i|SH?cEb*CEh+^%~29teh$K z){d)Z&Lbi?6p^7zF=Nsc>}}qSTb(x;9_^Bha`E(Otf{WXh2tmCexATgNe89e!|1yN zf^Geg660-qOyfwoPAfAO>E(!0zJ|WkWtAVoxVl0*QKM&0pq2?qsg=PHEQ#qE44DTZIN~PC2uWj>0d)=wW zNt1Fc&j7O~EkCs9sb1?Co_AN!tV`l^U)#33Po$|HT z^QNuDO@9W~_tEo9|82AlqjX!fwgs^@?HZL|hRUZa8EsG-lB3BOA9H1DRbh2u_g2r= zvO1DTD)K8AFXHyCTPV!QR^uGUj-8B^oA;q|-#Jw7J%#IU-bdZa8jKz{8O3!wP`KeJ zvNxVamh?C`9zf=%!$@9#NcH+gmTyDB+5<=_sDq>XNG$YSZXS}6kXeqToGK*cu0S$j zIXQ1NlJaW_%?(J-Y(Pd{11eT+#+2D!=-$g6^X3L&<;rFr+l$h*2T`)|81lEAK=#gy z$Zffbf<5i1IB*jMTdyFdav!1#n~~0KcKI&&hUdX!@H9B}9*n-O0J%U$za!wn?eI~v zQChzP>yKYT&5^UHIe8H)PF=?86IW1sKj8zL!- zmS7L_cYkUe&ZaKI<9b5ro_u^Ja~ro1m}RBCJ!SZ=r5fM1l;PVwh4^kSwTB?xl8^6O z^6>qhT>j2g(EQmBnG!A&Ul49T-<*Qaw-B1QrV*Yq`5*t6g-Qs#GL@V*)FKOCwD5ew z@)s=`{B06J`3sq5j$x^pGq$GUo9*dpin(u^^Y9sg`FDhgUlhdRexa;Rw;XSlFUMO& z5qMA#gNKB%`+4cOQk;V5i38y`*d58!#^Odz0RbZu?-4eBQM$*hkBhAd2bM~CG=j`rt z&+h%*-Fv_H`$kk&Wi^dv&h9?<`Tg;IvN9_(G9vPQBR&xs8Sx}FjPMhN`s@(wi}J;U z9_`d>hmpZ{oMWYbh7fR$&|Fbbi^_^RRFqevl;C`pK2u76EHy#1NM1P`WrRGdwXz!^gGnJ7C-2;XO8{BbyZ0Z!*c!#ZWW z8hhKP7opi2gt(|gM1>{5W(kGWKL`uw`(fb%Kh+Cu@pHiD9}EX!InXzhzeDiOJCoJ$ zj_}Fv;d((H&L_v>N(!MlIT@|dXevC%Q$92 zv&^26K?u#FEv4J>FoV#X$@KD^2ig93nj<~CcHCT@hL)Q#k5Dw6OVPKc(G!*L@ig7zALh7>}v^e_{6?<5D{Zju0VppqJzLETOx z6f>XeWtax{A0!6hQKEck7|j2I2;B~K-hDRyZYA2#kQIhANkQ-$GZ;PFw^n&NVeEMP z{V#vSv&Z)pG|O21>NDg7mc>%<$(jOn1ZU}KR?tk_F`)Sswo ziT-XyhIh#Dgo~9mSU7tw-k&rX{$Ac#wRAZ?y?6vbMJv5tpnpItC`u@)+gan#-Og%V8&S(ual>oU`!ZILA)MWUE z1VS6_Ye?t$EHt$kh5)KF<2~dI*SNoqmu!vSCb=MpTNV8{26&7GAfH-gJ#{^tYO*M{?9_QsByghHt|}9Weja`19M|&*3ZIxZR7izZNo!M zXJb26w&u(?X{@>mqhFC>gZiTeEpvRdpz{j{Uyt2#d{vJt7?rfoB4lMLvSuZBB42LS2Z#S&golg z5ms~zmgH>+%i4qJ+`aG)Da7y*9}`;p;>~Vvp>z9w=<()AIFj?R{8$MJj-N%=!ILOF zT!Nx~XVkdnoC7D2yZa~#nf}6!dogzMD=S2Md&IJ79t_1wku6X>z&RF~-;piWB z#_=2iFt7dN#xVSmu>AA7Q2e+)48LC=fgg*b@%!Q^{J0?sf7lR-pNpmEnCB6+Wx`{PJV5N_tg;T}Qn>+~>ul@f**8Ih>x^`GU1 zB7eaY_0iPe9$m0{`BIcsSL1AjKywwLxrWe8cqTNTE3HHc?NgxnmD!%Crog!|(3*Y0 zvn*ja+`6Gz`?i*|{vagl#exVc@=-1Kb=Z1q_*`RX7D+i+UG9Wt)(-;B@*$vn2v%KH zh4WQsaQ@U599kWN)PQMfW|fIU`lEEidejzVqb5EWm*Z`?L2$l9aK0Z;CEM^ceIZ`v z%)oCJPs0x@X5ja0=i#^Oyz%YMD3q><#-NTZ)ew!)_us*(lt}d<;OQ6#f!U5zk-j(_ zwg6{yW8gPQmdb91o;~^!k{yVNNI1AmYZ4#ky(l5{N?y z&4lHg00qr5j`;zh`EG_kZWEg2qdFPad_T*9d$~5;&$i%^d~iq5y;q!z@5&D2XkIGb z?$H(_Xg?c#7vN?#p*cAU*OFrJSrXxyfP6hEl;BJoOeSm-c5l-DZpm`jgkv=vLwc5z z1eyhy?MioKN(gWfM%uiZrY=O!_#&XJ#y>ND$vltOHRMELZlb4#lw??ayjA^Ne)5#+ zNRv6xjXK3FxwjfeP1~09r2cUY&rtQZK2|N~nXb{&{@~uH{#T)yVKfEJ#{d6rF%3q1 z6g2;>;B0`7sL6m|Bb5MG!QxJUS#qAa@DMe1FLlhBkxI^U)Ok)2RbL@jR8kd;H^Y*+ zo3C+RXT(j^IDTe%7MlO-(5!)($4z{&i@6Rr*}hF-fzBt>`iKDo@Xx zW?@>1bEnVZ!>RA1MT=Hg5Dbu5S_gWG0S!#Z25kK ztvH}Q3=AVUhpgC-kfnPOn!6E+`D-!s?aAobYbe4ZQxP7Mp~fpM34iCbS z_)wIw@_o1>0Y7ce#7~=(@bl&b{IWHf0Gxt9Y)K|abGs>lAS+-Thd*zR!++SFh=1Cf z$Zb6SX?JIhql9IA8C9WFNob}`p3~4QGru&3 zWee}nALh7$dO6p?) z=1Zz>W}Q)0NuW84+B4g*cU25>Lgrym=QbGLt1HSjt;fZs`KU<@!ev6;b%JvP8z*;T z0tn88$fP-Vk^V8hDfkFKET4ftte%IT*8AeS9g(=OAqgYjXs>K8VDea;OpU}DLi3qe z2hI?hPlS2naQGse%a4QaME%iVm(D%m?`uPBWFn$Nl3@1>gT>21aJIqUCm7y~_`H0A z2+d)z+alEmgO0!mOd3B~HLw@H{WhvHQ&E>5hx5ttxELRgnz#^Dg!`d1!j3JL03?hV zjm@*B;#p3lvZ;r(tvhPjYrVHm3IPiS{j9C}_^j&cna`$A7`)^U?q$7!#UnN-9)YTOoET&|D(WEJGY=TTW;; zz}cwp#7_0MECS3WYO#}Y8Hz!bvJRLS(wM*S@^>SMDXh}_WcR3m5> zC{`cFieCsXIDy#(&Cc`8Vf^oeX7y2`vp#k~b7Ocm0NVYV7db(vv6{MxS>}YiPI6vT zVKKY9oT@_4l*uAB9Kv~yNEe=e zJ2b02py9cDnt5MS8r0XPk`C2c(!gwnW}~22aWobD>X@nE_kd=P_%K{&fvt?lWk@=$ zMqw`!2dz9K@XU&)aXvWro04%(Q)+yU20;BRL`F}u;RLS$Gt(gPmsYCr+x#)9hfL4O zCFY4auCDkv8yt;g5UFb%0WIpu%TF(V#L=7TeS`W)Hvm^1GTJ2N+|L{$Xniv*R! zd-h`F@L_1xzCD6cGO+A;Igy3_Y^oOq_7AXbANL#iCi7WQ1**juZ??()wIZmK?O9`R56sbEakhZg$ z(0m!Wd#)mJeFYr32N9UH3mKaa<4E0S*t-8TX3q4*n>~i2JHfeq=e`*I{wK&-y%lQ? zp2hN=$C0<|2(tGcLFU%I$lJ0P_M|Md>D~v;TD3#J0YlV>QL|^xM~EW^QK5-gILiyY zyE3k=+o2sB8*N&&MC+EVl-g45nm1D)OD&u<4trBWaV#kS=W{~w?Y2Dpv^58R*qni% zHl-7i)9{C_srYGI3YA2d7HE#AVtGz9{=AE@d>|G7v_BR9xIYE|uqO$Bw@rGcObGrhA^3Oe`J1r()4Fi{%lSjK^sd!TD)Y2%aW}<4$S}N@59AZ})}w z=pjg%@-FV>X5wZjfg&ydpBF{pIn(=PS{R`@gwPz015pbwv1e;^Ytalrgr1W}kDyd4 zMS*6P6BR5!%UMJH zM^$f}v*STeW2(b5WcZ{={M20z>YiokRjy*XYFQu480JeACAe6{I;^?^7i#&td@r^v zi9|t^H-Wh`hW6;Jp!wRGrKlmOc|dbC;WEaT;QR@}`2&LURD$z#{J3&HpR+%{+ZKsW zx1?iy|E_A9L$7h8a3U>=&>Vo%(RS72d?M5f$HKjEE;knblg21$?$n_x{CoqD7@LBG z@Dw=w!(mxuhlQZ*=i`8{w}bx&>lrp|;YyBxFg6O`QPAzA3+e@$*%-Z<5Qdv<#N8q=HzX34 z6A9r-zPL@mmEPsssz+J(5UcUw1ZI)IvjDIF_3b3-J(k{PHt-Zc+ci`t3RK%rpW;t& z_QUN&A6~=w(l@TYPW-38{|}18YAP$p2$ihsXbVyvi@nm8 zTxC0DW(oz#P8oKpr%lz+Tvk@Dpt+)=NZ+IDjIc3{nJZJ8kU7lPNj)TI25Ha~N&GYzLSsOT6A`1KQ5y9648@SzgP@wQ`N=`X=PQhn9w60fSfb!5vTieAIJt=AGHr z+|bNM!s%nG2W;z_wHWYbU$pP>1`^j4qu@j(k`7iN_gFQO)^9?)K0WZ}$gzmbTaPe8 zvwg`}b}y6I!(Gh_}Z~z>FDlv2dXU z4qG(AISyg=DA;(;NAG=vq5X&Ajjlb>u5AajY1(N|-a|ZsrJrRG}mWW?=#NekL;rNl@{Nv_O{9byG*UR|kF#JeR7X7j|0)Jc+iJw>V zoV8*2qx4|!Ou^TyBk;F6zDRcvn1k>lD-2IF3C5XrrN>zgJeJ?-f%rUy z|Hp^mnS5-O8IRPdZxfjNA!71qltqW*`+{s-^O}tpX`y(wI0_GPgYhiYfhUO$+|7wa z!Q6MzuT?Yk(NtDaJj*AsN!ogOrJALwg1+MF{iVOEjt-OUipprW#wwS44?7ZYeOE5t8T^KV=0&4NsA=EML|bqw%7)XC zR-7a>p9=TJxtwS?-k+d)q}#S`i`lb2K{^}CDfCmB8N=d5X!Z_*Kfzha3C%J@Bfvix zHcJpj4Zs5Ae5vgsA=YE6QUI~*GAeV(3dn0M|FdX)qi5I!FsSIC2 zvxT-mV9g}BW(J@k%MUkm{n3yuvr+I^wv7PoN3iC(IRUsv_eF2dbJZ$h#kMF>~pmo+rYOe6}LnZI`k%nk7t z0(PK+=G$zn+)ic#FIn~p?rIo^tkg+hPW2^Bd*fF8LSEyAo9RqvsueL)Mqqfi_Gm58 z+`J_&oWG2}|M_Ry1mUzwKH8%V2sA6(p{)uu8|6BsdYNf2BC&r1nhhDytkxPJB$o>? zQ`MEylP!Kzj`LO3C^>Z+Uh@}V&dgZ|B{Z*JyAFc~4nj(DB2E)}&z?A;eS-C_@g`I1 zYhgNpc==CkxsDCJgs3RQMMNljK6T^>N{+K}d-{Z`H(d2G)8m>?+6l~m9hx;axOjnS zHspe4@lU7Eiaep2;{B8l^Z2jJ?Vm0w-`8>ZRcJQLJSCcrpniDj0XA_H^Pq48ctjo< z(@5qpUniAm3X~g*|C?ah)vMdoG0JRQ?!uP84Tx|(;8&%EE|%_ z$bd%$NJb^Y>wZm5g(Y&IBk^UpOsg26`FS=T(`>xb`Tsv?7V#XF29W{H`j~rcxzmPB zYRX4kGCf~+*dp$$bNCDuFl%Uj<@(o3&eb;PUVTX#PoDXpO;Wz;>zYTq#w?mU33_1lrZeg{TOd>_qPbwvBlJ@DStk1%(h7kqpjuv#Nw z@sA)pgdxOEfVM>-G%$jIVZ*Fh^DuYrLcBF_pqgD^Qr})!&&vD=f%AAgE8JxX_+`%u z{Bc`8;WY)nY}7Mh{In%l&1&%nf!|F6%JPw50zo()zug#%?>9u_cbnqyj|X${Pls~x z#~n%dg^>NrjyU|ZondU(&@3}({Ir4LtqsB-2+lvPkHqiS#^A?waroVaB>Z-3CZ4Z~ z#ie8~oC}?X3z2j2Br{mS@|WqMc$pbSXb#7dj1WA}48jxna4T29IS?-h39>lF^Q2Hb z%8JKUuNm+k)(c_pkHrqV7rtFxh)3ZzJP2Hbud<`@d}%BmW6y{`RMJmmd?#(VjJB%Y1aT{nL0dJ(;)}{kPNIyBTfG>QRwbp+ zGXPkZbxr%M_>-{_K_k0U{~;1dgi+%N2y?YJ$^+ zigUP3U1a?x(0snO0@p5;!8-pvOdC50v)&(%o*mkvWMeTJHm^fnawx743is15Z)rOymhTIOTzCo>Q?av)9x z&cO>d&YooW;DG>fj!lhWmZ2C8gy@?&7Tn5`V-(@}HX-^8rt8JxAY3K<-rt-@Xg-R4 z=`k46t{L9x+8#$E!_bf#&wPm>$O<$QbYuN-mwtAaU@Nn|+)MJqUG>qQbJ{*78KR-b zHQz}jFemsElx>7$>7|xo93i;HYi=j`^Lhe4!Mj0#In|rzEx?_)`Mk!P@eW2wydOd) z4aJC_omDS$>kgf8=gtFs|K&^C%4xNbO|^4)hJt3=kNQAg>P<7p&|b>RS^tt7%uJ(f zls-`|=Mau71(vI8*%+%vO%=CnY+k6V!~WfSRBv-mMyBpTTv$eC2X2%hBXyd^mYBN(kBKPM*Y><41As^l_A(anAfD zK0q1#qb0ud*P&Tt?rf{&Z-Ws$IEg-Ggkgj$_h>4-*Q$|^D)BzX|IOqTkXbbZglE>z>y!Lw^mL&5@FQBSU^os(|r;Ww-P=Vcy?lR85QbW8z)LWjsIw zcKG}Cl84U%nGfQB1)7Zl*A0?%iq}B1A>*N$bSNHg z47ujv9=alMR}4lkj9W$+jpc@DeV&n40|fQ6cRf=|wO~z-AkyQYnD(NjuysrIGBe!9b}lm8`YUoi*H^>kKL6Lf#Gc^$ zw}nqzH`$^2T6aEh-&v^qnhYB-z`5+q30$eJQp>M)>Chd+CQiYsW97&>T7%4k7qRmA zr?AE4phf!*m@?0b@S>dvmXH3H?n1+-NC{72e|z7J5<))#q1dt4Cp%wefkeahhF{h-h5vaZrO|F+Ych8 za1}cD9)#v?I^&Jr{qga%S(v}jmkohX_*APRxjC^)PUgpEi921Ft#CN?`;>D^sJXn#4>Wokv2%V4W>@aLzI2q+(3-KsB0#A~I2*e@y zLiIEU;~DiL(}AZ7nr(QRW#xZ1e4b2z&WuE9OaMYh^+n8-u_&556OUHqs=1gSMf>4# zv;|+~#p3fN@pzaUj0Z9^LXHE6Lg!)5;9lrTX#Qm8N36tU70Mc1D7!%4tff6x5`?8U zjb)E9551JDv{SX3nJULLG`m+Y)pA3#S=#1xuax+a;X}@bE6XW!8Ls`p{aXXOE=SY? z5hg!!`8mUN)n`sAXs%%SA<$gK#()!=FA<6^^Vs>SD%90h;qI*~2=e~~{knI?oM}_g zvr{J=UAYo>w{ApjVkoYfpt&I~05=l|mhp>lCt)6LCwziC1h~)h9r$idJbpg13^(_# z#9Q6kD!U8&;9ZoYgyU?4uL9KS1x|dm?nZO)Kg|mU_wZKrDLh~Zr-I$3VDh^{W;as435~2ARq4{=NGV>$`H)8~v9c)y{&>$b&j$6b$@*))b zO&~5sj6Y4t4{SYR}-3R3D4I*y^7eVXzbs$8+&%_M87xt!pF-S-iy32 zZ0K7!dgPGmX)YlEt0Ar;-d~_uEx01R&8+J`JAVOdmM+E60Ryma#|~xNXOACu@q=b2 zZTrd(O!3q_BwKe~reyXNL&5@5rzvJ|bAsw&cAfLT2+cfK`+Osr?(w%nvjC|H;Iy0% z(IvJC*D02UQ7Mi87 zHKYn|39m6MIn_AN&HpKYV|bo`XF#)vd8@>{P?PKvoauOFd?o_V8gSjfEHcBRs}fLY zrvw~%odI00!ZO2EYSi=WavmdX?q{!_DW##=a0G#6Lt;0cHY6?X|IeYBagmu>q(@7q z!FX1Nq!Fd#C$Fs<$IGyBBi)A7|BU0S;fZ)pb-kP?&ri?w`nt3?4!bclYdP;pseDjr zd%X-BXspsRr}^9}VfQBJw(p4#=lEg$saoV8xrlHEzI!>MVC(9FxMJ`sO9?+ zN`MWKkMan#j?9&aTe1y_D-R;Fa2JvY(^Ee2!;o%hcAE zv;?+qzN?&Zzk)SaKEvXM8(4a`0ZVV) z!rGe+*mvVTO0GP>`r<=)XTmfL971p&Ivg!~yoqts7hv(mJqSm2yQ`e6KiVE-CDOmN@@b@qAFINtoYAeLgnf6Y`|5c^jrl>S>BfsFiQ!} zWdhDDcN#;pp>mOY*h*EYo-nhFvdZu&@f+#!tK@yFSg)`FN=VUlMk8Nw0)kO)m_Jz7 z<%UxX*HxcUwZ;M#6|7e(=;s8TDnhb`<`P0P0lKAB}(Hjj33kiw~a5vS0mrLUC+l|@y)5#4u zloN?=&T;hd)89u)QV3z$2j{~5Q66PQO^gNQF}^s>22wtOwp*)~Xhj=s*RCak99Cqd zX29Ra0*kMWkW6Uyb0}a|B0vYw_HDjFa9D!z@dq=}yhU>i>DB>9;=^&Cwp|+)jLU@P zTKfW2%Zir)bCK}&AO!YpkFxN2gl1`Qd9wlKre&^|nD?XKnT$&pKgH$hN>%@<2@PaUW`SnK zE3_YBDca0mg=X5n;sqJDN@%W8Lw#yg4|5G|sIIDx$E$GT>NP}#gkgEn5(UwtN4<@y z?@hssY11)k#7G?2zZd&<~?QRE|F#;*?QJxW`A+{56>~`HsJzeAwKgtYTUBamr~C$kCfCo?(mHB z&HKhN9fppBaW5rTz5g0$Hil#vzZEzOkn%!L@fa&+18B|Atb3ZJw~*&Z1O#?WmAyxV zoN|X_k|}(FE15I8mIX;;U{>){GJsiL*%4O7xSZ1CiADN>HGX&k*eWjE*O@4CW~7n< zkc2CFU@mAjwr=Tymhn)(rDwu05M8*sdnugI%;WN4s~bIBH4LN?IID5cN@7Tqvw(IL z)B=PAO9jj_JcF>Mp;<$h1}`lQkc^Xv*Q#;5DxErh=Je|i67>1<3R1@IX;_x6Stejs z(9EzXfmfZcjGGv@>6z>Khs9oG2!?hBCI0pRX2Xs|PFwPjDZYkOx>PznW$sn4Vw#Mt z*sID%%Bc9%D>7w6Zu@XMBGV)OAZb!@lYH`wm!wtFEzYcMOmFou;~o;8Npeqhy*wuw zJ0_C2>-4B06&{lNFdph2^1CrK%RcYP^IZ8L&zNa!oM(oVtvVY$hCOH-K8Mm1r!n`V zPtd+?4=i%TqxkF=!PeI2Z6YhX!T1D}ND@Qq&zpO`{e zSUNtHIWDSFz#hO%xECGwt_+|A+1m`jM`>mt#m#vfddj@`4=ZEhJ%%3j~!sC2a+60ZeIjrn6S3w5DBxEmXY7tWsM&v~EE3FRL*mT$a4#0x{cxY4@p*0x6^Cazp?H$-KxNWG#CNjZy-Hf@ZN_1Dc&Z$O6h9;OvCv z$}_Tvlp*=3SKGN8nx%(Xdi?Y&bLq@67Vpx2CO#_uCqBb1rORTL4JO}mmm3trRpp)u zn$@dssmq*WnyV=pGI@dZ?}hR*T&!V3p{fE`FIVHtvE6v1V+(vVWh{Dj?uf-vQFyRt zJ1(V1;VOaeW?~@f3C=eY18|*ycQc8NrUV&R>#L?dd%idhU#(Bace|EirDcwqqqz$e z_2GLs9UDSm_Qts=Ka|BI|I1z&cnwgK=2%dlX<&QXhu_8?>EbPB)wtrM0hT|w&y?T(aZWZaTU zTvz6eB@iOL)v6a#Nx#0YlZQw!cENH5Gm+A-B%~E4J}r@e)1f5$lsW&kG%$)(8dVtD7PaKJ z&X=Zaz}<*pyqz#Aa>wQMQt<7pbXCRcN*BX*<+GLwi~kvo9X7MosxpXXmIF4 z!tl=~fVr_uVK$ZmXFhlNU3TUa!!F0s-Fq;3?0B^6&=VnP%do!WIu;+li0zfvaIoS6 zhK`7vR9a zfdkdd0v{3>HWMWFMLKXiJ`{DC33#wL1>X{4e_H8{-xZC+U)GJnf7m(}f7v`0e_Fo) zzpM?xH%lY%G(Q{g9qupgb-hXsSg3wA20K4 z_%c5bUl!T%`I11qToH_yD?|A^43Dw{@mZ7)%0v9IBie>Eua9vuCl=?Ef^aq37gr;^ zaU&`Kj|do_6VN}Gp5=62pY8iDedTLTSa!<{$8MFMJ?{Lifmt_N#n0)J%6Hs#o2PVR)LmY<%OaQGiVvIp(xA^Qq37$JAVq{{vTuEjCatbZEM6@ zym9B?PSj>b5}E_l)M+;eyVnz}1ZFF)C;8%L0-t|^09`Qd<%i+k;s`ujnNENUz<`!5 z)R<=LgmE|(8;0^If0QNopoG9(O8Bmc3r1yJ5O&4dFsxT+)jKNCZ1q}zwJVpxZt;ho zx1UIyQ69N z=aaekZf!I{+zVG0$Ku|uwb;zY=a4qd@$MV#vDrQ!SCVDb$t2XrN8v7^`7WXTZj=>w z3C?#FGzSolc`pL99N#nU` zJWs@i%43-VFWG{7xzX4jxDbIKOhTW|ofI?+)cyFwZ*c0^VRwT}I{&y;b|UqR)HhC> zX)MFORA-v%b>_JQX6aF`t*Bw+>pbn}0)N*jXqHv~WV$(7z(hK3F4fdv{mPY?`p!Fe zXUrHlEEY8jjA5J7%Pf*!ZfVrXP$AjA$MZ)GA4Y#Wi&CC1i=xPCg5~rHy*jfYNt4*V z#F1iL72mKvsbRe$YFzg+WYGKS#< zW*Hl37?sEzx7S0nTLNK?_ZvTZ&%D<0X-ay0wx^oxX=XY+o~`_L%E)`S+>of!sKOBY zq%FECLAMh&X#+C;oVMnX4rg2%gR)tTZQUcirhFDj91Lhyz4$yP@%LD;!5K#tPs*?x z^=7~vH>W&6*(_&xO-VPzwB$n%@$*fq)?n1oVd&m_AQG2v!`hP1P;{&gW%s|p)>9>D z*Ru~sO`M9vC0h|r;0<4P5Ls*X!JFVXVAxoMX0Al|@_h(fx)&iC>oE2MZ+JUWF>#s& z1IB!e0b@VL(8-_R?T@`NajqR7`bA-;H4gsa83<1(Ky*SrynI5@rF(C*Y}+0KhmS^3 zOdisBujCa+5JymtU3pM#<5!+S{OVG~ud6`f`bwm1s6j??9kL0|`I|2we_I{$chzFm zsq5HGI9~Hv1G4YkKn( zrDI36!kh0qW3eqV2nQmAaV9Ammva*cH%a(*MF@UVI1AtAj>F$?n2Nt_{0M(4o`av) z`Qs03!|;+2^f)h)uo;fKdEvN~6GE^cGz&au>7g2TW$LaBe*&vNUgTLQE5SJcpUY4R zZl4$0@Hn3UQXo^mg%D=_aVc~@j`@Frz+rvyZl~7R5w-}=*JR;&Q8ZyC2-l*mxDoDy zd-2|CmV?I(U-XFgdy*f51_JQ1kH#WwL_e%v^a-vMrr_DKWIRc?<7rwTA>O7ItN4Py zKhFx~{h5xUDAZ>LW5c2;c&BGObY(+iS#A+NJ%52^nM|W~4rghzvhrUEedrwJhGr@M z#hx2OvmqIR(HNeMjt6I5pnRPEsOmP)`pvVR6KMAQUO@T3%dc4e(0@F9RKjNbJ>Sw& zgcH56iEO#AI)qp^6IP^$?_yOME)$xsC}=*3>f^uS=`VZZ|5^5#IZqgD?b$Xmn5SmHw{zz_EzvVePDkai3mn@j1`p$z9?sd zriwsV8y|wo=s+Av4#T@|^+EF%%?Q7(5a6{C8`dmGaG(wT-WD}}+2RwddYUb&*O}1l z$Nzioj2&6%@!9Z?lYlVUMyJH#n=Tmw3dNrXr#s?P*qS3H@ zB{oHbU~uc^nB2P)wm25xJR1RbGZNUih+;z`68B<5aW9792s9^9YIp{rnYyK*IS{u6 zniI^>Oh9I1QK0#@0nN^14YIs8LHRE8`d&K2rk^}Wp1;( z*9Nz*U&VJ{eSwol4=GrddO^0H_F=Y1vv_W!dP&tQCTOm$tW~gFQ(nXEd2Xwet<_b@ z+YEVwQi>~eb;wFd!N5L!kdc&xs~0XPU>4hx|7ELek=I;4Ux%ra-$jouow0A%P8C;y zW&vdl%krk*EUD`xeyrD|j#BlT6PV8sm}SN;XQzxI;f!h%OdIu@+Besks@~Mc&3>kR zP3|Y1Z_Foo>#pi`x1SomDs`&C3g-V*ooUKjQ8VR~PBjV5B7tTWAZmsK$_&i1uOHlB zhi0SF_LM+}WGXXTNrQyze|h*KF&Ij~+4!xEh+%Up_t4$`(#s%XScGPoi?E#G${e0% zh%y7S&fM3;Pma6N;3@S{+Fy||zSRtDX2US_>MR&;BWVoFhLizOMo+U*DahBVWv2z4 zP0)Or>38-pH-={Q+JosadfrtpGo@a4IOQ2Pp0B{w6O4^sWdoAU|H;1D81;Rfc}&Uv zD~>=lhGlhM5&vr(o(D9`)){x%n)6?SryCMgqd|os@i0TP4pZBQv$AU}QyR>2t%Ug+ zsWDg1b3M}Rk%`JxtZB`vrk|2duucbcb>qSbJwu!)J1&y%}>bPxE0MhcENiy<|1j?R)j6y zt$K=c*Y3jy)8}L8sPTx&TZ?F!O`~WBEHQ<6Z*~CUQkLS)0b|gtMQ1ec&;u>Iy@?jx z`=M2jfoR|7E%Y5W0fR@pgOIRzWM?nMdsAnrRrlI;>Woq2CnG9t1=5!vB*e?m3>ni* zXeL0%FF(osvxr-D4)JTskg%o#DeJ0`wyqjkn^;+FtwGM-I;<|cgAJD-V)?b3$hdnQ zF;8zG;bA?NUB8We=O5t6xw|-c;x=|4zlAmXFJtYg8`yB_5*7xpEfO!Jbxy35||H! zJ5V8C`7a@4u8hK0%US8BzJuQ{n~uM%UxYud^TAK+Z1{0q2!6LF49^O}@PN>KkGexp zZO98ly}%LSxj~Hs4Z!_OD;`tNa;zGd^KJMdkFc6c$+p0T=LF_wJpL@t!T&??d`TQ` z6eT0*tv8VM@pxpvI~1n_r>l?kzE~QK7X>kREDJis`w++$;X&F$Jjz^zhuOZkR~Uk$ z;a+eI?SrB@AEGKf3eQ%h;bEp7&ncPL`C%rZS!UVD|&mF#&3@cRrYSeG#vOR}d*1;Dl&*4(#8A9`E zT&q2XI~NXMMNYWtA)YaA3~ub*f{VF{gl0PdS@$wuCjj0cFyBb@LVdg+?j_0+)S(KR zZ{~%fp)e8kYnQ?A-H8gCM-zhf1j#sN!g7K)DpUBIKv+k}tRhsNNR7mtiEpc(<~A)` zDrnxgZWSUzLg4Qu!!z{E8CGsB{&pp+e;|(qsGjC`-<^uK?b@qp6;tOgL`_NpYQh6i z9pNA#+i;A~eAN00KI+;LQ(8B}N!x6EofoV=D19I+{$(=mnSpEs1`?X>xRXv;O?TjS zD(#xk{13;o@Xx0U@N#9O6PjaibMp#9b0`2jLBqZVwP}v2eY;|djGc}qaA(F7n&~HT zVYm~^aAN5ru>rWp28WDk*1*gLrxF1=(T>{*K`vl+L9-K_bx*V2-WE7cv*TWBFoBup zWyr^j0eHycw{oJe!8Q+b#tv5>Uk+qh@$DDS@%-@voH%ljHclH-Hej+1`K@@zbj&Gb zFQz)l1DeY#$#&IbIf64K;9Ms|H3-w^t7{Yh%SVT5ymDnF4({HK;e!TY&9Y^!!v5CF=yrs^e0?z-@HlP>zwcg?Ol4Ag}sbbnKdvAL`#~SaU`o$(I?bw zQv$^VX8kd)XT76_PjV}7;Z>)iq)q%&e)C?Q^`<^<_A~8kJeR&!EpORnn5xTHolOX%;f7{`E}3%90A@WMonaU{J*v?muw2f4kt?0N zk26m04-IK#lmW%MXPfCWL2_dmd7vq+%%?8)Ph<%%OCqS3Y68yNNO$5?-&7OP9HVPnlr z-1zQiSmKfh&7JUxpAAXNw<2QcF2pR`gN&tHG2xvL@y>f6BQAd(!tysEaoH|R_m04f zMPXR6ZZAHZwh)s(m<^xcNGu4Bz{fr|jGaCgBj1~jVdJJ?*yu_4=;PUN*uyb+@JKbs zX_uZoF>&g2#OJI-Dxo=U*+ByHA+6;{xqpiAd%5t^Z-k)-ay9fYlwez9mx;sv0R|J_AXAA+{4im_psw=1J<0pjjUtWv9|IK z?CHxH4}yNPwrJC~9r_I&f{*9Uhj&0Q;W!KyTNL~W$3Bd+#U2eyV3b<{VHh%Sn414~ zLZ4n(V)Mn$FakYFV}|SLC!S1Ech6|to6ko*9YOJH8L$+EPlH_2H&m@ z!$U&zz5E#5&5y?IyeQnxkHoFKaQ+U*og7x)Id(jx9#hY93CDQ>3Y1^u5@0C}&NhN_ z0A3anU<*Rour+b*|Ajmt^_|-282^$3t z91NKY>)641d1redF4Wecg#KT~GW`sF;~YUkfcbw8nhlBHu-xVOjeS?@rpAC`uIt>; z?CIyO@@knLk*TG=G#9iaXh%s4ob|z2#}R zvuP!gy*^R!-M?*HtnrzLIvJ9Y;DgFUKSHxZL32&41ILoXvFN?=t})HN^XH*>%?iXu zutDf$b!6=PfvGs@ZRw@O)Vq zF2?%eQhqG17cax6$WRPqlh_%ff7mc|7cpqK$gK5tn+3_C=o% zXqMh&8HORCTwPX$3)S)wU?ncf2Yh??s&ZeZi<7VfjAg30&_I?whYsP>^XFCXvH{Kl z&DSqqMvyH4Z*=R5BM0{5>ZOaSPSQh4G26hkmC-@Kqf|l5Xd@?rV51eM)8_QPcOi-U?}X4tta`!?S*N`6zkI$m3>> zf!UCNvavOOt0k$Kaj!f$B{MY3gJeP)L#-+h8cSz%8N)TeSx?5zgitbZy)pE{ECZU| z_uy9DS6b@Mhjy-lTLEVSnnhxejUI@RF~25Lrn&E@3r)9Pz0Sz^zY5Ki0nMJ^{O^Hg z4LAZe2E=O7AUvR%@i!TU5m#xUdgQYyy(&Ju#te8e=E)4qhCC__lZ^YB?`h5>vs~9u z1}y8^QNC*e2 zhuPWq(a@}=8z=6xsqiQrj@+C6BLxHf!jK+s)|42A$sP@vfmz1M%a_q=?Ap0oNs;T6b?LLHpUHee9?+|jg??cL}O$bjb#4PVX zj3#)G9``QVckF`JtmwP<>V*$yFGOPQ1|%=rPf$LHn3dEj8KQB5fGoYtgyvPJ5w+qZ zA^sR5mmEjT(i4bYb^;0O&S3Get0+GI7;7)y$MP#TkzaouX?JcS^L9N}U%iC`Rkv}b zqyfi{-@=aL^;mYc0ZC_XA>-6lL@e8eoB`Eva zBj6hp30r6aLZVX<9GQY3{$4QOA3d3m@-f!@QKPUfC;)pyttd|l$F-a|JYSoL=PL-8 z`LprkdJBG9Z>OU0+ck*<=2ZMpOsHNJN?0Z|7bf6NVH_Ir3C#sDR3z@?N8k>@<9;qH zZY2kv5>8(bkYD6G2+tIceV%8>OV#Tv{}E{Og7GXr98XuLBKL#gI33}Or;AhYC^L*e z9gfF5_enm_Cxm}VfcYYw;7wTgMiw(kr5KkpA^u1nGYSVn2@)%F@nmTt!wli|VR#}i zL{NUh`#;PhL=)V<%(LRFA}g*Dytmk9;@#d|(5q7iY}&A1t&%6ee2zY&dYD-^C}`$y z>3J4l78}FYY6uKWxxl%K@K%47JnYCj%a-h~kD=uv<#?>WR_>@3* zBi^o{S!T|-p0tS2?2S9gGOL3O6Ch|3nj03!;_>=CJl?q;YhxVf)~1D8raXQ6R8$Kz zC;Fl?!H=?`n!sEh6@cR@p;-9tXa&up`LkzY-OA-ih>nHdBC8tDZ1D@Fg4LL28J;2K zL;$xoiyhOZe}cB{I;$Zi3&)Pfnb-(aIs8yZU@i*_z@CNg<7miyEF9Pe!+6dvzZtaA zP(04G;9*`?f@y71$+wd7dSi(|cmTB`c{7yJeAX<(IRLgzpKfQ|d#3XcT*ACl?ixpH$T-Aqx zJYS8;FO#7hPTycyZfF+HV_co_)1nW^mi{0%ul(XQQhvkpWtNa~{+7`Pu1;I=UHXR# z&p6j91Da(Pl3MwgSIvT>RVgD8)W`MidQd)aH+<8O**`@Jnq9rgCN(~;pqUBRnbTBZ z;)Z5dVdH#|dhDmZxo%0DB?X;^W-~Zfl(T{tXy$!oy1PoYvPH&6e`>s!7>R@}M#Hew zOdZYwPb9!BUx`*ST-DoVK(i{q1XP)^EL?TQhyj|_IG!T?V6@~K_r0`Ua}DpQhXiP8 z(D9U7(wQ)#WCCXa;~L)2fMl0=OhL04hd7C|As}#L^fVj2zD7@_0nKW52UWPcjYHz9 zzs_ZxjfRRVt-Ke{F~(>s3BXbs=sdyMxUa52Jn!?j%Hvw6%*V`9!!?{TS5itUUp(W| zl$wgetCI65^-P^o&e}-ozgL0zb@z1TsfckkOU~0Xk{CVB^09>(nsr#lZyB~?gr$1G zXqRSaejPX)eqq3c*pcjOSeAR~`^m5$nXd5k@iX{j#$0r0-yI7q5jc4M4t8IBfYSS4 z;oSAx7&YZXbnQC`L9rP~7HBTqOz7Q%koX)78#x|6mSBV=7a%012p`S2p>>!3aKsfL zXXS2u5)gw0k;zEhxD63&Hz2xrD-w!#B4WvU_{U|zDgU@a4x4zT_|>iw+}d@gXFy zKZ{kTZemmIBZBh-tiRNN<)7B0@OlFlQyVVU<4Dyll$^PalP9lY+o?}caPBJNDz3q? z`wW7YY{i6GzG&B_H#&6ghH;Z8VWBmE&}@faK$z-X_6v%FUq}=J*pLW~OGju_3PK|i z@xhd7=+LYM`nGC?xal8ZYp@l^2{ARP;drn#9$#-r!jr`T_|57-{J0?)KdcSL?>5BX z`}OhozBn1b*%*(_`AKddp;#nv9E2A< z{<#dt$O$C4I`Fa}1P`-<@a2jWl*jlX=e;4gTM&mYvZC>M3L#s1i4|67Vg%Fy9@Rav^ zCcWIbKKP>07ge!ykUjey3?RsT^zLNHLJcMK2kA`~XqLL7lu|v+vQNAAgl5{mt205{ zvQb8AXPnB}Ld8)+^FG|Wv=h}Q*I>{a9nhyoqj#170RQw!L_t(-Yt(F9 zi~7}zaXB#**W&E>EY6CXDKb2R(46RvyU7+pT_7G1^6u~%-CfQG%;q9I-MJ1&3R5t+ zXBXAm96D(XDhSsVGNw7+2jvW>G&+E?;1EI4XYyDzPqRSt%xNEC)sn?Xh>3+S{ZFQt z^YaabpPzgP=ukb)7JnP8el`SJf-rx+54y53EwDJHZ(keG+VKA;ba^O zn}>km1MwEm*&Q$oPuQ?}knX3T|32e&pWB913vSZpZl>9BgK&B+%@_9yEcmD6+4zqq z3-F*Y72lfOnL`{?rZfArMh(mFQjnlgcA-EgwpzLb- zYYok^ymqJpXDxwd<{c%#EJIxsEGO}NC0Q-djytI~LcK-JnsJ{{{~#j-_p%(^x8M=| zv6}adnK>E#I<-}Kn30r-|Ms8%6Ct%qm9M&kMl*)X=46b!u#{r0#*%ZS?lJ0XVFH%@ za$I$EmZOd^d%3zs+bnHN#w1S~KMreFEJyvdPgVGa>SRuEH7~eKb11XL5SXuiiY58E z=-#O#(h?HX_-DDd25r4HL$e_>G;44+;;5z4>Z(`iJB=iOEHmMhGfheY%>u$qo6O*$ zd)g^g_v$bV>0uqGmA}@OR4G-kzZG`7uT2!`%OepRnRvj*nB z4$V3v)k7xe%C+j@t7DQtNvPhJLP`ocy*4_*xd~{ltdK1uFJV=(a52DH!lzu_^h%6} z6Jm|m1Nv2ghGuz4uENX65MF17Z7m(gS7bbJwcq%fM(2v04^!@4E~0T555+72KBGsq zN&wo`+iU{mMp6Z<7>5a(MGABXIBt1Dv&Vf+M&bT2h4EHSsN*h`iaC#^V3&AGcm~Wh z24+!H_xbn8SfQ~gneS6cXg6f!frnJXq^WS4auE6bx5ZaYn`hXAQ*MX-b!avpHz3zs zVX9}VdYUyf8$HeEn8z}HOrTlv+X&CdZzDWCmR@ZB*I`+t|Cjuh{BlEv$be?!9@5h+ zzH(^yAxs=I2^~B1Kxj-BPF#M7LsuT5=Fzt}R(lx(M~y|FA)^qPQi$Z`+Yz~V6Vg}j zhL1f4V<${OP*?)O2+fv=6bu?U5zX56Kt$R~WD%OD`G#S3SR9hJ?LgF)EePMZ32~dY zBeZB0<_1N>m(V$7);zTC)D_K}H%G_L9q`7ReX+nl1ZhilAiii1VwUYg%qm&fVGVROv`g7bZBWFuhx`39`J zbO)<1Hegfr4IHVsj&o-(R3GzRv_MA*aP5onLVsJGr$uh>U-=zv16c{mUmi1Or6 zT+NBV*Bg`Z&H7lpToH!ft&gFi@ZGuy!g4gem4!?;$Kl(JgybB5HO5mux|3lUck-hY zFy9tnrXJ*n;!$3R3!0w_EEm}EG+z{iXVe#j;4h`eIopnx(!)&Xe!4sbYo?9CmIYG? zvx)dRC4{o$3j(rCDfdNIAYnNG-=^6p2figBe3KoHZ&v5v*3vAjpZ^igXN2PUsyMt{ z5`xbOGXlzw3WM-?v4Ar{KA$42e_j-bua^bjcG?`A44(!6xBID{&$PHiT&%4{3GLnJ zQC9T?>kHM%VjNR;{AY= zzKvZ0A0TMNV7$|&84iXoP}9lXN+$Rx$dqk_+9WF)Qv7g(P+FfBh?{J*T}}7KgCZ;b z)5%=?%ZYs2yaN~FEx1f*zP`Kw#ld#;B{a_(^ac)uEJ9s0^CL41w~~T!J24!0WO#-^ zGxJSDvy5FPBon6H;4IKg_!U@Al3r!TC7GbCrko2@OK#tlDd|!P^$MCTctEHZXuh9K zIM1*V>?3i8`R_Mr6neF9tr{siHf_Rx`R9LDT&3Qu@>#3iSJo#kn-hkTPD52cHR{7`hBl_yuDy3=}qG5t&z5an`-DuRQX5TdbZl;zk79f6&3}7}E4iTAK4kG>F zJfT_tXI#gy7>*ue<9WRRt?FrJI zwJa!W*=jJ)lPtY#8kjW{NTQ7~W=1B6YSeXn(9A?#aGK?OD6fd|SDD}Vd^zqs$7viQ z1to@@55+7GXg0|WRLnyWAFBHpMR+zFi%~eMDd$XJ*_70Mop~VfRTYHG5w(o;7?p(U z{?cPj89*+*%}s^xN}nmeUnh?|qVzZ;&wMgu+`|mauRwDX*J&|)Rk6A&Sbf}S8c;1|`1E`1JaG;^2Moc0 z5fc!TwHgV_w;_7TX0^2S2eZ8K{zr2VotTa2^koFeICSdP2c5h1M_kSZyC3`7`LSuFzP+D zXwdh5M1ahY5e5GcDl`fKA#tz<6PyYE^B4Hy%`R`C2P@B+{ocd| zn>S7;1fe=L2)CC;y%@52|-LmcSf=Z#PBYyDc&NKN2_7d~r3I zAR(`;Wrl&=5ZuWP)59|IrI(qYYVpnUVO0u<$h@;d6q(7lh%j(h17x0r*XNAin3Z-)2SP+q`i6Zc71b)5EaG z{t43&#jpt5i-YiGk<9WDjTejM7-3#!{8$`LVCMZ6TW~LXKCWka<7n(6yw|HeCJY&b zGl!1S_Da-S7Go+aB~`Y&%Xn3n$X1oZjqTa+Ywdfwe0IaK(W|V>L8JVmA334LEe##! zve5Y0Tkc_o<;KA5ibE6SgNl#bn_3pTrs3A1hRUE;U>Vv4S*@g?HTQPC8 zUV8o9x)pe`XCtm=M5q}wt|!{?S-dY{(~D3?;FO-`6hbqh@LpO39<9vAGeYyj&5KdJ zDjOe-7@$D-gZ^)_;pT_(1VS_6@@%9p&PH1(UmT-7M9iFmjxF^nc2nM)sGvDDE*{SZwzrs*!B`Bo|$iG=1`Y^>E2s;{Ma}G53Z2`v#rf(g{N6lYy7pIW-~M!(!*t?4&s0OZMJtaJez^}uR*i9 z4i#wDZ^`wTX3yan6)aa+j+lmL7(LBR`KDHxhGqrMriWve0lm`7F%sWcI`bMqvm}UN zy07OsCg?UEsuXs*r=T3Agk3#FoU@l%GJ+4zfMx~G1~fY%T8;JM`E|TjpxFRt{gtF7 zn)fhefEk*rm{~FnNeyB7_0TLbXPO}?oSE@TMsK!Jc=KM3j6>3Dz@UnU0Xj~-4w^*< z1T`f+t(e=;l(9(xZ3^2AvSz9Knktbd-xfG!R1&(P;4w;nc}hc5;OxGS`QC<%{Qor> z`D51Wpjir-#=xv3fJdNwZGNhJbH?8U!{!fm+%nQ4>1;aAo`seBS-kR`M6bl73AvnI zL$lXo$C}*-(Xmf|3>h~S(Yfmpvuqn;S8PM_@+}xW z>0>PP4n%ZP9%3_>!x9{;#yIvLI1)*P+mOHR06v)Oiv^K!NZqvq;afH!dfP_C7Z+nu zWGog1gurGG!{||C(Xx4Sv}xTEUAlEe@4f>O7@mfN+%1SI*o{a+bL8s1h+cCLN$X3H zvFSWgx7Q(QTNUEAloP^B5VQUiqE-`}R~|yf&N8el`3!3b%d6`eP<;M2Hdo!kmeO0; zT6PCJDsE$c*)^OjyMV*Yui~>OP;l-zf_7|zZ^2Ud@ZMc|3?>Y;LC>zeFmHi3=KI;; z9S{cpfGAk(6k*>#Bmw~h<^V@Df(XqYelP>wTD3*rX3gOkKL-0FWCg%rTxJFP)s{5; zyeAbeR|Vnw4RHkKBz(V4pg9g-Z-`XD{LO}F_0f~8kai=*A2-r0xG9i9khm?cgtJ*` z6ObEtgoKx=gl+=I7ir=6IwhFU9EfkT0`Wsu7=E8axL*>FKkijE%vk zm9;p1`~>R<`l;Bg@>$j|O_tS0Im~kuG`r>USyg@;plp=+^gRW|EZ=Gaj{}ICXOAddWes$$j6t*cHnwmEH1?Q<1?9VF2N7C5`1ts(T|{~mv?>`FAHz5 z!LmFHk2et^DTuQXC0TG4zrFqz7x1tXl*zw?Gk@8O59zeYLB^YesE zfm)g7S55Wn?D-WL(CpDMXUL--qOFjz6hCoa`0hOYp_X;Zc^*5@@GsX1G!IeBpc~LE zplnF)_vwXNoH>33Zx0&+p9ORA*=6ZrrXSF6oLiu_mBd6}G0Z6;L%H{lCOVr^n;@DH+f9bmL6{3M-~JhxJ|&O*T~=eYR%JLIh0h&Da@VEB785Lr-+C_;1m zsvQLA)p&d2hX{>KLQHZ25_8vJkzWv6wrIm}CL(?LUKDINig6z=z$cOMNZ!30p<6d0 zcK1$1uiuE7f#I0v8v-9z#QpmYAt_rtz+1swTBn6@AQZCiIo+h*-B zeBdwwLjV@|2f-^K0)93VGzUk(78FZG6aK?7dgM5CYu*Y&TeLv>j1O=kHW)RjL8vc^ zz#k44;fJm9_;O7MekhJ5G{@ol^$GaR#zew0_qQbB=e`^W9AI%X`YSOSqaPmcvKLGCwVf1hC?ld{XD~gFAC#P7Z(UyuXfn! zJqu5>WAP#}0AHj8;Hi9&l@WpG$GIh@e!z&AOB=DcwHJ~tkJ+L(_o)}&##y~S$$~iyoVW@OL>0*XV?9m{-k7-AEr3T@HqXJls=}) zK>D+i{N{P$lZ57KLi72`gSc9`i+;Hcqu(B=0CrnWBECMo3)c%`aapFDQ;VEf(2(Fu z;FDfjU;g&NBR<#LN#STHOvAl3d1zRXj@l*3SU6^=8a6Sib4P5l%q7HHP#)%s$_Q>F zEvSelJSPTWWuOm+cI~3>`QF60v3|v3q$b2;{wE7CcaAsa&b44J{b9i(8+^R%un?F7 zyn|u)3nMfKsB%M~xo@kMSUGn(Y8@8T23T+=%8rP!J(2y%7=(@+sXheUuxJMDIs`Wf z{5NE=k<<{vse`~wkfxogg+sWl&+^9~_od?hE-k`e_9dYq*AJHow$}*F4NLN{nb6$3 zMKdfK_9iN1R*@td9%hCUd^I%RB{V-w48?9vmBzV!1RmUjpNOZ{=_T}0tj2b=^ zK|#z@fo7gtM_9gCQ>ma?wsODsCXU0552q@~78@2gl{iS9qlS|(&N4isRysr*Ga%(^ zz`Dfvxg`6BzcPJHi?KAe8k$0>^f+aPW?@l}ykNZKTyy=YWrk+P$>j68-eX$m%c_T& z;iw*O{x*D4q!~z;7m_9qXcn2FSzuMeuKq2++!&g5fI83`2B;)~%YQCd=Cur4GRc5u zRS-x-S?H9p;w~e^lo4R{>o-PRq^@DcNj#i|mh^a6Rv z?Q(C1>8#kyJwr}{J zOfN)2#xkVlugA1cywJRPYfO4?I$`@D@{3Pm)CXRe6`73WJ%+N6>o?_c(#cY+EWL~!75A{c z^ggzi-^0$zn>bK@0ek2R8_%6Y*1lbE6f8zW_F{~l@IEWSwuI)km^gYOeEb5jz!D5^ zLbI=$Zq5nKK@o5en(dAl`1sh-ulE3SBm7V3+zD$fUO1Z=g7cXUJR}VN{qaJ4yD=2M zAy9t5DGuKii{kO!#&|+A;dyfgo-dEXqug+OSs10}JHDIgk9(O`f`nAsRE8Dz)jY^H zJj%8yXnvYwBQyu7<+f!OjVHN;X+pDn9QcCJ{30t9PYdF(aw_4ae-E4sx8g~9Bwi3S zo~H)jVX7VX((gMNQi^`?B&)<+t=nW%E*b>Ru;R;Tm;}s&9CX)2EGn z9%g7Rb8hv2QZ9%;iFA2qmQknZI!NNK%RmpQ*X}1-mmDLc9KnUs-8g=5BSwwX(3~2u z5Dhz5;$n6a!Ejurv+V^J8&7FAkRR z!_*kt0j*nM^`cp*inF02)CZN}{-}-$Ky@_XIWZW!qk=KMcQ19%_r{IDmK6oaPff(4 zdGy=a-k3GVpFV2Ag85c>Ees?m2gB+e3cEj{+1rM`z4|L?exrGFES^0B)xmaD+XHbV zEC7MSyCHSvSOiZPjln!;<(!XjJ3U18GT%rE!mZSB+)fQ9l*)8-A-I0AmGKFL~g4j6e?i}7Sd)7?+*Z=aLar?#%+GGhXmJ?bDqXM?FK87lLO_I9U%^*^C zi~We5n(HC{=C#5hGEu;J+N}Vy^fF(qsp36Kafwj;wm`Fuajw6vhGbl*)~ikmFw6ez zX&++TsNrfx58cx(VKYp|LH980__!s^;09(j34`H_F46v7GLC8fsN-*AxpHg^d7&^d3~4CM+8jxeI2Tg^3haN<3c%!>bIF zRtYnjI?u4`Scui0C9o5kr9d=~Q8C8Gx+I|K4BwqOMy6CcWpv?7<{B9-ay}&9SGxM; zVYqrLD0BYk#)5v}?le$R_(=iogcFhU80(&IhAr`O9yeqZ%CA%7^Hthdplj%Q6_`cx zpMpYpKFa9ad23FG$N**y&APHMBuibJfzX_{DtzNN@1vgq@6R|lmbq8OC7y4@Q{;}n zp5ftj^2{IR2jeJlP#^C(pP?ELF8}d4$bu;4=T4!NDx=C}hyo>(u2-F=ct3GKH&kjF zi{yAn<-gdlIHiE8XBy1U%k|t92=KSl*(F{^{Ka04Ik(l6Zzh?CXuK*nAiquqG|Q@- zuGhe{1MRDP_98xinbcn9XTMf=9;L^QV`z^aXxqFcMiH71S6#!ohL<>d;|1#Oeg$h} zCR+7)3uCAIA#UjoL=&9T*6hZDury5b2_r18Msn6tq~$KfsIl*%S?eyUr?#;85Oow| zK4PUCo`H;=N0GSW05bL-M%a?o_}JHuPgv1f`(6}oomf_?*s;=@@B5uLUa z;i=0Jp1vB9nQIZ3w;9RH_aS5bS>$aykGvh9B9+_t9aRY1aR&BX$6?=l6ajk?SntsJMx}gy#Ka&|uoENLYenU<;0gBQP2vj(E(NzEBP2kdL@#y!j?} zg*Z?`2tA+cz}MTN@vkS7@cnvU{AOb~zA8?_*Bevu%|`hEFpR*Qh%bsWaknrDPx4~% zwVDCLhKE_K%+vi8INy_10I9nK=KItGLi3}nKs@30VV)I_2+m*0{K|w5f$4|&7ChsB zpXW#8LYxBu-CASG__uI9mXMtmLTDyXCk5hJGC`Zr@jN9MUsAsz6nvi*i608$@!Qqe z_-Su3_6B~4kV$=LBLc^*(V|6Xv~1ZKty;E3^A^p~s#S9}N9_l_+Tr@rAT|nu@ztsT zd{!6&pSOCUPrGJFh>OB0+W7H9hfs3zAj(f4r60=7LgFLzT~}G`mbvpn*|YL}V>TkE zczq51Lzic|Y}0ToVwonkufmhE-+%z4YRkmmOcrt3&=HonJ@Kaev zJI|#CsE(y>Khm7q{X9aVt$$jf^0ChT;l;N54B-J(F%NnjwGQG*rb{ zQ5)ukT0(P8xDC}YHdMtsa56p&?+@s!?m20Ae;g=E!p59r_%8Ipw7FJHpKrswxjyik z=Z%F61F*n599~}Z0UrmqR=nAZb|Lqn(mwnM=ORK;5gCf@HXm39_dx6iZ^Jfm6#Dc3 z#dD{kAvaWgTvt!ntxpeCb2G~v&9?}l4VhttRe@%I{L|SS{GT;T@W)+gc)*6IEQ4L2 z8iMN_Wr~!to?0N`2J#AUO>86T@+z;QWB#{2)PwX#^`^7Rk&S zYTPpc`*sq;l`+h!K`OB9L;v!l=x_37C(Wv6&yYn|?#NVg^qa>7=ldxH?3@ssrTO30LO7s{&$%oVx^*V)sop_#TM{NQ05hOJV<874cH zxuN)Mf&Ez=(q?7(DDRkk@&nov(dn&zp8lad+Om&Zb_q7g~NN8`N!~mCp7B^@v2^pkXm_0IlYXJ z(UM3s1tJXjXAj571KvL^UOka~YPAE;nSpDpwvDhGJA2hD>Q`3YrC$4H?Hos^_JV zBrf7S1~i*LG*dD?6D^+alKZ%dy8tAexa{mnHN9R_asjlukK~E*ob~fl&rm8}XWq%w zbn+@yUa5)%;$`k^DJ-f^pH$E+`>(`XUHhus(9A-kQ5tlwlUustm3(FXJDuGVnyJ46 z%}qU5qlfy}u%x9 ztuJu)<_lEbe1=)Rp=jM>2;N^9g2ZLp5w~P3p?MGHg`~naA_Hkf>yc5o3JEDW=-q!9 zT6cL9KEa7twd)k})*Zm4*$ym<%|+I(V@Te90NHzvz@A+Q-^c{a^$ozg?@vRER&CXy zCcS(2#=xN?F~>I;Fcs}Y*K5~1lEkh11DvbR+ucgJNya~)!LmLp{M zDZ=tm`0qaqzy14R*|!(Aefto&?*JTx^pIT#k#?jM#g(_P<7@+Vmfyj)ifhP%8fAL?XUj<%i#Hio-XXQ}Okt6ns-0 zf$uhl5uoGnXhjn4n$dUnL#nN#sFB@B;c95W&0MPi`paekW0ZJHfl zrUv1=^l|ynZ!KoIHt>M~|WO_yPLjG3}2|U({tCZPi)7 zh&(E6v1ciF=ILHy2HMyoQ>Es1l}<&`dfRcDb@>Jo3pdG4H9DxbZkPDIW6 zBGhF?;i?BT+i)u>05=l|%L(2Jns3JNxB#F`Jx9Q+&x}O&^eJi?cLCarIUk`a)`~j$ zXfA^A9BxNtOaN-)>?lbH$47(YO;0n79?%o}v%|2rAPHes3ueu;V&*~z=Fanj*8*QP zL}Q+J1QrQ2dj(?AA`ANUepB5?8jI1B-@}QBFr11A#uh@eSHI2(pY#@jr;Now{+~Z* z3U1|v;#RW1>K(pCz!zw4kdNwQftIXrG)OOQoA<^%Rm0TvwW4gk!$#D$xpH<3CmuOUS`E)M!liyRI`2P&Pgq?dHGIF;PV2v zWk(NV@~DycVEkB=9y_9jXM8aE9asp>_18beg_;UnI$w>?u3SJO%OL^gRZEKq%+>1q zA-zTeCHCuNBQp=_IzyO8>Js{mNPLj{?s~-;KgB>MDZkU-r+EUiNW$0mG3q5%H#H8! zDK{)@sRiPc-?)KUPkYSxIO|`LtKK%kadMAYIv%C3hh}p`jDYn6(V1j8u;Ii4%`)k` ztEZU}6ks-5C#s-eX3I<&YUTnKA~O4e24*HeK{GR$&@A)7>dM@CKZy^eGF{@w1jrnH zMo)8{#DU=%Lo@Uk6(=2{ zrl8pf*DQ&TyHaSJ7I$3S(v2rA_i-cc3YwdM<;K$RES0A5jNQ*rL$gl1K(f3}^%#dJ z*C}XbCC#l1#3>{0?p|`H!6hAk1DZ8B>-bB9LdS($$#ZUXzKX;Sw6nhg&5~vX&5WZ9 znig}`!NR73J}reh_itYL6x-IXL(g_?(57W; zOq#wB$3K0HQ}r)#rr`z7-MB~SU4T|S`eANhEK-;4K=O)RNL;ZKpEyzwk+}-#Ma3vs zu@PaBDQMHV7rOKrikOV$Shf2M3N{?V1z?3v;qN1D-lt&8!4MgkhA+DQg_xM zYDX!8_MAk(K7r^OkT;{Axs+=`Ds2}JXj(sR@tZ;cpDV76nve*mG`4sV-=W@}KCf@Z5D7J=Lb+M+RX z!~}F|+YY^(H$(X3i8!7VhjW>6s9hX^pU)KH&j;f1-O5n>t~eRrY)Ha48x!!|#wfyN zG+wNZ!rjF&xK|KI*p0+Xg7FuGYD zf1eSq^h04B{<3c!Rxg~1@OS!S(fB@USVniIZ*;p>nDtgaB-s~W_J`wik0!Ok`vd%L zcM85-Ysd5T(MX&;1cPMk>9naRIdK@rjvT_76ZGQ~M^SzHL?dW6e97(Gp3p2dEgdLw zo>7q+b&~j+vn-H$#aRxk->Q5NzhyY8L&OuBRk<(VY=UO}(XF}c(|)C)S$x~GesG_s z%S4x)od-sG%u;2oDu-FuiI1`zlCpZyg1HJ{XODjyHR~1=nxpY4LHR~3p*hZq`h-9N zAHg}%4-GP|InEb1qWy6r!A|H4#XVVyIwuxueCDECE8VzBnN4Vp3q(yAp*b=Db&(E2 zv&{+35%_qJo@rx9ug=((9*7gmlaU=^$D)NknClaac?9GIUN*{s`KqTm1eQgCSh&C+ z1N#j?2l}o|v*W%$#cOx#bg6M6~FwATmuF<86sV>Lrfz}UgKkP?8$ zc~SVBzH>J@1osHU_hg7hLKyB*(%Y=(VV0p7cHH4Px2apnK?L_;+-7?3WLj}MQ=r+G zz$}ZF@Z59_&G!k-cNH`fzEeZ=aHbS1?&gJKYrt&0-AR8$`R?RN_|O0PkNDm9U*puV z!vsaz3MC%{*3y2}G;Yoz3R;>SjE=UZY{bnU0-m(9(l z;W;v7wSqpPrgWpUu~pSVlUqtpD$R!~ z6}K9D$-+%ZSMsXRVIiTery;pAOXA6WnVmxtqwXzna7sZl)f6-vz-&OX$SAlZZPNM| zWBB(#vnmWJRY0oq7*D4$n?c$0p-2I(Wej$g-gwm#QiUz!V#;^zC~g@rWtRDWx!zSE zn;=SqlCf`2yS}dRvp1(#!Zd^NugM6vvCNf2Q~#?kSD6P|&fj7qrgMzAzYdz!YKW%& zzXr|XWb$kjG_x==dYKg%Q?G+&Lq`64z%#>Dnp@{p=%3VRT z3X5UVc2$}Q$TEY5S&boB{72`RTgHEK?rWi0@>+nI`65ehmzQ&2#>Ekut11c2Rh6hE zJl9{nirlnRbZOlJZQ8WOY`<`vy!HY|uYZA4x1ZzC#VZ*8!TV_0vk$yNlaR7>C(>6R zKt$dq%n!*#Qqg8Y^F}OPvmL(Ha5QVx6+QZoME0s}ShMdO*6u682MZmrXDmg*{*%Zd z*rsgS1xs=kQi@i=;)uY2LBm+-$e6$uXy3If296pBTWl60v(_Utbv?q~%r2jnEvppWwV_4-yU>Lvbac zx#AXq`5Ly=T*Th8YV1378XLDCLtgO_gl25Rgb%z_CD^P58xd1Kf`4Er7Wf6iJ0J)? zfkE(dgs9;e)}R;!1jfK&k41oYC`Ju@8$DaM#Snr@&g>Z|Nl8LQRx&Owi^oqVi}2em zk@#U%EPlT}72j=$$8R^w%o#EGYIQW8EQ!Ls!e|0PG#=-K;3Z-B3s&CGv!%C~P>?C` zEWIfbm_HK(lZJQyV4TxIW|0!1XXHT6( zrBvSB$}kI&`E8nqob`lEM_1;myaj&6#>Kz1uer)KC6)`S%yX83>X@AGF8|GSg6j>V z6Pl$?G{-^8DIGukp8>nN^T>H$=Xt73E-c$9ti&zxPKB zMx{V=b_}7}PH46gnga;S4m9wYH6;5J010~W{-}?(5T*%)>4e5K0wS;3737azZCWd6 zj+^lTsuMy`9p;1ckv59IV;zKM2hPTaVVV<~`*dlCwQ*iJw>lB)2$eRU1z2baVWTk| z3w*<{&?giNy~E(+6$I-d8y3yCU||12Xxp-ddiyZzts&SNYR7@75UdT6LBt=P#Ua5ImfoGSBo2jM2|O_t5RoxwC^5}eZs%`!$g*AM?vU4;L3VLAS8UpDS1 z+weeU31L3m&xyeDS?{7J;~Ow`ATBU0f##P*iMW>{fb1OCEU=uQSNc;6m+;y<3|q!f z*E4N5WlXgUyCAUNq5s^@vfy^M1$Ws1z0U^e-LybLvK99j{@qkr%!GN%Yaa+S(_d~9 z@>ec=A0s=qQhDcZ@xlN2@BasW|HIdO_9tCKF+8D}Hmmqd08@9end^DYC2l*_qU|bH zqK)v{I_mhoJs8Wp?%uvFwya-`OLbN7SvU{&KpX0>U&ZLe)@ zzhhX^yDUtn>a#}oaoMvpRC$j0TSZBwQ{sEvS3_n^*BR_aa5hVT*_i&=&82Spci)R! zGpCs30%eA$hB8Umyhb?DSRcd{j?>4KBtLn*<}=A_eQz^38*1j>k$^m%RLc~wNyh(D z5HtnNs;8OGtus~%8z(G#NW-m11uv3gsuJf`;^YC%+&VK9m{E?;b9o4aOzsef7O{xUZcm%_LX2 zjpV8vc#gbfFg{o_WB{gFlD8gVIOG1e^)wrB+cY$*Ss|EiGxSPtbD06e)L-psHtnl0 zRAI)v`;|1kB2%37bC5=mTYn2Qn|(;6Aegpf?8`ZFZYga+R3`g;4(hEAZCQ(XL?07q zR*MTcRY7pBt*TJaeB<&J1pE7;W6Ne}+qymMQCT?s*%vrWa4vuRHFi{0qvzPS(Q(jQ zu*T*hdC6Wv^I_OiR>B%rgsc_2kh6T7g68)>norQ|gtx}MkCi))W8I+&6d$gDJuwHi zv;r(ScoO;hk0H5u8^UrIBX9Lac=_AWrAtpWp1FC;R_NIC4UC%nF&s%n2+J%+Nctv( zrEjDN&)FLgoU;KIQNb3(Y&edDU8M-!aS{PL55ceu6)x^%~!g}(5!g~EsO3k-vw zoxtn}h1C(MrkN994iAWhb*2Rq-W-O(t=eKzhj!Rv@k32&I;zuCaBW#CzTKULuh#_Q z_v>QuV{t5gyD0*{+Z2VbS4QCl!S-oU5*`-B5PC!LEXSdM`3qLypXXQz&K82Rg>bBg zV@MCPz_au+KguQ`bN{j0*T)2!A7uvNMP4+n#{?pxXGbKC9)g{gxpjE6zz*Ym2Jk4 zW!sVA`>}dWF5^oV4eyI@b|mAs8$$3+aS}GzW~yoChV|=>eVd9=OCJ^=ET#QdpJMrV zivCJJkX2KSAz+4I8*b)ZI%VAZAA5Xru%E+qO|} z0fMKzgUSSga)br7k%6cq)Kt_tg%?+7lB0tXElq)BCDFmd0_v+XxXAU+O%kicl-ClYC`iihQBPp2O)&oV*ih^ z#%CJdY2N~lQGM|l!@ZRvb2B?opBAd%`F6UU04pB`%KXVp8=?7M>z3f(2+e;cG~Y?E z;y%-OpZ_(aIZ*K77}e8k9XkLQ)7YTQjl`Er5(u+0rJPJ9=LTh&gIP~0XHlyV%CL~@ z%)cA5Ja!fVTjoEf&)v-p#63!eK*_=-ciHHaUS=0Gr!q|HKChJz1g|h2+0({h2yI;Q zE;}a!!2gBceEp2Jc$(!apR3G~%rdu5hGqyfyLy{xmt_L6BGyN$o{{=YY+Bo;hZHl> zM_7Ldh#%j-7vqKxRg2QBTv~)H7itk0;E%M_WL&;bhoamZESx(BiP5pDzL0PgFNn?2 z_VxIEVLLV!T>xy@viN}1H!_Mqd`*0=N^L98XlORx*vKRi9^b;z2i3T1s_}=9ZfJg$ zzdZcaBr}8?s^L8h&UJ@jG^E|WsNXp0aP+-RVi?A?&19H^5+f!~EikCJo)YJ8Tu2## z$h9?$gw7B{%w$zaN#+VL8{PG~w^@nTmGgQRB&#j=dCZvp!zctj#e^yFlmdzI<28mP zUh1{9grg*d(W~Ix7#wwaB)+Hi;1el zmGr&=%;l#&p;>`5^F)C&ZQKmaGBm@0W&!4Ms#u}d^@%4i1@VS zICJF%j$iu%<&VBW;<}A!)~y>lzcm`R#HGksbpjcLW=m`VA~M$?chxQ|UcC!N%Qxe# zk(1D@RTsQBV-dC-ufp1+RoHgA4uzX`!;w{p6-Un?ckf}uui1duqUFe6RgCE}^rLk< zR=Tawx@~)O?Kcqb%veB3UWJg1jnr0zW^5)vZ$xMop}Ala0*ki5H-94nm+eB#j#CKT zc>>m*2MEds;YVopXN4lr93apvpTBeqU9BeftlBe*htwu2mwffnEbiy_(f8EP$!{hV-JW03V3&P7cgn_R!g7F)f z!DDG0{71flkQrm}{)pacc*goDAN*3e4L7&t;hkO`(6e=O^y2Ztg_H2d18MlpYR02D z4US1~p=;}AnDOz4s5*HP7wAtHXoKoi=?T^|r)bx-VVN06##tI=g4w4H2v$AKthmda zJzi4ZXlT~{X3WlHobT#oR^^_Q4Y~~1;qqMPwX#)H-6_d2qmIzwy2UUl1#HV z5upBf?Y2yjMsSuX)9y+yvp_R}S$ab65=KwQ1mmqv?Nv{+*Vs`gjSog;6rnlVuAsR} zpgBGWr3q1(HFU5Vmn$E7#(2MvQ!ApeCodGSfnHc-3B`OzA{N-tv)r3pzlBf(^F4-tY;eTBoJ8ht@uv_=Kp$eC4SkRfm?A*r!)+tx1A82Gj)V& zs09$3uVe<|LAI<;8HM`<@VkiwUE1|Mo~QCAiTRYw21c?k^T?m~48&*2HU-WNnX*_= z1nw7v;%=S>XPMgZHbML@A^tualWI&e^Hg+KwkZJwd^@UnpQQIkV`w|3opIT_cL$;Q zzvIQz+qA_~IA29uqW!s`SytLpJl+ z|8j4qZ)gdKI(wIu-|?EWp3tm2RU1LGArmm`G}GP{G&dniV3=`?08Raa@bRkj3h?eoe z8i3qbuL84(57a41p!zbiQQSo4a13DkYta1HdYF~uzKpYxFUEK_bx#$ZfQ}QwTrkD| zo$=QXMha;|`hiQ~s={mPo|Ms2a^J^L<9&fk1C-4YXJCAsX?LZcP$u!!@iNJIUn6ZI zW8X-7Q)(PP<6bJ?|F=Lh;ql)I&E_y%lIP-->sh(3p%{jY2`d*#m^^0mLN}Hz;;jf;e*hudkHWU=5G;ES zz@Lz8*|!Ha!gHYd5O5zHgk}e!Ip*L|tgXC(U6r@7z4B9REw94Pqi1pA_$3r?uS9ea zfjemp`n)w>LCc6yV-R2uhn0XV(5%KW2S*}+vhlY)C{8fyW|nkHGve!>)&95LTYb>umz_W5U8S!ot@XLHH^&2ru)( zaBp!m{6=&^$h47|{8mquFNGoA_}%$KD00j}uU2eabZLvBZJXhEqBs7$HyPhaLSnpnMP0`R{zX=r=UML6o@TjBXcm7mpxG$DjIv3UJ4TsLzbZdlNlB&c z?6Bc6qfB$v3;ZpMX{onErm|1_ooCph#&w%fUue-+DCvZ%b5+{zSH{|(xouRY>bOXp zbXj0l6L}3$xp(?#2{RYwcrLN9|GPc^xc->848*S zb_D5L@*yCh?qLeyj__F;AA$*QbWuZUW(*&K)7&qQvMOkGt zp2+9p@i@T7(ze_v#01;0&=QV?jwCD$jEA>165cY^oR0$*?*PKH9fJqHMgMJ%7KG*@ zJ$oQGC=e?n!%$$iB74qz*yi^E4%z2m?wj2(r%y*z%g2i;R@{&wC&|I6XZ&v_2t?b^ zkQ#*Bgy#E&HvE?hEAf9`UW4ClOT~?NKQ;^kaWlprHwnAB?~hc&Gwfpr;xidak->&! zrmWUS;7+#UzAQ1GBzYrCb@N&R^W9`0g0C-b@|+uKcHAVa-;$vr($g#-CuS1d(*>$! zj%KUs{l2H5S$diouJk&~M}y2`f$fqgKg3KLfngn5p<}C7IDfuMLG$hUE2{A$JBJse5ES$dj{H|%PhFzXK8Ks4@c z;yeY=dW3;=mI?=IXqHh4PTmwB;q@XJ%lt}DGygYy)-3tY?5`p@W`<=$t})UoFE!h_ zL0?PfqYhKo!Hpz!qx0ULi<>x|+mJE7(j_{p0f-(6a7IRCe1zf7M9+-WnaKi;VU+P& zf#y=)tBjFVOC?IrS`#vZvdH54#qJ zJE23XPMA1hIznR#k-mHfmhU`{jYmqb<8&p~9W24(EvJ#Q_9W7l?m%4D3T)nc8e4Xs z#GV6Z5FVb0R;|0B_kiJ;9~gn+W0hEYq87W#uVB(#A1sPX!HQ#NvE=Y6*t2EmMFFBx zaxq}Y2m(S2v~JxR9lLf%pP}R66`X?bOiyTbWN<&X7=cBb;k#rjd>IGJk{$3_ycq$- zd*Rr97y)|@=yA+@_rlJtgImYGy$IR2A3=L}BXZwitS+x7$lu4Vl25Un;CA3-6^@+z z6h#})B6P`K_{J|q_kJ?Qx*6VicM9wTWPe)(p;_i-jzK_3Gy+4S2sSYY364ipSTZd0 ztr*bp4GbVO^y76I3#OwaCmrXqvQS!(i`tdxs9zU}FN*!~ZRyvOt<2Z zKr`X_C1K%nS#~=&6gToiuy9CwI6iy}69($pGa|fZChVQ_vwI} ztD^DawrG66I}M9{r>WT#`u6FGy?b_{w!A`tp+NHmmOnM8SstGj+ZG?7FPMDJ>__f; zfek?g%#}PYb&sy-_594vGSq4J8k+U>s*IJgPkh!amJzQ1O)}~|qu$W3o(U?dPL-Z7 zE%}Py_xO4V2%T#MJdqx8rLJwv9FQK_MB@!PD(DN@(9XtRh1e#+4P!VZE zO;j+dVr{654nSpcG!~8>t$R_CVD#v`yGfd2=^!6z^pzSanM6QaF+ z9q{uBpn~w$kl|?2lF;0?EqZn8fV98>EDj4rZlD$U3qHa&->EneJO@65dt+MHwkVC9 zk9tDw^|%0BjSs|UGAl?T(?!^qp&AYGw8z3g{Fn2q@ZT=4!&e&;a6Lihrw$}M2jDu> zSTt>{>S+!dGYIu;EK1`+KH9sRtRY!+UzA8#P9iK*0?xNn{M1t2YA6VGllPNh8rKQL z*9prvWg($-i|T1^;CZqt;2rtsPXRN*IX#g7IdC&Q6sIEmVIMURL)tXQ&_2EJ=)oQQ zKmX5v#m&zys&ZbSSzxh_c2w&en&AdFfjNO@-9aNZDfZ_EKTqZ|7)Ja~fHOYe+m=&zP>T?RzO}=E7nP&|NG#e90=+OhZ6W0aw&iJWe zP{LoT-hDM}k>vX;i<`U_m_>X5;=I!JZ%%|OGL##dCG*@FsJrg9^tfeK){I0cjiI!Z z0K$ltIk)A6gmN|KEek)jr34&YG73g54W&}JxeFq9f#jAMl8pZwr@?rrhTQ4oxu(oh zJ;S_S&%hv=sWa4oC6OmI3zUeAa5NApF%RUys@9g|z1&aTU*e_XW{!tSmx435%k z8JiKz-}cP4aO4%kR)2)NJzkCDC70Ci#I?r|ylO8NMP;Bvw>Q-M!|#7EjnFKs z|H+)pv9N^35}IRpEC#{c4+~91Tto^MOq+{7t=gj((=noBYix*d;B;;V&SdAHq#z%4 zYcgsDUxfuN2rU?9DMLK@7bO~M*F2;kLWIQ1FKFJnP_Qj)oJ@4}4 zJSzd(h9~)i;rt*xDG26oQ4k*Sx9A~%zsQTgi>xrUsL8WD2c8whBkP?Jm@&8)=8o=% zy|EVjemn0YW1e$lRt_7%I1rzwSqaYq%oaS#BtT~nqzMPl(qwUz5PV8~GNd#7-W!Tx z-PTQ^6**nW7iB?-UZnSeXR379dgznX$>&diTdCw8Yk zhAPJ;+T~^1eT}m*XsmW=hGvmbZn*0ODKl6(5}ajrBiGb#&az7U-+*QVRz<3eRX!(R zVo1slT@JbG0wpP%rM@6U&^KyXPHO;o6`DnV9hw<7_dP(_S*E&xnbIG<^MC1SW|>PZ;aCUC;%q333_wkEkb>rlXbYh^3ceG^si8M=?&#qo5EmYfcs6vs z?Xj3+je{4#*=h@iUqBeVtda1t1jEYcfhB7QjAoAU^8Y24xZR)s%VHVl22uXs%BPz^!CThD*peXF~Lo#liTmm)7Io zF0RAF<>9y{Q|=|%@mZn`*9fgE=S@-295QAw8nT0MhxwtF7N_nL!0*X6K}&w?WyeK? z?-W9_^fIRft9hSqGQ1nK`Ri&aZ{{hXxq*#HS?*il`Hp-Hm>xjM)N^_m$!FndH)MC1Z~S!oH}xp_AO(S=`#wN zWuAUnc%@4Hc0;q+DB~xbB10(@-#Gn*$J9K|{H>T#og?RJwsH*(cJ3>%Z6sG%lo~3` zw240`m=-^he4q@!5`JpTv??5_PZ>WmG*^{Wax2o~wAD9dyxypHjWEsk5j8WW`S84v zzPTX-nl&Wq1b9d?hLQ0UGeP~&->%k>mH?6r$tW#j#a(t9=SojGf!Ww(tbkZxx<1KFTJ{VLK%iv+C8N%A{xctnMv0 z%Y42|yr%%Ol3~M4ho`MF4$gcvJ`jB!G^=MJ*NCktaG>M=6{+bP8$>|#VUdNifH3(UA820sh zG1DHw%D+9@wrz`#r_Vy5EMnq_h9xM@3C%LaoD-Tu5|I>{idhq<;>~6))zFL?gL+|i zN(@SJvr&?ji_-i8TwIloyQ_lmACJWnnv?Ob$KvqwsyO^%X+FMPyc|#Sb8t610*~Z_ zzie+c6ysrD5FRQh4#MMt5W;d8o)$(>kxEaf$J~EZ7>?<@`CX!(;v^X1Mo!#MPPoefZ2k_S%m3K-2?qNJ&*t% zit`L}>cDQ8Gj0e5c95yHnxj+OHn985#DviU(5Gu_3~bj7xpODtr~TRZd0!@uB>Un` znY)=FRVck+HC3wjR~DJNNYK5)#%T?GP0gl6o0hjI6|aapa z5}2Jb&k}W>zFNaFNj{2?iH%k;)V0n3%iLc;*O9Dgy0B5QWoCvlGc&W8H)wA$GfS44 z?Xt^^RW2}FXy!6A6xV4uP18Nicc$m|{5^ALdTw{my!ZVgcI=&!avt`F%)n&=f8r!I;yU@GJ}F2+^Mj07H9SMcVL#2&F#0fuN{he`8x!#_clO~w zUfPUDY%t#AIrnm%xWnJ?(I)pqEm0o=#*hDqzT?6RLiY>W{7dRpW)vmc7(z3FH-q^h zR`^4}6z32}U$vY3h3lCmMHDKrZwSnbi+zSWjY z}({&|FV@G`246NPcM)Ps2Er>{F(GQ@=B=FqxHS)o@2ur%G7DReIz@!-WPVy+~bi(A+qDk?=$d`D#&pxyS&$Qa3-R zGlN^7cvC4fnn+jvQt29C7U@b@KSj)(&92jXzB}BlP zTPuW!3^Xh8J~Dj33d&X)BQxhWeSlVZ4ZtGR)9iUbza2E2JmURS7`_M72%0I~^0L16 z)AHq^HfGaPD~_Ta$Q<**l9v46xTo1ToRUDZIHQV>FO52`V$6~U+Hg(2DS7fv<)6o( zMe01yxz_vmpt*T9eVzt3t@l*nsQCNx`In@hOVfhI6IL_!V8Rzz7S(w2OZ3}7vjDdG zXz=1G4>&U(o`#>*W;HZx$?~8A&Z@vL&@A%Wi066mIw=U7gXE?%bDpbNG6>AxAsLi` z<_ol)i+oPie6|A3GHZrFvJaYBm~$)kc)PA1TMA1t*uMit_a1`GoOMXfS%bMtLeZg1 zAGB%{fVOQrqIoHH@4ehy`)!s62KGkwsOQ9g~2d_;k!!9*WkyPpj5#2M; z4U9)d@opp*?1C$Q7r}WKp?L=!CEH-H_!!Y^4iK7;5Sou8vPycJ_akiGJ_K$21d+QB zBku60a2-B@#3Lt=OlVFfG^d_8j+Emk5P#qcRDSgcn+|@BJ)c~~xzk_cT=hd#9H>F; z_LGR(egJdAqVy~t9RjdyMIh`hf`>f?k&ZM(xzZ6Gmr7_(fs^1I@5)4Wd=?gunuw2D zw81duL*mRSI9pbLYF1Va#g%9%E62^M4E(swg?~Pui+{RQfPa5H7k{!g7k|2T1Aez^ zy_(TNhD^LE7hn#-H)Sz+RS{1ZCKOjCP>Fb3l}x4LT~!)kIh9JGc&w7=5U5{O@Vt_E z{6z5m6~Xv$&~!wM>W7dSLfO%I<&;dfEGBK5{SP&wFZB9 ztOVIhC#VmU2KMWP)5i|uChc4|;u>&Wp!tlf#>vJf{Y@=tPiXe~o40%sKc|m*{7Zb= zH%GGWp*2GrRL74hH>HfCovV%=*%z70Sjr9~DL>>m{ZPs^kKd|~<{3``ohvo+>e_&_ zfJ9?xmin{78!oqv$|0870?iUP#;YkbTVYwN8rONB%Pa$>%*f8kRQcSVAX;J%#`O(( zxReo&>xs*8D`6S#X9VFvb~wJy4Oig&ga9gwe#r2Q2i$)}fRx_ndu17L&YGlpko&jy z!#-yis?%anofwI_hoe3@0(IF5sE7$gcRp9S@8FLn5R9|1)K-GU&MK^kEks15 z8!-{lh>DIwu#M1cBQ!@5n)&~jF}lZ7pgEvrOGM6{kE*0Jlsa9=SvUiq1Wm)Gv=C%X zAA?mr19&tNKbEbG@i%@fduAflq_HWTza3m|D@1~s|=@P##l^g(-J)yj|~Nd zz(4&H{`e1njY}+VFRK**38SjcMX_@^R;@Z+WemRzG1bFJe5$wBXCg_5M~yj)K56+)Nx~C96Cb198;Nh2v^JIM z+-3R=ONmt8F)lh>){o5NOBV%}nQm^S=iGdtC?5@)de`UwzVL-9T3o4Rz)d8r23Z2z z>hxacr>woMx&Wcm(}_j3j8v`K_d%8_peSqasOo8^F3Oe=Tum^x_A=|C86w{YY67a} z6_*~0A;u%y#xnQ)MKJ@+$5*N~!!)bc3j&Buq#FgiMs2m^^_)B$@6Z)W;JK-EFDK=R zcXP5jh90INM#@4_ZRJ7gUT2B7R0h5oK?r!B{1mC*J^8E+UJT#&yY)QDBO~2o%=