From 9d9441209dd8b09fed19957de4bc0db0f78b86fa Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:12:55 +0000
Subject: [PATCH 01/27] Add PLBART

---
 src/adapters/models/plbart/__init__.py        |  39 ++
 src/adapters/models/plbart/adapter_model.py   | 162 ++++++
 src/adapters/models/plbart/mixin_plbart.py    | 109 ++++
 src/adapters/models/plbart/modeling_plbart.py | 533 ++++++++++++++++++
 4 files changed, 843 insertions(+)
 create mode 100644 src/adapters/models/plbart/__init__.py
 create mode 100644 src/adapters/models/plbart/adapter_model.py
 create mode 100644 src/adapters/models/plbart/mixin_plbart.py
 create mode 100644 src/adapters/models/plbart/modeling_plbart.py

diff --git a/src/adapters/models/plbart/__init__.py b/src/adapters/models/plbart/__init__.py
new file mode 100644
index 0000000000..4ed67ab1d8
--- /dev/null
+++ b/src/adapters/models/plbart/__init__.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The Adapter-Hub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "adapter_model": ["BartAdapterModel"],
+}
+
+
+if TYPE_CHECKING:
+    from .adapter_model import BartAdapterModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+    )
diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
new file mode 100644
index 0000000000..33b2e9d639
--- /dev/null
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -0,0 +1,162 @@
+import torch
+
+from transformers.models.plbart.modeling_plbart import (
+    PLBART_INPUTS_DOCSTRING,
+    PLBART_START_DOCSTRING,
+    PLBartConfig,
+    PLBartModel,
+    PLBartPretrainedModel,
+    shift_tokens_right,
+)
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
+
+from ...heads import ModelWithFlexibleHeadsAdaptersMixin
+from ...model_mixin import EmbeddingAdaptersWrapperMixin
+from ...wrappers import init
+
+
+@add_start_docstrings(
+    "PLBART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
+)
+class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPretrainedModel):
+    _tied_weights_keys = [
+        "encoder.embed_tokens.weight",
+        "decoder.embed_tokens.weight",
+    ]
+
+    head_types = [
+        "classification",
+        "multilabel_classification",
+        "question_answering",
+        "seq2seq_lm",
+    ]
+
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
+        init(self.model)
+
+        self._init_head_modules()
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values=None,
+        head=None,
+        output_adapter_gating_scores=False,
+        output_adapter_fusion_attentions=False,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
+            use_cache = False
+
+        outputs, context = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            past_key_values=past_key_values,
+            output_adapter_gating_scores=output_adapter_gating_scores,
+            output_adapter_fusion_attentions=output_adapter_fusion_attentions,
+            adapter_input_parallelized=kwargs.pop("adapter_input_parallelized", False),
+            output_context=True,
+        )
+        # required e.g. for prompt tuning in all models
+        kwargs["context"] = context
+
+        head_outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            get_cls_from_eos_tokens=True,
+            # `get_cls_from_eos_tokens` requires passing eos mask
+            eos_mask=input_ids.eq(self.config.eos_token_id) if input_ids is not None else None,
+            **kwargs,
+        )
+
+        return head_outputs
+
+    # Copied from BartForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+            "adapter_input_parallelized": kwargs.pop("adapter_input_parallelized", False),
+        }
+
+    # Copied from PLBartForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    # Copied from PLBartForConditionalGeneration
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
diff --git a/src/adapters/models/plbart/mixin_plbart.py b/src/adapters/models/plbart/mixin_plbart.py
new file mode 100644
index 0000000000..bd02e04dea
--- /dev/null
+++ b/src/adapters/models/plbart/mixin_plbart.py
@@ -0,0 +1,109 @@
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from ...composition import adjust_tensors_for_parallel
+from ...methods.bottleneck import BottleneckLayer
+from ...methods.lora import LoRALinear
+from ...methods.prefix_tuning import PrefixTuningLayer
+from ...model_mixin import (
+    EmbeddingAdaptersMixin,
+    EmbeddingAdaptersWrapperMixin,
+    InvertibleAdaptersMixin,
+    InvertibleAdaptersWrapperMixin,
+    ModelBaseAdaptersMixin,
+)
+
+
+class PLBartAttentionAdaptersMixin:
+    """Adds adapters to the BartAttention module."""
+
+    def init_adapters(self, model_config, adapters_config):
+        # Wrap layers for LoRA
+        self.k_proj = LoRALinear.wrap(self.k_proj, "selfattn", model_config, adapters_config, attn_key="k")
+        self.v_proj = LoRALinear.wrap(self.v_proj, "selfattn", model_config, adapters_config, attn_key="v")
+        self.q_proj = LoRALinear.wrap(self.q_proj, "selfattn", model_config, adapters_config, attn_key="q")
+
+        self.prefix_tuning = PrefixTuningLayer(
+            self.location_key + "_prefix" if self.location_key else None, model_config, adapters_config
+        )
+
+
+class PLBartEncoderLayerAdaptersMixin:
+    """Adds adapters to the PLBartEncoderLayer module of PLBART."""
+
+    def init_adapters(self, model_config, adapters_config):
+        self.adapters_config = adapters_config
+        # Wrap layers for LoRA
+        self.fc1 = LoRALinear.wrap(self.fc1, "intermediate", model_config, adapters_config)
+        self.fc2 = LoRALinear.wrap(self.fc2, "output", model_config, adapters_config)
+
+        # Set attention layer location key for prefix tuning
+        self.self_attn.location_key = "encoder"
+        self.attention_adapters = BottleneckLayer("mh_adapter")
+        self.output_adapters = BottleneckLayer("output_adapter")
+
+
+class PLBartDecoderLayerAdaptersMixin(PLBartEncoderLayerAdaptersMixin):
+    """Adds adapters to the PLBartDecoderLayer module of PLBART."""
+
+    def init_adapters(self, model_config, adapters_config):
+        super().init_adapters(model_config, adapters_config)
+        # Set attention layer location key for prefix tuning
+        self.self_attn.location_key = "self"
+        self.encoder_attn.location_key = "cross"
+        self.cross_attention_adapters = BottleneckLayer("cross_adapter")
+
+
+class PLBartEncoderAdaptersMixin(InvertibleAdaptersMixin):
+    """Adds adapters to the PLBartEncoder module of PLBART."""
+
+    pass
+
+
+class PLBartDecoderAdaptersMixin:
+    """Adds adapters to the PLBartDecoder module of PLBART."""
+
+    def forward(
+        self, input_ids: torch.LongTensor = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, **kwargs
+    ):
+        (input_ids,) = adjust_tensors_for_parallel(encoder_hidden_states, input_ids)
+        return super().forward(input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, **kwargs)
+
+
+class PLBartModelAdaptersMixin(EmbeddingAdaptersMixin, InvertibleAdaptersWrapperMixin, ModelBaseAdaptersMixin):
+    """Adds adapters to the PLBartModel class."""
+
+    invertible_adapters_base_name = "encoder"
+    support_prompt_tuning = False
+
+    def init_adapters(self, model_config, adapters_config):
+        super().init_adapters(model_config, adapters_config)
+        self.encoder.layernorm_embedding.register_forward_hook(self.post_embedding_forward)
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        if hasattr(self, "encoder"):
+            for i, layer in enumerate(self.encoder.layers):
+                yield i, layer
+            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
+                yield i, layer
+        else:
+            for i, layer in enumerate(self.decoder.layers):
+                yield i, layer
+
+    def post_embedding_forward(self, module, args, embedding_output):
+        embedding_output = self.invertible_adapters_forward(embedding_output)
+        # Prompt tuning not yet supported
+        return embedding_output
+
+
+class PLBartDecoderWrapperAdaptersMixin(EmbeddingAdaptersWrapperMixin, ModelBaseAdaptersMixin):
+    """Adds adapters to the PLBartDecoderWrapper class."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.decoder.layers):
+            yield i, layer
+
+    def get_input_embeddings(self):
+        return self.decoder.get_input_embeddings()
diff --git a/src/adapters/models/plbart/modeling_plbart.py b/src/adapters/models/plbart/modeling_plbart.py
new file mode 100644
index 0000000000..ed272b3aa0
--- /dev/null
+++ b/src/adapters/models/plbart/modeling_plbart.py
@@ -0,0 +1,533 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PLBART model."""
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.models.plbart.modeling_plbart import PLBartAttention, PLBartDecoderLayer, PLBartEncoderLayer
+from transformers.utils import logging
+
+from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel
+from .mixin_plbart import PLBartAttentionAdaptersMixin, PLBartDecoderLayerAdaptersMixin, PLBartEncoderLayerAdaptersMixin
+
+
+logger = logging.get_logger(__name__)
+
+
+class PLBartAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        query_states, key_states, value_states = match_attn_matrices_for_parallel(
+            query_states, key_states, value_states
+        )
+        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        key_states, value_states, attention_mask = self.prefix_tuning(
+            key_states, value_states, hidden_states, attention_mask
+        )
+        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
+        bsz = query_states.size(0)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class PLBartFlashAttention2WithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # PLBartFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("PLBartFlashAttention2 attention does not support output_attentions")
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0].transpose(1, 2)
+            value_states = past_key_value[1].transpose(1, 2)
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+        else:
+            # self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+
+        query_states, key_states, value_states = match_attn_matrices_for_parallel(
+            query_states, key_states, value_states
+        )
+        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+
+        key_states, value_states, attention_mask = self.prefix_tuning(
+            key_states, value_states, hidden_states, attention_mask
+        )
+        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
+        bsz = query_states.size(0)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class PLBartSdpaAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "PLBartModel is using PLBartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not"
+                " support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                " implementation, but specifying the manual implementation will be required from Transformers version"
+                ' v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when'
+                " loading the model."
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        query_states, key_states, value_states = match_attn_matrices_for_parallel(
+            query_states, key_states, value_states
+        )
+        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        key_states, value_states, attention_mask = self.prefix_tuning(
+            key_states, value_states, hidden_states, attention_mask
+        )
+        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
+        bsz = query_states.size(0)
+
+        query_states = self._shape(query_states, tgt_len, bsz)
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+class PLBartEncoderLayerWithAdapters(PLBartEncoderLayerAdaptersMixin, PLBartEncoderLayer):
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        adjust_tensors_for_parallel_(hidden_states, attention_mask)
+
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class PLBartDecoderLayerWithAdapters(PLBartDecoderLayerAdaptersMixin, PLBartDecoderLayer):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        adjust_tensors_for_parallel_(hidden_states, attention_mask, encoder_attention_mask)
+
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = self.cross_attention_adapters(hidden_states, residual, self.encoder_attn_layer_norm)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs

From d54d0b2ecfa89200857e42d1a44682a50fea9d11 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:17:31 +0000
Subject: [PATCH 02/27] Add PLBART test

---
 tests/test_plbart.py | 66 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 tests/test_plbart.py

diff --git a/tests/test_plbart.py b/tests/test_plbart.py
new file mode 100644
index 0000000000..c4fe5cfa02
--- /dev/null
+++ b/tests/test_plbart.py
@@ -0,0 +1,66 @@
+import unittest
+
+from tests.methods.test_config_union import ConfigUnionAdapterTest
+from transformers import PLBartConfig
+from transformers.testing_utils import require_torch
+
+from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .methods import (
+    BottleneckAdapterTestMixin,
+    CompacterTestMixin,
+    IA3TestMixin,
+    LoRATestMixin,
+    PrefixTuningTestMixin,
+    UniPELTTestMixin,
+)
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_backward_compability import CompabilityTestMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+
+
+class PLBartAdapterTestBase(AdapterTestBase):
+    config_class = PLBartConfig
+    config = make_config(
+        BartConfig,
+        d_model=16,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        encoder_ffn_dim=4,
+        decoder_ffn_dim=4,
+    )
+    tokenizer_name = "uclanlp/plbart-base"
+
+
+@require_torch
+class PLBartAdapterTest(
+    BottleneckAdapterTestMixin,
+    CompacterTestMixin,
+    IA3TestMixin,
+    LoRATestMixin,
+    PrefixTuningTestMixin,
+    UniPELTTestMixin,
+    AdapterFusionModelTestMixin,
+    CompabilityTestMixin,
+    EmbeddingTestMixin,
+    PredictionHeadModelTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
+    PLBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class PLBartClassConversionTest(
+    ModelClassConversionTestMixin,
+    PLBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass

From 17a585be86bbd2b36637f8a36900ee7e08a8c5aa Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:19:31 +0000
Subject: [PATCH 03/27] Add PLBART test in /models

---
 tests/models/test_plbart.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 tests/models/test_plbart.py

diff --git a/tests/models/test_plbart.py b/tests/models/test_plbart.py
new file mode 100644
index 0000000000..7fbbfc38df
--- /dev/null
+++ b/tests/models/test_plbart.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F403,F405
+from adapters import PLBartAdapterModel
+from hf_transformers.tests.models.plbart.test_modeling_plbart import *
+from transformers.testing_utils import require_torch
+
+from .base import AdapterModelTesterMixin
+
+
+@require_torch
+class PLBartAdapterModelTest(AdapterModelTesterMixin, PLBartModelTest):
+    all_model_classes = (PLBartAdapterModel,)
+    fx_compatible = False

From 3d792abc2ff8945671a3b11b2e90ec2ace958aa9 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:30:02 +0000
Subject: [PATCH 04/27] Add PLBART in AutoMapping

---
 src/adapters/models/auto/adapter_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
index 2d59c6da44..d3724f9153 100644
--- a/src/adapters/models/auto/adapter_model.py
+++ b/src/adapters/models/auto/adapter_model.py
@@ -11,6 +11,7 @@
     [
         ("albert", "AlbertAdapterModel"),
         ("bart", "BartAdapterModel"),
+        ("plbart", "PLBartAdapterModel"),
         ("beit", "BeitAdapterModel"),
         ("bert", "BertAdapterModel"),
         ("bert-generation", "BertGenerationAdapterModel"),

From 4cd1558ec95b1faad806263037c628b238cc6d5b Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:43:00 +0000
Subject: [PATCH 05/27] Change setup to my directory

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index eff6de831c..324c0d91c9 100644
--- a/setup.py
+++ b/setup.py
@@ -150,7 +150,7 @@ def deps_list(*pkgs):
     long_description_content_type="text/markdown",
     keywords="NLP deep learning transformer pytorch BERT adapters",
     license="Apache",
-    url="https://github.com/adapter-hub/adapters",
+    url="https://github.com/FahadEbrahim/adapters",
     package_dir={"": "src"},
     packages=find_packages("src"),
     include_package_data=True,

From 893e55f089859224bbe1b83f71bccbf4c14f28f2 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:50:31 +0000
Subject: [PATCH 06/27] Add docs

---
 docs/classes/models/plbart.rst | 25 +++++++++++++++++++++++++
 docs/model_overview.md         |  1 +
 2 files changed, 26 insertions(+)
 create mode 100644 docs/classes/models/plbart.rst

diff --git a/docs/classes/models/plbart.rst b/docs/classes/models/plbart.rst
new file mode 100644
index 0000000000..5ed2e53af7
--- /dev/null
+++ b/docs/classes/models/plbart.rst
@@ -0,0 +1,25 @@
+BART
+=====
+
+The Bart model was proposed in `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
+Translation, and Comprehension <https://arxiv.org/abs/1910.13461>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
+Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+
+According to the abstract,
+
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
+  left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
+  where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
+  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
+  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
+  of up to 6 ROUGE.
+
+
+BartAdapterModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: adapters.PLBartAdapterModel
+    :members:
+    :inherited-members: PLBartPretrainedModel
diff --git a/docs/model_overview.md b/docs/model_overview.md
index 58ae523b43..3b7949de11 100644
--- a/docs/model_overview.md
+++ b/docs/model_overview.md
@@ -14,6 +14,7 @@ The table below further shows which model architectures support which adaptation
 | --------------------------------------- | -| - | - | - | - | - | - |- |
 | [ALBERT](classes/models/albert.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
+| [PLBART](classes/models/plbart.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
 | [BEIT](classes/models/beit.html)        | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
 | [BERT-Generation](classes/models/bert-generation.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

From b408c1f0313b7130ef724dc83601516c5f3f48eb Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 13:56:50 +0000
Subject: [PATCH 07/27] PL Init update

---
 src/adapters/models/plbart/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/adapters/models/plbart/__init__.py b/src/adapters/models/plbart/__init__.py
index 4ed67ab1d8..1160ba151b 100644
--- a/src/adapters/models/plbart/__init__.py
+++ b/src/adapters/models/plbart/__init__.py
@@ -22,12 +22,12 @@
 
 
 _import_structure = {
-    "adapter_model": ["BartAdapterModel"],
+    "adapter_model": ["PLBartAdapterModel"],
 }
 
 
 if TYPE_CHECKING:
-    from .adapter_model import BartAdapterModel
+    from .adapter_model import PLBartAdapterModel
 
 else:
     import sys

From 98603e86251d096319a99ff0895e3552f2be94d7 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:01:08 +0000
Subject: [PATCH 08/27] PLBART typo

---
 src/adapters/models/plbart/adapter_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 33b2e9d639..5045af0c3c 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -5,7 +5,7 @@
     PLBART_START_DOCSTRING,
     PLBartConfig,
     PLBartModel,
-    PLBartPretrainedModel,
+    PLBartPreTrainedModel,
     shift_tokens_right,
 )
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward

From 193c7f1c98acee52ad8204a0cb046d091df9864f Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:02:37 +0000
Subject: [PATCH 09/27] PLBART typo

---
 src/adapters/models/plbart/adapter_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 5045af0c3c..a8551b27c0 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -16,7 +16,7 @@
 
 
 @add_start_docstrings(
-    "PLBART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
+    "PLBART Model with the option to add multiple flexible prediction heads on top.", PLBART_START_DOCSTRING
 )
 class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPretrainedModel):
     _tied_weights_keys = [

From 06e70d89adcfdba631f98891edfe3471366c5ad9 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:04:20 +0000
Subject: [PATCH 10/27] PLBART typo

---
 src/adapters/models/plbart/adapter_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index a8551b27c0..980f67adcf 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -18,7 +18,7 @@
 @add_start_docstrings(
     "PLBART Model with the option to add multiple flexible prediction heads on top.", PLBART_START_DOCSTRING
 )
-class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPretrainedModel):
+class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPreTrainedModel):
     _tied_weights_keys = [
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",

From 57fd370cfb28a9a551df77ca5cf94bbcae32215b Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:07:33 +0000
Subject: [PATCH 11/27] PLBART typo

---
 src/adapters/models/plbart/adapter_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 980f67adcf..3cdc300e00 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -31,7 +31,7 @@ class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAd
         "seq2seq_lm",
     ]
 
-    def __init__(self, config: BartConfig, **kwargs):
+    def __init__(self, config: PLBartConfig, **kwargs):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
         init(self.model)

From 94242bd8c01956b61dc0e50cb0c692d971024def Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:09:40 +0000
Subject: [PATCH 12/27] PLBART typo

---
 src/adapters/models/plbart/adapter_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 3cdc300e00..4c14164b01 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -33,7 +33,7 @@ class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAd
 
     def __init__(self, config: PLBartConfig, **kwargs):
         super().__init__(config, **kwargs)
-        self.model = BartModel(config)
+        self.model = PLBartModel(config)
         init(self.model)
 
         self._init_head_modules()
@@ -46,7 +46,7 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -116,7 +116,7 @@ def forward(
 
         return head_outputs
 
-    # Copied from BartForConditionalGeneration
+    # Copied from PLBartForConditionalGeneration
     def prepare_inputs_for_generation(
         self,
         decoder_input_ids,

From dd9357499524ccabafbc9163d2ef79de48a54a38 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:15:24 +0000
Subject: [PATCH 13/27] Update CONFIG_CLASS_KEYS_MAPPING to add plbart

---
 src/adapters/wrappers/configuration.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/adapters/wrappers/configuration.py b/src/adapters/wrappers/configuration.py
index c49f3b8b7c..a706548776 100644
--- a/src/adapters/wrappers/configuration.py
+++ b/src/adapters/wrappers/configuration.py
@@ -17,6 +17,12 @@
         "hidden_dropout_prob": "dropout",
         "attention_probs_dropout_prob": "attention_dropout",
     },
+    "plbart": {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+        "hidden_dropout_prob": "dropout",
+        "attention_probs_dropout_prob": "attention_dropout",
+    },
     "beit": {},
     "bert": {},
     "clip_vision_model": {

From 8ff700626f82402e93609bffd2a38d2a1d798000 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 14:22:43 +0000
Subject: [PATCH 14/27] Update a plbart config

---
 src/adapters/models/plbart/adapter_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 4c14164b01..3631ba2288 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -148,7 +148,7 @@ def prepare_inputs_for_generation(
 
     # Copied from PLBartForConditionalGeneration
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+        return shift_tokens_right(labels, self.config.pad_token_id )#, self.config.decoder_start_token_id)
 
     # Copied from PLBartForConditionalGeneration
     @staticmethod

From 94b2c1a189fe1e48d6460b2cc208fe628176db2f Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Thu, 28 Mar 2024 15:03:45 +0000
Subject: [PATCH 15/27] Update a plbart model init

---
 src/adapters/models/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index 46eba733b7..63496d221c 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -5,6 +5,12 @@
     BartEncoderAdaptersMixin,
     BartModelAdaptersMixin,
 )
+from .plbart.mixin_plbart import (
+    PLBartDecoderAdaptersMixin,
+    PLBartDecoderWrapperAdaptersMixin,
+    PLBartEncoderAdaptersMixin,
+    PLBartModelAdaptersMixin,
+)
 from .beit.mixin_beit import BeitIntermediateAdaptersMixin, BeitModelAdaptersMixin, BeitOutputAdaptersMixin
 from .bert.mixin_bert import BertLayerAdaptersMixin, BertModelAdaptersMixin
 from .clip.mixin_clip import (
@@ -34,7 +40,11 @@
     "BartEncoder": BartEncoderAdaptersMixin,
     "BartDecoder": BartDecoderAdaptersMixin,
     "BartModel": BartModelAdaptersMixin,
+    "PLBartEncoder": PLBartEncoderAdaptersMixin,
+    "PLBartDecoder": PLBartDecoderAdaptersMixin,
+    "PLBartModel": PLBartModelAdaptersMixin,
     "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
+    "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin,
     "BeitIntermediate": BeitIntermediateAdaptersMixin,
     "BeitOutput": BeitOutputAdaptersMixin,
     "BeitModel": BeitModelAdaptersMixin,

From 213efe82c62e26c5d1e6272f7126e2a7287b1c2f Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Sun, 23 Jun 2024 17:14:06 +0100
Subject: [PATCH 16/27] Update plbart.rst

---
 docs/classes/models/plbart.rst | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/docs/classes/models/plbart.rst b/docs/classes/models/plbart.rst
index 5ed2e53af7..69227f8f2e 100644
--- a/docs/classes/models/plbart.rst
+++ b/docs/classes/models/plbart.rst
@@ -1,23 +1,18 @@
-BART
+PLBART
 =====
 
-The Bart model was proposed in `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
-Translation, and Comprehension <https://arxiv.org/abs/1910.13461>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
-Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task
+on Java, Python and English.
 
 According to the abstract,
 
-- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
-  left-to-right decoder (like GPT).
-- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
-  where spans of text are replaced with a single mask token.
-- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
-  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
-  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
-  of up to 6 ROUGE.
+- PLBART is a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks
+- PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding.
+- PLBART learns program syntax, style (e.g., identifier naming convention) and logical flow.
 
 
-BartAdapterModel
+PLBartAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: adapters.PLBartAdapterModel

From 46a6f5404cf6a0827ce78a9cd5f1dc64aa3380d5 Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Sun, 23 Jun 2024 17:15:42 +0100
Subject: [PATCH 17/27] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4856348a7e..ba2af4a208 100644
--- a/setup.py
+++ b/setup.py
@@ -150,7 +150,7 @@ def deps_list(*pkgs):
     long_description_content_type="text/markdown",
     keywords="NLP deep learning transformer pytorch BERT adapters",
     license="Apache",
-    url="https://github.com/FahadEbrahim/adapters",
+    url="https://github.com/adapter-hub/adapters",
     package_dir={"": "src"},
     packages=find_packages("src"),
     include_package_data=True,

From 083998c54b0dde0252971b0be32e0341562cb8f0 Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Mon, 24 Jun 2024 15:46:45 +0100
Subject: [PATCH 18/27] Fix Typo PLBart instead of Bart

---
 tests/test_plbart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_plbart.py b/tests/test_plbart.py
index c4fe5cfa02..969b3b0fde 100644
--- a/tests/test_plbart.py
+++ b/tests/test_plbart.py
@@ -24,7 +24,7 @@
 class PLBartAdapterTestBase(AdapterTestBase):
     config_class = PLBartConfig
     config = make_config(
-        BartConfig,
+        PLBartConfig,
         d_model=16,
         encoder_layers=2,
         decoder_layers=2,

From 612e23457a1333ce867b336f9bb7ff8e90efc254 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Tue, 25 Jun 2024 14:09:28 +0100
Subject: [PATCH 19/27] Various intsertions of plbart

---
 docs/index.rst                                |  1 +
 .../pytorch/adapterfusion/run_fusion_glue.py  |  2 +-
 .../dependency-parsing/preprocessing.py       |  1 +
 .../pytorch/dependency-parsing/run_udp.py     | 25 ++++++++++------
 .../pytorch/dependency-parsing/utils_udp.py   |  1 +
 examples/pytorch/language-modeling/run_clm.py |  6 ++--
 examples/pytorch/language-modeling/run_mlm.py |  6 ++--
 .../pytorch/text-classification/run_glue.py   |  2 +-
 .../pytorch/text-generation/run_generation.py |  3 +-
 src/adapters/__init__.py                      |  1 +
 src/adapters/composition.py                   |  1 +
 src/adapters/head_utils.py                    | 29 +++++++++++++++++++
 src/adapters/heads/dependency_parsing.py      |  1 +
 src/adapters/methods/bottleneck.py            |  8 +++--
 src/adapters/methods/lora.py                  |  8 +++--
 src/adapters/models/__init__.py               | 10 +++----
 src/adapters/models/auto/adapter_model.py     |  2 +-
 src/adapters/models/bart/modeling_bart.py     |  2 +-
 src/adapters/models/beit/modeling_beit.py     |  2 +-
 src/adapters/models/clip/modeling_clip.py     |  2 +-
 .../models/deberta/modeling_deberta.py        |  2 +-
 .../models/deberta_v2/modeling_deberta_v2.py  |  2 +-
 .../models/distilbert/modeling_distilbert.py  |  4 +--
 .../modeling_encoder_decoder.py               |  2 +-
 src/adapters/models/gptj/modeling_gptj.py     |  2 +-
 src/adapters/models/llama/modeling_llama.py   |  2 +-
 src/adapters/models/mbart/modeling_mbart.py   |  2 +-
 src/adapters/models/mt5/modeling_mt5.py       |  2 +-
 src/adapters/models/plbart/adapter_model.py   |  2 +-
 src/adapters/models/plbart/modeling_plbart.py |  8 +++--
 src/adapters/models/t5/modeling_t5.py         |  2 +-
 src/adapters/models/vit/modeling_vit.py       |  2 +-
 src/adapters/utils.py                         | 11 +++----
 src/adapters/wrappers/configuration.py        |  6 ++++
 tests/fixtures/samples/cifar10/cifar10.py     |  1 +
 utils/back_comp/Utils.py                      | 15 ++++++++++
 utils/convert_xmod_checkpoint.py              |  1 +
 37 files changed, 126 insertions(+), 53 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index b78a249c64..29ef772fdf 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -82,6 +82,7 @@ Currently, we support the PyTorch versions of all models as listed on the `Model
    classes/models/llama
    classes/models/mbart
    classes/models/mt5
+   classes/models/plbart
    classes/models/roberta
    classes/models/t5
    classes/models/vit
diff --git a/examples/pytorch/adapterfusion/run_fusion_glue.py b/examples/pytorch/adapterfusion/run_fusion_glue.py
index d02aa811eb..75e0790225 100644
--- a/examples/pytorch/adapterfusion/run_fusion_glue.py
+++ b/examples/pytorch/adapterfusion/run_fusion_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on
+"""Finetuning the library models for sequence classification on
 GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
 
 
diff --git a/examples/pytorch/dependency-parsing/preprocessing.py b/examples/pytorch/dependency-parsing/preprocessing.py
index 2188aab4fb..90f1012549 100644
--- a/examples/pytorch/dependency-parsing/preprocessing.py
+++ b/examples/pytorch/dependency-parsing/preprocessing.py
@@ -3,6 +3,7 @@
 Credits: "How Good is Your Tokenizer? On the Monolingual Performance of Multilingual Language Models" (Rust et al., 2021)
 https://arxiv.org/abs/2012.15613
 """
+
 from collections import defaultdict
 from typing import List
 
diff --git a/examples/pytorch/dependency-parsing/run_udp.py b/examples/pytorch/dependency-parsing/run_udp.py
index 8fefe1f49c..353df7cf68 100644
--- a/examples/pytorch/dependency-parsing/run_udp.py
+++ b/examples/pytorch/dependency-parsing/run_udp.py
@@ -3,6 +3,7 @@
 Credits: "How Good is Your Tokenizer? On the Monolingual Performance of Multilingual Language Models" (Rust et al., 2021)
 https://arxiv.org/abs/2012.15613
 """
+
 import logging
 import os
 import sys
@@ -157,9 +158,11 @@ def main():
         use_fast=model_args.use_fast,
         do_lower_case=model_args.do_lower_case,
         add_prefix_space=True,  # Used e.g. for RoBERTa
-        mecab_kwargs={"mecab_option": f"-r {model_args.mecab_dir} -d {model_args.mecab_dic_dir}"}
-        if model_args.is_japanese
-        else None,
+        mecab_kwargs=(
+            {"mecab_option": f"-r {model_args.mecab_dir} -d {model_args.mecab_dic_dir}"}
+            if model_args.is_japanese
+            else None
+        ),
     )
 
     # The task name (with prefix)
@@ -254,9 +257,11 @@ def main():
             if adapter_args.train_adapter:
                 adapter_config = AdapterConfig.load(adapter_args.adapter_config, **adapter_config_kwargs)
                 model.load_adapter(
-                    os.path.join(training_args.output_dir, "best_model", task_name)
-                    if training_args.do_train
-                    else adapter_args.load_adapter,
+                    (
+                        os.path.join(training_args.output_dir, "best_model", task_name)
+                        if training_args.do_train
+                        else adapter_args.load_adapter
+                    ),
                     config=adapter_config,
                     load_as=task_name,
                     **adapter_load_kwargs,
@@ -264,9 +269,11 @@ def main():
                 if adapter_args.load_lang_adapter:
                     lang_adapter_config = AdapterConfig.load(adapter_args.lang_adapter_config, **adapter_config_kwargs)
                     lang_adapter_name = model.load_adapter(
-                        os.path.join(training_args.output_dir, "best_model", lang_adapter_name)
-                        if training_args.do_train
-                        else adapter_args.load_lang_adapter,
+                        (
+                            os.path.join(training_args.output_dir, "best_model", lang_adapter_name)
+                            if training_args.do_train
+                            else adapter_args.load_lang_adapter
+                        ),
                         config=lang_adapter_config,
                         load_as=lang_adapter_name,
                         **adapter_load_kwargs,
diff --git a/examples/pytorch/dependency-parsing/utils_udp.py b/examples/pytorch/dependency-parsing/utils_udp.py
index 3424638319..0eaa4f5d3d 100644
--- a/examples/pytorch/dependency-parsing/utils_udp.py
+++ b/examples/pytorch/dependency-parsing/utils_udp.py
@@ -3,6 +3,7 @@
 Credits: "How Good is Your Tokenizer? On the Monolingual Performance of Multilingual Language Models" (Rust et al., 2021)
 https://arxiv.org/abs/2012.15613
 """
+
 import collections
 import logging
 import os
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 3094738317..17c28b88f6 100644
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -541,9 +541,9 @@ def compute_metrics(eval_preds):
         # Data collator will default to DataCollatorWithPadding, so we change it.
         data_collator=default_data_collator,
         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
+        preprocess_logits_for_metrics=(
+            preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None
+        ),
     )
 
     # Training
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index bf6de170ab..8dc451350b 100644
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -557,9 +557,9 @@ def compute_metrics(eval_preds):
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
+        preprocess_logits_for_metrics=(
+            preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None
+        ),
     )
 
     # Training
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 5786e0df55..4a7d869591 100644
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index 05dbe2fc0e..e6f6e8bac9 100644
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -14,8 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
-"""
+"""Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)"""
 
 
 import argparse
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
index bfe253e91e..204b718bd7 100644
--- a/src/adapters/__init__.py
+++ b/src/adapters/__init__.py
@@ -105,6 +105,7 @@
     "models.gptj": ["GPTJAdapterModel"],
     "models.llama": ["LlamaAdapterModel"],
     "models.mbart": ["MBartAdapterModel"],
+    "models.plbart": ["PLBartAdapterModel"],
     "models.mt5": ["MT5AdapterModel"],
     "models.roberta": ["RobertaAdapterModel"],
     "models.t5": ["T5AdapterModel"],
diff --git a/src/adapters/composition.py b/src/adapters/composition.py
index 6d37e44b1f..2b81dc7218 100644
--- a/src/adapters/composition.py
+++ b/src/adapters/composition.py
@@ -128,6 +128,7 @@ def __init__(
         "bart",
         "mbart",
         "mt5",
+        "plbart",
         "gpt2",
         "gptj",
         "t5",
diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
index ec78430e02..3bbe1b6db4 100644
--- a/src/adapters/head_utils.py
+++ b/src/adapters/head_utils.py
@@ -369,6 +369,35 @@
         },
         "layers": ["lm_head"],
     },
+    # PLBART
+    "PLBartForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+        },
+        "layers": [
+            None,
+            "classification_head.dense",
+            None,
+            None,
+            "classification_head.out_proj",
+        ],
+    },
+    "PLBartForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": [None, "qa_outputs"],
+    },
+    "PLBartForConditionalGeneration": {
+        "config": {
+            "head_type": "seq2seq_lm",
+        },
+        "layers": ["lm_head"],
+    },
     # MT5
     "MT5ForConditionalGeneration": {
         "config": {
diff --git a/src/adapters/heads/dependency_parsing.py b/src/adapters/heads/dependency_parsing.py
index d568f356b0..5d33820f45 100644
--- a/src/adapters/heads/dependency_parsing.py
+++ b/src/adapters/heads/dependency_parsing.py
@@ -2,6 +2,7 @@
 Code taken and modified from: https://github.com/Adapter-Hub/hgiyt. Credits: "How Good is Your Tokenizer? On the
 Monolingual Performance of Multilingual Language Models" (Rust et al., 2021) https://arxiv.org/abs/2012.15613
 """
+
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
diff --git a/src/adapters/methods/bottleneck.py b/src/adapters/methods/bottleneck.py
index 7ebae5221a..98bb1436d3 100644
--- a/src/adapters/methods/bottleneck.py
+++ b/src/adapters/methods/bottleneck.py
@@ -211,9 +211,11 @@ def pad_and_concat(self, states: List[BottleneckState]) -> BottleneckState:
             torch.cat([state.input_tensor for state in states], dim=0),
             torch.cat([state.adapter_residual for state in states], dim=0),
             states[0].layer_norm,
-            torch.cat([state.bottleneck_up for state in states], dim=0)
-            if states[0].bottleneck_up is not None
-            else None,
+            (
+                torch.cat([state.bottleneck_up for state in states], dim=0)
+                if states[0].bottleneck_up is not None
+                else None
+            ),
             states[-1].last,
         )
 
diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py
index 264eb96ca9..1723f7e158 100644
--- a/src/adapters/methods/lora.py
+++ b/src/adapters/methods/lora.py
@@ -437,9 +437,11 @@ def repeat(self, state: LoRAState, channels: int) -> LoRAState:
     def mean(self, states: List[LoRAState], weights: torch.Tensor) -> LoRAState:
         return LoRAState(
             states[0].layer_input,
-            torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0)
-            if states[0].hidden_states is not None
-            else None,
+            (
+                torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0)
+                if states[0].hidden_states is not None
+                else None
+            ),
             states[0].layer_output,
             states[-1].last,
         )
diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index b018080a0b..1f9ef7840a 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -40,11 +40,6 @@
     "BartEncoder": BartEncoderAdaptersMixin,
     "BartDecoder": BartDecoderAdaptersMixin,
     "BartModel": BartModelAdaptersMixin,
-    "PLBartEncoder": PLBartEncoderAdaptersMixin,
-    "PLBartDecoder": PLBartDecoderAdaptersMixin,
-    "PLBartModel": PLBartModelAdaptersMixin,
-    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
-    "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin,
     "BeitIntermediate": BeitIntermediateAdaptersMixin,
     "BeitOutput": BeitOutputAdaptersMixin,
     "BeitModel": BeitModelAdaptersMixin,
@@ -70,6 +65,11 @@
     "MT5ForConditionalGeneration": T5ForCondiditionalGenerationWithHeadsMixin,
     "MT5ForQuestionAnswering": T5ForQuestionAnsweringWithHeadsMixin,
     "MT5EncoderModel": T5ModelAdaptersMixin,
+    "PLBartEncoder": PLBartEncoderAdaptersMixin,
+    "PLBartDecoder": PLBartDecoderAdaptersMixin,
+    "PLBartModel": PLBartModelAdaptersMixin,
+    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
+    "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin,
     "GPT2Model": GPT2ModelAdapterMixin,
     "GPTJMLP": GPTJMLPAdaptersMixin,
     "GPTJModel": GPTJModelAdapterMixin,
diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
index d3724f9153..b6db14ca0c 100644
--- a/src/adapters/models/auto/adapter_model.py
+++ b/src/adapters/models/auto/adapter_model.py
@@ -11,7 +11,6 @@
     [
         ("albert", "AlbertAdapterModel"),
         ("bart", "BartAdapterModel"),
-        ("plbart", "PLBartAdapterModel"),
         ("beit", "BeitAdapterModel"),
         ("bert", "BertAdapterModel"),
         ("bert-generation", "BertGenerationAdapterModel"),
@@ -24,6 +23,7 @@
         ("gptj", "GPTJAdapterModel"),
         ("llama", "LlamaAdapterModel"),
         ("mbart", "MBartAdapterModel"),
+        ("plbart", "PLBartAdapterModel"),
         ("mt5", "MT5AdapterModel"),
         ("roberta", "RobertaAdapterModel"),
         ("t5", "T5AdapterModel"),
diff --git a/src/adapters/models/bart/modeling_bart.py b/src/adapters/models/bart/modeling_bart.py
index b347fddf07..080455b497 100644
--- a/src/adapters/models/bart/modeling_bart.py
+++ b/src/adapters/models/bart/modeling_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BART model."""
+"""PyTorch BART model."""
 from typing import Optional, Tuple
 
 import torch
diff --git a/src/adapters/models/beit/modeling_beit.py b/src/adapters/models/beit/modeling_beit.py
index 1ed5082beb..bc67120d13 100644
--- a/src/adapters/models/beit/modeling_beit.py
+++ b/src/adapters/models/beit/modeling_beit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BEiT model."""
+"""PyTorch BEiT model."""
 
 
 import math
diff --git a/src/adapters/models/clip/modeling_clip.py b/src/adapters/models/clip/modeling_clip.py
index b74a0308ef..fecbb105c8 100644
--- a/src/adapters/models/clip/modeling_clip.py
+++ b/src/adapters/models/clip/modeling_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CLIP model."""
+"""PyTorch CLIP model."""
 
 
 from typing import Optional, Tuple
diff --git a/src/adapters/models/deberta/modeling_deberta.py b/src/adapters/models/deberta/modeling_deberta.py
index 1feca72b4a..4380b5e038 100644
--- a/src/adapters/models/deberta/modeling_deberta.py
+++ b/src/adapters/models/deberta/modeling_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa model."""
+"""PyTorch DeBERTa model."""
 
 import torch
 import torch.utils.checkpoint
diff --git a/src/adapters/models/deberta_v2/modeling_deberta_v2.py b/src/adapters/models/deberta_v2/modeling_deberta_v2.py
index 56d6fec448..bc41ae82af 100644
--- a/src/adapters/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/adapters/models/deberta_v2/modeling_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa-v2 model."""
+"""PyTorch DeBERTa-v2 model."""
 
 import torch
 import torch.utils.checkpoint
diff --git a/src/adapters/models/distilbert/modeling_distilbert.py b/src/adapters/models/distilbert/modeling_distilbert.py
index cbd501942c..e59aa1ad50 100644
--- a/src/adapters/models/distilbert/modeling_distilbert.py
+++ b/src/adapters/models/distilbert/modeling_distilbert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 """
- PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
- part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
+part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
 """
 
 
diff --git a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
index 43178898f6..1572087d98 100644
--- a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Encoder-Decoder architectures"""
+"""Classes to support Encoder-Decoder architectures"""
 
 from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel
 
diff --git a/src/adapters/models/gptj/modeling_gptj.py b/src/adapters/models/gptj/modeling_gptj.py
index 700e919a17..3880df12c0 100644
--- a/src/adapters/models/gptj/modeling_gptj.py
+++ b/src/adapters/models/gptj/modeling_gptj.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPT-J model."""
+"""PyTorch GPT-J model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/adapters/models/llama/modeling_llama.py b/src/adapters/models/llama/modeling_llama.py
index 7c99f286e4..d9d8b2ebcc 100644
--- a/src/adapters/models/llama/modeling_llama.py
+++ b/src/adapters/models/llama/modeling_llama.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LLaMA model."""
+"""PyTorch LLaMA model."""
 import math
 import warnings
 from typing import Optional, Tuple
diff --git a/src/adapters/models/mbart/modeling_mbart.py b/src/adapters/models/mbart/modeling_mbart.py
index 0f8f0d5335..45bdceae25 100644
--- a/src/adapters/models/mbart/modeling_mbart.py
+++ b/src/adapters/models/mbart/modeling_mbart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MBART model."""
+"""PyTorch MBART model."""
 from typing import Optional, Tuple
 
 import torch
diff --git a/src/adapters/models/mt5/modeling_mt5.py b/src/adapters/models/mt5/modeling_mt5.py
index 12ad630a74..b982d34d62 100644
--- a/src/adapters/models/mt5/modeling_mt5.py
+++ b/src/adapters/models/mt5/modeling_mt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MT5 model."""
+"""PyTorch MT5 model."""
 
 import torch
 from torch import nn
diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
index 3631ba2288..83f02183d0 100644
--- a/src/adapters/models/plbart/adapter_model.py
+++ b/src/adapters/models/plbart/adapter_model.py
@@ -148,7 +148,7 @@ def prepare_inputs_for_generation(
 
     # Copied from PLBartForConditionalGeneration
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id )#, self.config.decoder_start_token_id)
+        return shift_tokens_right(labels, self.config.pad_token_id)  # , self.config.decoder_start_token_id)
 
     # Copied from PLBartForConditionalGeneration
     @staticmethod
diff --git a/src/adapters/models/plbart/modeling_plbart.py b/src/adapters/models/plbart/modeling_plbart.py
index ed272b3aa0..2b062fafbe 100644
--- a/src/adapters/models/plbart/modeling_plbart.py
+++ b/src/adapters/models/plbart/modeling_plbart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PLBART model."""
+"""PyTorch PLBART model."""
 from typing import Optional, Tuple
 
 import torch
@@ -23,7 +23,11 @@
 from transformers.utils import logging
 
 from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel
-from .mixin_plbart import PLBartAttentionAdaptersMixin, PLBartDecoderLayerAdaptersMixin, PLBartEncoderLayerAdaptersMixin
+from .mixin_plbart import (
+    PLBartAttentionAdaptersMixin,
+    PLBartDecoderLayerAdaptersMixin,
+    PLBartEncoderLayerAdaptersMixin,
+)
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py
index 03d9f27972..c98cfa477a 100644
--- a/src/adapters/models/t5/modeling_t5.py
+++ b/src/adapters/models/t5/modeling_t5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch T5 model."""
+"""PyTorch T5 model."""
 
 import torch
 from torch import nn
diff --git a/src/adapters/models/vit/modeling_vit.py b/src/adapters/models/vit/modeling_vit.py
index f8c02bd931..0a9d7a1b3e 100644
--- a/src/adapters/models/vit/modeling_vit.py
+++ b/src/adapters/models/vit/modeling_vit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT model."""
+"""PyTorch ViT model."""
 
 
 import math
diff --git a/src/adapters/utils.py b/src/adapters/utils.py
index 1640c5567e..efe48208e0 100644
--- a/src/adapters/utils.py
+++ b/src/adapters/utils.py
@@ -736,8 +736,9 @@ def resolve_adapter_path(
             except Exception as ex:
                 logger.info(ex)
                 raise EnvironmentError(
-                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source."
-                    .format(adapter_name_or_path)
+                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source.".format(
+                        adapter_name_or_path
+                    )
                 )
     else:
         raise ValueError("Unable to identify {} as a valid module location.".format(adapter_name_or_path))
@@ -820,9 +821,9 @@ def get_adapter_info(adapter_id: str, source: str = "ah") -> Optional[AdapterInf
             return AdapterInfo(
                 source="hf",
                 adapter_id=model_info.modelId,
-                model_name=model_info.config.get("adapter_transformers", {}).get("model_name")
-                if model_info.config
-                else None,
+                model_name=(
+                    model_info.config.get("adapter_transformers", {}).get("model_name") if model_info.config else None
+                ),
                 username=model_info.modelId.split("/")[0],
                 sha1_checksum=model_info.sha,
             )
diff --git a/src/adapters/wrappers/configuration.py b/src/adapters/wrappers/configuration.py
index a706548776..dd9fbfafd4 100644
--- a/src/adapters/wrappers/configuration.py
+++ b/src/adapters/wrappers/configuration.py
@@ -52,6 +52,12 @@
         "hidden_dropout_prob": "dropout",
         "attention_probs_dropout_prob": "attention_dropout",
     },
+    "plbart": {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+        "hidden_dropout_prob": "dropout",
+        "attention_probs_dropout_prob": "attention_dropout",
+    },
     "roberta": {},
     "t5": {
         "hidden_size": "d_model",
diff --git a/tests/fixtures/samples/cifar10/cifar10.py b/tests/fixtures/samples/cifar10/cifar10.py
index cd00f02603..052a203dff 100644
--- a/tests/fixtures/samples/cifar10/cifar10.py
+++ b/tests/fixtures/samples/cifar10/cifar10.py
@@ -1,6 +1,7 @@
 """
 CIFAR-10 demo data, adapted from https://huggingface.co/datasets/cifar10.
 """
+
 import os
 import pickle
 
diff --git a/utils/back_comp/Utils.py b/utils/back_comp/Utils.py
index 8ed482130c..21c15545f7 100644
--- a/utils/back_comp/Utils.py
+++ b/utils/back_comp/Utils.py
@@ -29,6 +29,7 @@
     GPT2Config,
     GPTJConfig,
     MBartConfig,
+    PLBartConfig,
     RobertaConfig,
     T5Config,
     ViTConfig,
@@ -130,6 +131,7 @@ def get_model_names():
         "gpt2",
         "gptj",
         "mbart",
+        "plbart",
         "roberta",
         "t5",
         "vit",
@@ -283,6 +285,19 @@ def create_model(model_name: str, model_class: Any) -> Any:
         )
         model = model_class.from_config(mbart_config)
 
+    elif model_name == "plbart":
+        plbart_config = PLBartConfig(
+            d_model=16,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=4,
+            decoder_attention_heads=4,
+            encoder_ffn_dim=4,
+            decoder_ffn_dim=4,
+            vocab_size=50005,
+        )
+        model = model_class.from_config(plbart_config)
+
     elif model_name == "roberta":
         roberta_config = RobertaConfig(
             hidden_size=32,
diff --git a/utils/convert_xmod_checkpoint.py b/utils/convert_xmod_checkpoint.py
index 30ca0ede74..b3744fece6 100644
--- a/utils/convert_xmod_checkpoint.py
+++ b/utils/convert_xmod_checkpoint.py
@@ -1,6 +1,7 @@
 """
 This script can be used to convert an Xmod checkpoints (including adapters) from the HF format to the Adapters format.
 """
+
 import argparse
 import os
 import re

From 30f0f6bd03e021fc9cc8f0f6d31e3c8255f0b8ac Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <eng.fahad.ebrahim@gmai.com>
Date: Tue, 25 Jun 2024 14:36:19 +0100
Subject: [PATCH 20/27] Ordering

---
 src/adapters/models/__init__.py        | 2 +-
 src/adapters/wrappers/configuration.py | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index 1f9ef7840a..28228d63b5 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -39,6 +39,7 @@
     "AlbertModel": AlbertModelAdaptersMixin,
     "BartEncoder": BartEncoderAdaptersMixin,
     "BartDecoder": BartDecoderAdaptersMixin,
+    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "BartModel": BartModelAdaptersMixin,
     "BeitIntermediate": BeitIntermediateAdaptersMixin,
     "BeitOutput": BeitOutputAdaptersMixin,
@@ -68,7 +69,6 @@
     "PLBartEncoder": PLBartEncoderAdaptersMixin,
     "PLBartDecoder": PLBartDecoderAdaptersMixin,
     "PLBartModel": PLBartModelAdaptersMixin,
-    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin,
     "GPT2Model": GPT2ModelAdapterMixin,
     "GPTJMLP": GPTJMLPAdaptersMixin,
diff --git a/src/adapters/wrappers/configuration.py b/src/adapters/wrappers/configuration.py
index dd9fbfafd4..ed224cd600 100644
--- a/src/adapters/wrappers/configuration.py
+++ b/src/adapters/wrappers/configuration.py
@@ -17,12 +17,6 @@
         "hidden_dropout_prob": "dropout",
         "attention_probs_dropout_prob": "attention_dropout",
     },
-    "plbart": {
-        "num_attention_heads": "encoder_attention_heads",
-        "hidden_size": "d_model",
-        "hidden_dropout_prob": "dropout",
-        "attention_probs_dropout_prob": "attention_dropout",
-    },
     "beit": {},
     "bert": {},
     "clip_vision_model": {

From 00f4f1bcb6115fe61d5c7b86c2848fff934de47a Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <fe@Fahads-MacBook-Pro.local>
Date: Tue, 25 Jun 2024 16:11:47 +0100
Subject: [PATCH 21/27] make style

---
 src/adapters/models/__init__.py               | 12 ++++++------
 src/adapters/models/plbart/modeling_plbart.py | 10 +++++-----
 src/adapters/utils.py                         |  5 ++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index 28228d63b5..55bff73de8 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -5,12 +5,6 @@
     BartEncoderAdaptersMixin,
     BartModelAdaptersMixin,
 )
-from .plbart.mixin_plbart import (
-    PLBartDecoderAdaptersMixin,
-    PLBartDecoderWrapperAdaptersMixin,
-    PLBartEncoderAdaptersMixin,
-    PLBartModelAdaptersMixin,
-)
 from .beit.mixin_beit import BeitIntermediateAdaptersMixin, BeitModelAdaptersMixin, BeitOutputAdaptersMixin
 from .bert.mixin_bert import BertLayerAdaptersMixin, BertModelAdaptersMixin
 from .clip.mixin_clip import (
@@ -24,6 +18,12 @@
 from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin
 from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin
 from .llama.mixin_llama import LlamaForQuestionAnsweringAdapterMixin, LlamaModelAdapterMixin
+from .plbart.mixin_plbart import (
+    PLBartDecoderAdaptersMixin,
+    PLBartDecoderWrapperAdaptersMixin,
+    PLBartEncoderAdaptersMixin,
+    PLBartModelAdaptersMixin,
+)
 from .t5.mixin_t5 import (
     T5BlockAdaptersMixin,
     T5ForCondiditionalGenerationWithHeadsMixin,
diff --git a/src/adapters/models/plbart/modeling_plbart.py b/src/adapters/models/plbart/modeling_plbart.py
index 2b062fafbe..2d812cae1d 100644
--- a/src/adapters/models/plbart/modeling_plbart.py
+++ b/src/adapters/models/plbart/modeling_plbart.py
@@ -293,11 +293,11 @@ def forward(
         if output_attentions or layer_head_mask is not None:
             # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "PLBartModel is using PLBartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not"
-                " support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
-                " implementation, but specifying the manual implementation will be required from Transformers version"
-                ' v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when'
-                " loading the model."
+                "PLBartModel is using PLBartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does"
+                " not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual"
+                " attention implementation, but specifying the manual implementation will be required from"
+                " Transformers version v5.0.0 onwards. This warning can be removed using the argument"
+                ' `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
                 hidden_states,
diff --git a/src/adapters/utils.py b/src/adapters/utils.py
index efe48208e0..52c26a76ee 100644
--- a/src/adapters/utils.py
+++ b/src/adapters/utils.py
@@ -736,9 +736,8 @@ def resolve_adapter_path(
             except Exception as ex:
                 logger.info(ex)
                 raise EnvironmentError(
-                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source.".format(
-                        adapter_name_or_path
-                    )
+                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source."
+                    .format(adapter_name_or_path)
                 )
     else:
         raise ValueError("Unable to identify {} as a valid module location.".format(adapter_name_or_path))

From 2da521f895a212ca92e71f0459407b23810cdc92 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <fe@Fahads-MacBook-Pro.local>
Date: Fri, 28 Jun 2024 15:05:13 +0100
Subject: [PATCH 22/27] Make Style

---
 src/adapters/__init__.py                  | 2 +-
 src/adapters/models/auto/adapter_model.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
index c3cb6d1869..49e3c72ddc 100644
--- a/src/adapters/__init__.py
+++ b/src/adapters/__init__.py
@@ -105,8 +105,8 @@
     "models.gptj": ["GPTJAdapterModel"],
     "models.llama": ["LlamaAdapterModel"],
     "models.mbart": ["MBartAdapterModel"],
-    "models.plbart": ["PLBartAdapterModel"],
     "models.mt5": ["MT5AdapterModel"],
+    "models.plbart": ["PLBartAdapterModel"],
     "models.roberta": ["RobertaAdapterModel"],
     "models.t5": ["T5AdapterModel"],
     "models.vit": ["ViTAdapterModel"],
diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
index b6db14ca0c..31dfa00cff 100644
--- a/src/adapters/models/auto/adapter_model.py
+++ b/src/adapters/models/auto/adapter_model.py
@@ -23,8 +23,8 @@
         ("gptj", "GPTJAdapterModel"),
         ("llama", "LlamaAdapterModel"),
         ("mbart", "MBartAdapterModel"),
-        ("plbart", "PLBartAdapterModel"),
         ("mt5", "MT5AdapterModel"),
+        ("plbart", "PLBartAdapterModel"),
         ("roberta", "RobertaAdapterModel"),
         ("t5", "T5AdapterModel"),
         ("vit", "ViTAdapterModel"),

From 025a5a40ed5ad9f10467a11a31b0dd0f6bdbb9a8 Mon Sep 17 00:00:00 2001
From: Fahad Ebrahim <fe@Fahads-MacBook-Pro.local>
Date: Fri, 28 Jun 2024 15:13:47 +0100
Subject: [PATCH 23/27] Fix Bug for Make Style

---
 src/adapters/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
index 49e3c72ddc..0f20a9e4d9 100644
--- a/src/adapters/__init__.py
+++ b/src/adapters/__init__.py
@@ -210,6 +210,7 @@
     from .models.llama import LlamaAdapterModel
     from .models.mbart import MBartAdapterModel
     from .models.mt5 import MT5AdapterModel
+    from .models.plbart import PLBartAdapterModel
     from .models.roberta import RobertaAdapterModel
     from .models.t5 import T5AdapterModel
     from .models.vit import ViTAdapterModel

From 87d39b694b9e9f895f719061506ad13e487162a5 Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Sun, 7 Jul 2024 12:00:49 +0100
Subject: [PATCH 24/27] Update model_overview.md

---
 docs/model_overview.md | 48 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/docs/model_overview.md b/docs/model_overview.md
index 3b7949de11..264054ecda 100644
--- a/docs/model_overview.md
+++ b/docs/model_overview.md
@@ -10,30 +10,30 @@ The table below further shows which model architectures support which adaptation
     E.g., for BERT, this means adapters provides a ``BertAdapterModel`` class, but you can also use ``BertModel``, ``BertForSequenceClassification`` etc. together with adapters.
 ```
 
-| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | LoRA | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block | Prompt<br> Tuning |
-| --------------------------------------- | -| - | - | - | - | - | - |- |
-| [ALBERT](classes/models/albert.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [PLBART](classes/models/plbart.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [BEIT](classes/models/beit.html)        | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
-| [BERT-Generation](classes/models/bert-generation.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [CLIP](classes/models/clip.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |
-| [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | | |
-| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [GPT-J](classes/models/gptj.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [Llama](classes/models/llama.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [MT5](classes/models/mt5.html)          | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [ViT](classes/models/vit.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [X-MOD](classes/models/xmod.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | LoRA | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block | Prompt<br> Tuning | ReFT |
+| --------------------------------------- | -| - | - | - | - | - | - |- | - |
+| [ALBERT](classes/models/albert.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [BEIT](classes/models/beit.html)        | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ | ✅ |
+| [BERT-Generation](classes/models/bert-generation.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [CLIP](classes/models/clip.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
+| [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | | | (*) |
+| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [GPT-J](classes/models/gptj.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [Llama](classes/models/llama.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [PLBart](classes/models/plbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [MT5](classes/models/mt5.html)          | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [ViT](classes/models/vit.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [X-MOD](classes/models/xmod.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 
 (*) If the used encoder and decoder model class are supported.
 

From f4c54d2080f661a0ca82fce7ebcebc9b4b7325d8 Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Sun, 7 Jul 2024 14:01:04 +0100
Subject: [PATCH 25/27] Update model_overview.md

---
 docs/model_overview.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/model_overview.md b/docs/model_overview.md
index 58ae523b43..4042d8a0d6 100644
--- a/docs/model_overview.md
+++ b/docs/model_overview.md
@@ -28,6 +28,7 @@ The table below further shows which model architectures support which adaptation
 | [Llama](classes/models/llama.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
 | [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
 | [MT5](classes/models/mt5.html)          | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
+| [PLBart](classes/models/plbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
 | [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
 | [ViT](classes/models/vit.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

From bce5d31d730a9e1ebce79ea3cfea6483e12ea2e1 Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Sun, 7 Jul 2024 14:11:02 +0100
Subject: [PATCH 26/27] Update model_overview.md to include ReFT

---
 docs/model_overview.md | 48 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/docs/model_overview.md b/docs/model_overview.md
index 4042d8a0d6..4cc164bc8f 100644
--- a/docs/model_overview.md
+++ b/docs/model_overview.md
@@ -10,30 +10,30 @@ The table below further shows which model architectures support which adaptation
     E.g., for BERT, this means adapters provides a ``BertAdapterModel`` class, but you can also use ``BertModel``, ``BertForSequenceClassification`` etc. together with adapters.
 ```
 
-| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | LoRA | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block | Prompt<br> Tuning |
-| --------------------------------------- | -| - | - | - | - | - | - |- |
-| [ALBERT](classes/models/albert.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [BEIT](classes/models/beit.html)        | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
-| [BERT-Generation](classes/models/bert-generation.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [CLIP](classes/models/clip.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |
-| [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | | |
-| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [GPT-J](classes/models/gptj.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [Llama](classes/models/llama.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [MT5](classes/models/mt5.html)          | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [PLBart](classes/models/plbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
-| [ViT](classes/models/vit.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [X-MOD](classes/models/xmod.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | LoRA | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block | Prompt<br> Tuning | ReFT |
+| --------------------------------------- | -| - | - | - | - | - | - |- | - |
+| [ALBERT](classes/models/albert.html)    | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [BEIT](classes/models/beit.html)        | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ | ✅ |
+| [BERT-Generation](classes/models/bert-generation.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [CLIP](classes/models/clip.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
+| [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | | | (*) |
+| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [GPT-J](classes/models/gptj.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [Llama](classes/models/llama.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [MT5](classes/models/mt5.html)          | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [PLBart](classes/models/plbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ✅ |
+| [ViT](classes/models/vit.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [X-MOD](classes/models/xmod.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 
 (*) If the used encoder and decoder model class are supported.
 

From f4b6391c52ea14997d570990a63e24f35b14dabf Mon Sep 17 00:00:00 2001
From: FahadEbrahim <62794766+FahadEbrahim@users.noreply.github.com>
Date: Wed, 10 Jul 2024 20:47:31 +0200
Subject: [PATCH 27/27] Update head_utils.py

---
 src/adapters/head_utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
index 3bbe1b6db4..079334c1e2 100644
--- a/src/adapters/head_utils.py
+++ b/src/adapters/head_utils.py
@@ -384,14 +384,6 @@
             "classification_head.out_proj",
         ],
     },
-    "PLBartForQuestionAnswering": {
-        "config": {
-            "head_type": "question_answering",
-            "layers": 1,
-            "activation_function": None,
-        },
-        "layers": [None, "qa_outputs"],
-    },
     "PLBartForConditionalGeneration": {
         "config": {
             "head_type": "seq2seq_lm",