Merge branch 'main' into sync/v4.36.0

adapter-hub · Jan 13, 2024 · 86309c1 · 86309c1
2 parents a96a49f + 5c5f10c
commit 86309c1
Show file tree

Hide file tree

Showing 12 changed files with 54 additions and 101 deletions.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+  - name: "🗪 Discussions Forum"
+    url: https://github.com/adapter-hub/adapters/discussions
+    about: Ask questions on working with adapters, request help or share your work
diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md
diff --git a/.github/workflows/pr_dependencies.yml b/.github/workflows/pr_dependencies.yml
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2020-2023 The AdapterHub Team. All rights reserved.
+Copyright 2020-2024 The AdapterHub Team. All rights reserved.
 
                                  Apache License
                            Version 2.0, January 2004

diff --git a/README.md b/README.md
@@ -46,7 +46,6 @@ pip install -U adapters
 
 ```
 git clone https://github.com/adapter-hub/adapters.git
-git checkout adapters
 cd adapters
 pip install .
 ```

diff --git a/docs/conf.py b/docs/conf.py
@@ -20,7 +20,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "AdapterHub"
-copyright = "2020-2023, AdapterHub Team"
+copyright = "2020-2024, AdapterHub Team"
 author = "AdapterHub Team"
 
 docs_versions = [

diff --git a/setup.py b/setup.py
@@ -142,7 +142,7 @@ def deps_list(*pkgs):
 
 setup(
     name="adapters",
-    version="0.1.0",
+    version="0.1.1",
     author="The AdapterHub team and community contributors",
     author_email="[email protected]",
     description="A Unified Library for Parameter-Efficient and Modular Transfer Learning",

diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 
 from typing import TYPE_CHECKING
 

diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py
@@ -18,7 +18,13 @@
 )
 from transformers.utils import ModelOutput
 
-from ..composition import AdapterCompositionBlock, BatchSplit, Parallel, parse_heads_from_composition
+from ..composition import (
+    AdapterCompositionBlock,
+    BatchSplit,
+    Parallel,
+    adjust_tensors_for_parallel,
+    parse_heads_from_composition,
+)
 from ..context import AdapterSetup, ForwardContext
 from ..loading import PredictionHeadLoader
 from ..methods.modeling import Activation_Function_Class
@@ -105,6 +111,21 @@ def get_output_embeddings(self):
     def get_label_names(self):
         return ["labels"]
 
+    def _get_cls_output(self, outputs, **kwargs):
+        if self.config["use_pooler"]:
+            cls_output = kwargs.pop("pooled_output")
+        elif kwargs.get("get_cls_from_eos_tokens", False):
+            x = outputs[0]  # last hidden state
+            eos_mask = kwargs.get("eos_mask")
+            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_output = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        else:
+            cls_output = outputs[0][:, 0]
+
+        return cls_output
+
 
 class ClassificationHead(PredictionHead):
     def __init__(
@@ -134,10 +155,7 @@ def __init__(
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
         if cls_output is None:
-            if self.config["use_pooler"]:
-                cls_output = kwargs.pop("pooled_output")
-            else:
-                cls_output = outputs[0][:, 0]
+            cls_output = self._get_cls_output(outputs, **kwargs)
         logits = super().forward(cls_output)
         loss = None
         labels = kwargs.pop("labels", None)
@@ -205,10 +223,7 @@ def __init__(
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
         if cls_output is None:
-            if self.config["use_pooler"]:
-                cls_output = kwargs.pop("pooled_output")
-            else:
-                cls_output = outputs[0][:, 0]
+            cls_output = self._get_cls_output(outputs, **kwargs)
         logits = super().forward(cls_output)
         loss = None
         labels = kwargs.pop("labels", None)
@@ -271,10 +286,7 @@ def __init__(
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=None, **kwargs):
         if cls_output is None:
-            if self.config["use_pooler"]:
-                cls_output = kwargs.pop("pooled_output")
-            else:
-                cls_output = outputs[0][:, 0]
+            cls_output = self._get_cls_output(outputs, **kwargs)
         logits = super().forward(cls_output)
         logits = logits.view(-1, self.config["num_choices"])
         loss = None
@@ -476,10 +488,7 @@ def __init__(
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
         if cls_output is None:
-            if self.config["use_pooler"]:
-                cls_output = kwargs.pop("pooled_output")
-            else:
-                cls_output = outputs[0][:, 0]
+            cls_output = self._get_cls_output(outputs, **kwargs)
         logits = super().forward(cls_output)
         loss = None
         labels = kwargs.pop("labels", None)
@@ -800,6 +809,9 @@ def forward_head(
             cls_output (torch.Tensor, optional): The classification output of the model.
             attention_mask (torch.Tensor, optional): The attention mask of the model.
             return_dict (bool): Whether or not to return a ``ModelOutput`` instead of a plain tuple.
+            get_cls_from_eos_tokens (bool):
+                If set to True, retrieve classifier token representations from the last <eos> token in the sequence.
+                Setting to True requires `eos_mask` to be passed as well.
             **kwargs: Additional keyword arguments passed to the forward pass of the head.
         """
         used_head_modules = self._get_used_heads(head_name)
@@ -846,10 +858,12 @@ def _get_head_input(outputs, cls_out, batch):
                 )
             head_outputs = []
             labels = kwargs.pop("labels", None)
+            eos_mask = kwargs.pop("eos_mask", None)
             for i, head in enumerate(self.active_head):
                 head_module = self.heads[head]
                 batch_idx = range(sum(self.active_head.batch_sizes[:i]), sum(self.active_head.batch_sizes[: i + 1]))
                 kwargs["labels"] = labels[batch_idx] if labels is not None else None
+                kwargs["eos_mask"] = eos_mask[batch_idx] if eos_mask is not None else None
                 head_inputs, head_cls_input = _get_head_input(all_outputs, cls_output, batch_idx)
                 # head_attention = attention_mask[batch_idx] if attention_mask is not None else None
                 head_output = head_module(head_inputs, head_cls_input, attention_mask, return_dict, **kwargs)

diff --git a/src/adapters/methods/prefix_tuning.py b/src/adapters/methods/prefix_tuning.py
@@ -21,19 +21,20 @@ def __init__(
         n_heads: int,
         input_size: int,
         config: PrefixTuningConfig,
+        n_embd_per_head: Optional[int] = None,
     ):
         super().__init__()
         self.n_layers = n_layers
         self.n_heads = n_heads
         self.input_size = input_size
-        self.n_embd_per_head = self.input_size // self.n_heads
+        self.n_embd_per_head = n_embd_per_head or self.input_size // self.n_heads
         self.config = config
 
         self.wte = nn.Embedding(self.config.prefix_length, self.input_size)
         self.control_trans = nn.Sequential(
             nn.Linear(self.input_size, self.config.bottleneck_size),
             Activation_Function_Class(self.config.non_linearity.lower()),
-            nn.Linear(self.config.bottleneck_size, self.n_layers * 2 * self.input_size),
+            nn.Linear(self.config.bottleneck_size, self.n_layers * 2 * self.n_heads * self.n_embd_per_head),
         )
         self.dropout = nn.Dropout(self.config.dropout)
 
@@ -70,15 +71,18 @@ def __init__(
         n_heads: int,
         input_size: int,
         config: PrefixTuningConfig,
+        n_embd_per_head: Optional[int] = None,
     ):
         super().__init__()
         self.n_layers = n_layers
         self.n_heads = n_heads
         self.input_size = input_size
-        self.n_embd_per_head = self.input_size // self.n_heads
+        self.n_embd_per_head = n_embd_per_head or self.input_size // self.n_heads
         self.config = config
 
-        self.control_trans = nn.Parameter(torch.randn(self.config.prefix_length * self.n_layers * 2 * self.input_size))
+        self.control_trans = nn.Parameter(
+            torch.randn(self.config.prefix_length * self.n_layers * 2 * self.n_heads * self.n_embd_per_head)
+        )
 
         self.dropout = nn.Dropout(self.config.dropout)
 
@@ -174,6 +178,7 @@ def confirm_prefix(self, prefix_name: str) -> bool:
                 "n_layers": location_config["count"],
                 "n_heads": location_config["n_heads"],
                 "input_size": location_config["input_size"],
+                "n_embd_per_head": location_config["n_embd_per_head"],
             }
         prefix_tuning = PrefixTuningGroup(module_configs, prefix_tuning_config)
         prefix_tuning.train(self.training)  # make sure training mode is consistent
@@ -319,6 +324,7 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
                 self.location_key,
                 n_heads=self.model_config.num_attention_heads,
                 input_size=self.model_config.hidden_size,
+                n_embd_per_head=getattr(self.model_config, "d_kv", None),  # this is currently specific to T5-3B
             )
             self.prefixes[adapter_name] = prefix_id
 

diff --git a/src/adapters/model_mixin.py b/src/adapters/model_mixin.py
@@ -134,10 +134,7 @@ def _get_active_setup(self):
                 adapter_setup = self.adapters_config.active_setup
         else:
             adapter_setup = None
-        skip_adapters = adapter_setup is None or (
-            self.adapters_config.skip_layers is not None and self.layer_idx in self.adapters_config.skip_layers
-        )
-        if not skip_adapters and (len(adapter_setup.flatten()) > 0):
+        if adapter_setup is not None and (len(adapter_setup.flatten()) > 0):
             return adapter_setup
         else:
             return None

diff --git a/src/adapters/models/bart/adapter_model.py b/src/adapters/models/bart/adapter_model.py
@@ -10,7 +10,6 @@
 )
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
 
-from ...composition import adjust_tensors_for_parallel
 from ...heads import (
     ClassificationHead,
     ModelWithFlexibleHeadsAdaptersMixin,
@@ -102,23 +101,15 @@ def forward(
         )
         # required e.g. for prompt tuning in all models
         kwargs["context"] = context
-        # sequence classification based on last token in sequence
-        x = outputs[0]  # last hidden state
-        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
-            eos_mask = input_ids.eq(self.config.eos_token_id)
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_representation = x
 
         head_outputs = self.forward_head(
             outputs,
             head_name=head,
-            cls_output=cls_representation,
             attention_mask=attention_mask,
             return_dict=return_dict,
+            get_cls_from_eos_tokens=True,
+            # `get_cls_from_eos_tokens` requires passing eos mask
+            eos_mask=input_ids.eq(self.config.eos_token_id) if input_ids is not None else None,
             **kwargs,
         )