Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix import error for Huggingface-hub version >=0.26.0 & Update Notebooks #750

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 114 additions & 54 deletions examples/pytorch/language-modeling/run_clm.py

Large diffs are not rendered by default.

193 changes: 136 additions & 57 deletions examples/pytorch/language-modeling/run_mlm.py

Large diffs are not rendered by default.

17 changes: 10 additions & 7 deletions examples/pytorch/multiple-choice/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import PaddingStrategy, check_min_version
from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.26.0")
check_min_version("4.44.0")

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,9 +94,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
" should only be set to `True` for repositories you trust and in which you have read the code, as it"
" will execute code present on the Hub on your local machine."
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -239,6 +239,10 @@ def main():
else:
model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()

# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_swag", model_args, data_args)

# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
Expand All @@ -260,8 +264,7 @@ def main():
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training:"
f" {training_args.fp16}"
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

Expand Down
80 changes: 59 additions & 21 deletions examples/pytorch/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from typing import Optional

Expand All @@ -45,13 +46,13 @@
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.26.0")
check_min_version("4.44.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

Expand Down Expand Up @@ -81,12 +82,22 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
token: str = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -231,13 +242,21 @@ def main():
else:
model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()

# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_qa", model_args, data_args)

# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
Expand All @@ -247,8 +266,8 @@ def main():

# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

Expand Down Expand Up @@ -285,7 +304,8 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand All @@ -304,10 +324,10 @@ def main():
data_files=data_files,
field="data",
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# https://huggingface.co/docs/datasets/loading_datasets.

# Load pretrained model and tokenizer
#
Expand All @@ -318,25 +338,27 @@ def main():
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=True,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
model = AutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

# Convert the model into an adapter model
adapters.init(model)

# Tokenizer check: this script requires a fast tokenizer.
Expand All @@ -348,7 +370,7 @@ def main():
)

# Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation.
# Preprocessing is slightly different for training and evaluation.
if training_args.do_train:
column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
Expand All @@ -364,7 +386,7 @@ def main():

if data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
Expand Down Expand Up @@ -404,7 +426,12 @@ def prepare_train_features(examples):
for i, offsets in enumerate(offset_mapping):
# We will label impossible answers with the index of the CLS token.
input_ids = tokenized_examples["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
if tokenizer.cls_token_id in input_ids:
cls_index = input_ids.index(tokenizer.cls_token_id)
elif tokenizer.bos_token_id in input_ids:
cls_index = input_ids.index(tokenizer.bos_token_id)
else:
cls_index = 0

# Grab the sequence corresponding to that example (to know what is the context and what is the question).
sequence_ids = tokenized_examples.sequence_ids(i)
Expand Down Expand Up @@ -589,21 +616,32 @@ def post_processing_function(examples, features, predictions, stage="eval"):
# Format the result to the format the metric expects.
if data_args.version_2_with_negative:
formatted_predictions = [
{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
{"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
]
else:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]

references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
if data_args.version_2_with_negative:
accepted_best_metrics = ("exact", "f1", "HasAns_exact", "HasAns_f1")
else:
accepted_best_metrics = ("exact_match", "f1")

if training_args.load_best_model_at_end and training_args.metric_for_best_model not in accepted_best_metrics:
warnings.warn(f"--metric_for_best_model should be set to one of {accepted_best_metrics}")

metric = evaluate.load(
"squad_v2" if data_args.version_2_with_negative else "squad", cache_dir=model_args.cache_dir
)

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)

# Setup adapters
setup_adapter_training(model, adapter_args, data_args.dataset_name or "squad")

# Initialize our Trainer
trainer_class = QuestionAnsweringAdapterTrainer if adapter_args.train_adapter else QuestionAnsweringTrainer
trainer = trainer_class(
Expand Down
Loading
Loading