Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring parameter class #133

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,4 @@ scripts/tapt/output_dir/*

# The dev folder
dev/*
/scripts/charlies_wip/
6 changes: 3 additions & 3 deletions al_llm/acquisition_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def select(
The list of sentences from which to sample
num_samples : int, default=-1
The number of samples to select. The default value of -1 means
that `parameters["num_samples"]` is used.
that `parameters.num_samples` is used.

Returns
-------
Expand All @@ -48,11 +48,11 @@ def _get_validated_num_samples(
) -> int:
"""Determine and validate the number of samples to take

The value of -1 means that `parameters["num_samples"]` is used.
The value of -1 means that `parameters.num_samples` is used.
"""

if num_samples == -1:
num_samples = self.parameters["num_samples"]
num_samples = self.parameters.num_samples

if num_samples > len(sample_pool):
raise ValueError(
Expand Down
73 changes: 41 additions & 32 deletions al_llm/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def __init__(
self._model = None

# set device
self.device = torch.device(self.parameters["cuda_device"])
self.device = torch.device(self.parameters.cuda_device)

def train_afresh(
self,
Expand All @@ -408,7 +408,7 @@ def train_afresh(
):
# If we're refreshing every iteration, and the most recent set of
# samples were all skipped, then don't do any trying
if new_tokenized_samples is not None and self.parameters["refresh_every"] == 1:
if new_tokenized_samples is not None and self.parameters.refresh_every == 1:
new_tokenized_samples = new_tokenized_samples.filter(
lambda x: x[SKIPS_COLUMN_NAME] == 0
)
Expand All @@ -432,11 +432,11 @@ def train_afresh(

# create a dataloader for the train dataset
train_dataloader = DataLoader(
tokenized_train, shuffle=True, batch_size=self.parameters["batch_size"]
tokenized_train, shuffle=True, batch_size=self.parameters.batch_size
)

# Run the training loop
self._train(train_dataloader, self.parameters["num_epochs_afresh"], iteration)
self._train(train_dataloader, self.parameters.num_epochs_afresh, iteration)

def train_update(
self,
Expand All @@ -463,11 +463,11 @@ def train_update(

# Make a sample loader from the latest batch of labelled samples
samples_dataloader = DataLoader(
tokenized_samples, shuffle=True, batch_size=self.parameters["batch_size"]
tokenized_samples, shuffle=True, batch_size=self.parameters.batch_size
)

# Run the training loop
self._train(samples_dataloader, self.parameters["num_epochs_update"], iteration)
self._train(samples_dataloader, self.parameters.num_epochs_update, iteration)

def _initialise(self):
pass
Expand All @@ -484,7 +484,7 @@ def _load_fresh_model(self):
models = []

# Load fresh versions of the model
for i in range(self.parameters["num_classifier_models"]):
for i in range(self.parameters.num_classifier_models):
models.append(
AutoModelForSequenceClassification.from_pretrained(
self.MODEL_NAME, num_labels=len(self.dataset_container.CATEGORIES)
Expand Down Expand Up @@ -521,26 +521,26 @@ def _setup_model(self):

def _train(self, train_dataloader: DataLoader, num_epochs: int, iteration: int):
# create an optimizer for the model
optimizer = AdamW(self._model.parameters(), lr=self.parameters["learning_rate"])
optimizer = AdamW(self._model.parameters(), lr=self.parameters.learning_rate)

# The eval dataloader
eval_dataloader = DataLoader(
self.dataset_container.tokenized_validation,
batch_size=self.parameters["eval_batch_size"],
batch_size=self.parameters.eval_batch_size,
)

# The test dataloader
test_dataloader = DataLoader(
self.dataset_container.tokenized_test,
batch_size=self.parameters["eval_batch_size"],
batch_size=self.parameters.eval_batch_size,
)

# create a learning rate scheduler
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear",
optimizer=optimizer,
num_warmup_steps=self.parameters["num_warmup_steps"],
num_warmup_steps=self.parameters.num_warmup_steps,
num_training_steps=num_training_steps,
)

Expand Down Expand Up @@ -570,24 +570,33 @@ def _train(self, train_dataloader: DataLoader, num_epochs: int, iteration: int):
("eval", eval_dataloader),
("test", test_dataloader),
]:
# If the eval loop should run this epoch, or if it is the last epoch
run_eval = (
self.parameters[f"{split}_every"] > 0
and (epoch + 1) % self.parameters[f"{split}_every"] == 0
)
run_eval = run_eval or (
self.parameters[f"{split}_every"] >= 0 and epoch == num_epochs - 1
)
if split == "eval":
eval_frequency = self.parameters.eval_every
elif split == "test":
eval_frequency = self.parameters.test_every
else:
raise ValueError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This case is impossible. Can you remove it? It confused me at first


if run_eval:
# Run the evaluation loop, obtaining the metrics
print(f"- Running {split} loop")
eval_metrics = self._eval_epoch(dataloader)
print(
f"{split.capitalize()} mean loss: {eval_metrics['loss']:.8}; "
f"{split.capitalize()} f1: {eval_metrics['f1']:.6}"
)
results_to_log[split] = eval_metrics
# if eval_frequency == -1, we should never evaluate the model
if eval_frequency < 0:
assert eval_frequency == -1
pass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you replace this with continue, you can remove the else: and so remove one level of indent

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also I think that any validation should be done in the Parameters class, rather than here with assert statements

else:
if eval_frequency == 0:
regular_eval_is_due = False
else:
regular_eval_is_due = (epoch + 1) % eval_frequency == 0
is_last_epoch = epoch == num_epochs - 1

if regular_eval_is_due or is_last_epoch:
# Run the evaluation loop, obtaining the metrics
print(f"- Running {split} loop")
eval_metrics = self._eval_epoch(dataloader)
print(
f"{split.capitalize()} mean loss:"
f" {eval_metrics['loss']:.8}; "
f"{split.capitalize()} f1: {eval_metrics['f1']:.6}"
)
SamAdamDay marked this conversation as resolved.
Show resolved Hide resolved

# Record the metrics with W&B
self.wandb_run.log(results_to_log)
Expand Down Expand Up @@ -798,7 +807,7 @@ def calculate_uncertainties_tokenized(
num_samples = tokenized_samples.shape[0]

# Store the batch size with a shorter variable name
batch_size = self.parameters["eval_batch_size"]
batch_size = self.parameters.eval_batch_size

# Make a PyTorch dataloader for the samples
samples_dataloader = DataLoader(tokenized_samples, batch_size=batch_size)
Expand Down Expand Up @@ -894,11 +903,11 @@ def _load_fresh_model(self):
models, training_args = load_tapted_model(
self.wandb_run,
self.MODEL_NAME,
self.parameters["dataset_name"],
self.parameters.dataset_name,
"classifier",
num_categories=len(self.dataset_container.CATEGORIES),
tapted_model_version=self.parameters["tapted_model_version"],
num_models=self.parameters["num_classifier_models"],
tapted_model_version=self.parameters.tapted_model_version,
num_models=self.parameters.num_classifier_models,
)
self._model = HuggingFaceClassifierEnsemble(models)
self.training_parameters = training_args
Expand Down
12 changes: 6 additions & 6 deletions al_llm/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def get_latest_tokenized_datapoints(
# return the last `num_samples`entries from `tokenized_train`
# (because adding items puts them at the end of the dataset)
samples_dict = self.dataset_container.tokenized_train[
-self.parameters["num_samples"] :
-self.parameters.num_samples :
]
tokenized_samples = datasets.Dataset.from_dict(samples_dict)
tokenized_samples.set_format("torch")
Expand Down Expand Up @@ -129,7 +129,7 @@ def save(self, unlabelled_samples: UnlabelledSamples):
# get all datapoints from dataset_train after `train_dataset_size`,
# i.e. only data added by AL process
added_data = self.dataset_container.dataset_train[
self.parameters["train_dataset_size"] :
self.parameters.train_dataset_size :
]

# add the samples in `unlabelled_samples` (if there are any)
Expand Down Expand Up @@ -186,8 +186,8 @@ def get_replay_samples(self, iteration: int) -> UnlabelledSamples:
The list of sentences selected
"""

start = iteration * self.parameters["num_samples"]
end = (iteration + 1) * self.parameters["num_samples"]
start = iteration * self.parameters.num_samples
end = (iteration + 1) * self.parameters.num_samples
return UnlabelledSamples(
self.replay_dataset_extension[TEXT_COLUMN_NAME][start:end]
)
Expand All @@ -211,8 +211,8 @@ def get_replay_prompt_output(self, iteration: int) -> PromptOutput:
"""

# Get the labels and ambiguities
start = iteration * self.parameters["num_samples"]
end = (iteration + 1) * self.parameters["num_samples"]
start = iteration * self.parameters.num_samples
end = (iteration + 1) * self.parameters.num_samples
labels = self.replay_dataset_extension[LABEL_COLUMN_NAME][start:end]
ambiguities = self.replay_dataset_extension[AMBIGUITIES_COLUMN_NAME][start:end]

Expand Down
36 changes: 19 additions & 17 deletions al_llm/dataset_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class DatasetContainer(ABC):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand All @@ -60,6 +60,7 @@ class DatasetContainer(ABC):
"""

CATEGORIES = OrderedDict()
TOKENIZED_LENGTH_UPPER_QUARTILE = 0

def __init__(self, parameters: Parameters):
self.parameters = parameters
Expand Down Expand Up @@ -168,7 +169,7 @@ def _train_remainder_split(
) -> Tuple[datasets.Dataset, datasets.Dataset]:
"""Split a dataset 'train' split into a train and remainder dataset

We select `parameters["train_dataset_size"]` datapoints and set them
We select `parameters.train_dataset_size` datapoints and set them
as a train dataset, the rest going to a remainder dataset.

Parameters
Expand All @@ -179,13 +180,13 @@ def _train_remainder_split(
Returns
-------
train_dataset : datasets.Dataset
A train dataset of size at most `parameters["train_dataset_size"]`,
A train dataset of size at most `parameters.train_dataset_size`,
selected from `train_split`.
remainder_dataset : datasets.Dataset
The remainder of the train split.
"""

if len(train_split) < self.parameters["train_dataset_size"]:
if len(train_split) < self.parameters.train_dataset_size:
raise ValueError(
f"Train split must be larger than train dataset size (currently"
f" {len(train_split)} < {self.parameters['train_dataset_size']})"
Expand All @@ -196,9 +197,9 @@ def _train_remainder_split(
train_split = train_split.shuffle(seed=seed)

# Select the train and remainder datasets
train_range = range(self.parameters["train_dataset_size"])
train_range = range(self.parameters.train_dataset_size)
train_dataset = train_split.select(train_range)
remainder_range = range(self.parameters["train_dataset_size"], len(train_split))
remainder_range = range(self.parameters.train_dataset_size, len(train_split))
remainder_dataset = train_split.select(remainder_range)

return train_dataset, remainder_dataset
Expand Down Expand Up @@ -264,7 +265,7 @@ def _preprocess_dataset(self):
"""Do any preprocessing on the dataset, just after it is loaded"""

# If we're in dev mode, limit the size of the datasets significantly
if self.parameters["dev_mode"]:
if self.parameters.dev_mode:
train_slice_size = min(20, len(self.dataset_train))
self.dataset_train = self.dataset_train.select(range(train_slice_size))
validation_slice_size = min(20, len(self.dataset_validation))
Expand Down Expand Up @@ -351,7 +352,7 @@ class DummyDatasetContainer(DatasetContainer):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand All @@ -378,24 +379,25 @@ class DummyDatasetContainer(DatasetContainer):
REMAINDER_SIZE = 10
VALIDATION_SIZE = 20
TEST_SIZE = 50
TOKENIZED_LENGTH_UPPER_QUARTILE = 0

def __init__(self, parameters: Parameters):
super().__init__(parameters)

# Generate some training sentences
sentence_generator = FakeSentenceGenerator(parameters["seed"])
sentence_generator = FakeSentenceGenerator(parameters.seed)
train_sentences = sentence_generator.generate(
parameters["train_dataset_size"] + self.REMAINDER_SIZE
parameters.train_dataset_size + self.REMAINDER_SIZE
)
validation_sentences = sentence_generator.generate(self.VALIDATION_SIZE)
test_sentences = sentence_generator.generate(self.TEST_SIZE)

# Generate the class labels
label_generator = FakeLabelGenerator(
list(self.categories.keys()), parameters["seed"]
list(self.categories.keys()), parameters.seed
)
train_labels = label_generator.generate(
parameters["train_dataset_size"] + self.REMAINDER_SIZE
parameters.train_dataset_size + self.REMAINDER_SIZE
)
validation_labels = label_generator.generate(self.VALIDATION_SIZE)
test_labels = label_generator.generate(self.TEST_SIZE)
Expand Down Expand Up @@ -462,7 +464,7 @@ class HuggingFaceDatasetContainer(DatasetContainer, ABC):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand Down Expand Up @@ -530,7 +532,7 @@ class RottenTomatoesDatasetContainer(HuggingFaceDatasetContainer):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand Down Expand Up @@ -598,7 +600,7 @@ class WikiToxicDatasetContainer(HuggingFaceDatasetContainer):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand Down Expand Up @@ -718,7 +720,7 @@ class PubMed20kRCTDatasetContainer(HuggingFaceDatasetContainer):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand Down Expand Up @@ -801,7 +803,7 @@ class Trec6DatasetContainer(HuggingFaceDatasetContainer):
dataset_train : datasets.Dataset
The raw dataset consisting of labelled sentences used for training, as
a Hugging Face Dataset. This is separated from the 'train' split of
the dataset by selecting `parameters["train_dataset_size"]` datapoints.
the dataset by selecting `parameters.train_dataset_size` datapoints.
dataset_remainder : datasets.Dataset
The remainder of the 'train' split after `dataset_train` has been
selected. Used by the pool-based simulator.
Expand Down
Loading