Skip to content

Commit

Permalink
Merge pull request #1328 from hanhainebula/master
Browse files Browse the repository at this point in the history
fix bugs for embedder finetune
  • Loading branch information
hanhainebula authored Jan 13, 2025
2 parents 808b6c8 + ddf9ada commit cef1595
Show file tree
Hide file tree
Showing 16 changed files with 58 additions and 23 deletions.
2 changes: 2 additions & 0 deletions FlagEmbedding/abc/inference/AbsEmbedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,8 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
"""
if isinstance(results_list[0], torch.Tensor):
# move all tensors to the same device
results_list = [res.to(self.target_devices[0]) for res in results_list]
return torch.cat(results_list, dim=0)
elif isinstance(results_list[0], np.ndarray):
return np.concatenate(results_list, axis=0)
Expand Down
16 changes: 11 additions & 5 deletions FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
logger.info("Training new model from scratch")
Expand Down Expand Up @@ -129,13 +132,15 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -152,6 +157,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
model = model_args.from_config(config)
Expand All @@ -173,5 +179,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:

model.save_pretrained(os.path.join(output_dir, 'merged_model'))

tokenizer = AutoTokenizer.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code)
tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model'))
3 changes: 2 additions & 1 deletion FlagEmbedding/finetune/embedder/decoder_only/base/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
token=self.model_args.token,
cache_dir=self.model_args.cache_dir,
use_fast=False,
add_eos_token=True
add_eos_token=True,
trust_remote_code=self.model_args.trust_remote_code,
)

if tokenizer.pad_token is None:
Expand Down
10 changes: 7 additions & 3 deletions FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str,
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str,
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
logger.info("Training new model from scratch")
Expand Down Expand Up @@ -152,6 +155,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
model = model_args.from_config(config)
Expand All @@ -173,5 +177,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d

model.save_pretrained(os.path.join(output_dir, 'merged_model'))

tokenizer = AutoTokenizer.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code)
tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model'))
3 changes: 2 additions & 1 deletion FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
token=self.model_args.token,
cache_dir=self.model_args.cache_dir,
use_fast=False,
add_eos_token=True
add_eos_token=True,
trust_remote_code=self.model_args.trust_remote_code,
)

if tokenizer.pad_token is None:
Expand Down
8 changes: 8 additions & 0 deletions FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
if "position_embeddings" in k:
logging.info(f"Freeze the parameters for {k}")
v.requires_grad = False

if self.training_args.fix_encoder:
for k, v in model.named_parameters():
if "colbert_linear" in k or 'sparse_linear' in k:
logging.info(f"train the parameters for {k}")
else:
v.requires_grad = False

return tokenizer, model

def load_trainer(self) -> EncoderOnlyEmbedderM3Trainer:
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/decoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
4 changes: 2 additions & 2 deletions FlagEmbedding/inference/embedder/decoder_only/icl.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def encode_queries_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down Expand Up @@ -519,7 +519,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/encoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/encoder_only/m3.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list):
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/decoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

dataset, dataloader = None, None
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/decoder_only/layerwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

dataset, dataloader = None, None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

all_scores = []
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/encoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

all_scores = []
Expand Down
19 changes: 16 additions & 3 deletions examples/finetune/embedder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,33 @@ cd FlagEmbedding/scripts

```shell
python hn_mine.py \
--model_name_or_path BAAI/bge-base-en-v1.5 \
--input_file toy_finetune_data.jsonl \
--output_file toy_finetune_data_minedHN.jsonl \
--range_for_sampling 2-200 \
--negative_number 15 \
--use_gpu_for_searching
--use_gpu_for_searching \
--embedder_name_or_path BAAI/bge-base-en-v1.5
```

- **`input_file`**: json data for finetuning. This script will retrieve top-k documents for each query, and random sample negatives from the top-k documents (not including the positive documents).
- **`output_file`**: path to save JSON data with mined hard negatives for finetuning
- **`negative_number`**: the number of sampled negatives
- **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)**
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file.
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file.
- **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives.
- **`search_batch_size`**: batch size for searching. Default is 64.
- **`embedder_name_or_path`**: The name or path to the embedder.
- **`embedder_model_class`**: Class of the model used for embedding (current options include 'encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl'.). Default is None. For the custom model, you should set this argument.
- **`normalize_embeddings`**: Set to `True` to normalize embeddings.
- **`pooling_method`**: The pooling method for the embedder.
- **`use_fp16`**: Use FP16 precision for inference.
- **`devices`**: List of devices used for inference.
- **`query_instruction_for_retrieval`**, **`query_instruction_format_for_retrieval`**: Instructions and format for query during retrieval.
- **`examples_for_task`**, **`examples_instruction_format`**: Example tasks and their instructions format. This is only used when `embedder_model_class` is set to `decoder-only-icl`.
- **`trust_remote_code`**: Set to `True` to trust remote code execution.
- **`cache_dir`**: Cache directory for models.
- **`embedder_batch_size`**: Batch sizes for embedding and reranking.
- **`embedder_query_max_length`**, **`embedder_passage_max_length`**: Maximum length for embedding queries and passages.

### Teacher Scores

Expand Down
2 changes: 1 addition & 1 deletion scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ python hn_mine.py \
- **`output_file`**: path to save JSON data with mined hard negatives for finetuning
- **`negative_number`**: the number of sampled negatives
- **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)**
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file.
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file.
- **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives.
- **`search_batch_size`**: batch size for searching. Default is 64.
- **`embedder_name_or_path`**: The name or path to the embedder.
Expand Down

0 comments on commit cef1595

Please sign in to comment.