diff --git a/FlagEmbedding/abc/inference/AbsEmbedder.py b/FlagEmbedding/abc/inference/AbsEmbedder.py index 1eff6c5e..797505d8 100644 --- a/FlagEmbedding/abc/inference/AbsEmbedder.py +++ b/FlagEmbedding/abc/inference/AbsEmbedder.py @@ -416,6 +416,8 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. """ if isinstance(results_list[0], torch.Tensor): + # move all tensors to the same device + results_list = [res.to(self.target_devices[0]) for res in results_list] return torch.cat(results_list, dim=0) elif isinstance(results_list[0], np.ndarray): return np.concatenate(results_list, axis=0) diff --git a/FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py b/FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py index 6b1ba5ec..5156904b 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re config = AutoConfig.from_pretrained( model_args.config_name, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained( model_args.model_name_or_path, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) else: raise ValueError( @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re cache_dir=model_args.cache_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, + trust_remote_code=model_args.trust_remote_code, ) else: logger.info("Training new model from scratch") @@ -129,13 +132,15 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: config = AutoConfig.from_pretrained( model_args.config_name, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained( model_args.model_name_or_path, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) else: raise ValueError( @@ -152,6 +157,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: cache_dir=model_args.cache_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, + trust_remote_code=model_args.trust_remote_code, ) else: model = model_args.from_config(config) @@ -173,5 +179,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: model.save_pretrained(os.path.join(output_dir, 'merged_model')) - tokenizer = AutoTokenizer.from_pretrained(output_dir) + tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code) tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model')) diff --git a/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py b/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py index aaaf0abc..2c00a2f0 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py @@ -41,7 +41,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode token=self.model_args.token, cache_dir=self.model_args.cache_dir, use_fast=False, - add_eos_token=True + add_eos_token=True, + trust_remote_code=self.model_args.trust_remote_code, ) if tokenizer.pad_token is None: diff --git a/FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py b/FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py index f5178104..abff1553 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str, config = AutoConfig.from_pretrained( model_args.config_name, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained( model_args.model_name_or_path, token=model_args.token, - cache_dir=model_args.cache_dir + cache_dir=model_args.cache_dir, + trust_remote_code=model_args.trust_remote_code, ) else: raise ValueError( @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str, cache_dir=model_args.cache_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, + trust_remote_code=model_args.trust_remote_code, ) else: logger.info("Training new model from scratch") @@ -152,6 +155,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d cache_dir=model_args.cache_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, + trust_remote_code=model_args.trust_remote_code, ) else: model = model_args.from_config(config) @@ -173,5 +177,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d model.save_pretrained(os.path.join(output_dir, 'merged_model')) - tokenizer = AutoTokenizer.from_pretrained(output_dir) + tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code) tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model')) diff --git a/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py b/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py index 4cfe1dc9..bb406a96 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py @@ -45,7 +45,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode token=self.model_args.token, cache_dir=self.model_args.cache_dir, use_fast=False, - add_eos_token=True + add_eos_token=True, + trust_remote_code=self.model_args.trust_remote_code, ) if tokenizer.pad_token is None: diff --git a/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py b/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py index b3ecfba2..4f17ad20 100644 --- a/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py +++ b/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py @@ -142,6 +142,14 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode if "position_embeddings" in k: logging.info(f"Freeze the parameters for {k}") v.requires_grad = False + + if self.training_args.fix_encoder: + for k, v in model.named_parameters(): + if "colbert_linear" in k or 'sparse_linear' in k: + logging.info(f"train the parameters for {k}") + else: + v.requires_grad = False + return tokenizer, model def load_trainer(self) -> EncoderOnlyEmbedderM3Trainer: diff --git a/FlagEmbedding/inference/embedder/decoder_only/base.py b/FlagEmbedding/inference/embedder/decoder_only/base.py index 2912ca05..da6838a6 100644 --- a/FlagEmbedding/inference/embedder/decoder_only/base.py +++ b/FlagEmbedding/inference/embedder/decoder_only/base.py @@ -257,7 +257,7 @@ def encode_single_device( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 # encode diff --git a/FlagEmbedding/inference/embedder/decoder_only/icl.py b/FlagEmbedding/inference/embedder/decoder_only/icl.py index 1206b8a4..ac09e403 100644 --- a/FlagEmbedding/inference/embedder/decoder_only/icl.py +++ b/FlagEmbedding/inference/embedder/decoder_only/icl.py @@ -409,7 +409,7 @@ def encode_queries_single_device( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 # encode @@ -519,7 +519,7 @@ def encode_single_device( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 # encode diff --git a/FlagEmbedding/inference/embedder/encoder_only/base.py b/FlagEmbedding/inference/embedder/encoder_only/base.py index 6b27ec14..c97d0ea4 100644 --- a/FlagEmbedding/inference/embedder/encoder_only/base.py +++ b/FlagEmbedding/inference/embedder/encoder_only/base.py @@ -238,7 +238,7 @@ def encode_single_device( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 # encode diff --git a/FlagEmbedding/inference/embedder/encoder_only/m3.py b/FlagEmbedding/inference/embedder/encoder_only/m3.py index 42c207d7..cc4fad7a 100644 --- a/FlagEmbedding/inference/embedder/encoder_only/m3.py +++ b/FlagEmbedding/inference/embedder/encoder_only/m3.py @@ -406,7 +406,7 @@ def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list): flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 # encode diff --git a/FlagEmbedding/inference/reranker/decoder_only/base.py b/FlagEmbedding/inference/reranker/decoder_only/base.py index b50a697e..2b3e7827 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/base.py +++ b/FlagEmbedding/inference/reranker/decoder_only/base.py @@ -412,7 +412,7 @@ def compute_score_single_gpu( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 dataset, dataloader = None, None diff --git a/FlagEmbedding/inference/reranker/decoder_only/layerwise.py b/FlagEmbedding/inference/reranker/decoder_only/layerwise.py index 9a3e2301..a855e677 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/layerwise.py +++ b/FlagEmbedding/inference/reranker/decoder_only/layerwise.py @@ -282,7 +282,7 @@ def compute_score_single_gpu( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 dataset, dataloader = None, None diff --git a/FlagEmbedding/inference/reranker/decoder_only/lightweight.py b/FlagEmbedding/inference/reranker/decoder_only/lightweight.py index 722b1aa6..a70837da 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/lightweight.py +++ b/FlagEmbedding/inference/reranker/decoder_only/lightweight.py @@ -368,7 +368,7 @@ def compute_score_single_gpu( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 all_scores = [] diff --git a/FlagEmbedding/inference/reranker/encoder_only/base.py b/FlagEmbedding/inference/reranker/encoder_only/base.py index 9af0b2d8..d075f39d 100644 --- a/FlagEmbedding/inference/reranker/encoder_only/base.py +++ b/FlagEmbedding/inference/reranker/encoder_only/base.py @@ -169,7 +169,7 @@ def compute_score_single_gpu( flag = True except RuntimeError as e: batch_size = batch_size * 3 // 4 - except torch.OutofMemoryError as e: + except torch.OutOfMemoryError as e: batch_size = batch_size * 3 // 4 all_scores = [] diff --git a/examples/finetune/embedder/README.md b/examples/finetune/embedder/README.md index 6e578249..c9e141aa 100644 --- a/examples/finetune/embedder/README.md +++ b/examples/finetune/embedder/README.md @@ -57,20 +57,33 @@ cd FlagEmbedding/scripts ```shell python hn_mine.py \ ---model_name_or_path BAAI/bge-base-en-v1.5 \ --input_file toy_finetune_data.jsonl \ --output_file toy_finetune_data_minedHN.jsonl \ --range_for_sampling 2-200 \ --negative_number 15 \ ---use_gpu_for_searching +--use_gpu_for_searching \ +--embedder_name_or_path BAAI/bge-base-en-v1.5 ``` - **`input_file`**: json data for finetuning. This script will retrieve top-k documents for each query, and random sample negatives from the top-k documents (not including the positive documents). - **`output_file`**: path to save JSON data with mined hard negatives for finetuning - **`negative_number`**: the number of sampled negatives - **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)** -- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file. +- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file. - **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives. +- **`search_batch_size`**: batch size for searching. Default is 64. +- **`embedder_name_or_path`**: The name or path to the embedder. +- **`embedder_model_class`**: Class of the model used for embedding (current options include 'encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl'.). Default is None. For the custom model, you should set this argument. +- **`normalize_embeddings`**: Set to `True` to normalize embeddings. +- **`pooling_method`**: The pooling method for the embedder. +- **`use_fp16`**: Use FP16 precision for inference. +- **`devices`**: List of devices used for inference. +- **`query_instruction_for_retrieval`**, **`query_instruction_format_for_retrieval`**: Instructions and format for query during retrieval. +- **`examples_for_task`**, **`examples_instruction_format`**: Example tasks and their instructions format. This is only used when `embedder_model_class` is set to `decoder-only-icl`. +- **`trust_remote_code`**: Set to `True` to trust remote code execution. +- **`cache_dir`**: Cache directory for models. +- **`embedder_batch_size`**: Batch sizes for embedding and reranking. +- **`embedder_query_max_length`**, **`embedder_passage_max_length`**: Maximum length for embedding queries and passages. ### Teacher Scores diff --git a/scripts/README.md b/scripts/README.md index 2150a32b..e8ee7465 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -29,7 +29,7 @@ python hn_mine.py \ - **`output_file`**: path to save JSON data with mined hard negatives for finetuning - **`negative_number`**: the number of sampled negatives - **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)** -- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file. +- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file. - **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives. - **`search_batch_size`**: batch size for searching. Default is 64. - **`embedder_name_or_path`**: The name or path to the embedder.