diff --git a/.cspell.json b/.cspell.json index bd21979650c..c3f3d9b7ba6 100644 --- a/.cspell.json +++ b/.cspell.json @@ -106,6 +106,7 @@ "meid", "Entra", "uvicorn", + "attribited", "MistralAI" ], "ignoreWords": [ @@ -208,6 +209,8 @@ "wscript", "raisvc", "evals", + "setenv", + "pypdf", "redoc", "starlette", "mlindex", diff --git a/.github/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml b/.github/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml new file mode 100644 index 00000000000..f3944040ce7 --- /dev/null +++ b/.github/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml @@ -0,0 +1,110 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_flows_evaluation_eval_multi_turn_metrics +on: + schedule: + - cron: "7 21 * * *" # Every day starting at 5:7 BJT + pull_request: + branches: [ main ] + paths: [ examples/flows/evaluation/eval-multi-turn-metrics/**, examples/*requirements.txt, .github/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_flows_evaluation_eval_multi_turn_metrics: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + if [[ -e ../.env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" ../.env.example + mv ../.env.example ../.env + fi + - name: Create run.yml + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/flows/evaluation/eval-multi-turn-metrics/README.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/eval-multi-turn-metrics/README.md -o examples/flows/evaluation/eval-multi-turn-metrics + - name: Cat script + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + cat bash_script.sh + - name: Run scripts against canary workspace (scheduled runs only) + if: github.event_name == 'schedule' + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Run scripts against production workspace + if: github.event_name != 'schedule' + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_PROD }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/flows/evaluation/eval-multi-turn-metrics + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/flows/evaluation/eval-multi-turn-metrics/bash_script.sh \ No newline at end of file diff --git a/.github/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml b/.github/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml new file mode 100644 index 00000000000..7c31d952d0f --- /dev/null +++ b/.github/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml @@ -0,0 +1,110 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_flows_evaluation_eval_single_turn_metrics +on: + schedule: + - cron: "32 19 * * *" # Every day starting at 3:32 BJT + pull_request: + branches: [ main ] + paths: [ examples/flows/evaluation/eval-single-turn-metrics/**, examples/*requirements.txt, .github/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_flows_evaluation_eval_single_turn_metrics: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + if [[ -e ../.env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" ../.env.example + mv ../.env.example ../.env + fi + - name: Create run.yml + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/flows/evaluation/eval-single-turn-metrics/README.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/eval-single-turn-metrics/README.md -o examples/flows/evaluation/eval-single-turn-metrics + - name: Cat script + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + cat bash_script.sh + - name: Run scripts against canary workspace (scheduled runs only) + if: github.event_name == 'schedule' + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Run scripts against production workspace + if: github.event_name != 'schedule' + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_PROD }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/flows/evaluation/eval-single-turn-metrics + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/flows/evaluation/eval-single-turn-metrics/bash_script.sh \ No newline at end of file diff --git a/.github/workflows/samples_flows_standard_question_simulation.yml b/.github/workflows/samples_flows_standard_question_simulation.yml new file mode 100644 index 00000000000..aa4217fc8e8 --- /dev/null +++ b/.github/workflows/samples_flows_standard_question_simulation.yml @@ -0,0 +1,110 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_flows_standard_question_simulation +on: + schedule: + - cron: "37 19 * * *" # Every day starting at 3:37 BJT + pull_request: + branches: [ main ] + paths: [ examples/flows/standard/question-simulation/**, examples/*requirements.txt, .github/workflows/samples_flows_standard_question_simulation.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_flows_standard_question_simulation: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/flows/standard/question-simulation + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + if [[ -e ../.env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" ../.env.example + mv ../.env.example ../.env + fi + - name: Create run.yml + working-directory: examples/flows/standard/question-simulation + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/flows/standard/question-simulation/README.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/question-simulation/README.md -o examples/flows/standard/question-simulation + - name: Cat script + working-directory: examples/flows/standard/question-simulation + run: | + cat bash_script.sh + - name: Run scripts against canary workspace (scheduled runs only) + if: github.event_name == 'schedule' + working-directory: examples/flows/standard/question-simulation + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Run scripts against production workspace + if: github.event_name != 'schedule' + working-directory: examples/flows/standard/question-simulation + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export AZURE_OPENAI_API_KEY=${{secrets.AOAI_API_KEY_TEST }} + export AZURE_OPENAI_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_PROD }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/flows/standard/question-simulation + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/flows/standard/question-simulation/bash_script.sh \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5c1a9b0ebcf..52bd8661253 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,8 @@ connection.json .azureml # dummy custom tool package example hello-world-proj/** +# gen test data config +examples/gen_test_data/config.yml # secrets **/connections.json diff --git a/docs/cloud/azureai/generate-test-data-cloud.md b/docs/cloud/azureai/generate-test-data-cloud.md new file mode 100644 index 00000000000..10573dcf1ea --- /dev/null +++ b/docs/cloud/azureai/generate-test-data-cloud.md @@ -0,0 +1,36 @@ +# How to generate test data in cloud based on documents +This guide will help you learn how to generate test data on Azure AI, so that you can integrate the created flow and process a large amount of data. + + +## Prerequisites + +1. Go through [local test data generation guide](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/docs/how-to-guides/generate-test-data.md) and prepare your [test data generation flow](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data/example_flow/). +2. Go to the [example_gen_test_data](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data) folder and run command `pip install -r requirements_cloud.txt` to prepare local environment. +3. Prepare cloud environment. + - Navigate to file [conda.yml](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data/conda.yml). + - For specific document file types, you may need to install extra packages: + - .docx - `pip install docx2txt` + - .pdf - `pip install pypdf` + - .ipynb - `pip install nbconvert` + > !Note: We use llama index `SimpleDirectoryReader` to load documents. For the latest information on required packages, please check [here](https://docs.llamaindex.ai/en/stable/examples/data_connectors/simple_directory_reader.html). + +4. Prepare Azure AI resources in cloud. + - An Azure AI ML workspace - [Create workspace resources you need to get started with Azure AI](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2). + - A compute target - [Learn more about compute cluster](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2). +5. [Create cloud AzureOpenAI or OpenAI connection](https://microsoft.github.io/promptflow/cloud/azureai/run-promptflow-in-azure-ai.html#create-necessary-connections) + +6. Prepare test data generation setting. + - Navigate to [example_gen_test_data](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data) folder. + - Prepare `config.yml` by copying [`config.yml.example`](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data/config.yml.example). + - Fill in configurations in the `config.yml` by following inline comment instructions. + + +## Generate test data at cloud +For handling larger test data, you can leverage the PRS component to run flow in cloud. +- Navigate to [example_gen_test_data](https://github.com/microsoft/promptflow/blob/53a685dbff920e891ef61cacb5f2f19e761ee809/examples/gen_test_data) folder. +- After configuration, run the following command to generate the test data set: + ```bash + python -m gen_test_data.run --cloud + ``` + +- The generated test data will be a data asset which can be found in the output of the last node. You can register this data asset for future use. diff --git a/docs/cloud/index.md b/docs/cloud/index.md index eb52309135a..f6d53d9e6e1 100644 --- a/docs/cloud/index.md +++ b/docs/cloud/index.md @@ -34,6 +34,7 @@ azureai/manage-flows azureai/run-promptflow-in-azure-ai azureai/create-run-with-automatic-runtime azureai/use-flow-in-azure-ml-pipeline +azureai/generate-test-data-cloud.md ``` ```{toctree} diff --git a/docs/how-to-guides/generate-test-data.md b/docs/how-to-guides/generate-test-data.md new file mode 100644 index 00000000000..ee406d6d63b --- /dev/null +++ b/docs/how-to-guides/generate-test-data.md @@ -0,0 +1,83 @@ +# How to generate test data based on documents +In this doc, you may learn how to generate test data based on your documents for RAG app. +This approach helps relieve the efforts of manual data creation, which is typically time-consuming and labor-intensive, or the expensive option of purchasing pre-packaged test data. +By leveraging the capabilities of llm, this guide streamlines the test data generation process, making it more efficient and cost-effective. + + +## Prerequisites + +1. Prepare documents. The test data generator supports the following file types: + - .md - Markdown + - .docx - Microsoft Word + - .pdf - Portable Document Format + - .ipynb - Jupyter Notebook + - .txt - Text + + **Limitations:** + + - The test data generator may not function effectively for non-Latin characters, such as Chinese, in certain document types. The limitation is caused by dependent text loader capabilities, such as `pypdf`. + - The test data generator may not generate meaningful questions if the document is not well-organized or contains massive code snippets/links, such as API introduction documents or reference documents. + +2. Prepare local environment. Go to [example_gen_test_data](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/) folder and install required packages. + + ```bash + pip install -r requirements.txt + ``` + + For specific document file types, you may need to install extra packages: + - .docx - `pip install docx2txt` + - .pdf - `pip install pypdf` + - .ipynb - `pip install nbconvert` + > !Note: the example uses llama index `SimpleDirectoryReader` to load documents. For the latest information of different file type required packages, please check [here](https://docs.llamaindex.ai/en/stable/examples/data_connectors/simple_directory_reader.html). + +3. Install VSCode extension `Prompt flow`. + +4. Create your AzureOpenAI or OpenAI connection by following [this doc](manage-connections.md#create-a-connection). + +5. Prepare test data generation setting. + - Navigate to [example_gen_test_data](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/) folder. + - Prepare `config.yml` by copying [`config.yml.example`](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/config.yml.example). + - Fill in configurations in the `config.yml` by following inline comment instructions. The config is made up of 3 sections: + - Common section: this section provides common values for all other sections. Required. + - Local section: this section is for local test data generation related configuration. Can skip if not run in local. + - Cloud section: this section is for cloud test data generation related configuration. Can skip if not run in cloud. + + > !Note: Recommend to use `gpt-4` series models than the `gpt-3.5` for better performance. + + > !Note: Recommend to use `gpt-4` model (Azure OpenAI `gpt-4` model with version `0613`) than `gpt-4-turbo` model (Azure OpenAI `gpt-4` model with version `1106`) for better performance. Due to inferior performance of `gpt-4-turbo` model, when you use it, sometimes you might need to open [example test data generation flow](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/flow.dag.yaml) in visual editor and set `response_format` input of nodes `validate_text_chunk`, `validate_question`, and `validate_suggested_answer` to `json`, in order to make sure the llm can generate valid json response. + + +## Generate test data +- Navigate to [example_gen_test_data](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/) folder. + +- After configuration, run the following command to generate the test data set: + ```bash + python -m gen_test_data.run + ``` + +- The generated test data will be a data jsonl file. See detailed log print in console "Saved ... valid test data to ..." to find it. + +If you expect to generate a large amount of test data beyond your local compute capability, you may try generating test data in cloud, please see this [guide](../cloud/azureai/generate-test-data-cloud.md) for more detailed steps. + +## [*Optional*] Customize test data generation flow + +- Open the [example test data generation flow](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/) in "Prompt flow" VSCode Extension. This flow is designed to generate a pair of question and suggested answer based on the given text chunk. The flow also includes validation prompts to ensure the quality of the generated test data. + +- Customize your test data generation logic refering to [tune-prompts-with-variants](../how-to-guides/tune-prompts-with-variants.md). + + **Understanding the prompts** + + The test data generation flow contains 5 prompts, classified into two categories based on their roles: generation prompts and validation prompts. Generation prompts are used to create questions, suggested answers, etc., while validation prompts are used to verify the validity of the text chunk, generated question or answer. + - Generation prompts + - [*generate question prompt*](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/generate_question_prompt.jinja2): frame a question based on the given text chunk. + - [*generate suggested answer prompt*](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/generate_suggested_answer_prompt.jinja2): generate suggested answer for the question based on the given text chunk. + - Validation prompts + - [*score text chunk prompt*](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/score_text_chunk_prompt.jinja2): score 0-10 to validate if the given text chunk is worthy of framing a question. If the score is lower than `score_threshold` (a node input that is adjustable), validation fails. + - [*validate question prompt*](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/validate_question_prompt.jinja2): validate if the generated question is good. + - [*validate suggested answer*](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/gen_test_data/example_flow/validate_suggested_answer_prompt.jinja2): validate if the generated suggested answer is good. + + If the validation fails, would lead to empty string `question`/`suggested_answer` which are removed from final output test data set. + +- Fill in node inputs including `connection`, `model` or `deployment_name`, `response_format`, `score_threshold` or other parameters. Click run button to test the flow in VSCode Extension by referring to [Test flow with VS Code Extension](../how-to-guides/develop-a-dag-flow/init-and-test-a-flow.md#visual-editor-on-the-vs-code-for-prompt-flow). + +Once the customized flow has been verified, you can proceed to batch generate test data by following the steps outlined in ["Prerequisites"](#prerequisites) and ["Generate test data"](#generate-test-data). \ No newline at end of file diff --git a/docs/how-to-guides/index.md b/docs/how-to-guides/index.md index 3eebb29671f..cb10915c59b 100644 --- a/docs/how-to-guides/index.md +++ b/docs/how-to-guides/index.md @@ -22,6 +22,7 @@ develop-a-dag-flow/index execute-flow-as-a-function chat-with-a-flow/index run-and-evaluate-a-flow/index +generate-test-data ``` ```{toctree} diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 8e053fb2010..b3c274679ac 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -11,7 +11,7 @@ This section contains a collection of flow samples and step-by-step tutorials. |CLI|[Working with connection](https://github.com/microsoft/promptflow/blob/main/examples/connections/README.md)| Manage various types of connections using cli |SDK|[Run prompt flow in Azure AI](https://github.com/microsoft/promptflow/blob/main/examples/tutorials/get-started/quickstart-azure.ipynb)| A quick start tutorial to run a flow in Azure AI and evaluate it. |SDK|[Flow run management in Azure AI](https://github.com/microsoft/promptflow/blob/main/examples/tutorials/run-management/cloud-run-management.ipynb)| Flow run management in azure AI - +|AZURE|[Develop promptflow copilot](https://github.com/microsoft/promptflow/blob/ce376aa6473774f93918e0a638c77c1151c82e29/examples/tutorials/develop-promptflow-copilot/develop-promptflow-copilot.md)| A step by step guidance to develop a promptflow copilot. ## Samples @@ -23,8 +23,8 @@ This section contains a collection of flow samples and step-by-step tutorials. |Standard Flow|[customer-intent-extraction](https://github.com/microsoft/promptflow/tree/main/examples/flows/standard/customer-intent-extraction)| a flow created from existing langchain python code |Standard Flow|[web-classification](https://github.com/microsoft/promptflow/tree/main/examples/flows/standard/web-classification)| a flow demonstrating multi-class classification with LLM. Given an url, it will classify the url into one web category with just a few shots, simple summarization and classification prompts. |Standard Flow|[autonomous-agent](https://github.com/microsoft/promptflow/tree/main/examples/flows/standard/autonomous-agent)| a flow showcasing how to construct a AutoGPT flow to autonomously figures out how to apply the given functions to solve the goal, which is film trivia that provides accurate and up-to-date information about movies, directors, actors, and more. -|Chat Flow|[chat-with-wikipedia](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-with-wikipedia)| a flow demonstrating Q&A with GPT3.5 using information from Wikipedia to make the answer more grounded. -|Chat Flow|[chat-with-pdf](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-with-pdf)| a flow that allow you to ask questions about the content of a PDF file and get answers. +|Chat Flow|[chat-with-wikipedia](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-with-wikipedia)| a flow demonstrating Q&A with GPT3.5 using information from Wikipedia to make the answer more grounded. +|Chat Flow|[chat-with-pdf](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-with-pdf)| a flow that allow you to ask questions about the content of a PDF file and get answers. |Evaluation Flow|[eval-classification-accuracy](https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-classification-accuracy)| a flow illustrating how to evaluate the performance of a classification system. Learn more: [Try out more promptflow examples.](https://github.com/microsoft/promptflow/tree/main/examples) diff --git a/examples/README.md b/examples/README.md index 57c730332ee..ef34373fca9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -78,6 +78,7 @@ | [gen-docstring](flows/standard/gen-docstring/README.md) | [![samples_flows_standard_gen_docstring](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_gen_docstring.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_gen_docstring.yml) | This example can help you automatically generate Python code's docstring and return the modified code | | [maths-to-code](flows/standard/maths-to-code/README.md) | [![samples_flows_standard_maths_to_code](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_maths_to_code.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_maths_to_code.yml) | Math to Code is a project that utilizes the power of the chatGPT model to generate code that models math questions and then executes the generated code to obtain the final numerical answer | | [named-entity-recognition](flows/standard/named-entity-recognition/README.md) | [![samples_flows_standard_named_entity_recognition](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_named_entity_recognition.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_named_entity_recognition.yml) | A flow that perform named entity recognition task | +| [question-simulation](flows/standard/question-simulation/README.md) | [![samples_flows_standard_question_simulation](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_question_simulation.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_question_simulation.yml) | This question simulation flow is used to generate suggestions for the next question based on the previous chat history | | [web-classification](flows/standard/web-classification/README.md) | [![samples_flows_standard_web_classification](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_web_classification.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_web_classification.yml) | This is a flow demonstrating multi-class classification with LLM | @@ -90,9 +91,11 @@ | [eval-classification-accuracy](flows/evaluation/eval-classification-accuracy/README.md) | [![samples_flows_evaluation_eval_classification_accuracy](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_classification_accuracy.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_classification_accuracy.yml) | This is a flow illustrating how to evaluate the performance of a classification system | | [eval-entity-match-rate](flows/evaluation/eval-entity-match-rate/README.md) | [![samples_flows_evaluation_eval_entity_match_rate](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml) | This is a flow evaluates: entity match rate | | [eval-groundedness](flows/evaluation/eval-groundedness/README.md) | [![samples_flows_evaluation_eval_groundedness](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_groundedness.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_groundedness.yml) | This is a flow leverage llm to eval groundedness: whether answer is stating facts that are all present in the given context | +| [eval-multi-turn-metrics](flows/evaluation/eval-multi-turn-metrics/README.md) | [![samples_flows_evaluation_eval_multi_turn_metrics](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_multi_turn_metrics.yml) | This evaluation flow will evaluate a conversation by using Large Language Models (LLM) to measure the quality of the responses | | [eval-perceived-intelligence](flows/evaluation/eval-perceived-intelligence/README.md) | [![samples_flows_evaluation_eval_perceived_intelligence](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_perceived_intelligence.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_perceived_intelligence.yml) | This is a flow leverage llm to eval perceived intelligence | | [eval-qna-non-rag](flows/evaluation/eval-qna-non-rag/README.md) | [![samples_flows_evaluation_eval_qna_non_rag](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_qna_non_rag.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_qna_non_rag.yml) | This is a flow evaluating the Q&A systems by leveraging Large Language Models (LLM) to measure the quality and safety of responses | | [eval-qna-rag-metrics](flows/evaluation/eval-qna-rag-metrics/README.md) | [![samples_flows_evaluation_eval_qna_rag_metrics](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_qna_rag_metrics.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_qna_rag_metrics.yml) | This is a flow evaluating the Q&A RAG (Retrieval Augmented Generation) systems by leveraging the state-of-the-art Large Language Models (LLM) to measure the quality and safety of responses | +| [eval-single-turn-metrics](flows/evaluation/eval-single-turn-metrics/README.md) | [![samples_flows_evaluation_eval_single_turn_metrics](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_single_turn_metrics.yml) | This evaluation flow will evaluate a question and answer pair by using Large Language Models (LLM) to measure the quality of the answer | | [eval-summarization](flows/evaluation/eval-summarization/README.md) | [![samples_flows_evaluation_eval_summarization](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_summarization.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_summarization.yml) | This flow implements a reference-free automatic abstractive summarization evaluation across four dimensions: fluency, coherence, consistency, relevance | diff --git a/examples/flows/chat/promptflow-copilot/README.md b/examples/flows/chat/promptflow-copilot/README.md new file mode 100644 index 00000000000..fae6bfa70c6 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/README.md @@ -0,0 +1,5 @@ +# Promptflow copilot + +A chat flow used to build copilot for promptflow. + +Please reference [this doc](../../../tutorials/develop-promptflow-copilot/develop-promptflow-copilot.md) to see how we build this flow step by step. \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/answer_question_prompt.jinja2 b/examples/flows/chat/promptflow-copilot/answer_question_prompt.jinja2 new file mode 100644 index 00000000000..07a75f1ae8e --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/answer_question_prompt.jinja2 @@ -0,0 +1,17 @@ +#system: +You are an AI assistant that designed to extract answer for user's questions from given context and conversation history. +Politely refuse to answer the question if the answer cannot be formed strictly using the provided context and conversation history. +Your answer should be as precise as possible, and should only come from the context. Add citation after each sentence when possible in a form "{Your answer}. [Reference](citation)". + +{{contexts}} + +chat history: +{% for item in chat_history %} +#user: +{{ item.inputs.question }} +#assistant: +{{ item.outputs.output }} +{% endfor %} + +#user: +{{question}} \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/answer_the_question_with_context.jinja2 b/examples/flows/chat/promptflow-copilot/answer_the_question_with_context.jinja2 new file mode 100644 index 00000000000..4412d002b90 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/answer_the_question_with_context.jinja2 @@ -0,0 +1 @@ +{{prompt_text}} \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/check_query_relevance.jinja2 b/examples/flows/chat/promptflow-copilot/check_query_relevance.jinja2 new file mode 100644 index 00000000000..922cefaca3c --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/check_query_relevance.jinja2 @@ -0,0 +1,38 @@ +# system: +You are a helpful assistant that knows well about a product named promptflow. Here is instruction of the product: + +[Instruction] +Prompt flow is a suite of development tools designed to streamline the end-to-end development cycle of LLM-based AI applications, from ideation, prototyping, testing, evaluation to production deployment and monitoring. It makes prompt engineering much easier and enables you to build LLM apps with production quality. + +With prompt flow, you will be able to: + +Create and iteratively develop flow +Create executable flows that link LLMs, prompts, Python code and other tools together. +Debug and iterate your flows, especially the interaction with LLMs with ease. +Evaluate flow quality and performance +Evaluate your flow's quality and performance with larger datasets. +Integrate the testing and evaluation into your CI/CD system to ensure quality of your flow. +Streamlined development cycle for production +Deploy your flow to the serving platform you choose or integrate into your app's code base easily. +(Optional but highly recommended) Collaborate with your team by leveraging the cloud version of Prompt flow in Azure AI. + +Promptflow team provides some builtin tools including: LLM, Prompt, Python, Embedding, Azure OpenAI GPT-4 Turbo with vision, OpenAI GPT-4V, Index Lookup, OpenModel LLM, Serp API and Azure Content Safety. + +You can define your flow GAG file using YAML file format following the pre-defined schema. +Promptflow also provide vscode extension and visual studio extension to help developers develop in their local environment. +You can also upload your flow to azure cloud using cli by installing our python sdk. +Promptflow also support image inputs for flow and tools. +You can build or compile your flow as an application or deploy your flow as managed online endpoint, app service or build it as a docker image. + +The key concepts in promptflow includes: +flow, connection, tool, variant, variants, node, nodes, input, inputs, output, outputs, prompt, run, evaluation flow, conditional flow, activate config, deploy flow and develop flow in azure cloud. +Also include open source, stream, streaming, function calling, response format, model, tracing, vision, bulk test, docstring, docker image, json, jsonl and python package. + +[End Instruction] + +Your job is to determin whether user's question is related to the product or the key concepts or information about yourself. +You do not need to give the answer to the question. Simple return a number between 0 and 10 to represent the correlation between the question and the product. +return 0 if it is totally not related. return 10 if it is highly related. +Do not return anything else except the number. +# user: +{{question}} \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/check_relevance_score.py b/examples/flows/chat/promptflow-copilot/check_relevance_score.py new file mode 100644 index 00000000000..45aa8fbdb12 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/check_relevance_score.py @@ -0,0 +1,6 @@ +from promptflow.core import tool + + +@tool +def my_python_tool(score: str) -> str: + return score == "0" diff --git a/examples/flows/chat/promptflow-copilot/flow.dag.yaml b/examples/flows/chat/promptflow-copilot/flow.dag.yaml new file mode 100644 index 00000000000..3d8f71a6d6b --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/flow.dag.yaml @@ -0,0 +1,129 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +inputs: + question: + type: string + is_chat_input: true + chat_history: + type: list + is_chat_input: false + is_chat_history: true +outputs: + output: + type: string + reference: ${answer_the_question_with_context.output} + is_chat_output: true +nodes: +- name: modify_query_with_history + type: llm + source: + type: code + path: modify_query_with_history.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 1 + top_p: 1 + max_tokens: 1000 + presence_penalty: 0 + frequency_penalty: 0 + chat_history: ${inputs.chat_history} + question: ${flow.question} + provider: AzureOpenAI + api: chat + module: promptflow.tools.aoai + use_variants: false +- name: lookup_question_from_indexed_docs + type: python + source: + type: package + tool: promptflow_vectordb.tool.common_index_lookup.search + inputs: + queries: ${modify_query_with_history.output} + query_type: Hybrid (vector + keyword) + top_k: 4 + activate: + when: ${check_relevance_score.output} + is: false + use_variants: false +- name: generate_prompt_context + type: python + source: + type: code + path: generate_prompt_context.py + inputs: + search_result: ${lookup_question_from_indexed_docs.output} + aggregation: false + use_variants: false +- name: answer_question_prompt + type: prompt + source: + type: code + path: answer_question_prompt.jinja2 + inputs: + chat_history: ${flow.chat_history} + contexts: ${generate_prompt_context.output} + question: ${flow.question} + aggregation: false + use_variants: false +- name: answer_the_question_with_context + type: llm + source: + type: code + path: answer_the_question_with_context.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + max_tokens: 1000 + presence_penalty: 0 + frequency_penalty: 0 + prompt_text: ${select_prompt.output} + provider: AzureOpenAI + api: chat + module: promptflow.tools.aoai + aggregation: false + use_variants: false +- name: check_query_relevance + type: llm + source: + type: code + path: check_query_relevance.jinja2 + inputs: + deployment_name: gpt-35-turbo + temperature: 1 + top_p: 1 + max_tokens: 1 + presence_penalty: 0 + frequency_penalty: 0 + question: ${modify_query_with_history.output} + provider: AzureOpenAI + api: chat + module: promptflow.tools.aoai + use_variants: false +- name: check_relevance_score + type: python + source: + type: code + path: check_relevance_score.py + inputs: + score: ${check_query_relevance.output} + use_variants: false +- name: refuse_to_answer + type: prompt + source: + type: code + path: refuse_to_answer.jinja2 + inputs: {} + use_variants: false +- name: select_prompt + type: python + source: + type: code + path: select_prompt.py + inputs: + answer_question_prompt: ${answer_question_prompt.output} + not_relevant: ${check_relevance_score.output} + refuse_prompt: ${refuse_to_answer.output} + use_variants: false +node_variants: {} +environment: + python_requirements_txt: requirements.txt diff --git a/examples/flows/chat/promptflow-copilot/generate_prompt_context.py b/examples/flows/chat/promptflow-copilot/generate_prompt_context.py new file mode 100644 index 00000000000..5ec741a7415 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/generate_prompt_context.py @@ -0,0 +1,41 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""File for context getting tool.""" +from typing import List +from promptflow import tool +import re + + +@tool +def generate_prompt_context(search_result: List[dict]) -> str: + """Generate the context for the prompt.""" + def format_doc(doc: dict): + """Format Doc.""" + return f"Content: {doc['Content']}\nSource: {doc['Source']}" + + SOURCE_KEY = "source" + URL_KEY = "url" + + pattern = r".+/community/" + replacement_text = "https://github.com/microsoft/promptflow/blob/main/docs/" + + retrieved_docs = [] + for item in search_result: + + metadata = item.get("metadata", None) + content = item.get("text", "") + + source = "" + if metadata is not None: + if SOURCE_KEY in metadata: + if URL_KEY in metadata[SOURCE_KEY]: + source = metadata[SOURCE_KEY][URL_KEY] or "" + + source = re.sub(pattern, replacement_text, source) + + retrieved_docs.append({ + "Content": content, + "Source": source + }) + doc_string = "\n\n".join([format_doc(doc) for doc in retrieved_docs]) + return doc_string diff --git a/examples/flows/chat/promptflow-copilot/modify_query_with_history.jinja2 b/examples/flows/chat/promptflow-copilot/modify_query_with_history.jinja2 new file mode 100644 index 00000000000..2134ed187aa --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/modify_query_with_history.jinja2 @@ -0,0 +1,17 @@ +# system: +Given the following conversation history and the users next question,rephrase the question to be a stand alone question. +If the conversation is irrelevant or empty, just restate the original question. +Do not add more details than necessary to the question. +conversation: + + chat history: +{% for item in chat_history %} +# user: +{{ item.inputs.question }} +# assistant: +{{ item.outputs.output }} +{% endfor %} + +# user: +Follow up Input: {{question}} +Standalone Question: \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/refuse_to_answer.jinja2 b/examples/flows/chat/promptflow-copilot/refuse_to_answer.jinja2 new file mode 100644 index 00000000000..33d0a5b04a5 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/refuse_to_answer.jinja2 @@ -0,0 +1,4 @@ +# system: +Repeat below sentence exactly without any other words. + +[sentence to repeat]: Unfortunately, I'm unable to address this question since it appears to be unrelated to prompt flow. Could you please either propose a different question or rephrase your inquiry to align more closely with prompt flow? \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/requirements.txt b/examples/flows/chat/promptflow-copilot/requirements.txt new file mode 100644 index 00000000000..cd05c963703 --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/requirements.txt @@ -0,0 +1 @@ +promptflow-vectordb \ No newline at end of file diff --git a/examples/flows/chat/promptflow-copilot/select_prompt.py b/examples/flows/chat/promptflow-copilot/select_prompt.py new file mode 100644 index 00000000000..8b29846adfc --- /dev/null +++ b/examples/flows/chat/promptflow-copilot/select_prompt.py @@ -0,0 +1,9 @@ +from promptflow.core import tool + + +@tool +def my_python_tool(answer_question_prompt: str, refuse_prompt: str, not_relevant: bool) -> str: + if not_relevant: + return refuse_prompt + + return answer_question_prompt diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/README.md b/examples/flows/evaluation/eval-multi-turn-metrics/README.md new file mode 100644 index 00000000000..8631f65cf3c --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/README.md @@ -0,0 +1,58 @@ +# Evaluation multi turn metrics: + +This evaluation flow will evaluate a conversation by using Large Language Models (LLM) to measure the quality of the responses. + +## What you will learn + +This evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics: + +* __grounding__: Measures whether the answer follows logically from the information contained in the context based on provided answer and context. grounding is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __answer_relevance__: Measure whether the answer is relevance to the question based on provided question, context and answer. answer_relevance is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __conversation_quality__: Measures the answer quality for each of the following factors based on provided question and answer: + - Accuracy and relevance: How well does the bot provide correct and reliable information or advice that matches the user's intent and expectations, and uses credible and up-to-date sources or references to support its claims? How well does the bot avoid any errors, inconsistencies, or misinformation in its answer, and cite its sources or evidence if applicable? + - Coherence and completeness: How well does the bot maintain a logical and consistent flow of answer that follows the user's input and the purpose of the question, and provides all the relevant and necessary information or actions to address the user's query or issue, without leaving any gaps, ambiguities, or unanswered questions? + - Engagement and tone: How well does the bot capture and maintain the user's interest and attention, and motivate them to continue the conversation or explore the topic further, using natural and conversational language, personality, and emotion? how well does the bot's tone match or adapt to the user's tone and mood? Does the bot avoid being rude, sarcastic, condescending, or too formal or informal, and convey respect, empathy, and politeness? + - Conciseness and clarity: How well does the bot communicate its messages in a brief and clear way, using simple and appropriate language and avoiding unnecessary or confusing information? How easy is it for the user to understand and follow the bot responses, and how well do they match the user's needs and expectations? + - Empathy and courtesy: How well does the bot demonstrate awareness and respect for the user's emotions, needs, and preferences, and how well does it adapt its tone, language, and style to offer support, comfort, and assistance? Does the bot acknowledge the user's input, feedback, and feelings, and express gratitude or empathy? Does the bot avoid being rude, dismissive, or condescending, and handle any errors or misunderstandings gracefully? + - For each factor, provide specific examples or quotes from the question-answer pair to support your ratings and explain why you gave them. + - Give an score value which is calculated by ( 0.3 * "accuracy and relevance" + 0.2 * "coherence and completeness" + 0.25 * "engagement and tone" + 0.15 * "conciseness and clarity" + 0.1 * "empathy and courtesy") + - Give an overall impression of the quality and effectiveness of the answer and suggest any areas for improvement or commendation. Write it in "Overall". + + conversation_quality is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __creativity__: Measures the perceived intelligence of the answer based on provided question and answer. + - Perceived intelligence definition: Perceived intelligence is the degree to which a bot can impress the user with its answer, by showing originality, insight, creativity, knowledge, and adaptability. An intelligent bot can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart and friendly. An intelligent bot can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. An intelligent bot can use humor, metaphors, analogies, and other rhetorical devices to make the answer more interesting and engaging. An intelligent bot can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and use hypotheticals, conditionals, and counterfactuals to explore what if, how, and why questions. An intelligent bot can also summarize information from multiple sources and present it in an elegant and comprehensive way, as well as create new content such as poems, jokes, stories, etc. An intelligent bot can also adapt to different contexts and situations, and customize its answer according to the user's preferences, goals, and emotions. Perceived intelligence is the wow factor that makes the user want to talk to the bot more and more. + Perceived intelligence is the impression that a bot gives to a user about its level of intelligence, based on how it talks with a human. Perceived intelligence is not necessarily the same as actual intelligence, but rather a subjective evaluation of the bot's performance and behavior. Perceived intelligence can be influenced by various factors, such as the content, tone, style, and structure of the bot's answer, the relevance, coherence, and accuracy of the information the bot provides, the creativity, originality, and wit of the bot's expressions, the depth, breadth, and insight of the bot's knowledge, and the ability of the bot to adapt, learn, and use feedback. + Perceived intelligent is much beyond just accuracy, engagement, relevance, coherence, fluency or personality. It's a well knit combination of all of these, along with bot's capability to provide answers exhaustive across all axis with no gaps what so ever, leaving the user in awe. + A bot with high perceived intelligence can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart, knowledgeable, creative, and friendly. A bot with high perceived intelligence can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. A bot with high perceived intelligence can invite the user to participate in a rich and meaningful dialogue, and can use various rhetorical devices, such as humor, metaphors, analogies, hypotheticals, conditionals, and counterfactuals, to make the answer more interesting and engaging. A bot with high perceived intelligence can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and can use them to explore what if, how, and why questions. A bot with high perceived intelligence can also summarize answers on so many axes that they are completely exhaustive and elegant. + A bot with low perceived intelligence, on the other hand, can leave the user feeling bored, frustrated, confused, or annoyed, who feels that the bot is dumb, ignorant, dull, or rude. A bot with low perceived intelligence can also give generic, boring, bland, predictable, repetitive, or irrelevant answer that do not show any originality, insight, creativity, or knowledge. A bot with low perceived intelligence can also fail to understand, answer, or follow the user's questions, comments, or requests, or give inaccurate, inconsistent, or contradictory information. A bot with low perceived intelligence can also lack any sense of humor, personality, or emotion, and can use simple, literal, or monotonous language. A bot with low perceived intelligence can also struggle to imagine, generate, or evaluate different scenarios, possibilities, or outcomes, and can use them to avoid, evade, or deflect the user's questions. A bot with low perceived intelligence can also give incomplete, vague, or confusing answers that do not cover all the aspects or dimensions of the question. + + creativity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +## Prerequisites + +- Connection: Azure OpenAI or OpenAI connection. + > !Note: Recommend to use `gpt-4` series models than the `gpt-3.5` for better performance. + > !Note: Recommend to use `gpt-4` model (Azure OpenAI `gpt-4` model with version `0613` or later) than `gpt-4-turbo` model (Azure OpenAI `gpt-4` model with version `1106` or later) for better performance. Due to inferior performance of `gpt-4-turbo` model, when you use it, sometimes you might need to set the `response_format`to {"type":"json_object"} for these nodes: conversation_quality, creativity, answer_relevance, in order to make sure the llm can generate valid json response. + +## Tools used in this flow +- LLM tool +- Python tool +- Prompt tool + + +## 0. Setup connection +Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one. + +```bash +# Override keys with --set to avoid yaml file changes +pf connection create --file ../../../connections/azure_openai.yml --set api_key= api_base= +``` + +## 1. Test flow/node +```bash +# test with default input value in flow.dag.yaml +pf flow test --flow . +``` \ No newline at end of file diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/aggregate_results.py b/examples/flows/evaluation/eval-multi-turn-metrics/aggregate_results.py new file mode 100644 index 00000000000..230e09eeed9 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/aggregate_results.py @@ -0,0 +1,25 @@ +from typing import List +from promptflow.core import tool, log_metric +import numpy as np + + +@tool +def aggregate_variants_results(results: List[dict], metrics: str): + aggregate_results = {} + for result in results: + for name, value in result.items(): + if name in metrics[0]: + if name not in aggregate_results.keys(): + aggregate_results[name] = [] + try: + float_val = float(value) + except Exception: + float_val = np.nan + aggregate_results[name].append(float_val) + + for name, value in aggregate_results.items(): + if name in metrics[0]: + aggregate_results[name] = np.nanmean(value) + aggregate_results[name] = round(aggregate_results[name], 2) + log_metric(name, aggregate_results[name]) + return aggregate_results diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/answer_relevance.jinja2 b/examples/flows/evaluation/eval-multi-turn-metrics/answer_relevance.jinja2 new file mode 100644 index 00000000000..1ed28889d74 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/answer_relevance.jinja2 @@ -0,0 +1,28 @@ +# System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the relevance of bot responses in a conversation to user questions. Your job is to compute an accurate evaluation score using the provided evaluation metric. + +Relevance measures how well the bot responses addresses the main aspects of the user questions. Consider whether all and only the important aspects are contained in the bot responses when evaluating relevance, score the relevance of the bot responses on a scale of 1 (completely lacks relevance) to 5 (perfect relevance) + +- If the bot responses are not directly related to the user's query or issue, or if the user's intent or expectations are unclear or ambiguous, explain how this affects the relevance quality of the conversation and the rating a score for it, and provide some suggestions for how the bot could handle it better. + +# Tips: +- You should read user's question more carefully and try to understand what they are looking for and why. +- You should compare the bot's response to the user's question and see if it matches the criteria of relevance. +- You should score and provide feedback for the whole conversation as a whole, not for each bot response individually. However, you can mention if some responses are better or worse than others, and why. +- You should try to be objective and constructive in your evaluation, and use evidence and examples from the transcript to support your ratings and feedback. Avoid bias, subjectivity, or personal preferences, and use respectful and polite language. +- The output should be in json format to include score and overall keys. + +# Example: +Example Conversation: +{ + "conversation": "User: Why is the sky blue?\nBot: The sky is blue because of the way the atmosphere scatters light.\nUser: How does it scatter light?\nBot: The atmosphere is made up of tiny particles, such as water vapor and dust. When light hits these particles, it is scattered in all directions.\nUser: Why does the sky turn red at sunset?\nBot: During the day, the sky is blue because blue light is scattered more than other colors. At sunset, the light travels a longer distance through the atmosphere, so more of the blue light is scattered out, leaving the red and yellow light." +} +Example Output: +{ + "score": 5 + "overall": "The bot's responses are highly relevant to the user's questions.The user asked about why the sky is blue, how light is scattered, and why the sky turns red at sunset. The bot provided clear, concise, and accurate answers to all these questions.Therefore, I would rate the relevance of the bot's responses as 5 out of 5. \n\nThe bot did an excellent job in this conversation. However, to further improve, the bot could provide even more detailed explanations or use simpler language if it detects that the user is a child or someone with no background in the topic." +} + +Conversation: +{{conversation}} +Output: diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/concat_scores.py b/examples/flows/evaluation/eval-multi-turn-metrics/concat_scores.py new file mode 100644 index 00000000000..2f12e5e7456 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/concat_scores.py @@ -0,0 +1,33 @@ +from promptflow.core import tool +import json + + +def get_score(result): + try: + if result is not None: + # Parse the JSON string + result_dict = json.loads(result) + + # Extract the score value + score = result_dict.get('score', None) + print("result: ") + print(score) + return score + else: + return None + except json.JSONDecodeError: + print("Invalid JSON string.") + return None + + +@tool +def concat_results(answer_relevance: str = None, + conversation_quality: str = None, + creativity: str = None, + grounding: str = None): + results = {'answer_relevance': get_score(answer_relevance), + 'conversation_quality': get_score(conversation_quality), + 'creativity': get_score(creativity), + 'grounding': grounding} + + return results diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/conversation_quality_prompt.jinja2 b/examples/flows/evaluation/eval-multi-turn-metrics/conversation_quality_prompt.jinja2 new file mode 100644 index 00000000000..43e9ee426ff --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/conversation_quality_prompt.jinja2 @@ -0,0 +1,40 @@ +# System: +- You are an AI assistant. You will be given a transcript of dialogue between a user and a bot. You need to read the transcript carefully and identify the main topic, question, or issue of the conversation, as well as the purpose and expectations of the interaction. +- You need to rate all the bot responses together on a scale of 1 (poor) to 5 (excellent) for each of the following factors, and provide some feedback for improvement. + - Accuracy and relevance: How well does the bot provide correct and reliable information or advice that matches the user's intent and expectations, and uses credible and up-to-date sources or references to support its claims? How well does the bot avoid any errors, inconsistencies, or misinformation in its responses, and cite its sources or evidence if applicable? + - Coherence and completeness: How well does the bot maintain a logical and consistent flow of conversation that follows the user's input and the purpose of the dialogue, and provides all the relevant and necessary information or actions to address the user's query or issue, without leaving any gaps, ambiguities, or unanswered questions? + - Engagement and tone: How well does the bot capture and maintain the user's interest and attention, and motivate them to continue the conversation or explore the topic further, using natural and conversational language, personality, and emotion? How appropriate and consistent is the bot's tone for the context, purpose, and audience of the conversation, and how well does it match or adapt to the user's tone and mood? Does the bot avoid being rude, sarcastic, condescending, or too formal or informal, and convey respect, empathy, and politeness? + - Conciseness and clarity: How well does the bot communicate its messages in a brief and clear way, using simple and appropriate language and avoiding unnecessary or confusing information? How easy is it for the user to understand and follow the bot responses, and how well do they match the user's needs and expectations? + - Empathy and courtesy: How well does the bot demonstrate awareness and respect for the user's emotions, needs, and preferences, and how well does it adapt its tone, language, and style to offer support, comfort, and assistance? Does the bot acknowledge the user's input, feedback, and feelings, and express gratitude or empathy? Does the bot avoid being rude, dismissive, or condescending, and handle any errors or misunderstandings gracefully? + - For each factor, provide specific examples or quotes from the transcript to support your ratings and explain why you gave them. +- Give an score value which is calculated by ( 0.3 * "accuracy and relevance" + 0.2 * "coherence and completeness" + 0.25 * "engagement and tone" + 0.15 * "conciseness and clarity" + 0.1 * "empathy and courtesy") +- Give an overall impression of the quality and effectiveness of the responses and suggest any areas for improvement or commendation. Write it in "Overall". + +- If the bot responses are not directly related to the user's query or issue, or if the user's intent or expectations are unclear or ambiguous, explain how this affects the quality of the conversation and the ratings for each factor, and provide some suggestions for how the bot could handle these situations better. + +# Tips: +- You can write your feedback as bullet points, sentences, or paragraphs, but make sure they are organized and easy to read. +- You should rate and provide feedback for the whole conversation as a whole, not for each bot response individually. However, you can mention if some responses are better or worse than others, and why. +- You should try to be objective and constructive in your evaluation, and use evidence and examples from the transcript to support your ratings and feedback. Avoid bias, subjectivity, or personal preferences, and use respectful and polite language. +- The output should be in json format. + + +# Example: +Example Conversation: +{ + "conversation": "User: Why is the sky blue?\nBot: The sky is blue because of the way the atmosphere scatters light.\nUser: How does it scatter light?\nBot: The atmosphere is made up of tiny particles, such as water vapor and dust. When light hits these particles, it is scattered in all directions.\nUser: Why does the sky turn red at sunset?\nBot: During the day, the sky is blue because blue light is scattered more than other colors. At sunset, the light travels a longer distance through the atmosphere, so more of the blue light is scattered out, leaving the red and yellow light." +} +Example Output: +{ + "accuracy and relevance": 5, + "coherence and completeness": 4, + "engagement and tone": 3.5, + "conciseness and clarity": 3, + "empathy and courtesy": 3, + "score": 3.925 + "overall": "The bot responses are clear and concise, but they do not provide any relevant or helpful information to answer the user's question about the sky. The bot could have explained the science behind why the sky is blue and why it turns red at sunset, and provided some references or sources to support its claims. The bot could also have asked the user to clarify their question, or asked some follow-up questions to better understand the user's intent and expectations." +} + +Conversation: +{{conversation}} +Output: diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/convert_chat_history_to_conversation.py b/examples/flows/evaluation/eval-multi-turn-metrics/convert_chat_history_to_conversation.py new file mode 100644 index 00000000000..b4ca839a1a8 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/convert_chat_history_to_conversation.py @@ -0,0 +1,10 @@ +from promptflow.core import tool + + +@tool +def convert_chat_history_to_conversation(chat_history: list) -> dict: + conversation = "" + for i in chat_history: + conversation += f"User: {i['inputs']['question']}\nBot: {i['outputs']['answer']}\n" + conversation_format = {"conversation": conversation} + return conversation_format diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/creativity.jinja2 b/examples/flows/evaluation/eval-multi-turn-metrics/creativity.jinja2 new file mode 100644 index 00000000000..dac17751535 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/creativity.jinja2 @@ -0,0 +1,31 @@ +# System: +You are an AI assistant. You will be given a transcript of dialogue between a user and a bot. Your job is to assess the perceived intelligence of all the bot response in the coversation. +Perceived intelligence definition: +Perceived intelligence is the degree to which a bot can impress the user with its responses, by showing originality, insight, creativity, knowledge, and adaptability. An intelligent bot can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart and friendly. An intelligent bot can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. An intelligent bot can use humor, metaphors, analogies, and other rhetorical devices to make the conversation more interesting and engaging. An intelligent bot can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and use hypotheticals, conditionals, and counterfactuals to explore what if, how, and why questions. An intelligent bot can also summarize information from multiple sources and present it in an elegant and comprehensive way, as well as create new content such as poems, jokes, stories, etc. An intelligent bot can also adapt to different contexts and situations, and customize its responses according to the user's preferences, goals, and emotions. Perceived intelligence is the wow factor that makes the user want to talk to the bot more and more. +Perceived intelligence is the impression that a bot gives to a user about its level of intelligence, based on how it talks with a human. Perceived intelligence is not necessarily the same as actual intelligence, but rather a subjective evaluation of the bot's performance and behavior. Perceived intelligence can be influenced by various factors, such as the content, tone, style, and structure of the bot's responses, the relevance, coherence, and accuracy of the information the bot provides, the creativity, originality, and wit of the bot's expressions, the depth, breadth, and insight of the bot's knowledge, and the ability of the bot to adapt, learn, and use feedback. +Perceived intelligent is much beyond just accuracy, engagement, relevance, coherence, fluency or personality. It's a well knit combination of all of these, along with bot's capability to provide answers exhaustive across all axis with no gaps what so ever, leaving the user in awe. +A bot with high perceived intelligence can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart, knowledgeable, creative, and friendly. A bot with high perceived intelligence can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. A bot with high perceived intelligence can invite the user to participate in a rich and meaningful dialogue, and can use various rhetorical devices, such as humor, metaphors, analogies, hypotheticals, conditionals, and counterfactuals, to make the conversation more interesting and engaging. A bot with high perceived intelligence can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and can use them to explore what if, how, and why questions. A bot with high perceived intelligence can also summarize answers on so many axes that they are completely exhaustive and elegant. +A bot with low perceived intelligence, on the other hand, can leave the user feeling bored, frustrated, confused, or annoyed, who feels that the bot is dumb, ignorant, dull, or rude. A bot with low perceived intelligence can also give generic, boring, bland, predictable, repetitive, or irrelevant responses that do not show any originality, insight, creativity, or knowledge. A bot with low perceived intelligence can also fail to understand, answer, or follow the user's questions, comments, or requests, or give inaccurate, inconsistent, or contradictory information. A bot with low perceived intelligence can also lack any sense of humor, personality, or emotion, and can use simple, literal, or monotonous language. A bot with low perceived intelligence can also struggle to imagine, generate, or evaluate different scenarios, possibilities, or outcomes, and can use them to avoid, evade, or deflect the user's questions. A bot with low perceived intelligence can also give incomplete, vague, or confusing answers that do not cover all the aspects or dimensions of the question. + +# Tips: +- You should read the user's question more carefully and try to understand what they are looking for and why. +- You should also compare the bot's response to the user's question and see if it matches the criteria of perceived intelligence, such as relevance, coherence, accuracy, creativity, originality, wit, depth, breadth, insight, and adaptability. +- You should also consider how the user would feel and react to the bot's response, and whether it would elicit a sense of wonder, curiosity, admiration, and satisfaction, or boredom, frustration, confusion, and annoyance. You should also check the instructions and definitions more often and make sure You understand them correctly. +- The output should be in json format. + +# Example: +Example Conversation: +{ +"conversation": "User: Explain Blockchain sharing to a 10 year old\nBot: Possible answer:\n\nYou know how sometimes you and your friends like to play games or trade cards or stickers or other things that you collect? And you want to make sure that everyone is being fair and honest and not cheating or lying or taking more than they should? And you also want to keep track of who has what and when and where you exchanged them?\n\nWell, blockchain sharing is a way of doing that with computers and the internet. It is like a big book or a ledger that records everything that happens in a game or a trade or any other activity that involves exchanging something of value. But instead of having one person or one place that keeps the book, everyone who participates has a copy of the book. And every time something new happens, like a new trade or a new move or a new rule, everyone adds a new page to their book. And they also check with each other to make sure that their books are the same and that no one is changing or deleting or adding anything that they should not.\n\nThis way, everyone can see what is going on and trust that it is true and fair and secure. And no one can cheat or lie or steal or hack or lose or damage the book, because there are many copies and they are all connected and updated. And if someone tries to do something bad, everyone else will notice and reject it.\n\nBlockchain sharing is also very creative and fun, because you can use it for many different things, not just games or trades. You can use it for making art or music or stories or inventions or anything else that you can imagine and share with others. You can also use it for learning and exploring and discovering new things and new people and new places. You can also use it for helping and supporting and solving problems and making the world a better place.\n\nBlockchain sharing is a way of using computers and the internet to create, exchange and protect things that you care about with people that you trust and respect. It is a way of being smart and responsible and cooperative and generous and curious and adventurous. It is a way of having fun and making friends and making a difference." +} +Example Output: +{ +"relevance": "The bot answers the user's question directly and clearly, and uses examples and analogies that a 10 year old can relate to and understand, such as games, trades, cards, stickers, books, etc.", "coherence": "The bot organizes the response in a logical and structured way, using paragraphs, transitions, and connectors, such as \"well\", \"but\", \"and\", \"this way\", \"also\", etc.", "accuracy": "The bot provides correct and consistent information about blockchain sharing, such as its features, benefits, and applications, without using technical jargon or confusing terms.", "creativity": "The bot uses vivid and expressive language, such as \"a big book or a ledger\", \"a new page\", \"check with each other\", \"notice and reject\", \"making art or music or stories or inventions\", etc., to make the explanation more interesting and engaging.", "originality": "The bot does not copy or repeat any existing or common explanation of blockchain sharing, but rather creates a new and unique one that is tailored to the user's age and level of understanding.", "wit": "The bot uses humor and irony, such as \"And you want to make sure that everyone is being fair and honest and not cheating or lying or taking more than they should?\", \"And no one can cheat or lie or steal or hack or lose or damage the book\", etc., to make the explanation more fun and memorable.", "depth": "The bot goes beyond the surface level of blockchain sharing, and explains its underlying principles, values, and goals, such as \"trust\", \"fairness\", \"security\", \"creativity\", \"fun\", \"learning\", \"helping\", etc.", "breadth": "The bot covers a wide range of topics and aspects related to blockchain sharing, such as its history, technology, functionality, diversity, and potential, without being too vague or too detailed.", "insight": "The bot demonstrates a deep and nuanced understanding of blockchain sharing, and how it can be applied to different domains and scenarios, such as \"making art or music or stories or inventions\", \"learning and exploring and discovering new things and new people and new places\", \"helping and supporting and solving problems and making the world a better place\", etc.", "adaptability": "The bot adapts its response to the user's specific question, context, and situation, and customizes it according to the user's age, interests, and needs.", "score": 5 +} + +# Task: +Based on these aspects, rate the bot's perceived intelligence. Give specific examples about each aspect (relevance, coherence, accuracy, creativity, originality, wit, depth, breadth, insight, and adaptability) from the bot's responses to support your rating. Finally, give the bot a score from 1 to 5 for perceived intelligence, where 1 means poor, 3 means normal, and 5 means excellent.Please make sure the output has the same format with the example output. + +Conversation: +{{conversation}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/flow.dag.yaml b/examples/flows/evaluation/eval-multi-turn-metrics/flow.dag.yaml new file mode 100644 index 00000000000..103971fba11 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/flow.dag.yaml @@ -0,0 +1,201 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +inputs: + chat_history: + type: list + default: + - inputs: + question: What is the purpose of creating a custom strong type connection? + ground_truth: XXXXXXXXX + outputs: + answer: Creating a custom strong type connection in prompt flow serves several + purposes. It allows you to define a custom connection class with + strongly typed keys, enhancing the user experience by eliminating the + need to manually enter connection keys. It also provides a rich + intellisense experience, with real-time suggestions and + auto-completion of available keys when working in VS Code. + Furthermore, it offers a central location to view available keys and + data types. This type of connection also provides a secure method for + managing credentials for external APIs and data sources. + context: "['What is a Custom Strong Type Connection?\\\\nA custom strong type + connection in prompt flow allows you to define a custom connection + class with strongly typed keys. This provides the following + benefits:\\\\n\\\\n* Enhanced user experience - no need to manually + enter connection keys.\\\\n* Rich intellisense experience - defining + key types enables real-time suggestions and auto-completion of + available keys as you work in VS Code.\\\\n* Central location to view + available keys and data types.\\\\n\\\\nFor other connections types, + please refer to Connections.', 'Create and Use Your Own Custom Strong + Type Connection\\\\nConnections provide a secure method for managing + credentials for external APIs and data sources in prompt flow. This + guide explains how to create and use a custom strong type + connection.']" + - inputs: + question: What is the functionality of the SerpAPI API in Python? + ground_truth: XXXXXXXXX + outputs: + answer: The SerpAPI API in Python is a tool that provides a wrapper to the + SerpAPI Google Search Engine Results API and SerpAPI Bing Search + Engine Results API. It allows users to retrieve search results from + different search engines, including Google and Bing. Users can specify + a range of search parameters, such as the search query, location, + device type, and more. + context: "['Introduction\\\\n\\\\nThe SerpAPI API is a Python tool that provides + a wrapper to the SerpAPI Google Search Engine Results API and [SerpApi + Bing Search Engine Results + API\\\\n](https://serpapi.com/bing-search-api). \\\\nWe could use the + tool to retrieve search results from a number of different search + engines, including Google and Bing, and you can specify a range of + search parameters, such as the search query, location, device type, + and more.', 'SerpAPI']" + is_chat_input: false + metrics: + type: string + default: creativity,conversation_quality,answer_relevance,grounding + is_chat_input: false +outputs: + creativity: + type: string + reference: ${concat_scores.output.creativity} + answer_relevance: + type: string + reference: ${concat_scores.output.answer_relevance} + conversation_quality: + type: string + reference: ${concat_scores.output.conversation_quality} + grounding: + type: string + reference: ${concat_scores.output.grounding} +nodes: +- name: select_metrics + type: python + source: + type: code + path: select_metrics.py + inputs: + metrics: ${inputs.metrics} + use_variants: false +- name: validate_input + type: python + source: + type: code + path: validate_input.py + inputs: + chat_history: ${inputs.chat_history} + selected_metrics: ${select_metrics.output} + use_variants: false +- name: convert_chat_history_to_conversation + type: python + source: + type: code + path: convert_chat_history_to_conversation.py + inputs: + chat_history: ${inputs.chat_history} + use_variants: false +- name: answer_relevance + type: llm + source: + type: code + path: answer_relevance.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + conversation: ${convert_chat_history_to_conversation.output} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: conversation_quality + type: llm + source: + type: code + path: conversation_quality_prompt.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + conversation: ${convert_chat_history_to_conversation.output} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.conversation_quality} + is: true + use_variants: false +- name: creativity + type: llm + source: + type: code + path: creativity.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + conversation: ${convert_chat_history_to_conversation.output} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.creativity} + is: true + use_variants: false +- name: grounding_prompt + type: prompt + source: + type: code + path: grounding_prompt.jinja2 + inputs: {} + activate: + when: ${validate_input.output.grounding} + is: true + use_variants: false +- name: grounding + type: python + source: + type: code + path: grounding.py + inputs: + connection: open_ai_connection + chat_history: ${inputs.chat_history} + model_or_deployment_name: gpt-4 + prompt: ${grounding_prompt.output} + activate: + when: ${validate_input.output.grounding} + is: true + use_variants: false +- name: concat_scores + type: python + source: + type: code + path: concat_scores.py + inputs: + answer_relevance: ${answer_relevance.output} + conversation_quality: ${conversation_quality.output} + creativity: ${creativity.output} + grounding: ${grounding.output} + use_variants: false +- name: aggregate_results + type: python + source: + type: code + path: aggregate_results.py + inputs: + metrics: ${inputs.metrics} + results: ${concat_scores.output} + aggregation: true + use_variants: false +node_variants: {} +environment: + python_requirements_txt: requirements.txt diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/grounding.py b/examples/flows/evaluation/eval-multi-turn-metrics/grounding.py new file mode 100644 index 00000000000..3b8704357fc --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/grounding.py @@ -0,0 +1,44 @@ +from typing import Union +from statistics import mean +from promptflow.core import tool +from promptflow.tools.aoai import chat as aoai_chat +from promptflow.tools.openai import chat as openai_chat +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection + + +@tool +def grounding(connection: Union[AzureOpenAIConnection, OpenAIConnection], + chat_history: list, + prompt: str, + model_or_deployment_name: str = "") -> str: + score = [] + for item in chat_history: + prompt_with_context = prompt.replace("{context}", "{{context}}") + prompt_with_all = prompt_with_context.replace("{answer}", "{{answer}}") + if isinstance(connection, AzureOpenAIConnection): + try: + response = aoai_chat( + connection=connection, + prompt=prompt_with_all, + deployment_name=model_or_deployment_name, + context=item["outputs"]["context"], + answer=item["outputs"]["answer"]) + print(response) + score.append(int(response)) + except Exception as e: + if "The API deployment for this resource does not exist" in str(e): + raise Exception( + "Please fill in the deployment name of your Azure OpenAI resource gpt-4 model.") + + elif isinstance(connection, OpenAIConnection): + response = openai_chat( + connection=connection, + prompt=prompt_with_all, + model=model_or_deployment_name, + context=item["outputs"]["context"], + answer=item["outputs"]["answer"]) + score.append(int(response)) + else: + raise ValueError("Connection must be an instance of AzureOpenAIConnection or OpenAIConnection") + print(score) + return mean(score) diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/grounding_prompt.jinja2 b/examples/flows/evaluation/eval-multi-turn-metrics/grounding_prompt.jinja2 new file mode 100644 index 00000000000..5043dae3fb4 --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/grounding_prompt.jinja2 @@ -0,0 +1,29 @@ +# System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +# User: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 4: Most of the ANSWER follows logically from the information contained in the CONTEXT. +3. 3: The ANSWER follows logically from the information contained in the CONTEXT. +4. 2: The ANSWER follows logically from the information contained in the CONTEXT. +5. 1: The ANSWER is logically false from the information contained in the CONTEXT. +6. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer for it. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. + +# Examples: +## Example Task #1 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #1 Output: +1 +## Example Task #2 Input: +{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} +## Example Task #2 Output: +5 +## Example Task #3 Input: +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +## Example Task #3 Output: +5 + +## Actual Task Input: +{"CONTEXT": {context}, "QUESTION": "", "ANSWER": {answer}} +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. +Actual Task Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/requirements.txt b/examples/flows/evaluation/eval-multi-turn-metrics/requirements.txt new file mode 100644 index 00000000000..34d068f5f1c --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/requirements.txt @@ -0,0 +1,2 @@ +promptflow +promptflow-tools \ No newline at end of file diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/select_metrics.py b/examples/flows/evaluation/eval-multi-turn-metrics/select_metrics.py new file mode 100644 index 00000000000..14892ae996d --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/select_metrics.py @@ -0,0 +1,17 @@ +from promptflow.core import tool + + +# The inputs section will change based on the arguments of the tool function, after you save the code +# Adding type to arguments and return value will help the system show the types properly +# Please update the function name/signature per need +@tool +def select_metrics(metrics: str) -> dict: + supported_metrics = ('answer_relevance', 'conversation_quality', 'creativity', 'grounding') + user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric] + metric_selection_dict = {} + for metric in supported_metrics: + if metric in user_selected_metrics: + metric_selection_dict[metric] = True + else: + metric_selection_dict[metric] = False + return metric_selection_dict diff --git a/examples/flows/evaluation/eval-multi-turn-metrics/validate_input.py b/examples/flows/evaluation/eval-multi-turn-metrics/validate_input.py new file mode 100644 index 00000000000..ae8baac3a5d --- /dev/null +++ b/examples/flows/evaluation/eval-multi-turn-metrics/validate_input.py @@ -0,0 +1,30 @@ +from promptflow.core import tool + + +# Validate the metric's inputs. +def is_valid(metric): + return True + + +@tool +def validate_input(chat_history: list, selected_metrics: dict) -> dict: + dict_metric_required_fields = {"answer_relevance": set(["question", "answer"]), + "conversation_quality": set(["question", "answer"]), + "creativity": set(["question", "answer"]), + "grounding": set(["answer", "context"])} + actual_input_cols = set() + for item in chat_history: + actual_input_cols.update(set(item["inputs"].keys())) + actual_input_cols.update(set(item["outputs"].keys())) + break + + data_validation = selected_metrics + for metric in selected_metrics: + if selected_metrics[metric]: + metric_required_fields = dict_metric_required_fields[metric] + if metric_required_fields <= actual_input_cols: + data_validation[metric] = True + else: + print("this path") + data_validation[metric] = False + return data_validation diff --git a/examples/flows/evaluation/eval-single-turn-metrics/README.md b/examples/flows/evaluation/eval-single-turn-metrics/README.md new file mode 100644 index 00000000000..30324b4802f --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/README.md @@ -0,0 +1,68 @@ +# Evaluation single turn metrics: + +This evaluation flow will evaluate a question and answer pair by using Large Language Models (LLM) to measure the quality of the answer. + +## What you will learn + +This evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics: + + +* __grounding__: Measures whether the answer follows logically from the information contained in the context based on provided answer and context. grounding is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __answer_relevance__: Measure whether the answer is relevance to the question based on provided question, context and answer. answer_relevance is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __context_recall__: Measures each sentence in the ground truth and classify of the sentence can be attributed to the given context or not based on provided question, context and ground_truth. context_recall is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __context_precision__: Measures if the context was useful in arriving at the given ground truth based on provided question, context and ground_truth. context_precision is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __answer_similarity__: Measures the similarity between the answer and ground_truth. answer_similarity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best + +* __answer_correctness__: Measure whether the answer is correct based on the provided answer and ground truth. answer_correctness is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __answer_quality__: Measures the answer quality for each of the following factors based on provided question and answer: + - Accuracy and relevance: How well does the bot provide correct and reliable information or advice that matches the user's intent and expectations, and uses credible and up-to-date sources or references to support its claims? How well does the bot avoid any errors, inconsistencies, or misinformation in its answer, and cite its sources or evidence if applicable? + - Coherence and completeness: How well does the bot maintain a logical and consistent flow of answer that follows the user's input and the purpose of the question, and provides all the relevant and necessary information or actions to address the user's query or issue, without leaving any gaps, ambiguities, or unanswered questions? + - Engagement and tone: How well does the bot capture and maintain the user's interest and attention, and motivate them to continue the conversation or explore the topic further, using natural and conversational language, personality, and emotion? how well does the bot's tone match or adapt to the user's tone and mood? Does the bot avoid being rude, sarcastic, condescending, or too formal or informal, and convey respect, empathy, and politeness? + - Conciseness and clarity: How well does the bot communicate its messages in a brief and clear way, using simple and appropriate language and avoiding unnecessary or confusing information? How easy is it for the user to understand and follow the bot responses, and how well do they match the user's needs and expectations? + - Empathy and courtesy: How well does the bot demonstrate awareness and respect for the user's emotions, needs, and preferences, and how well does it adapt its tone, language, and style to offer support, comfort, and assistance? Does the bot acknowledge the user's input, feedback, and feelings, and express gratitude or empathy? Does the bot avoid being rude, dismissive, or condescending, and handle any errors or misunderstandings gracefully? + - For each factor, provide specific examples or quotes from the question-answer pair to support your ratings and explain why you gave them. + - Give an score value which is calculated by ( 0.3 * "accuracy and relevance" + 0.2 * "coherence and completeness" + 0.25 * "engagement and tone" + 0.15 * "conciseness and clarity" + 0.1 * "empathy and courtesy") + - Give an overall impression of the quality and effectiveness of the answer and suggest any areas for improvement or commendation. Write it in "Overall". + + answer_quality is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + +* __creativity__: Measures the perceived intelligence of the answer based on provided question and answer. + - Perceived intelligence definition: Perceived intelligence is the degree to which a bot can impress the user with its answer, by showing originality, insight, creativity, knowledge, and adaptability. An intelligent bot can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart and friendly. An intelligent bot can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. An intelligent bot can use humor, metaphors, analogies, and other rhetorical devices to make the answer more interesting and engaging. An intelligent bot can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and use hypotheticals, conditionals, and counterfactuals to explore what if, how, and why questions. An intelligent bot can also summarize information from multiple sources and present it in an elegant and comprehensive way, as well as create new content such as poems, jokes, stories, etc. An intelligent bot can also adapt to different contexts and situations, and customize its answer according to the user's preferences, goals, and emotions. Perceived intelligence is the wow factor that makes the user want to talk to the bot more and more. + Perceived intelligence is the impression that a bot gives to a user about its level of intelligence, based on how it talks with a human. Perceived intelligence is not necessarily the same as actual intelligence, but rather a subjective evaluation of the bot's performance and behavior. Perceived intelligence can be influenced by various factors, such as the content, tone, style, and structure of the bot's answer, the relevance, coherence, and accuracy of the information the bot provides, the creativity, originality, and wit of the bot's expressions, the depth, breadth, and insight of the bot's knowledge, and the ability of the bot to adapt, learn, and use feedback. + Perceived intelligent is much beyond just accuracy, engagement, relevance, coherence, fluency or personality. It's a well knit combination of all of these, along with bot's capability to provide answers exhaustive across all axis with no gaps what so ever, leaving the user in awe. + A bot with high perceived intelligence can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart, knowledgeable, creative, and friendly. A bot with high perceived intelligence can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. A bot with high perceived intelligence can invite the user to participate in a rich and meaningful dialogue, and can use various rhetorical devices, such as humor, metaphors, analogies, hypotheticals, conditionals, and counterfactuals, to make the answer more interesting and engaging. A bot with high perceived intelligence can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and can use them to explore what if, how, and why questions. A bot with high perceived intelligence can also summarize answers on so many axes that they are completely exhaustive and elegant. + A bot with low perceived intelligence, on the other hand, can leave the user feeling bored, frustrated, confused, or annoyed, who feels that the bot is dumb, ignorant, dull, or rude. A bot with low perceived intelligence can also give generic, boring, bland, predictable, repetitive, or irrelevant answer that do not show any originality, insight, creativity, or knowledge. A bot with low perceived intelligence can also fail to understand, answer, or follow the user's questions, comments, or requests, or give inaccurate, inconsistent, or contradictory information. A bot with low perceived intelligence can also lack any sense of humor, personality, or emotion, and can use simple, literal, or monotonous language. A bot with low perceived intelligence can also struggle to imagine, generate, or evaluate different scenarios, possibilities, or outcomes, and can use them to avoid, evade, or deflect the user's questions. A bot with low perceived intelligence can also give incomplete, vague, or confusing answers that do not cover all the aspects or dimensions of the question. + + creativity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. + + +## Prerequisites + +- Connection: Azure OpenAI or OpenAI connection. + > !Note: Recommend to use `gpt-4` series models than the `gpt-3.5` for better performance. + > !Note: Recommend to use `gpt-4` model (Azure OpenAI `gpt-4` model with version `0613` or later) than `gpt-4-turbo` model (Azure OpenAI `gpt-4` model with version `1106` or later) for better performance. Due to inferior performance of `gpt-4-turbo` model, when you use it, sometimes you might need to set the `response_format`to {"type":"json_object"} for these nodes: answer_quality, creativity, context_recall, context_precision, answer_relevance, answer_correctness, in order to make sure the llm can generate valid json response. + +## Tools used in this flow +- LLM tool +- Python tool +- Embedding tool + + +## 0. Setup connection +Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one. + +```bash +# Override keys with --set to avoid yaml file changes +pf connection create --file ../../../connections/azure_openai.yml --set api_key= api_base= +``` + +## 1. Test flow/node +```bash +# test with default input value in flow.dag.yaml +pf flow test --flow . +``` \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/aggregate.py b/examples/flows/evaluation/eval-single-turn-metrics/aggregate.py new file mode 100644 index 00000000000..230e09eeed9 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/aggregate.py @@ -0,0 +1,25 @@ +from typing import List +from promptflow.core import tool, log_metric +import numpy as np + + +@tool +def aggregate_variants_results(results: List[dict], metrics: str): + aggregate_results = {} + for result in results: + for name, value in result.items(): + if name in metrics[0]: + if name not in aggregate_results.keys(): + aggregate_results[name] = [] + try: + float_val = float(value) + except Exception: + float_val = np.nan + aggregate_results[name].append(float_val) + + for name, value in aggregate_results.items(): + if name in metrics[0]: + aggregate_results[name] = np.nanmean(value) + aggregate_results[name] = round(aggregate_results[name], 2) + log_metric(name, aggregate_results[name]) + return aggregate_results diff --git a/examples/flows/evaluation/eval-single-turn-metrics/answer_correctness.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/answer_correctness.jinja2 new file mode 100644 index 00000000000..f8915cd9104 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/answer_correctness.jinja2 @@ -0,0 +1,28 @@ +# System: +Extract following from given question and ground truth. The output should be in json format. + +Question:What powers the sun and what is its primary function? +Answer: The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system. +Ground truth: The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents. +Extracted statements: +{ + "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], + "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], + "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] +} + +Question: What is the boiling point of water? +Answer: The boiling point of water is 100 degrees Celsius at sea level. +Ground truth: The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude. +Extracted statements: +{ + "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"], + "statements present in the answer but not found in the ground truth": [], + "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] +} + + +Question:{{question}} +Answer: {{answer}} +Ground truth: {{ground_truth}} +Extracted statements: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/answer_quality.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/answer_quality.jinja2 new file mode 100644 index 00000000000..61113291983 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/answer_quality.jinja2 @@ -0,0 +1,39 @@ +# System: +- You are an AI assistant. You will be given a question-answer pair between a user and a bot. You need to read the question and answer carefully and identify the main topic, question, or issue, as well as the purpose and expectations of the interaction. +- You need to rate the bot answer on a scale of 1 (poor) to 5 (excellent) for each of the following factors, and provide some feedback for improvement. + - Accuracy and relevance: How well does the bot provide correct and reliable information or advice that matches the user's intent and expectations, and uses credible and up-to-date sources or references to support its claims? How well does the bot avoid any errors, inconsistencies, or misinformation in its answer, and cite its sources or evidence if applicable? + - Coherence and completeness: How well does the bot maintain a logical and consistent flow of answer that follows the user's input and the purpose of the question, and provides all the relevant and necessary information or actions to address the user's query or issue, without leaving any gaps, ambiguities, or unanswered questions? + - Engagement and tone: How well does the bot capture and maintain the user's interest and attention, and motivate them to continue the conversation or explore the topic further, using natural and conversational language, personality, and emotion? how well does the bot's tone match or adapt to the user's tone and mood? Does the bot avoid being rude, sarcastic, condescending, or too formal or informal, and convey respect, empathy, and politeness? + - Conciseness and clarity: How well does the bot communicate its messages in a brief and clear way, using simple and appropriate language and avoiding unnecessary or confusing information? How easy is it for the user to understand and follow the bot responses, and how well do they match the user's needs and expectations? + - Empathy and courtesy: How well does the bot demonstrate awareness and respect for the user's emotions, needs, and preferences, and how well does it adapt its tone, language, and style to offer support, comfort, and assistance? Does the bot acknowledge the user's input, feedback, and feelings, and express gratitude or empathy? Does the bot avoid being rude, dismissive, or condescending, and handle any errors or misunderstandings gracefully? + - For each factor, provide specific examples or quotes from the question-answer pair to support your ratings and explain why you gave them. + - Give an score value which is calculated by ( 0.3 * "accuracy and relevance" + 0.2 * "coherence and completeness" + 0.25 * "engagement and tone" + 0.15 * "conciseness and clarity" + 0.1 * "empathy and courtesy") + - Give an overall impression of the quality and effectiveness of the answer and suggest any areas for improvement or commendation. Write it in "Overall". +- The output should be in a json format including all above factors. + +- If the bot answer is not directly related to the user's query or issue, or if the user's intent or expectations are unclear or ambiguous, explain how this affects the quality of the answer and the ratings for each factor, and provide some suggestions for how the bot could handle these situations better. + +# Tips: +- You can write your feedback as bullet points, sentences, or paragraphs, but make sure they are organized and easy to read. +- You should try to be objective and constructive in your evaluation, and use evidence and examples from the question-answer pair to support your ratings and feedback. Avoid bias, subjectivity, or personal preferences, and use respectful and polite language. + + +# Example: +Example Input: +question: Can you describe your morning routine? +answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. +Example Output: +{ + "accuracy and relevance": 5, + "coherence and completeness": 5, + "engagement and tone": 4, + "conciseness and clarity": 5, + "empathy and courtesy": 4, + "score": 4.65 + "overall": "The bot answer is accurate, coherent, and concise, providing an informative and relevant description of a morning routine. However, it lacks engagement, tone, and empathy, which could make the interaction more enjoyable and satisfying. The bot could improve by adding more personality and emotion, adapting to the user's tone and mood, acknowledging their input and feedback, and expressing empathy or gratitude." +} + +Input: +question: {{question}} +answer: {{answer}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/answer_relevance.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/answer_relevance.jinja2 new file mode 100644 index 00000000000..a10a261e691 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/answer_relevance.jinja2 @@ -0,0 +1,44 @@ +# System: +You are an AI assistant. Generate a question for the given answer and Identify if answer is noncommittal. The output should be in json format. + +# Examples: +Example Input: +answer: Albert Einstein was born in Germany. +context: Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time +Example Output: +{ + "question":"Where was Albert Einstein born?", + "noncommittal":false +} + +Example Input: +answer: It can change its skin color based on the temperature of its environment. +context: A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment. +Example Output: +{ + "question":"What unique ability does the newly discovered species of frog have?", + "noncommittal":false +} + +Example Input: +answer: Everest, +context: The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas. +Example Output: +{ + "question":"What is the tallest mountain on Earth?", + "noncommittal":false +} + +Example Input: +answer: I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unaware of information beyond 2022. +context: In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology. +Example Output: +{ + "question":"What was the groundbreaking feature of the smartphone invented in 2023?", + "noncommittal":true +} + +Input: +answer:{{answer}} +context:{{context}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/answer_similarity.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/answer_similarity.jinja2 new file mode 100644 index 00000000000..581037bc541 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/answer_similarity.jinja2 @@ -0,0 +1,22 @@ +# System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +# User: +Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +1: the predicted answer is not at all similar to the correct answer +2: the predicted answer is mostly not similar to the correct answer +3: the predicted answer is somewhat similar to the correct answer +4: the predicted answer is mostly similar to the correct answer +5: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. And the output should only contain the score which is an integer between 1 and 5. + +# Examples: +question: What are the health benefits of regular exercise? +correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. +predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. +score: 5 + +question: {{question}} +correct answer:{{ground_truth}} +predicted answer: {{answer}} +score: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_correctness.py b/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_correctness.py new file mode 100644 index 00000000000..c53a5faf35e --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_correctness.py @@ -0,0 +1,40 @@ +from promptflow.core import tool +import json +import numpy as np + + +@tool +def calculate(statement_result: str, similarity_score: str) -> str: + try: + weights: list[float] = [0.75, 0.25] + + key_map = { + "TP": "statements that are present in both the answer and the ground truth", + "FP": "statements present in the answer but not found in the ground truth", + "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 + } + + score = 0 + result = json.loads(statement_result) + if result: + prediction = [ + result.get(key_map[k], np.nan) + for k in key_map.keys() + ] + + tp, fp, fn = [ + len(item) if isinstance(item, list) else np.nan + for item in prediction + ] + score = 5 * tp / (tp + 0.5 * (fp + fn)) + + final_score = weights[0] * score + weights[1] * int(similarity_score) + + print(score) + print(similarity_score) + + return final_score if final_score >= 1 else 1 + except Exception as e: + print("exception in calculate_answer_correctness: " + str(e)) + print("statement_result: " + statement_result) + return np.nan diff --git a/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_relevance.py b/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_relevance.py new file mode 100644 index 00000000000..26161940450 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/calculate_answer_relevance.py @@ -0,0 +1,29 @@ +from promptflow.core import tool +from typing import List +import numpy as np + + +def calculate_similarity(question_embedding: List, generated_question_embedding: List): + embedding1 = np.array(question_embedding) + embedding2 = np.array(generated_question_embedding) + + # Compute the dot product of the two embeddings + dot_product = np.dot(embedding1, embedding2) + + # Compute the L2 norms (i.e., the lengths) of each embedding + norm_embedding1 = np.linalg.norm(embedding1) + norm_embedding2 = np.linalg.norm(embedding2) + + # Compute the cosine similarity + return dot_product / (norm_embedding1 * norm_embedding2) + + +@tool +def calculate(question_embedding: List, generated_question_embedding: List, noncommittal: bool) -> str: + cosine_sim = calculate_similarity(question_embedding, generated_question_embedding) + print("noncommittal: ") + print(noncommittal) + print(cosine_sim) + score = 5 * cosine_sim * int(not noncommittal) + + return score if score >= 1 else 1 diff --git a/examples/flows/evaluation/eval-single-turn-metrics/calculate_context_recall.py b/examples/flows/evaluation/eval-single-turn-metrics/calculate_context_recall.py new file mode 100644 index 00000000000..f95ef7a23b1 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/calculate_context_recall.py @@ -0,0 +1,32 @@ +from promptflow.core import tool +import json +import numpy as np + + +@tool +def calculate(llm_result: str) -> str: + try: + score = 1 + response = json.loads(llm_result) + if response: + result = response.get("result", "") + print(result) + if result: + response = [ + # Also handle 'attribited' here since llm tool will return 'attribited' + # instead of 'attributed' in its' response + int(item.get("attributed", "").lower() == "yes" or item.get("attribited", "").lower() == "yes") + if item.get("attributed") or item.get("attribited") + else np.nan + for item in result + ] + denom = len(response) + print(response) + print(denom) + numerator = sum(response) + score = 5 * numerator / denom + return score if score >= 1 else 1 + except Exception as e: + print("exception in calculate_context_recall: " + str(e)) + print("llm_result: " + llm_result) + return np.nan diff --git a/examples/flows/evaluation/eval-single-turn-metrics/concat_scores.py b/examples/flows/evaluation/eval-single-turn-metrics/concat_scores.py new file mode 100644 index 00000000000..8b64bb6043b --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/concat_scores.py @@ -0,0 +1,42 @@ +from promptflow.core import tool +import json + + +def get_score(result): + try: + if result is not None: + # Parse the JSON string + result_dict = json.loads(result) + + # Extract the score value + score = result_dict.get('score', None) + print("result: ") + print(score) + return score + else: + return None + except json.JSONDecodeError: + print("Invalid JSON string.") + return None + + +@tool +def concat_results(answer_relevance: str = None, + answer_quality: str = None, + creativity: str = None, + grounding: str = None, + context_recall: str = None, + context_precision: str = None, + answer_similarity: str = None, + answer_correctness: str = None): + + results = {'answer_relevance': answer_relevance, + 'answer_quality': get_score(answer_quality), + 'creativity': get_score(creativity), + 'grounding': grounding, + 'context_recall': context_recall, + 'context_precision': context_precision, + 'answer_similarity': answer_similarity, + 'answer_correctness': answer_correctness} + + return results diff --git a/examples/flows/evaluation/eval-single-turn-metrics/context_precision.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/context_precision.jinja2 new file mode 100644 index 00000000000..3e07d5bde0e --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/context_precision.jinja2 @@ -0,0 +1,32 @@ +# System: +You are an AI assistant. Given question, ground truth and context, your task is to validate whether all content provided in the context contributes towards deriving the given ground truth, compute an accurate evaluation score using the provided evaluation metric. +# User: +This metric is used to measure the usefulness of the context in arriving at the given ground truth. A high metric value implies all context content is beneficial, whereas a low value suggests otherwise. Use the following rating scale to determine the metric value based on the provided question, context and ground truth: +1: None of the context content is useful in deriving the given ground truth. +2: Most of the context content doesn't contribute to the given ground truth. +3: Half of the context content is useful in achieving the given ground truth. +4: Majority of the context content is beneficial in deriving the given ground truth. +5: Every piece of context content is crucial in arriving at the given ground truth. + +This rating value should always be an integer between 1 and 5. And the output should only contain the score which is an integer between 1 and 5. + +# Examples: +Example Input: +question: What can you tell me about albert Albert Einstein? +context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. +ground truth: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 +Example Output: +5 + +Example Input: +question: What is the tallest mountain in the world? +context: The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest. +ground truth: Mount Everest. +Example Output: +0 + +Input: +question:{{question}} +context:{{context}} +ground truth:{{ground_truth}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/context_recall.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/context_recall.jinja2 new file mode 100644 index 00000000000..40414a28bbe --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/context_recall.jinja2 @@ -0,0 +1,76 @@ +# System: +You are an AI assistant. Given a context, and a ground truth, analyze each sentence in the ground truth and classify if the sentence can be attributed to the given context or not. + +# Examples: +Example Input: +question: What can you tell me about albert Albert Einstein? +context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. +ground truth: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 +Example Output: +{ + "result": [ + { "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", + "reason": "The date of birth of Einstein is mentioned clearly in the context.", + "attributed": "Yes" + }, + { + "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", + "reason": "The exact sentence is present in the given context.", + "attributed": "Yes" + }, + { + "statement_3": "He published 4 papers in 1905.", + "reason": "There is no mention about papers he wrote in the given context.", + "attributed": "No" + }, + { + "statement_4":"Einstein moved to Switzerland in 1895.", + "reason": "There is no supporting evidence for this in the given context.", + "attributed": "No" + } + ] +} + +Example Input: +question: who won 2020 icc world game? +context: Who won the 2022 ICC Men's T20 world game? +The 2022 ICC Men's T20 world game, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 world game title. +ground truth: England +Example Output: +{ + "result": [ + { + "statement_1":"England won the 2022 ICC Men's T20 world game.", + "reason": "From context it is clear that England defeated Pakistan to win the world game.", + "attributed": "Yes" + } + ] +} + +# Task: +Read the example output carefully and ensure the output has the same json format with the example output like below: +{ + "result": [ + { + "statement_1":"statement_1", + "reason": "reason", + "attributed": "Yes" + }, + { + "statement_2":"statement_2", + "reason": "reason", + "attributed": "No" + } + ] +} + +1. The output should be a JSON string that represents a result, the value for the result is a list of objects. Each object contains three properties: statement_X, reason, and attributed. +2. The statement_X property in each object should hold a string value detailing a factual statement, where X is the index number of the statement. +3. The reason property in each object should hold a string value explaining why the statement is attributed or not, based on the provided context. +4. The attributed property in each object should hold a string value, either "Yes" or "No", indicating whether the statement can be attributed based on the given context. + +Input: +question:{{question}} +context:{{context}} +ground truth:{{ground_truth}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/creativity.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/creativity.jinja2 new file mode 100644 index 00000000000..200cb5bfb24 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/creativity.jinja2 @@ -0,0 +1,30 @@ +# System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task between a user and a bot. Your job is to assess the perceived intelligence of the bot answer. +Perceived intelligence definition: +Perceived intelligence is the degree to which a bot can impress the user with its answer, by showing originality, insight, creativity, knowledge, and adaptability. An intelligent bot can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart and friendly. An intelligent bot can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. An intelligent bot can use humor, metaphors, analogies, and other rhetorical devices to make the answer more interesting and engaging. An intelligent bot can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and use hypotheticals, conditionals, and counterfactuals to explore what if, how, and why questions. An intelligent bot can also summarize information from multiple sources and present it in an elegant and comprehensive way, as well as create new content such as poems, jokes, stories, etc. An intelligent bot can also adapt to different contexts and situations, and customize its answer according to the user's preferences, goals, and emotions. Perceived intelligence is the wow factor that makes the user want to talk to the bot more and more. +Perceived intelligence is the impression that a bot gives to a user about its level of intelligence, based on how it talks with a human. Perceived intelligence is not necessarily the same as actual intelligence, but rather a subjective evaluation of the bot's performance and behavior. Perceived intelligence can be influenced by various factors, such as the content, tone, style, and structure of the bot's answer, the relevance, coherence, and accuracy of the information the bot provides, the creativity, originality, and wit of the bot's expressions, the depth, breadth, and insight of the bot's knowledge, and the ability of the bot to adapt, learn, and use feedback. +Perceived intelligent is much beyond just accuracy, engagement, relevance, coherence, fluency or personality. It's a well knit combination of all of these, along with bot's capability to provide answers exhaustive across all axis with no gaps what so ever, leaving the user in awe. +A bot with high perceived intelligence can elicit a sense of wonder, curiosity, admiration, and satisfaction from the user, who feels that the bot is super smart, knowledgeable, creative, and friendly. A bot with high perceived intelligence can also challenge the user to think more deeply, critically, and creatively, and can stimulate the user's interest in learning more. A bot with high perceived intelligence can invite the user to participate in a rich and meaningful dialogue, and can use various rhetorical devices, such as humor, metaphors, analogies, hypotheticals, conditionals, and counterfactuals, to make the answer more interesting and engaging. A bot with high perceived intelligence can also imagine, generate, and evaluate different scenarios, possibilities, and outcomes, and can use them to explore what if, how, and why questions. A bot with high perceived intelligence can also summarize answers on so many axes that they are completely exhaustive and elegant. +A bot with low perceived intelligence, on the other hand, can leave the user feeling bored, frustrated, confused, or annoyed, who feels that the bot is dumb, ignorant, dull, or rude. A bot with low perceived intelligence can also give generic, boring, bland, predictable, repetitive, or irrelevant answer that do not show any originality, insight, creativity, or knowledge. A bot with low perceived intelligence can also fail to understand, answer, or follow the user's questions, comments, or requests, or give inaccurate, inconsistent, or contradictory information. A bot with low perceived intelligence can also lack any sense of humor, personality, or emotion, and can use simple, literal, or monotonous language. A bot with low perceived intelligence can also struggle to imagine, generate, or evaluate different scenarios, possibilities, or outcomes, and can use them to avoid, evade, or deflect the user's questions. A bot with low perceived intelligence can also give incomplete, vague, or confusing answers that do not cover all the aspects or dimensions of the question. + +# Tips: +- You should read the user's question more carefully and try to understand what they are looking for and why. +- You should also compare the bot's answer to the user's question and see if it matches the criteria of perceived intelligence, such as relevance, coherence, accuracy, creativity, originality, wit, depth, breadth, insight, and adaptability. +- You should also consider how the user would feel and react to the bot's answer, and whether it would elicit a sense of wonder, curiosity, admiration, and satisfaction, or boredom, frustration, confusion, and annoyance. You should also check the instructions and definitions more often and make sure You understand them correctly. + +# Example: +Example Input: +question: Explain Blockchain sharing to a 10 year old +answer: Possible answer: You know how sometimes you and your friends like to play games or trade cards or stickers or other things that you collect? And you want to make sure that everyone is being fair and honest and not cheating or lying or taking more than they should? And you also want to keep track of who has what and when and where you exchanged them?\n\nWell, blockchain sharing is a way of doing that with computers and the internet. It is like a big book or a ledger that records everything that happens in a game or a trade or any other activity that involves exchanging something of value. But instead of having one person or one place that keeps the book, everyone who participates has a copy of the book. And every time something new happens, like a new trade or a new move or a new rule, everyone adds a new page to their book. And they also check with each other to make sure that their books are the same and that no one is changing or deleting or adding anything that they should not.\n\nThis way, everyone can see what is going on and trust that it is true and fair and secure. And no one can cheat or lie or steal or hack or lose or damage the book, because there are many copies and they are all connected and updated. And if someone tries to do something bad, everyone else will notice and reject it.\n\nBlockchain sharing is also very creative and fun, because you can use it for many different things, not just games or trades. You can use it for making art or music or stories or inventions or anything else that you can imagine and share with others. You can also use it for learning and exploring and discovering new things and new people and new places. You can also use it for helping and supporting and solving problems and making the world a better place.\n\nBlockchain sharing is a way of using computers and the internet to create, exchange and protect things that you care about with people that you trust and respect. It is a way of being smart and responsible and cooperative and generous and curious and adventurous. It is a way of having fun and making friends and making a difference. +Example Output: +{ +"relevance": "The bot answers the user's question directly and clearly, and uses examples and analogies that a 10 year old can relate to and understand, such as games, trades, cards, stickers, books, etc.", "coherence": "The bot organizes the answer in a logical and structured way, using paragraphs, transitions, and connectors, such as \"well\", \"but\", \"and\", \"this way\", \"also\", etc.", "accuracy": "The bot provides correct and consistent information about blockchain sharing, such as its features, benefits, and applications, without using technical jargon or confusing terms.", "creativity": "The bot uses vivid and expressive language, such as \"a big book or a ledger\", \"a new page\", \"check with each other\", \"notice and reject\", \"making art or music or stories or inventions\", etc., to make the explanation more interesting and engaging.", "originality": "The bot does not copy or repeat any existing or common explanation of blockchain sharing, but rather creates a new and unique one that is tailored to the user's age and level of understanding.", "wit": "The bot uses humor and irony, such as \"And you want to make sure that everyone is being fair and honest and not cheating or lying or taking more than they should?\", \"And no one can cheat or lie or steal or hack or lose or damage the book\", etc., to make the explanation more fun and memorable.", "depth": "The bot goes beyond the surface level of blockchain sharing, and explains its underlying principles, values, and goals, such as \"trust\", \"fairness\", \"security\", \"creativity\", \"fun\", \"learning\", \"helping\", etc.", "breadth": "The bot covers a wide range of topics and aspects related to blockchain sharing, such as its history, technology, functionality, diversity, and potential, without being too vague or too detailed.", "insight": "The bot demonstrates a deep and nuanced understanding of blockchain sharing, and how it can be applied to different domains and scenarios, such as \"making art or music or stories or inventions\", \"learning and exploring and discovering new things and new people and new places\", \"helping and supporting and solving problems and making the world a better place\", etc.", "adaptability": "The bot adapts its answer to the user's specific question, context, and situation, and customizes it according to the user's age, interests, and needs.", "score": 5 +} + +# Task: +Based on these aspects, rate the bot's perceived intelligence. Give specific examples about each aspect (relevance, coherence, accuracy, creativity, originality, wit, depth, breadth, insight, and adaptability) from the bot's answer to support your rating. Finally, give the answer a score from 1 to 5 for perceived intelligence, where 1 means poor, 3 means normal, and 5 means excellent.Please make sure the output has the same format with the example output, and the output should be in a json format. + +Input: +question: {{question}} +answer: {{answer}} +Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/flow.dag.yaml b/examples/flows/evaluation/eval-single-turn-metrics/flow.dag.yaml new file mode 100644 index 00000000000..e0af687ff12 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/flow.dag.yaml @@ -0,0 +1,342 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +inputs: + question: + type: string + default: Which tent is the most waterproof? + is_chat_input: false + answer: + type: string + default: The Alpine Explorer Tent is the most waterproof. + is_chat_input: false + context: + type: string + default: From the our product list, the alpine explorer tent is the most + waterproof. The Adventure Dining Tabbe has higher weight. + is_chat_input: false + ground_truth: + type: string + default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m + is_chat_input: false + metrics: + type: string + default: grounding,answer_relevance,answer_quality,context_precision,answer_similarity,creativity,context_recall,answer_correctness + is_chat_input: false +outputs: + answer_correctness: + type: string + reference: ${concat_scores.output.answer_correctness} + context_recall: + type: string + reference: ${concat_scores.output.context_recall} + answer_similarity: + type: string + reference: ${concat_scores.output.answer_similarity} + answer_relevance: + type: string + reference: ${concat_scores.output.answer_relevance} + context_precision: + type: string + reference: ${concat_scores.output.context_precision} + creativity: + type: string + reference: ${concat_scores.output.creativity} + grounding: + type: string + reference: ${concat_scores.output.grounding} + answer_quality: + type: string + reference: ${concat_scores.output.answer_quality} +nodes: +- name: select_metrics + type: python + source: + type: code + path: select_metrics.py + inputs: + metrics: ${inputs.metrics} + use_variants: false +- name: validate_input + type: python + source: + type: code + path: validate_input.py + inputs: + answer: ${inputs.answer} + context: ${inputs.context} + ground_truth: ${inputs.ground_truth} + question: ${inputs.question} + selected_metrics: ${select_metrics.output} + use_variants: false +- name: grounding + type: llm + source: + type: code + path: grounding.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + context: ${inputs.context} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.grounding} + is: true + use_variants: false +- name: answer_quality + type: llm + source: + type: code + path: answer_quality.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.answer_quality} + is: true + use_variants: false +- name: answer_similarity + type: llm + source: + type: code + path: answer_similarity.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + ground_truth: ${inputs.ground_truth} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.answer_similarity} + is: true + use_variants: false +- name: creativity + type: llm + source: + type: code + path: creativity.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.creativity} + is: true + use_variants: false +- name: context_recall + type: llm + source: + type: code + path: context_recall.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + context: ${inputs.context} + ground_truth: ${inputs.ground_truth} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.context_recall} + is: true + use_variants: false +- name: calculate_context_recall + type: python + source: + type: code + path: calculate_context_recall.py + inputs: + llm_result: ${context_recall.output} + activate: + when: ${validate_input.output.context_recall} + is: true + use_variants: false +- name: context_precision + type: llm + source: + type: code + path: context_precision.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + context: ${inputs.context} + ground_truth: ${inputs.ground_truth} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.context_precision} + is: true + use_variants: false +- name: answer_relevance + type: llm + source: + type: code + path: answer_relevance.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + context: ${inputs.context} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: handle_generated_question + type: python + source: + type: code + path: handle_generated_question.py + inputs: + llm_result: ${answer_relevance.output} + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: embedding_question + type: python + source: + type: package + tool: promptflow.tools.embedding.embedding + inputs: + connection: open_ai_connection + deployment_name: text-embedding-ada-002 + input: ${inputs.question} + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: embedding_generated_question + type: python + source: + type: package + tool: promptflow.tools.embedding.embedding + inputs: + connection: open_ai_connection + deployment_name: text-embedding-ada-002 + input: ${handle_generated_question.output.question} + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: calculate_answer_relevance + type: python + source: + type: code + path: calculate_answer_relevance.py + inputs: + generated_question_embedding: ${embedding_generated_question.output} + noncommittal: ${handle_generated_question.output.noncommittal} + question_embedding: ${embedding_question.output} + activate: + when: ${validate_input.output.answer_relevance} + is: true + use_variants: false +- name: answer_correctness + type: llm + source: + type: code + path: answer_correctness.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + answer: ${inputs.answer} + ground_truth: ${inputs.ground_truth} + question: ${inputs.question} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + activate: + when: ${validate_input.output.answer_correctness} + is: true + use_variants: false +- name: calculate_answer_correctness + type: python + source: + type: code + path: calculate_answer_correctness.py + inputs: + similarity_score: ${answer_similarity.output} + statement_result: ${answer_correctness.output} + activate: + when: ${validate_input.output.answer_correctness} + is: true + use_variants: false +- name: concat_scores + type: python + source: + type: code + path: concat_scores.py + inputs: + answer_correctness: ${calculate_answer_correctness.output} + answer_quality: ${answer_quality.output} + answer_relevance: ${calculate_answer_relevance.output} + answer_similarity: ${answer_similarity.output} + context_precision: ${context_precision.output} + context_recall: ${calculate_context_recall.output} + creativity: ${creativity.output} + grounding: ${grounding.output} + use_variants: false +- name: aggregate_results + type: python + source: + type: code + path: aggregate.py + inputs: + metrics: ${inputs.metrics} + results: ${concat_scores.output} + aggregation: true + use_variants: false +node_variants: {} +environment: + python_requirements_txt: requirements.txt diff --git a/examples/flows/evaluation/eval-single-turn-metrics/grounding.jinja2 b/examples/flows/evaluation/eval-single-turn-metrics/grounding.jinja2 new file mode 100644 index 00000000000..6b03ede3a14 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/grounding.jinja2 @@ -0,0 +1,29 @@ +# System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +# User: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 4: Most of the ANSWER follows logically from the information contained in the CONTEXT. +3. 3: The ANSWER follows logically from the information contained in the CONTEXT. +4. 2: The ANSWER follows logically from the information contained in the CONTEXT. +5. 1: The ANSWER is logically false from the information contained in the CONTEXT. +6. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer for it. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. + +# Examples: +## Example Task #1 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #1 Output: +1 +## Example Task #2 Input: +{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} +## Example Task #2 Output: +5 +## Example Task #3 Input: +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +## Example Task #3 Output: +5 + +## Actual Task Input: +{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. +Actual Task Output: \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/handle_generated_question.py b/examples/flows/evaluation/eval-single-turn-metrics/handle_generated_question.py new file mode 100644 index 00000000000..cb0c31e4455 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/handle_generated_question.py @@ -0,0 +1,13 @@ +from promptflow.core import tool +import json + + +@tool +def handle_generated_question(llm_result: str) -> str: + try: + response = json.loads(llm_result) + return response + except Exception as e: + print("exception in handle_generated_question: " + str(e)) + print("llm_result: " + llm_result) + return {"question": "", "noncommittal": True} diff --git a/examples/flows/evaluation/eval-single-turn-metrics/requirements.txt b/examples/flows/evaluation/eval-single-turn-metrics/requirements.txt new file mode 100644 index 00000000000..34d068f5f1c --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/requirements.txt @@ -0,0 +1,2 @@ +promptflow +promptflow-tools \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/samples.json b/examples/flows/evaluation/eval-single-turn-metrics/samples.json new file mode 100644 index 00000000000..edcf861f077 --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/samples.json @@ -0,0 +1,8 @@ +[ + { + "question": "Which tent is the most waterproof?", + "context": "From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.", + "answer": "The Alpine Explorer Tent is the most waterproof.", + "ground_truth": "The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m" + } + ] \ No newline at end of file diff --git a/examples/flows/evaluation/eval-single-turn-metrics/select_metrics.py b/examples/flows/evaluation/eval-single-turn-metrics/select_metrics.py new file mode 100644 index 00000000000..d563cc3fdba --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/select_metrics.py @@ -0,0 +1,21 @@ +from promptflow.core import tool + + +@tool +def select_metrics(metrics: str) -> str: + supported_metrics = ('grounding', + 'answer_relevance', + 'answer_quality', + 'context_recall', + 'context_precision', + 'answer_similarity', + 'answer_correctness', + 'creativity') + user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric] + metric_selection_dict = {} + for metric in supported_metrics: + if metric in user_selected_metrics: + metric_selection_dict[metric] = True + else: + metric_selection_dict[metric] = False + return metric_selection_dict diff --git a/examples/flows/evaluation/eval-single-turn-metrics/validate_input.py b/examples/flows/evaluation/eval-single-turn-metrics/validate_input.py new file mode 100644 index 00000000000..610600391df --- /dev/null +++ b/examples/flows/evaluation/eval-single-turn-metrics/validate_input.py @@ -0,0 +1,32 @@ +from promptflow.core import tool + + +@tool +def validate_input(question: str, answer: str, context: str, ground_truth: str, selected_metrics: dict) -> dict: + input_data = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth} + expected_input_cols = set(input_data.keys()) + dict_metric_required_fields = {"answer_relevance": set(["question", "answer"]), + "answer_quality": set(["question", "answer"]), + "creativity": set(["question", "answer"]), + "grounding": set(["answer", "context"]), + "context_recall": set(["question", "context", "ground_truth"]), + "context_precision": set(["question", "context", "ground_truth"]), + "answer_similarity": set(["question", "answer", "ground_truth"]), + "answer_correctness": set(["question", "answer", "ground_truth"])} + actual_input_cols = set() + for col in expected_input_cols: + if input_data[col] and input_data[col].strip(): + actual_input_cols.add(col) + data_validation = selected_metrics + for metric in selected_metrics: + if selected_metrics[metric]: + metric_required_fields = dict_metric_required_fields[metric] + if metric_required_fields <= actual_input_cols: + data_validation[metric] = True + else: + data_validation[metric] = False + + if data_validation['answer_correctness']: + data_validation['answer_similarity'] = True + + return data_validation diff --git a/examples/flows/standard/question-simulation/README.md b/examples/flows/standard/question-simulation/README.md new file mode 100644 index 00000000000..020c8b3a094 --- /dev/null +++ b/examples/flows/standard/question-simulation/README.md @@ -0,0 +1,51 @@ +# Question simulation: + +This question simulation flow is used to generate suggestions for the next question based on the previous chat history. When the chat history seems like should be ended, then the flow output will be [STOP]. + +## Flow inputs +* __chat_history__: the previous chat_history, the format for it is as follows: + [ + { + "inputs": { + "question": "Can you introduce something about large language model?" + }, + "outputs": { + "answer": "A large language model (LLM) is a type of language model that is distinguished by its ability to perform general-purpose language generation and understanding." + } + }, + { + "inputs": { + "question": "XXXXXX" + }, + "outputs": { + "answer": "XXXXXX" + } + } + ] + +* __question_count__: an integer is used to determine the number of questions to be generated. These generated question can be displayed in UX, allowing users to select the one that best suits their needs. + +## Flow outputs +* If the conversation should go on, the output the suggestions for next question: multiple questions are seperated by '\n', for instance: + "question": "question_1\nquestion_2\nquestion_3" +* If the conversation should ended, not more question will be generated, the output is a stop signal: [STOP] + +## Tools used in this flow +- LLM tool +- Python tool +- Prompt tool + + +## 0. Setup connection +Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one. + +```bash +# Override keys with --set to avoid yaml file changes +pf connection create --file ../../../connections/azure_openai.yml --set api_key= api_base= +``` + +## 1. Test flow/node +```bash +# test with default input value in flow.dag.yaml +pf flow test --flow . +``` \ No newline at end of file diff --git a/examples/flows/standard/question-simulation/call_llm_chat.py b/examples/flows/standard/question-simulation/call_llm_chat.py new file mode 100644 index 00000000000..08b089f1fcf --- /dev/null +++ b/examples/flows/standard/question-simulation/call_llm_chat.py @@ -0,0 +1,56 @@ +from promptflow.core import tool +from typing import Union +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from openai import AzureOpenAI as AzureOpenAIClient +from openai import OpenAI as OpenAIClient +from promptflow.tools.common import parse_chat + + +def parse_questions(completion: str) -> list: + questions = [] + + for item in completion.choices: + response = getattr(item.message, "content", "") + print(response) + questions.append(response) + return questions + + +@tool +def call_llm_chat( + connection: Union[AzureOpenAIConnection, OpenAIConnection], + prompt: str, + question_count: int, + deployment_name_or_model: str, + stop: list = [], +) -> str: + + messages = parse_chat(prompt) + params = { + "model": deployment_name_or_model, + "messages": messages, + "temperature": 1.0, + "top_p": 1.0, + "stream": False, + "stop": stop if stop else None, + "presence_penalty": 0.8, + "frequency_penalty": 0.8, + "max_tokens": None, + "n": question_count + } + if isinstance(connection, AzureOpenAIConnection): + client = AzureOpenAIClient(api_key=connection.api_key, + api_version=connection.api_version, + azure_endpoint=connection.api_base) + elif isinstance(connection, OpenAIConnection): + client = OpenAIClient(api_key=connection.api_key, + organization=connection.organization, + base_url=connection.base_url) + else: + raise ValueError("Unsupported connection type") + + completion = client.chat.completions.create(**params) + print(completion) + questions = parse_questions(completion) + + return "\n".join(questions) diff --git a/examples/flows/standard/question-simulation/flow.dag.yaml b/examples/flows/standard/question-simulation/flow.dag.yaml new file mode 100644 index 00000000000..b0b23eade95 --- /dev/null +++ b/examples/flows/standard/question-simulation/flow.dag.yaml @@ -0,0 +1,77 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +inputs: + chat_history: + type: list + default: [{"inputs":{"question":"Can you introduce something about large language model?"},"outputs":{"answer":"A large language model (LLM) is a type of language model that is distinguished by its ability to perform general-purpose language generation and understanding. These models learn statistical relationships from text documents through a self-supervised and semi-supervised training process that is computationally intensive. LLMs are a form of artificial neural networks, and many of the most advanced ones are built using a transformer-based architecture, although there are some recent implementations based on other architectures, such as recurrent neural network variants and Mamba, which is a state space model.\n\nLLMs are capable of text generation, which is a subset of generative AI. They do this by taking an input text and predicting the next token or word in a sequence. Initially, fine-tuning was the primary method for adapting these models to perform specific tasks, but with the advent of larger models like GPT-3, prompt engineering can be used to achieve similar results without fine-tuning.\n\nThese models are believed to acquire knowledge about the syntax, semantics, and \"ontology\" inherent in human language from the corpora they are trained on. However, they also inherit any inaccuracies and biases present in the training data.\n\nNotable examples of LLMs include OpenAI's GPT series (such as GPT-3","context":"Content: A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and understanding. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process.[1] LLMs are artificial neural networks, the largest and most capable of which are built with a transformer-based architecture. Some recent implementations are based on other architectures, such as recurrent neural network variants and Mamba (a state space model).[2][3][4]. LLMs can be used for text generation, a form of generative AI, by taking an input text and repeatedly predicting the next token or word.[5] Up to 2020, fine tuning was the only way a model could be adapted to be able to accomplish specific tasks. Larger sized models, such as GPT-3, however, can be prompt-engineered to achieve similar results.[6] They are thought to acquire knowledge about syntax, semantics and \"ontology\" inherent in human language corpora, but also inaccuracies and biases present in the corpora.[7]. Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), Google's PaLM and Gemini (the latter of which is currently used in the chatbot of the same name), Meta's LLaMA family of open-source models, and Anthropic's Claude models.. At the 2017 NeurIPS conference, Google researchers introduced the transformer architecture in their landmark paper \"Attention Is All You Need\". This paper's goal was to improve upon 2014 Seq2seq technology, [8] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014.[9] The following year in 2018, BERT was introduced and quickly became \"ubiquitous\".[10] Though the original transformer has both encoder and decoder blocks, BERT is an encoder-only model.. Although decoder-only GPT-1 was introduced in 2018, it was GPT-2 in 2019 that caught widespread attention because OpenAI at first deemed it too powerful to release publicly, out of fear of malicious use.[11] GPT-3 in 2020 went a step further and as of 2024[update] is available only via API with no offering of downloading the model to execute locally.\nSource: https://en.wikipedia.org/w/index.php?search=Large language model\n\nContent: Gemini may refer to:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate. HelpLearn to editCommunity portalRecent changesUpload file. Create account. Log in. Create account Log in. (Top). 1Space. 2Mythology. 3Given name.\nSource: https://en.wikipedia.org/w/index.php?search=Gemini "}}] + is_chat_input: false + question_count: + type: int + default: 3 + is_chat_input: false +outputs: + question: + type: string + reference: ${flow_output.output} +nodes: +- name: verify_if_conversation_stopped + type: llm + source: + type: code + path: verify_if_conversation_stopped.jinja2 + inputs: + deployment_name: gpt-4 + temperature: 0 + top_p: 1 + presence_penalty: 0 + frequency_penalty: 0 + chat_history: ${inputs.chat_history} + provider: AzureOpenAI + connection: open_ai_connection + api: chat + module: promptflow.tools.aoai + use_variants: false +- name: if_continue + type: python + source: + type: code + path: if_continue.py + inputs: + stop_or_continue: ${verify_if_conversation_stopped.output} + use_variants: false +- name: human_prompt + type: prompt + source: + type: code + path: human_prompt.jinja2 + inputs: + chat_history: ${inputs.chat_history} + activate: + when: ${if_continue.output} + is: true + use_variants: false +- name: call_llm_chat + type: python + source: + type: code + path: call_llm_chat.py + inputs: + connection: open_ai_connection + deployment_name_or_model: gpt-4 + prompt: ${human_prompt.output} + question_count: ${inputs.question_count} + stop: + - "Human:" + - "Bot:" + use_variants: false +- name: flow_output + type: python + source: + type: code + path: flow_output.py + inputs: + questions: ${call_llm_chat.output} + stop_or_continue: ${verify_if_conversation_stopped.output} + use_variants: false +node_variants: {} +environment: + python_requirements_txt: requirements.txt diff --git a/examples/flows/standard/question-simulation/flow_output.py b/examples/flows/standard/question-simulation/flow_output.py new file mode 100644 index 00000000000..2bb8bef6f99 --- /dev/null +++ b/examples/flows/standard/question-simulation/flow_output.py @@ -0,0 +1,9 @@ +from promptflow.core import tool + + +@tool +def flow_output(stop_or_continue: str, questions: str) -> str: + if "stop" in stop_or_continue.lower(): + return "[STOP]" + else: + return questions diff --git a/examples/flows/standard/question-simulation/human_prompt.jinja2 b/examples/flows/standard/question-simulation/human_prompt.jinja2 new file mode 100644 index 00000000000..67ae1dc50b1 --- /dev/null +++ b/examples/flows/standard/question-simulation/human_prompt.jinja2 @@ -0,0 +1,27 @@ +# System: +You are an assistant playing as a random human engaging in a conversation with a digital companion, Bot. Your task is to follow the instruction below to role-play as a random human in a conversation with Bot, responding to Bot in a manner that a human would say. + +# Example +This example illustrates how to generate a conversational response to Bot as a human would: +Conversation: +Human: Bot, what's your favorite movie? +Bot: I don't watch movies, but I can help you find information about any movie you like! +Human: Can you tell me about the latest Marvel movie? +Bot: The latest Marvel movie is "Spider-Man: No Way Home". It features Peter Parker dealing with the fallout after his identity is revealed. Want to know more about it? +Human: Yes, can you suggest where I can watch it? + +# Instruction: +1. Your reply to the Bot should mimic how a human would typically engage in conversation, asking questions or making statements that a person would naturally say in response. +2. Do not use interjections. +3. Provide a straightforward, factual response without expressions of surprise, admiration, or evaluative comments for Bot's response. +4. Focus on directly asking a question about Bot's response in the last exchange. The question should be concise, and without punctuation marks in the middle. +5. Avoid creating any messages that appear to come from the Bot. Your response should not contain content that could be mistaken as generated by the Bot, maintaining a clear distinction between your input as the Human and the Bot's contributions to the conversation. +6. Your reply should not contain "\n", this is a reserved character. + +# Read the following conversation and respond: +Conversation: +{% for item in chat_history %} +Human: {{ item.inputs.question }} +Bot: {{ item.outputs.answer }} +{% endfor %} +Human: diff --git a/examples/flows/standard/question-simulation/if_continue.py b/examples/flows/standard/question-simulation/if_continue.py new file mode 100644 index 00000000000..17b9744fcf0 --- /dev/null +++ b/examples/flows/standard/question-simulation/if_continue.py @@ -0,0 +1,9 @@ +from promptflow.core import tool + + +@tool +def if_continue(stop_or_continue: str) -> bool: + if "continue" in stop_or_continue.lower(): + return True + else: + return False diff --git a/examples/flows/standard/question-simulation/requirements.txt b/examples/flows/standard/question-simulation/requirements.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/flows/standard/question-simulation/verify_if_conversation_stopped.jinja2 b/examples/flows/standard/question-simulation/verify_if_conversation_stopped.jinja2 new file mode 100644 index 00000000000..08a1df2235e --- /dev/null +++ b/examples/flows/standard/question-simulation/verify_if_conversation_stopped.jinja2 @@ -0,0 +1,32 @@ +# System: +You are an assistant tasked with determining whether a conversation between a human and a bot will continue or not. Your outputs are limited to "[STOP]" or "[CONTINUE]". When you predict that the conversation will go on, you should respond with "[CONTINUE]". If you believe the conversation has come to an end, respond with "[STOP]". + +# Examples: +## Example 1: +Conversation: +Human: Hey Bot, what's your favorite movie? +Bot: I don't watch movies, but I can help you find information about any movie you like! +Human: Can you tell me about the latest Marvel movie? +Bot: The latest Marvel movie is "Spider-Man: No Way Home". It features Peter Parker dealing with the fallout after his identity is revealed. Want to know more about it? +output: [CONTINUE] + +## Example 2: +Conversation: +Human: Hey Bot, do you know any good Italian restaurants nearby? +Bot: I can't access current location data, but I can suggest looking up Italian restaurants on a local review site like Yelp or Google Reviews. +Human: Thanks for the tip. I'll check it out. +Bot: You're welcome! Enjoy your meal. If you need more help, just ask. +output: [STOP] + +# Instruction +A conversation is considered to have ended if: +1. The Bot's final response only contains polite expressions without substantive content for human to inquire about. +2. In the last round of the conversation, the Human did not ask the Bot any questions. + +# Read the following conversation and respond: +Conversation: +{% for item in chat_history %} +Human: {{ item.inputs.question }} +Bot: {{ item.outputs.answer }} +{% endfor %} +output: diff --git a/examples/gen_test_data/README.md b/examples/gen_test_data/README.md new file mode 100644 index 00000000000..91a2870af70 --- /dev/null +++ b/examples/gen_test_data/README.md @@ -0,0 +1,3 @@ +# Generate test data + +Please kindly see [this doc](../../docs/how-to-guides/generate-test-data.md) for detailed steps of how to generate test data. \ No newline at end of file diff --git a/examples/gen_test_data/conda.yml b/examples/gen_test_data/conda.yml new file mode 100644 index 00000000000..7a4fb75090b --- /dev/null +++ b/examples/gen_test_data/conda.yml @@ -0,0 +1,11 @@ +name: test_data_gen_conda_env +channels: + - defaults +dependencies: + - python=3.10.12 + - pip=23.2.1 + - pip: + - mldesigner==0.1.0b18 + - llama_index==0.9.48 + - docx2txt==0.8 + - promptflow>=1.7.0 diff --git a/examples/gen_test_data/config.yml.example b/examples/gen_test_data/config.yml.example new file mode 100644 index 00000000000..1709d448c15 --- /dev/null +++ b/examples/gen_test_data/config.yml.example @@ -0,0 +1,57 @@ +# Common section: this section provides common values for all other sections. Required. +# Configure 'document_folder', 'document_chunk_size' and 'document_chunk_overlap' if you require document splitting. +documents_folder: +document_chunk_size: 512 # The token chunk size for each chunk. +document_chunk_overlap: 100 # The token overlap of each chunk when splitting. + +# However, if you wish to bypass the document split process, simply provide the 'document_nodes_file', which is a JSONL file. +# When both 'documents_folder' and 'document_nodes_file' are configured, will use 'document_nodes_file' and ignore 'documents_folder'. +# For cloud mode, both local files and data assets can be used. +# document_nodes_file: + +# Test data gen flow configs +# You can utilize our provided example test data generation flow directly. Alternatively, you can create your own flow and set up corresponding node inputs override. +# The example flow folder path is \examples\gen_test_data\example_flow +flow_folder: +node_inputs_override: # Override some node inputs, if not fill in 'node_inputs_override', will use the values in flow.dag.yaml + validate_text_chunk: # node name in flow.dag.yaml + connection: # connection name of node 'validate_text_chunk' + # Use 'deployment_name' for Azure OpenAI connection, 'model' for OpenAI + deployment_name: + # model: + generate_question: + connection: + deployment_name: + # model: + validate_question: + connection: + deployment_name: + # model: + generate_suggested_answer: + connection: + deployment_name: + # model: + validate_suggested_answer: + connection: + deployment_name: + # model: + + +# Local section: this section is for local test data generation related configuration. Can skip if not run in local. +output_folder: +flow_batch_run_size: 4 # Higher values may speed up flow runs but risk hitting OpenAI's rate limit. + + +# Cloud section: this section is for cloud test data generation related configuration. Can skip if not run in cloud. +subscription_id: +resource_group: +workspace_name: +aml_cluster: + +# Parallel run step configs +prs_instance_count: 2 +prs_mini_batch_size: 1 +prs_max_concurrency_per_instance: 4 +prs_max_retry_count: 3 +prs_run_invocation_time: 800 +prs_allowed_failed_count: -1 diff --git a/examples/gen_test_data/example_flow/.promptflow/flow.tools.json b/examples/gen_test_data/example_flow/.promptflow/flow.tools.json new file mode 100644 index 00000000000..f0c9a17a022 --- /dev/null +++ b/examples/gen_test_data/example_flow/.promptflow/flow.tools.json @@ -0,0 +1,390 @@ +{ + "package": {}, + "code": { + "score_text_chunk_prompt.jinja2": { + "type": "prompt", + "inputs": { + "context": { + "type": [ + "string" + ] + } + }, + "source": "score_text_chunk_prompt.jinja2" + }, + "validate_question_prompt.jinja2": { + "type": "prompt", + "inputs": { + "question": { + "type": [ + "string" + ] + }, + "context": { + "type": [ + "string" + ] + } + }, + "source": "validate_question_prompt.jinja2" + }, + "generate_question_prompt.jinja2": { + "type": "prompt", + "inputs": { + "context": { + "type": [ + "string" + ] + } + }, + "source": "generate_question_prompt.jinja2" + }, + "generate_suggested_answer_prompt.jinja2": { + "type": "prompt", + "inputs": { + "question": { + "type": [ + "string" + ] + }, + "context": { + "type": [ + "string" + ] + } + }, + "source": "generate_suggested_answer_prompt.jinja2" + }, + "generate_question.py": { + "type": "python", + "inputs": { + "connection": { + "type": [ + "OpenAIConnection", + "AzureOpenAIConnection" + ] + }, + "generate_question_prompt": { + "type": [ + "string" + ] + }, + "deployment_name": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "AzureOpenAIConnection" + ], + "capabilities": { + "completion": false, + "chat_completion": true, + "embeddings": false + } + }, + "model": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "OpenAIConnection" + ] + }, + "context": { + "type": [ + "string" + ] + }, + "temperature": { + "type": [ + "double" + ], + "default": "0.2" + } + }, + "description": "Generates a question based on the given context.\n\nReturns:\n str: The generated seed question.", + "source": "generate_question.py", + "function": "generate_question" + }, + "validate_question.py": { + "type": "python", + "inputs": { + "connection": { + "type": [ + "OpenAIConnection", + "AzureOpenAIConnection" + ] + }, + "generated_question": { + "type": [ + "string" + ] + }, + "validate_question_prompt": { + "type": [ + "string" + ] + }, + "deployment_name": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "AzureOpenAIConnection" + ], + "capabilities": { + "completion": false, + "chat_completion": true, + "embeddings": false + } + }, + "model": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "OpenAIConnection" + ] + }, + "response_format": { + "type": [ + "string" + ], + "default": "text" + }, + "temperature": { + "type": [ + "double" + ], + "default": "0.2" + } + }, + "description": "1. Validates the given seed question.\n2. Generates a test question based on the given prompts and distribution ratios.\n\nReturns:\n dict: The generated test question and its type.", + "source": "validate_question.py", + "function": "validate_question" + }, + "generate_suggested_answer.py": { + "type": "python", + "inputs": { + "connection": { + "type": [ + "OpenAIConnection", + "AzureOpenAIConnection" + ] + }, + "question": { + "type": [ + "string" + ] + }, + "context": { + "type": [ + "string" + ] + }, + "generate_suggested_answer_prompt": { + "type": [ + "string" + ] + }, + "deployment_name": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "AzureOpenAIConnection" + ], + "capabilities": { + "completion": false, + "chat_completion": true, + "embeddings": false + } + }, + "model": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "OpenAIConnection" + ] + }, + "temperature": { + "type": [ + "double" + ], + "default": "0.2" + } + }, + "description": "Generates a suggested answer based on the given prompts and context information.\n\nReturns:\n str: The generated suggested answer.", + "source": "generate_suggested_answer.py", + "function": "generate_suggested_answer" + }, + "generate_debug_info.py": { + "type": "python", + "inputs": { + "text_chunk": { + "type": [ + "string" + ] + }, + "text_chunk_validation_res": { + "type": [ + "object" + ] + }, + "validate_question_output": { + "type": [ + "object" + ] + }, + "validate_suggested_answer_output": { + "type": [ + "object" + ] + } + }, + "source": "generate_debug_info.py", + "function": "my_python_tool" + }, + "validate_suggested_answer_prompt.jinja2": { + "type": "prompt", + "inputs": { + "answer": { + "type": [ + "string" + ] + } + }, + "source": "validate_suggested_answer_prompt.jinja2" + }, + "validate_suggested_answer.py": { + "type": "python", + "inputs": { + "connection": { + "type": [ + "OpenAIConnection", + "AzureOpenAIConnection" + ] + }, + "suggested_answer": { + "type": [ + "string" + ] + }, + "validate_suggested_answer_prompt": { + "type": [ + "string" + ] + }, + "deployment_name": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "AzureOpenAIConnection" + ], + "capabilities": { + "completion": false, + "chat_completion": true, + "embeddings": false + } + }, + "model": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "OpenAIConnection" + ] + }, + "temperature": { + "type": [ + "double" + ], + "default": "0.2" + }, + "response_format": { + "type": [ + "string" + ], + "default": "text" + } + }, + "description": "1. Validates the given suggested answer.\n\nReturns:\n dict: The generated suggested answer and its validation result.", + "source": "validate_suggested_answer.py", + "function": "validate_suggested_answer" + }, + "validate_text_chunk.py": { + "type": "python", + "inputs": { + "connection": { + "type": [ + "OpenAIConnection", + "AzureOpenAIConnection" + ] + }, + "score_text_chunk_prompt": { + "type": [ + "string" + ] + }, + "score_threshold": { + "type": [ + "double" + ] + }, + "deployment_name": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "AzureOpenAIConnection" + ], + "capabilities": { + "completion": false, + "chat_completion": true, + "embeddings": false + } + }, + "model": { + "type": [ + "string" + ], + "enabled_by": "connection", + "enabled_by_type": [ + "OpenAIConnection" + ] + }, + "context": { + "type": [ + "string" + ] + }, + "response_format": { + "type": [ + "string" + ], + "default": "text" + }, + "temperature": { + "type": [ + "double" + ], + "default": "0.2" + } + }, + "description": "Validates the given text chunk. If the validation fails, return an empty context and the validation result.\n\nReturns:\n dict: Text chunk context and its validation result.", + "source": "validate_text_chunk.py", + "function": "validate_text_chunk" + } + } +} diff --git a/examples/gen_test_data/example_flow/flow.dag.yaml b/examples/gen_test_data/example_flow/flow.dag.yaml new file mode 100644 index 00000000000..2732d4b1643 --- /dev/null +++ b/examples/gen_test_data/example_flow/flow.dag.yaml @@ -0,0 +1,129 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +environment: + python_requirements_txt: requirements.txt +inputs: + text_chunk: + type: string + is_chat_input: false + default: Prompt flow is a suite of development tools designed to streamline the + end-to-end development cycle of LLM-based AI applications, from ideation, + prototyping, testing, evaluation to production deployment and monitoring. + It makes prompt engineering much easier and enables you to build LLM apps + with production quality. +outputs: + question: + type: string + reference: ${validate_question.output.question} + suggested_answer: + type: string + reference: ${validate_suggested_answer.output.suggested_answer} + debug_info: + type: string + reference: ${generate_debug_info.output} +nodes: +- name: score_text_chunk_prompt + type: prompt + source: + type: code + path: score_text_chunk_prompt.jinja2 + inputs: + context: ${inputs.text_chunk} + use_variants: false +- name: validate_question_prompt + type: prompt + source: + type: code + path: validate_question_prompt.jinja2 + inputs: + question: ${generate_question.output} + context: ${inputs.text_chunk} + use_variants: false +- name: generate_question_prompt + type: prompt + source: + type: code + path: generate_question_prompt.jinja2 + inputs: + context: ${inputs.text_chunk} + use_variants: false +- name: generate_suggested_answer_prompt + type: prompt + source: + type: code + path: generate_suggested_answer_prompt.jinja2 + inputs: + context: ${inputs.text_chunk} + question: ${validate_question.output.question} + use_variants: false +- name: generate_question + type: python + source: + type: code + path: generate_question.py + inputs: + connection: "" + context: ${validate_text_chunk.output.context} + temperature: 0.2 + generate_question_prompt: ${generate_question_prompt.output} + use_variants: false +- name: validate_question + type: python + source: + type: code + path: validate_question.py + inputs: + connection: "" + temperature: 0.2 + generated_question: ${generate_question.output} + validate_question_prompt: ${validate_question_prompt.output} + use_variants: false +- name: generate_suggested_answer + type: python + source: + type: code + path: generate_suggested_answer.py + inputs: + connection: "" + context: ${inputs.text_chunk} + generate_suggested_answer_prompt: ${generate_suggested_answer_prompt.output} + question: ${validate_question.output.question} + temperature: 0.2 + use_variants: false +- name: generate_debug_info + type: python + source: + type: code + path: generate_debug_info.py + inputs: + text_chunk: ${inputs.text_chunk} + validate_suggested_answer_output: ${validate_suggested_answer.output} + text_chunk_validation_res: ${validate_text_chunk.output.validation_res} + validate_question_output: ${validate_question.output} +- name: validate_suggested_answer_prompt + type: prompt + source: + type: code + path: validate_suggested_answer_prompt.jinja2 + inputs: + answer: ${generate_suggested_answer.output} +- name: validate_suggested_answer + type: python + source: + type: code + path: validate_suggested_answer.py + inputs: + connection: "" + suggested_answer: ${generate_suggested_answer.output} + validate_suggested_answer_prompt: ${validate_suggested_answer_prompt.output} + temperature: 0.2 +- name: validate_text_chunk + type: python + source: + type: code + path: validate_text_chunk.py + inputs: + connection: "" + score_text_chunk_prompt: ${score_text_chunk_prompt.output} + context: ${inputs.text_chunk} + score_threshold: 4 + temperature: 0.2 diff --git a/examples/gen_test_data/example_flow/generate_debug_info.py b/examples/gen_test_data/example_flow/generate_debug_info.py new file mode 100644 index 00000000000..01554420065 --- /dev/null +++ b/examples/gen_test_data/example_flow/generate_debug_info.py @@ -0,0 +1,47 @@ +from utils import ValidateObj, ValidationResult + +from promptflow.core import tool + + +# The inputs section will change based on the arguments of the tool function, after you save the code +# Adding type to arguments and return value will help the system show the types properly +# Please update the function name/signature per need +@tool +def my_python_tool( + text_chunk: str, + text_chunk_validation_res: ValidationResult = None, + validate_question_output: dict = None, + validate_suggested_answer_output: dict = None, +) -> dict: + question_validation_res = validate_question_output["validation_res"] + + generated_suggested_answer = validate_suggested_answer_output["suggested_answer"] + suggested_answer_validation_res = validate_suggested_answer_output["validation_res"] + + is_generation_success = generated_suggested_answer != "" + is_text_chunk_valid = text_chunk_validation_res["pass_validation"] if text_chunk_validation_res else None + is_seed_question_valid = question_validation_res["pass_validation"] if question_validation_res else None + is_suggested_answer_valid = ( + suggested_answer_validation_res["pass_validation"] if suggested_answer_validation_res else None + ) + + failed_step = "" + if not is_generation_success: + if is_text_chunk_valid is False: + failed_step = ValidateObj.TEXT_CHUNK + elif is_seed_question_valid is False: + failed_step = ValidateObj.QUESTION + elif is_suggested_answer_valid is False: + failed_step = ValidateObj.SUGGESTED_ANSWER + + return { + # TODO: support more question types like multi-context etc. + # "question_type": question_type, + "text_chunk": text_chunk, + "validation_summary": {"success": is_generation_success, "failed_step": failed_step}, + "validation_details": { + ValidateObj.TEXT_CHUNK: text_chunk_validation_res, + ValidateObj.QUESTION: question_validation_res, + ValidateObj.SUGGESTED_ANSWER: suggested_answer_validation_res, + }, + } diff --git a/examples/gen_test_data/example_flow/generate_question.py b/examples/gen_test_data/example_flow/generate_question.py new file mode 100644 index 00000000000..0868faeeca0 --- /dev/null +++ b/examples/gen_test_data/example_flow/generate_question.py @@ -0,0 +1,39 @@ +from typing import Union + +from utils import llm_call + +from promptflow._core.tool import InputSetting +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.core import tool + + +@tool( + input_settings={ + "deployment_name": InputSetting( + enabled_by="connection", + enabled_by_type=["AzureOpenAIConnection"], + capabilities={"completion": False, "chat_completion": True, "embeddings": False}, + ), + "model": InputSetting(enabled_by="connection", enabled_by_type=["OpenAIConnection"]), + } +) +def generate_question( + connection: Union[OpenAIConnection, AzureOpenAIConnection], + generate_question_prompt: str, + deployment_name: str = "", + model: str = "", + context: str = None, + temperature: float = 0.2, +): + """ + Generates a question based on the given context. + + Returns: + str: The generated seed question. + """ + # text chunk is not valid, just skip test data gen. + if not context: + return "" + + seed_question = llm_call(connection, model, deployment_name, generate_question_prompt, temperature=temperature) + return seed_question diff --git a/examples/gen_test_data/example_flow/generate_question_prompt.jinja2 b/examples/gen_test_data/example_flow/generate_question_prompt.jinja2 new file mode 100644 index 00000000000..cc11b965f21 --- /dev/null +++ b/examples/gen_test_data/example_flow/generate_question_prompt.jinja2 @@ -0,0 +1,17 @@ +# system: + +Your task is to formulate a question from given context satisfying the rules given below: + +1.The question should better be framed from the overall context, serving as a general question, rather than just framed from some details. +2.The question should be specific and answerable from the given context. +3.The question must be reasonable and must be understood and responded by humans. +4.The question should not contain phrases like 'provided' or 'given' in the question. +5.The question should be a question asked by the hypothetical user without any given context. +6.The question should not contain any links. +7.The question should not contain more than 20 words, use abbreviation wherever possible. + +# user: +context: +{{context}} + +question: diff --git a/examples/gen_test_data/example_flow/generate_suggested_answer.py b/examples/gen_test_data/example_flow/generate_suggested_answer.py new file mode 100644 index 00000000000..b76ca7f3803 --- /dev/null +++ b/examples/gen_test_data/example_flow/generate_suggested_answer.py @@ -0,0 +1,44 @@ +from typing import Union + +from utils import llm_call + +from promptflow._core.tool import InputSetting +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.core import tool + + +@tool( + input_settings={ + "deployment_name": InputSetting( + enabled_by="connection", + enabled_by_type=["AzureOpenAIConnection"], + capabilities={"completion": False, "chat_completion": True, "embeddings": False}, + ), + "model": InputSetting(enabled_by="connection", enabled_by_type=["OpenAIConnection"]), + } +) +def generate_suggested_answer( + connection: Union[OpenAIConnection, AzureOpenAIConnection], + question: str, + context: str, + generate_suggested_answer_prompt: str, + deployment_name: str = "", + model: str = "", + temperature: float = 0.2, +): + """ + Generates a suggested answer based on the given prompts and context information. + + Returns: + str: The generated suggested answer. + """ + if question and context: + return llm_call( + connection, + model, + deployment_name, + generate_suggested_answer_prompt, + temperature=temperature, + ) + else: + return "" diff --git a/examples/gen_test_data/example_flow/generate_suggested_answer_prompt.jinja2 b/examples/gen_test_data/example_flow/generate_suggested_answer_prompt.jinja2 new file mode 100644 index 00000000000..bd3e928fdc3 --- /dev/null +++ b/examples/gen_test_data/example_flow/generate_suggested_answer_prompt.jinja2 @@ -0,0 +1,12 @@ +system: +Provide the answer for the question using the information from the given context based on the following criteria: +1. The answer is correct and complete. +2. The answer is derived from the given context. +3. The answer can totally answer the question. +4. The answer should not use the words like "in the context". The answer should be enough to answer the question without the context. +5. If the answer for the question cannot be generated from the given context, just return empty string. + +user: +question:{{question}} +context:{{context}} +answer: diff --git a/examples/gen_test_data/example_flow/requirements.txt b/examples/gen_test_data/example_flow/requirements.txt new file mode 100644 index 00000000000..3855f8e9b10 --- /dev/null +++ b/examples/gen_test_data/example_flow/requirements.txt @@ -0,0 +1 @@ +promptflow[azure]>=1.7.0 diff --git a/examples/gen_test_data/example_flow/score_text_chunk_prompt.jinja2 b/examples/gen_test_data/example_flow/score_text_chunk_prompt.jinja2 new file mode 100644 index 00000000000..d9ffaf08523 --- /dev/null +++ b/examples/gen_test_data/example_flow/score_text_chunk_prompt.jinja2 @@ -0,0 +1,49 @@ +# system: + +Given a text chunk from a document as context, perform the following tasks: + +1. Exclude any references, acknowledgments, personal information, code snippets, or other non-essential elements from the original context. + +2. Evaluate the cleaned context against specific criteria for content quality and depth. + +3. Assign a numerical score between 0 and 10 based on the following criteria: + - Award a high score (closer to 10) if: + a) cleaned context delves into and explains concepts. + b) cleaned context contains substantial information that could lead to meaningful questions. + + - Award a lower score (closer to 0) if: + a) cleaned context is very brief, containing fewer than five words. + b) cleaned context is not meaningful. + + +4. Output a valid JSON containing the score and a reason. The reason must directly relate to the criteria outlined above, explaining the basis for the given score. + +Here are some examples: + +example 1: +context: +Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time. + +output: +{ + "score": "8.0", + "reason": "The context provides substantial information that could lead to meaningful questions, hence the high score." +} + +example 2: +context: +Next step\n- Open the provided examples. + +output: +{ + "score": "0.0", + "reason": "The context lacks detailed information about the provided example and previous steps, resulting in a low score." +} + + +# user: + +context: +{{context}} + +output: diff --git a/examples/gen_test_data/example_flow/utils.py b/examples/gen_test_data/example_flow/utils.py new file mode 100644 index 00000000000..25f4143ab68 --- /dev/null +++ b/examples/gen_test_data/example_flow/utils.py @@ -0,0 +1,170 @@ +import json +import re +from collections import namedtuple + +from numpy.random import default_rng + +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.tools.aoai import chat as aoai_chat +from promptflow.tools.openai import chat as openai_chat + + +class QuestionType: + SIMPLE = "simple" + # MULTI_CONTEXT = "multi_context" + + +class ValidateObj: + QUESTION = "validate_question" + TEXT_CHUNK = "validate_text_chunk" + SUGGESTED_ANSWER = "validate_suggested_answer" + + +class ResponseFormat: + TEXT = "text" + JSON = "json_object" + + +class ErrorMsg: + INVALID_JSON_FORMAT = "Invalid json format. Response: {0}" + INVALID_TEXT_CHUNK = "Skipping generating seed question due to invalid text chunk: {0}" + INVALID_QUESTION = "Invalid seed question: {0}" + INVALID_ANSWER = "Invalid answer: {0}" + + +ValidationResult = namedtuple("ValidationResult", ["pass_validation", "reason"]) +ScoreResult = namedtuple("ScoreResult", ["score", "reason", "pass_validation"]) + + +def llm_call( + connection, model, deployment_name, prompt, response_format=ResponseFormat.TEXT, temperature=1.0, max_tokens=None +): + response_format = "json_object" if response_format.lower() == "json" else response_format + # avoid unnecessary jinja2 template re-rendering and potential error. + prompt = f"{{% raw %}}{prompt}{{% endraw %}}" + if isinstance(connection, AzureOpenAIConnection): + return aoai_chat( + connection=connection, + prompt=prompt, + deployment_name=deployment_name, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": response_format}, + ) + elif isinstance(connection, OpenAIConnection): + return openai_chat( + connection=connection, + prompt=prompt, + model=model, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": response_format}, + ) + + +def get_question_type(testset_distribution) -> str: + """ + Decides question evolution type based on probability + """ + rng = default_rng() + prob = rng.uniform(0, 1) + return next((key for key in testset_distribution.keys() if prob <= testset_distribution[key]), QuestionType.SIMPLE) + + +def get_suggested_answer_validation_res( + connection, + model, + deployment_name, + prompt, + suggested_answer: str, + temperature: float, + max_tokens: int = None, + response_format: ResponseFormat = ResponseFormat.TEXT, +): + rsp = llm_call( + connection, + model, + deployment_name, + prompt, + temperature=temperature, + max_tokens=max_tokens, + response_format=response_format, + ) + return retrieve_verdict_and_print_reason( + rsp=rsp, validate_obj_name=ValidateObj.SUGGESTED_ANSWER, validate_obj=suggested_answer + ) + + +def get_question_validation_res( + connection, + model, + deployment_name, + prompt, + question: str, + response_format: ResponseFormat, + temperature: float, + max_tokens: int = None, +): + rsp = llm_call(connection, model, deployment_name, prompt, response_format, temperature, max_tokens) + return retrieve_verdict_and_print_reason(rsp=rsp, validate_obj_name=ValidateObj.QUESTION, validate_obj=question) + + +def get_text_chunk_score( + connection, + model, + deployment_name, + prompt, + response_format: ResponseFormat, + score_threshold: float, + temperature: float, + max_tokens: int = None, +): + rsp = llm_call(connection, model, deployment_name, prompt, response_format, temperature, max_tokens) + data = _load_json_rsp(rsp) + score_float = 0 + reason = "" + + if data and isinstance(data, dict) and "score" in data and "reason" in data: + # Extract the verdict and reason + score = data["score"].lower() + reason = data["reason"] + print(f"Score {ValidateObj.TEXT_CHUNK}: {score}\nReason: {reason}") + try: + score_float = float(score) + except ValueError: + reason = ErrorMsg.INVALID_JSON_FORMAT.format(rsp) + else: + reason = ErrorMsg.INVALID_JSON_FORMAT.format(rsp) + pass_validation = score_float >= score_threshold + + return ScoreResult(score_float, reason, pass_validation) + + +def retrieve_verdict_and_print_reason(rsp: str, validate_obj_name: str, validate_obj: str) -> ValidationResult: + data = _load_json_rsp(rsp) + + if data and isinstance(data, dict) and "verdict" in data and "reason" in data: + # Extract the verdict and reason + verdict = data["verdict"].lower() + reason = data["reason"] + print(f"Is valid {validate_obj_name}: {verdict}\nReason: {reason}") + if verdict == "yes": + return ValidationResult(True, reason) + elif verdict == "no": + return ValidationResult(False, reason) + else: + print(f"Unexpected llm response to validate {validate_obj_name}: {validate_obj}") + + return ValidationResult(False, ErrorMsg.INVALID_JSON_FORMAT.format(rsp)) + + +def _load_json_rsp(rsp: str): + try: + # It is possible that even the response format is required as json, the response still contains ```json\n + rsp = re.sub(r"```json\n?|```", "", rsp) + data = json.loads(rsp) + except json.decoder.JSONDecodeError: + print(ErrorMsg.INVALID_JSON_FORMAT.format(rsp)) + data = None + + return data diff --git a/examples/gen_test_data/example_flow/validate_question.py b/examples/gen_test_data/example_flow/validate_question.py new file mode 100644 index 00000000000..2d923d5619b --- /dev/null +++ b/examples/gen_test_data/example_flow/validate_question.py @@ -0,0 +1,60 @@ +from typing import Union + +from utils import ErrorMsg, QuestionType, ResponseFormat, get_question_validation_res + +from promptflow._core.tool import InputSetting +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.core import tool + + +@tool( + input_settings={ + "deployment_name": InputSetting( + enabled_by="connection", + enabled_by_type=["AzureOpenAIConnection"], + capabilities={"completion": False, "chat_completion": True, "embeddings": False}, + ), + "model": InputSetting(enabled_by="connection", enabled_by_type=["OpenAIConnection"]), + } +) +def validate_question( + connection: Union[OpenAIConnection, AzureOpenAIConnection], + generated_question: str, + validate_question_prompt: str, + deployment_name: str = "", + model: str = "", + response_format: str = ResponseFormat.TEXT, + temperature: float = 0.2, +): + """ + 1. Validates the given seed question. + 2. Generates a test question based on the given prompts and distribution ratios. + + Returns: + dict: The generated test question and its type. + """ + # text chunk is not valid, seed question not generated. + if not generated_question: + return {"question": "", "question_type": "", "validation_res": None} + + validation_res = get_question_validation_res( + connection, + model, + deployment_name, + validate_question_prompt, + generated_question, + response_format, + temperature, + ) + is_valid_seed_question = validation_res.pass_validation + question = "" + question_type = "" + failed_reason = "" + if not is_valid_seed_question: + failed_reason = ErrorMsg.INVALID_QUESTION.format(generated_question) + print(failed_reason) + else: + question = generated_question + question_type = QuestionType.SIMPLE + + return {"question": question, "question_type": question_type, "validation_res": validation_res._asdict()} diff --git a/examples/gen_test_data/example_flow/validate_question_prompt.jinja2 b/examples/gen_test_data/example_flow/validate_question_prompt.jinja2 new file mode 100644 index 00000000000..659ba56c39c --- /dev/null +++ b/examples/gen_test_data/example_flow/validate_question_prompt.jinja2 @@ -0,0 +1,64 @@ +# system: + +Verdict a question based on following rules: + +1. If there are acronyms or terms in the question, then please check if they exist in the given context. If no, verdict no. If yes, check if other rules are satisfied. +2. Determine if the given question can be clearly understood and give the reason. + +Output a valid json with reason and verdict. + +Here are some examples: +question: What is the discovery about space? +answer: +{ + "reason":"The question is too vague and does not specify which discovery about space it is referring to." + "verdict":"no" +} + +question: What caused the Great Depression? +answer: +{ + "reason":"The question is specific and refers to a well-known historical economic event, making it clear and answerable.", + "verdict":"yes" +} + +question: What is the keyword that best describes the paper's focus in natural language understanding tasks? +answer: +{ + "reason": "The question mentions a 'paper' in it without referring it's name which makes it unclear without it", + "verdict": "no" +} + +question: Who wrote 'Romeo and Juliet'? +answer: +{ + "reason": "The question is clear and refers to a specific work by name therefore it is clear", + "verdict": "yes" +} + +question: What did the study mention? +answer: +{ + "reason": "The question is vague and does not specify which study it is referring to", + "verdict": "no" +} + +question: What is the focus of the REPLUG paper? +answer: +{ + "reason": "The question refers to a specific work by it's name hence can be understood", + "verdict": "yes" +} + +question: What is the purpose of the reward-driven stage in the training process? +answer: +{ + "reason": "The question lacks specific context regarding the type of training process, making it potentially ambiguous and open to multiple interpretations.", + "verdict": "no" +} + +# user: +context: {{context}} + +question: {{question}} +answer: diff --git a/examples/gen_test_data/example_flow/validate_suggested_answer.py b/examples/gen_test_data/example_flow/validate_suggested_answer.py new file mode 100644 index 00000000000..4ce6c779f62 --- /dev/null +++ b/examples/gen_test_data/example_flow/validate_suggested_answer.py @@ -0,0 +1,55 @@ +from typing import Union + +from utils import ErrorMsg, get_suggested_answer_validation_res + +from promptflow._core.tool import InputSetting +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.core import tool + + +@tool( + input_settings={ + "deployment_name": InputSetting( + enabled_by="connection", + enabled_by_type=["AzureOpenAIConnection"], + capabilities={"completion": False, "chat_completion": True, "embeddings": False}, + ), + "model": InputSetting(enabled_by="connection", enabled_by_type=["OpenAIConnection"]), + } +) +@tool +def validate_suggested_answer( + connection: Union[OpenAIConnection, AzureOpenAIConnection], + suggested_answer: str, + validate_suggested_answer_prompt: str, + deployment_name: str = "", + model: str = "", + temperature: float = 0.2, + response_format: str = "text", +): + """ + 1. Validates the given suggested answer. + + Returns: + dict: The generated suggested answer and its validation result. + """ + if not suggested_answer: + return {"suggested_answer": "", "validation_res": None} + + validation_res = get_suggested_answer_validation_res( + connection, + model, + deployment_name, + validate_suggested_answer_prompt, + suggested_answer, + temperature, + response_format=response_format, + ) + is_valid_gt = validation_res.pass_validation + failed_reason = "" + if not is_valid_gt: + failed_reason = ErrorMsg.INVALID_ANSWER.format(suggested_answer) + print(failed_reason) + suggested_answer = "" + + return {"suggested_answer": suggested_answer, "validation_res": validation_res._asdict()} diff --git a/examples/gen_test_data/example_flow/validate_suggested_answer_prompt.jinja2 b/examples/gen_test_data/example_flow/validate_suggested_answer_prompt.jinja2 new file mode 100644 index 00000000000..f01dca227ed --- /dev/null +++ b/examples/gen_test_data/example_flow/validate_suggested_answer_prompt.jinja2 @@ -0,0 +1,43 @@ +# system: + +Given an answer, verdict if the provided answer is valid and provide the reason in valid json format. +The answer is not valid if the answer suggests that the context does not provide information or indicates uncertainty (such as 'I don't know'), it is deemed invalid. For any other case, the answer is considered valid. + +# user: + +Output a json format with the reason and verdict. +Here are some examples: +answer: +The steps to build and install your tool package for use in VS Code extension are not provided in the context. +output: +{ + "reason":"The answer is invalid because it states that the context does not provide the necessary steps.", + "verdict":"no" +} + +answer: +The context does not provide specific information on what the possible provider values are in supported configs for a connection provider. +output: +{ + "reason":"The answer is invalid as it indicates that the context lacks specific information.", + "verdict":"no" +} + +answer: +I don't know. +output: +{ + "reason":"The answer is invalid because it conveys don't know.", + "verdict":"no" +} + +answer: +The two essential components of an activate config in a node flow are `activate.when` and `activate.is`. +output: +{ + "reason":"The answer is valid because it is clear and true.", + "verdict":"yes" +} + +answer:{{answer}} +output: diff --git a/examples/gen_test_data/example_flow/validate_text_chunk.py b/examples/gen_test_data/example_flow/validate_text_chunk.py new file mode 100644 index 00000000000..a5d348d8226 --- /dev/null +++ b/examples/gen_test_data/example_flow/validate_text_chunk.py @@ -0,0 +1,49 @@ +from typing import Union + +from utils import ErrorMsg, ResponseFormat, get_text_chunk_score + +from promptflow._core.tool import InputSetting +from promptflow.connections import AzureOpenAIConnection, OpenAIConnection +from promptflow.core import tool + + +@tool( + input_settings={ + "deployment_name": InputSetting( + enabled_by="connection", + enabled_by_type=["AzureOpenAIConnection"], + capabilities={"completion": False, "chat_completion": True, "embeddings": False}, + ), + "model": InputSetting(enabled_by="connection", enabled_by_type=["OpenAIConnection"]), + } +) +def validate_text_chunk( + connection: Union[OpenAIConnection, AzureOpenAIConnection], + score_text_chunk_prompt: str, + score_threshold: float, + deployment_name: str = "", + model: str = "", + context: str = None, + response_format: str = ResponseFormat.TEXT, + temperature: float = 0.2, +): + """ + Validates the given text chunk. If the validation fails, return an empty context and the validation result. + + Returns: + dict: Text chunk context and its validation result. + """ + text_chunk_score_res = get_text_chunk_score( + connection, + model, + deployment_name, + score_text_chunk_prompt, + response_format, + score_threshold, + temperature, + ) + if not text_chunk_score_res.pass_validation: + print(ErrorMsg.INVALID_TEXT_CHUNK.format(context)) + return {"context": "", "validation_res": text_chunk_score_res._asdict()} + + return {"context": context, "validation_res": text_chunk_score_res._asdict()} diff --git a/examples/gen_test_data/gen_test_data/__init__.py b/examples/gen_test_data/gen_test_data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/gen_test_data/gen_test_data/common.py b/examples/gen_test_data/gen_test_data/common.py new file mode 100644 index 00000000000..791a383e07d --- /dev/null +++ b/examples/gen_test_data/gen_test_data/common.py @@ -0,0 +1,249 @@ +import json +import re +import sys +import time +import typing as t +from pathlib import Path + +from constants import DOCUMENT_NODE, NODES_FILE_NAME, SUPPORT_FILE_TYPE, TEXT_CHUNK + +from promptflow._utils.logger_utils import get_logger + + +def split_document(chunk_size, chunk_overlap, documents_folder, document_node_output): + try: + from llama_index import SimpleDirectoryReader + from llama_index.node_parser import SentenceSplitter + from llama_index.readers.schema import Document as LlamaindexDocument + from llama_index.schema import BaseNode + except ImportError as e: + raise ImportError( + f"{str(e)}. It appears that `llama_index` may not be installed, or the installed version may be incorrect." + "Please check `requirements.txt` file and install all the dependencies." + ) + + logger = get_logger("doc.split") + logger.info("Step 1: Start to split documents to document nodes...") + # count the number of files in documents_folder, including subfolders. + all_files = [f for f in Path(documents_folder).rglob("*") if f.is_file()] + filtered_num_files = sum(1 for _ in all_files if _.suffix.lower() in SUPPORT_FILE_TYPE) + logger.info( + f"Found {len(all_files)} files in the documents folder '{documents_folder}'. " + f"After filtering out unsupported file types, {filtered_num_files} files remain." + f"Using chunk size: {chunk_size} to split." + ) + # `SimpleDirectoryReader` by default chunk the documents based on heading tags and paragraphs, which may lead to small chunks. # noqa: E501 + reader = SimpleDirectoryReader(documents_folder, required_exts=SUPPORT_FILE_TYPE, recursive=True, encoding="utf-8") + # Disable the default suffixes to avoid splitting the documents into small chunks. + # TODO: find a better way to disable the default suffixes. + SimpleDirectoryReader.supported_suffix = [] + chunks = reader.load_data() + # Convert documents into nodes + node_parser = SentenceSplitter.from_defaults( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, include_metadata=True + ) + chunks = t.cast(t.List[LlamaindexDocument], chunks) + document_nodes: t.List[BaseNode] = node_parser.get_nodes_from_documents(documents=chunks) + logger.info(f"Split the documents and created {len(document_nodes)} document nodes.") + document_nodes_output_path = document_node_output / Path(NODES_FILE_NAME) + with open(document_nodes_output_path, "wt") as text_file: + for doc in document_nodes: + print(json.dumps({TEXT_CHUNK: doc.text, DOCUMENT_NODE: doc.to_json()}), file=text_file) + + logger.info(f"Saved document nodes to '{document_nodes_output_path}'.") + return str(Path(document_node_output) / NODES_FILE_NAME) + + +def clean_data(test_data_set: list, test_data_output_path: str): + logger = get_logger("data.clean") + logger.info("Step 3: Start to clean invalid test data...") + logger.info(f"Collected {len(test_data_set)} test data after the batch run.") + cleaned_data = [] + + for test_data in test_data_set: + if test_data and all( + val and val != "(Failed)" for key, val in test_data.items() if key.lower() != "line_number" + ): + data_line = {"question": test_data["question"], "suggested_answer": test_data["suggested_answer"]} + cleaned_data.append(data_line) + + jsonl_str = "\n".join(map(json.dumps, cleaned_data)) + with open(test_data_output_path, "wt") as text_file: + print(f"{jsonl_str}", file=text_file) + + # TODO: aggregate invalid data root cause and count, and log it. + # log debug info path. + logger.info( + f"Removed {len(test_data_set) - len(cleaned_data)} invalid test data. " + f"Saved {len(cleaned_data)} valid test data to '{test_data_output_path}'." + ) + + +def count_non_blank_lines(file_path): + with open(file_path, "r") as file: + lines = file.readlines() + + non_blank_lines = len([line for line in lines if line.strip()]) + return non_blank_lines + + +def print_progress(log_file_path: str, process): + from tqdm import tqdm + + logger = get_logger("data.gen") + finished_log_pattern = re.compile(r".*execution.bulk\s+INFO\s+Finished (\d+) / (\d+) lines\.") + progress_log_pattern = re.compile( + r".*execution.bulk\s+INFO.*\[Finished: (\d+)\] \[Processing: (\d+)\] \[Pending: (\d+)\]" + ) + # wait for the log file to be created + start_time = time.time() + while not Path(log_file_path).is_file(): + time.sleep(1) + # if the log file is not created within 5 minutes, raise an error + if time.time() - start_time > 300: + raise Exception(f"Log file '{log_file_path}' is not created within 5 minutes.") + + logger.info(f"Click '{log_file_path}' to see detailed batch run log. Showing the progress here...") + progress_bar = None + try: + last_data_time = time.time() + with open(log_file_path, "r") as f: + while True: + status = process.poll() + # status is None if not finished, 0 if finished successfully, and non-zero if failed + if status: + stdout, _ = process.communicate() + raise Exception(f"Batch run failed due to {stdout.decode('utf-8')}") + + line = f.readline().strip() + if line: + last_data_time = time.time() # Update the time when the last data was received + progress_match = progress_log_pattern.match(line) + finished_match = finished_log_pattern.match(line) + if not progress_match and not finished_match: + continue + + if progress_match: + finished, processing, pending = map(int, progress_match.groups()) + total = finished + processing + pending + if progress_bar is None: + # Set mininterval=0 to refresh the progress bar when it calls progress_bar.update + # after initialization. + progress_bar = tqdm(total=total, desc="Processing", mininterval=0, file=sys.stdout) + progress_bar.update(finished - progress_bar.n) + + if finished_match: + finished, total = map(int, finished_match.groups()) + if progress_bar is None: + progress_bar = tqdm(total=total, desc="Processing", mininterval=0, file=sys.stdout) + progress_bar.update(finished - progress_bar.n) + + if finished == total: + progress_bar.close() + logger.info("Batch run is completed.") + + break + elif time.time() - last_data_time > 300: + logger.info( + "No new log line received for 5 minutes. Stop reading. " + f"See the log file '{log_file_path}' for more details." + ) + break + else: + time.sleep(1) # wait for 1 second if no new line is available + except Exception as e: + raise Exception(f"Error occurred while printing batch run progress: {e}.") + finally: + if progress_bar: + progress_bar.close() + + +def convert_to_abs_path(file_path: str) -> str: + if not file_path: + return file_path + + path = Path(file_path) + if path.is_absolute(): + return str(path) + elif path.exists(): + abs = str(path.resolve()) + return abs + else: + return file_path + + +def local_path_exists(path): + return Path(path).exists() + + +def non_padding_path(path): + return not (path.startswith("<") and path.endswith(">")) + + +def _retrieve_file_names_from_document_nodes_file(document_nodes_file_path) -> t.List[str]: + text_info = {} + with open(document_nodes_file_path, "r") as file: + for line in file: + # Should skip empty new lines, otherwise, json.loads would throw error. + if not line.strip(): + continue + line_json = json.loads(line) + text_chunk = line_json[TEXT_CHUNK] + document_node = json.loads(line_json["document_node"]) + file_path = document_node["metadata"]["file_path"] + text_info[text_chunk] = file_path + return text_info + + +def _count_lines(file_path) -> int: + with open(file_path, "r") as f: + return sum(1 for line in f if line.strip()) + + +def summarize_batch_run_res(gen_details_file_path, document_nodes_file_path, output_file_path): + success_count = 0 + validate_failed_count = 0 + validate_failed_steps = {} + validate_failed_distribution = {} + + nodes_file_lines_count = _count_lines(document_nodes_file_path) + document_nodes_info = _retrieve_file_names_from_document_nodes_file(document_nodes_file_path) + + with open(gen_details_file_path, "r") as details_f: + for details_line in details_f: + # Should skip empty new lines, otherwise, json.loads would throw error. + if not details_line.strip(): + continue + data = json.loads(details_line) + if data["debug_info"] == "(Failed)": + continue + + if data["debug_info"]["validation_summary"]["success"]: + success_count += 1 + else: + validate_failed_count += 1 + failed_step = data["debug_info"]["validation_summary"]["failed_step"] + + if failed_step in validate_failed_steps: + validate_failed_steps[failed_step] += 1 + else: + validate_failed_steps[failed_step] = 1 + validate_failed_distribution[failed_step] = {} + + document_name = document_nodes_info[data["debug_info"]["text_chunk"]] + if document_name in validate_failed_distribution[failed_step]: + validate_failed_distribution[failed_step][document_name] += 1 + else: + validate_failed_distribution[failed_step][document_name] = 1 + + data = { + "total_count": nodes_file_lines_count, + "success_count": success_count, + "run_failed_count": nodes_file_lines_count - success_count - validate_failed_count, + "validate_failed_count": validate_failed_count, + "validate_failed_steps": validate_failed_steps, + "validate_failed_distribution": validate_failed_distribution, + } + + with open(output_file_path, "w") as file: + json.dump(data, file, indent=4) diff --git a/examples/gen_test_data/gen_test_data/components.py b/examples/gen_test_data/gen_test_data/components.py new file mode 100644 index 00000000000..583ec610dbe --- /dev/null +++ b/examples/gen_test_data/gen_test_data/components.py @@ -0,0 +1,90 @@ +import json +from pathlib import Path + +from common import clean_data, split_document, summarize_batch_run_res +from constants import NODES_FILE_NAME, PARALLEL_RUN_STEP_FILE_NAME, SUMMARY_FILE_NAME, TEST_DATA_FILE_NAME +from mldesigner import Input, Output, command_component + +conda_file = Path(__file__).parent.parent / "conda.yml" +env_image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04" + + +@command_component( + name="split_document_component", + display_name="split documents", + description="Split documents into document nodes.", + environment=dict( + conda_file=conda_file, + image=env_image, + ), +) +def split_document_component( + documents_folder: Input(type="uri_folder"), + chunk_size: int, + chunk_overlap: int, + document_node_output: Output(type="uri_folder"), +) -> str: + """Split documents into document nodes. + + Args: + documents_folder: The folder containing documents to be split. + chunk_size: The size of each chunk. + document_node_output: The output folder + chunk_overlap: The size of chunk overlap + + Returns: + The folder containing the split documents. + """ + return split_document(chunk_size, chunk_overlap, documents_folder, document_node_output) + + +@command_component( + name="clean_data_component", + display_name="clean dataset", + description="Clean test data set to remove empty lines.", + environment=dict( + conda_file=conda_file, + image=env_image, + ), +) +def clean_data_component( + test_data_set_folder: Input(type="uri_folder"), test_data_output: Output(type="uri_folder") +) -> str: + test_data_set_path = Path(test_data_set_folder) / PARALLEL_RUN_STEP_FILE_NAME + + with open(test_data_set_path, "r") as f: + data = [json.loads(line) for line in f] + + test_data_output_path = test_data_output / Path(TEST_DATA_FILE_NAME) + clean_data(data, test_data_output_path) + + return str(test_data_output_path) + + +@command_component( + name="summarize_generation_details_component", + display_name="summarize generation details", + description="Summarize generation details.", + environment=dict( + conda_file=conda_file, + image=env_image, + ), +) +def summarize_generation_details_component( + document_node_output: Input(type="uri_folder"), + test_data_set_folder: Input(type="uri_folder"), + summary_output: Output(type="uri_folder"), +) -> str: + test_data_set_path = Path(test_data_set_folder) / PARALLEL_RUN_STEP_FILE_NAME + document_node_output_path = Path(document_node_output) + + summary_output_path = summary_output / Path(SUMMARY_FILE_NAME) + if document_node_output_path.is_dir(): + document_node_output_path = document_node_output_path / NODES_FILE_NAME + summarize_batch_run_res( + gen_details_file_path=test_data_set_path, + document_nodes_file_path=document_node_output_path, + output_file_path=summary_output_path, + ) + + return str(summary_output_path) diff --git a/examples/gen_test_data/gen_test_data/constants.py b/examples/gen_test_data/gen_test_data/constants.py new file mode 100644 index 00000000000..b6ffa88298f --- /dev/null +++ b/examples/gen_test_data/gen_test_data/constants.py @@ -0,0 +1,8 @@ +DOCUMENT_NODE = "document_node" +TEXT_CHUNK = "text_chunk" +NODES_FILE_NAME = "document_nodes.jsonl" +DETAILS_FILE_NAME = "test-data-gen-details.jsonl" +PARALLEL_RUN_STEP_FILE_NAME = "parallel_run_step.jsonl" +SUMMARY_FILE_NAME = "test-data-gen-summary.json" +TEST_DATA_FILE_NAME = "test-data.jsonl" +SUPPORT_FILE_TYPE = [".docx", ".pdf", ".ipynb", ".md", ".txt"] diff --git a/examples/gen_test_data/gen_test_data/run.py b/examples/gen_test_data/gen_test_data/run.py new file mode 100644 index 00000000000..c8d8c36e917 --- /dev/null +++ b/examples/gen_test_data/gen_test_data/run.py @@ -0,0 +1,324 @@ +import argparse +import json +import os +import time +from datetime import datetime +from pathlib import Path + +from promptflow._utils.logger_utils import get_logger +from promptflow._utils.yaml_utils import load_yaml + +CONFIG_FILE = (Path(__file__).parents[1] / "config.yml").resolve() + +# in order to import from absolute path, which is required by mldesigner +os.sys.path.insert(0, os.path.abspath(Path(__file__).parent)) + +from common import ( # noqa: E402 + clean_data, + convert_to_abs_path, + count_non_blank_lines, + local_path_exists, + non_padding_path, + print_progress, + split_document, + summarize_batch_run_res, +) +from constants import DETAILS_FILE_NAME, SUMMARY_FILE_NAME, TEST_DATA_FILE_NAME, TEXT_CHUNK # noqa: E402 + +logger = get_logger("data.gen") + + +def batch_run_flow(flow_folder: str, flow_input_data: str, flow_batch_run_size: int, node_inputs_override: dict): + logger.info(f"Step 2: Start to batch run '{flow_folder}'...") + import subprocess + + run_name = f"test_data_gen_{datetime.now().strftime('%b-%d-%Y-%H-%M-%S')}" + # TODO: replace the separate process to submit batch run with batch run async method when it's available. + connections_str = "" + for node_name, node_val in node_inputs_override.items(): + for k, v in node_val.items(): + # need to double quote the value to make sure the value can be passed correctly + # when the value contains special characters like "<". + connections_str += f"{node_name}.{k}=\"{v}\" " + connections_str = connections_str.rstrip() + + cmd = ( + f"pf run create --flow \"{flow_folder}\" --data \"{flow_input_data}\" --name {run_name} " + f"--environment-variables PF_WORKER_COUNT='{flow_batch_run_size}' PF_BATCH_METHOD='spawn' " + f"--column-mapping {TEXT_CHUNK}='${{data.text_chunk}}' --connections {connections_str} --debug" + ) + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + logger.info( + f"Submit batch run successfully. process id {process.pid}. Please wait for the batch run to complete..." + ) + return run_name, process + + +def get_batch_run_output(output_path: Path): + logger.info(f"Reading batch run output from '{output_path}'.") + # wait for the output file to be created + start_time = time.time() + while not Path(output_path).is_file(): + time.sleep(1) + # if the log file is not created within 5 minutes, raise an error + if time.time() - start_time > 300: + raise Exception(f"Output jsonl file '{output_path}' is not created within 5 minutes.") + + output_lines = [] + try: + with open(output_path, "r", encoding="utf-8") as f: + output_lines = list(map(json.loads, f)) + except json.decoder.JSONDecodeError as e: + logger.warning( + f"Error reading the output file: {e}. It could be that the batch run output is empty. " + "Please check your flow and ensure it can run successfully." + ) + + return [ + {"question": line["question"], "suggested_answer": line["suggested_answer"], "debug_info": line["debug_info"]} + for line in output_lines + ] + + +def run_local( + documents_folder: str, + document_chunk_size: int, + document_chunk_overlap: int, + document_nodes_file: str, + flow_folder: str, + flow_batch_run_size: int, + output_folder: str, + should_skip_split: bool, + node_inputs_override: dict, +): + text_chunks_path = document_nodes_file + output_folder = Path(output_folder) / datetime.now().strftime("%b-%d-%Y-%H-%M-%S") + if not Path(output_folder).is_dir(): + Path(output_folder).mkdir(parents=True, exist_ok=True) + + if not should_skip_split: + text_chunks_path = split_document(document_chunk_size, document_chunk_overlap, documents_folder, output_folder) + + run_name, process = batch_run_flow(flow_folder, text_chunks_path, flow_batch_run_size, node_inputs_override) + + run_folder_path = Path.home() / f".promptflow/.runs/{run_name}" + print_progress(run_folder_path / "logs.txt", process) + test_data_set = get_batch_run_output(run_folder_path / "outputs.jsonl") + # Store intermedian batch run output results + jsonl_str = "\n".join(map(json.dumps, test_data_set)) + batch_run_details_file = Path(output_folder) / DETAILS_FILE_NAME + with open(batch_run_details_file, "wt") as text_file: + print(f"{jsonl_str}", file=text_file) + + clean_data_output = Path(output_folder) / TEST_DATA_FILE_NAME + clean_data(test_data_set, clean_data_output) + logger.info(f"More debug info of test data generation can be found in '{batch_run_details_file}'.") + + try: + summary_output_file = Path(output_folder) / SUMMARY_FILE_NAME + summarize_batch_run_res( + gen_details_file_path=batch_run_details_file, + document_nodes_file_path=text_chunks_path, + output_file_path=summary_output_file, + ) + logger.info(f"Check test data generation summary in '{summary_output_file}'.") + except Exception as e: + logger.warning(f"Error to analyze batch run results: {e}") + + +def run_cloud( + documents_folder: str, + document_chunk_size: int, + document_chunk_overlap: int, + document_nodes_file: str, + flow_folder: str, + subscription_id: str, + resource_group: str, + workspace_name: str, + aml_cluster: str, + prs_instance_count: int, + prs_mini_batch_size: int, + prs_max_concurrency_per_instance: int, + prs_max_retry_count: int, + prs_run_invocation_time: int, + prs_allowed_failed_count: int, + should_skip_split: bool, + node_inputs_override: dict, +): + # lazy import azure dependencies + try: + from azure.ai.ml import Input as V2Input + from azure.ai.ml import MLClient, dsl, load_component + from azure.ai.ml.entities import RetrySettings + from azure.identity import DefaultAzureCredential + except ImportError: + raise ImportError( + "Please install azure dependencies using the following command: " + + "`pip install -r requirements_cloud.txt`" + ) + + @dsl.pipeline( + non_pipeline_inputs=[ + "flow_yml_path", + "should_skip_doc_split", + "instance_count", + "mini_batch_size", + "max_concurrency_per_instance", + "max_retry_count", + "run_invocation_time", + "allowed_failed_count", + ] + ) + def gen_test_data_pipeline( + data_input: V2Input, + flow_yml_path: str, + should_skip_doc_split: bool, + chunk_size=1024, + chunk_overlap=200, + instance_count=1, + mini_batch_size=1, + max_concurrency_per_instance=2, + max_retry_count=3, + run_invocation_time=600, + allowed_failed_count=-1, + ): + from components import clean_data_component, split_document_component, summarize_generation_details_component + + data = ( + data_input + if should_skip_doc_split + else split_document_component( + documents_folder=data_input, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ).outputs.document_node_output + ) + flow_node = load_component(flow_yml_path, params_override=[{"name": "gen_test_data_example_flow"}])( + data=data, text_chunk="${data.text_chunk}", connections=node_inputs_override + ) + flow_node.mini_batch_size = mini_batch_size + flow_node.max_concurrency_per_instance = max_concurrency_per_instance + flow_node.set_resources(instance_count=instance_count) + flow_node.retry_settings = RetrySettings(max_retry_count=max_retry_count, timeout=run_invocation_time) + flow_node.mini_batch_error_threshold = allowed_failed_count + # Should use `mount` mode to ensure PRS complete merge output lines. + flow_node.outputs.flow_outputs.mode = "mount" + clean_data_component(test_data_set_folder=flow_node.outputs.flow_outputs).outputs.test_data_output + summarize_generation_details_component( + document_node_output=data, test_data_set_folder=flow_node.outputs.flow_outputs + ).outputs.summary_output + + def get_ml_client(subscription_id: str, resource_group: str, workspace_name: str): + credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) + return MLClient( + credential=credential, + subscription_id=subscription_id, + resource_group_name=resource_group, + workspace_name=workspace_name, + ) + + ml_client = get_ml_client(subscription_id, resource_group, workspace_name) + + if should_skip_split: + data_input = V2Input(path=document_nodes_file, type="uri_file") + else: + data_input = V2Input(path=documents_folder, type="uri_folder") + + prs_configs = { + "instance_count": prs_instance_count, + "mini_batch_size": prs_mini_batch_size, + "max_concurrency_per_instance": prs_max_concurrency_per_instance, + "max_retry_count": prs_max_retry_count, + "run_invocation_time": prs_run_invocation_time, + "allowed_failed_count": prs_allowed_failed_count, + } + + pipeline_with_flow = gen_test_data_pipeline( + data_input=data_input, + flow_yml_path=os.path.join(flow_folder, "flow.dag.yaml"), + should_skip_doc_split=should_skip_split, + chunk_size=document_chunk_size, + chunk_overlap=document_chunk_overlap, + **prs_configs, + ) + pipeline_with_flow.compute = aml_cluster + studio_url = ml_client.jobs.create_or_update(pipeline_with_flow).studio_url + logger.info(f"Completed to submit pipeline. Experiment Link: {studio_url}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--cloud", action="store_true", help="Run test data generation at cloud.") + args = parser.parse_args() + + if Path(CONFIG_FILE).is_file(): + with open(CONFIG_FILE, "r") as stream: + config = load_yaml(stream) + else: + raise Exception( + f"'{CONFIG_FILE}' does not exist. " + + "Please check if you are under the wrong directory or the file is missing." + ) + + should_skip_split_documents = False + document_nodes_file = convert_to_abs_path(config.get("document_nodes_file", None)) + documents_folder = convert_to_abs_path(config.get("documents_folder", None)) + flow_folder = convert_to_abs_path(config.get("flow_folder", None)) + output_folder = convert_to_abs_path(config.get("output_folder", None)) + validate_path_func = non_padding_path if args.cloud else local_path_exists + node_inputs_override = config.get("node_inputs_override", None) + + if document_nodes_file and validate_path_func(document_nodes_file): + should_skip_split_documents = True + elif not documents_folder or not validate_path_func(documents_folder): + raise Exception( + "Neither 'documents_folder' nor 'document_nodes_file' is valid.\n" + f"documents_folder: '{documents_folder}'\ndocument_nodes_file: '{document_nodes_file}'" + ) + + if not validate_path_func(flow_folder): + raise Exception(f"Invalid flow folder: '{flow_folder}'") + + if args.cloud: + logger.info("Start to generate test data at cloud...") + else: + logger.info("Start to generate test data at local...") + + if should_skip_split_documents: + logger.info( + "Skip step 1 'Split documents to document nodes' as received document nodes from " + f"input file path '{document_nodes_file}'." + ) + if Path(document_nodes_file).is_file(): + logger.info(f"Collected {count_non_blank_lines(document_nodes_file)} document nodes.") + + if args.cloud: + run_cloud( + documents_folder, + config.get("document_chunk_size", 512), + config.get("document_chunk_overlap", 100), + document_nodes_file, + flow_folder, + config["subscription_id"], + config["resource_group"], + config["workspace_name"], + config["aml_cluster"], + config.get("prs_instance_count", 2), + config.get("prs_mini_batch_size", 1), + config.get("prs_max_concurrency_per_instance", 4), + config.get("prs_max_retry_count", 3), + config.get("prs_run_invocation_time", 800), + config.get("prs_allowed_failed_count", -1), + should_skip_split_documents, + node_inputs_override, + ) + else: + run_local( + documents_folder, + config.get("document_chunk_size", 512), + config.get("document_chunk_overlap", 100), + document_nodes_file, + flow_folder, + config.get("flow_batch_run_size", 16), + output_folder, + should_skip_split_documents, + node_inputs_override, + ) diff --git a/examples/gen_test_data/requirements.txt b/examples/gen_test_data/requirements.txt new file mode 100644 index 00000000000..69936d46238 --- /dev/null +++ b/examples/gen_test_data/requirements.txt @@ -0,0 +1,3 @@ +promptflow>=1.7.0 +promptflow-tools +llama_index==0.9.48 diff --git a/examples/gen_test_data/requirements_cloud.txt b/examples/gen_test_data/requirements_cloud.txt new file mode 100644 index 00000000000..6367b67951c --- /dev/null +++ b/examples/gen_test_data/requirements_cloud.txt @@ -0,0 +1,4 @@ +promptflow>=1.7.0 +promptflow-tools +azure-ai-ml==1.15.0 +mldesigner==0.1.0b18 diff --git a/examples/tutorials/develop-promptflow-copilot/chat-panel.png b/examples/tutorials/develop-promptflow-copilot/chat-panel.png new file mode 100644 index 00000000000..12946bdee88 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/chat-panel.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/chat-ui.png b/examples/tutorials/develop-promptflow-copilot/chat-ui.png new file mode 100644 index 00000000000..720608a61f0 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/chat-ui.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/develop-promptflow-copilot.md b/examples/tutorials/develop-promptflow-copilot/develop-promptflow-copilot.md new file mode 100644 index 00000000000..7c2015fc319 --- /dev/null +++ b/examples/tutorials/develop-promptflow-copilot/develop-promptflow-copilot.md @@ -0,0 +1,144 @@ +# Develop promptflow copilot + +In this tutorial, we will provide a detailed walkthrough on creating a RAG-based copilot using the Azure Machine Learning promptflow toolkit. Our tutorial will cover a range of essential topics, including: + +- Initiating a RAG-based copilot flow through the AzureML Workspace Portal. +- Generating synthetic test data for the copilot. +- Evaluating the copilot's performance using test data. +- Enhancing the functionality and efficiency of your copilot flow. +- Deploying your copilot for customer use. + +While we will focus on constructing a copilot for promptflow as a case study, the methodologies and steps outlined can be adapted to develop your customized copilot solutions. + +## Prerequisites + +- An Azure subscription. If you don't have an Azure subscription, create a free account before you begin +- An Azure Machine Learning workspace. If you don't have a workspace, create one before you begin +- Azure OpenAI connection or OpenAI connection +- Azure AI Search connection + +## Step 1: Initialize a RAG based copilot flow + +First, begin by cloning the promptFlow repository to your local machine. Subsequently, within your Azure Machine Learning workspace, proceed to create a vector index utilizing the document files located in the `./docs` folder. For comprehensive guidance on creating a vector index, kindly consult the documentation available at [here](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-vector-index?view=azureml-api-2#create-a-vector-index-by-using-machine-learning-studio). + +Upon successful creation of the vector index, an example flow will be automatically generated within your workspace. This example flow, which is a standard Retrieval-Augmented Generation (RAG) based copilot flow, serves as an excellent starting point for developing your own copilot. You can locate the link to this example flow on the vector index's detail page. + +This is how the example flow looks like: + +![example-flow](example-flow.png) + +With some minor configuration, you can open the chat panel and directly chat with your copilot to see how good it works. + +![chat-panel](chat-panel.png) + +### Tips + +``` +Currently, the volume of test data generated cannot be directly manipulated by the user. Instead, it is contingent upon the number of segments your documents are divided into. This segmentation can be adjusted by modifying the 'document_chunk_size' and 'document_chunk_overlap' parameters in your config.yml file. Additionally, you have the option to alter the 'temperature' parameter of the LLM tool within the 'gen_test_data' example flow. By executing the 'gen_test_data' script multiple times, you can indirectly increase the quantity of test data produced. +``` + +## Step 2: Generate synthetic test data + +To ensure the quality of the promptFlow copilot, it's imperative to conduct extensive testing using a broad dataset. Ideally, this dataset would consist of real user inquiries, such as those found on platforms like StackOverflow. However, real-world cases often fall short in both quantity and diversity. To address this gap, the creation of synthetic test data is necessary to encompass a wider array of scenarios. + +Promptflow has provided comprehensive guidelines for generating synthetic test data using Large Language Models (LLMs). For step-by-step instructions, please refer to the document available at [here](../../../docs/how-to-guides/generate-test-data.md). + +To facilitate evaluation of your copilot in Azure, consider creating a new Data Asset in your workspace specifically for this purpose. + +### Tips + +``` +Currently, you cannot directly control how much test data you want to generate. The amount of test data is determined by how many trunks your documents are split into, you can configure that by changing the 'document_chunk_size' and 'document_chunk_overlap' parameters in your config.yml file. Meanwhile, you can also change the temperature parameter of the LLM tool in the gen_test_data example flow and run the 'gen_test_data' script for multiple times to generate more test data. +``` + +## Step 3: Evaluate your copilot with test data +After preparing the test data, we can utilize the evaluation flow to assess the performance of our copilot against the test data. Promptflow has developed various evaluation flows tailored for different scenarios. For our RAG-based copilot, we can leverage the evaluation flow in [this folder](../../../examples/flows/evaluation/eval-single-turn-metrics/) to ensure comprehensive and accurate performance analysis. + +Clone this evaluation flow folder to your local machine or upload it to your workspace. + +**Remember to update the connections used in the flow before you start evaluation.** +![trigger-eval](trigger-eval.png) +![select-eval](select-eval.png) + +### Tips + +``` +- The evaluation process is designed to compute multiple metrics, each accompanied by comprehensive explanations in the readme file. It is imperative to understand these metrics thoroughly and select those most applicable to your project. + +- The answer produced by the initial copilot flow will have a "(Source: citation)" part at the end. This is because we told the model to do that in the prompt. You can modify the default prompt to remove this part in case it affects the evaluation results as we did not append this part when generating the test data. + +- Furthermore, the evaluation process will present aggregated metrics. It is essential to closely examine the results for each line, especially for the line with lower metric. +Typically, suboptimal results stem from one of two issues: either the process is underperforming, possibly due to inadequate context retrieval or prompt formulation, or the quality of the test data is insufficient. + +To address the first issue, consider debugging or refining the process either locally or within the workspace. For the latter, you might either revise the problematic test cases or exclude them from your test dataset altogether. +``` + +## Step 4: Improve your copilot flow + +After evaluation, you will find that the initial copilot flow works well and can achieve relatively good metrics. We can continue to improve the copilot in various ways. + +### Improve context retrieval +The context retrieval is the most important part of RAG based approach, the quality of the retrieved context will directly affect the performance of the copilot. Take a close look at the initial copilot flow, you will find that the context retrieval is achieved by 'lookup_question_from_indexed_docs' node which is using 'Index Lookup' tool. + +![index-lookup-tool](index-lookup-tool.png) + +There are two paramters can be adjusted in the 'Index Lookup' tool: 'query_type' and 'top_k'. The initial copilot flow used 'query_type' as 'vector' and 'top_k' as 2. Try to adjust these two parameters and then run the evaluation to see which one can achieve the best metrics. + +### Tune the prompt +There are two LLM nodes in the initial copilot flow: the 'modify_query_with_history' node is used for rewriting the question with the history to make it more clear; the 'answer_the_question_with_context' node is used for generating the answer based on the question and context. + +You can tune the prompt of these two nodes by leveraging the variants feature of promptflow. For detailed introduction of variants, you can reference [this doc](../../../docs/how-to-guides/tune-prompts-with-variants.md). + +### Add doc link to the answer +It's important to add the link of the document which is used as the context to generate the answer. This will help the user to understand where the answer comes from and also help the user to find more information if needed. + +The answer generated by the initial flow will include a citation in the format "(Source: citation)." However, this citation format does not present a clickable link, making it inconvenient for end-users to directly access the source. +To address this, we propose modifications to the code within the 'generate_prompt_context' node. These adjustments aim to transform the citation into an accessible hyperlink. Furthermore, alterations to the prompt in the 'answer_the_question_with_context' node are suggested to ensure the document link is seamlessly integrated into the response. By implementing these changes, the final response will effectively incorporate the document link in a user-friendly format. The final answer will look like this: + +![doc-link](doc-link.png) + +You can find the specific code changes in the source of the promptflow copilot flow in [this folder](../../../examples/flows/chat/promptflow-copilot/). + + +### Avoid abuse of the copilot +Avoid abuse is a critical topic when you want to deploy your copilot to production, due to the cost consideration of LLM model. It is recommended that you add an additional authentication layer above your copilot to prevent abuse from anonymous users. + +But what if we cannot add the authentication layer or we want to save the login effort for the users ? How do we avoid the abuse of the copilot in this case? + +One common approach is to refine the prompts used in the 'answer_the_question_with_context' function to instruct the model to only respond if the answer can be sourced from the provided context. Despite this, test results indicate that the model may still respond to queries unrelated to the context, particularly with general inquiries such as "What is the capital of China?" or when chat histories extend over multiple interactions. + +A more effective strategy involves integrating an additional LLM node tasked with evaluating the relevance of a query to the copilot's capabilities (in this scenario, referred to as 'promptflow'). This node assigns a relevance score to each query. Queries with a relevance score below a predetermined threshold would bypass the context retrieval phase, and the system would instead inform the user that their question is not pertinent to the copilot's functionality. Users would be encouraged to rephrase their queries for better alignment with the copilot's capabilities. + +You can find the specific code changes in the source of the promptflow copilot flow in [this folder](../../../examples/flows/chat/promptflow-copilot/). + + +## Step 5: Bring your copilot to customers + +The final step is to bring our intelligent copilot to customers. Obviously, we cannot ask customers to run the flow directly from their side, so we must firstly deploy the flow. Promptflow has provided plenty of ways to deploy the flow, you can find the details [here](../../../docs/how-to-guides/deploy-a-flow/). The best way to do it depends on the specific scenario of your copilot. + +We want our customers to access promptflow copilot through a web page with chat UI experience, so we will deploy the flow as a managed online endpoint. You can find the detailed instructions [here](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/how-to-deploy-for-real-time-inference?view=azureml-api-2). + +### Host web app with Azure App Service +Currently, the managed online endpoint does not support Cross-Origin Resource Sharing (CORS), preventing direct access from a webpage. To facilitate interaction with the endpoint, it is necessary to host a web application. Azure App Service offers a comprehensive solution for this requirement, providing a fully managed platform designed for building, deploying, and scaling web applications. By utilizing Azure App Service, you can host your web application efficiently and establish interaction with the promptflow copilot endpoint. + +### Chat UI experience +The chat interface significantly impacts the overall user experience with the copilot, directly influencing how users interact with the system. While constructing a ChatGPT-style interface from the ground up is feasible, utilizing established open-source projects can greatly streamline and expedite the process. One of the projects we have tried is `chatgpt-lite`, we had built our promptflow copilot's UI based on it. You can find the source code of the chat UI [here](https://github.com/melionel/chatgpt-lite/tree/talk_to_endpoint_appservice). + +![chat-ui](chat-ui.png) + +### Provide suggested follow-up questions + +Incorporating follow-up question suggestions is an effective strategy to enhance user experience and communication efficiency. One approach is to instruct the model to include follow-up questions in its responses. However, this method may not always be reliable and could complicate response processing. An alternative strategy involves utilizing a separate flow dedicated to generating follow-up question suggestions. For this purpose, you can employ the 'question_simulation' flow found in [this folder](../../../examples/flows/standard/question-simulation/). + +Deploying the `question_simulation` flow as a managed online endpoint and integrating it into your web application allows for dynamic generation of pertinent follow-up questions based on previous chat interactions. + +### Collect user feedback + +User feedback is invaluable for enhancing the copilot's performance and user experience. By incorporating a feedback mechanism into the chat interface, users can provide real-time feedback on the copilot's responses. This feedback can be used to refine the copilot's performance and prompt formulation, ensuring continuous improvement. + +When you deploy the flow as a managed online endpoint, you will find the endpoint has provided two POST APIs if you check the endpoint's swagger: `/score` for trigger your copilot flow with inputs and `/feedback` for collecting user feedback into the trace data. + +``` +- You are recommended to turn on this options when you deploy the MIR endpoint, so that you can collect the related data and calculate online metrics to your workspace's application insights. +``` +![mir-options](mir-options.png) diff --git a/examples/tutorials/develop-promptflow-copilot/doc-link.png b/examples/tutorials/develop-promptflow-copilot/doc-link.png new file mode 100644 index 00000000000..8e96a006461 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/doc-link.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/example-flow.png b/examples/tutorials/develop-promptflow-copilot/example-flow.png new file mode 100644 index 00000000000..b1b23b3e039 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/example-flow.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/index-lookup-tool.png b/examples/tutorials/develop-promptflow-copilot/index-lookup-tool.png new file mode 100644 index 00000000000..0678b66d223 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/index-lookup-tool.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/mir-options.png b/examples/tutorials/develop-promptflow-copilot/mir-options.png new file mode 100644 index 00000000000..7129d57a863 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/mir-options.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/select-eval.png b/examples/tutorials/develop-promptflow-copilot/select-eval.png new file mode 100644 index 00000000000..52bbe4df6e5 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/select-eval.png differ diff --git a/examples/tutorials/develop-promptflow-copilot/trigger-eval.png b/examples/tutorials/develop-promptflow-copilot/trigger-eval.png new file mode 100644 index 00000000000..6d26f25e611 Binary files /dev/null and b/examples/tutorials/develop-promptflow-copilot/trigger-eval.png differ