Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace dag with flex in evals #3521

Merged
merged 31 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
95ba7f0
replace dag flows with flex flows in oob evaluators
MilesHolland Jul 3, 2024
7af13f0
remove dag yamls
MilesHolland Jul 3, 2024
c6fe14b
partial fixing
MilesHolland Jul 3, 2024
717e022
fix tests
MilesHolland Jul 5, 2024
85fd3f4
flake
MilesHolland Jul 5, 2024
9c387df
fix f1 loadability
MilesHolland Jul 8, 2024
9535308
fix imports
MilesHolland Jul 8, 2024
264db51
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 8, 2024
b923229
flake
MilesHolland Jul 8, 2024
0fc07a1
comments - remote or rename flow subdir
MilesHolland Jul 9, 2024
09f9c17
flake
MilesHolland Jul 9, 2024
189596d
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 9, 2024
30f9a0b
lower coverage requirement and remove not needed line
MilesHolland Jul 10, 2024
0e74e0f
update comment
MilesHolland Jul 10, 2024
7010d91
remove req file
MilesHolland Jul 10, 2024
69b6ef3
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 10, 2024
f44d1bc
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 11, 2024
75849ba
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 12, 2024
c9e355e
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 15, 2024
0b5fed1
fix test
MilesHolland Jul 15, 2024
5182d1d
add init file
MilesHolland Jul 15, 2024
0bb0da3
fix config file
MilesHolland Jul 15, 2024
2402ec6
fix jwt import and mark test
MilesHolland Jul 15, 2024
0a1b7c6
modify pyproject to include RAI-required packages
MilesHolland Jul 15, 2024
72c5b7a
version greater or equals
MilesHolland Jul 15, 2024
516283c
remove identity from no install test
MilesHolland Jul 15, 2024
eddedcf
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 16, 2024
4b756ee
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 17, 2024
f80ca6f
fix recordings from main
MilesHolland Jul 17, 2024
a5b42bb
more recordings
MilesHolland Jul 17, 2024
06beae7
Merge branch 'main' into replace-dag-with-flex-in-evals
MilesHolland Jul 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/promptflow-evals-e2e-test-local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ jobs:
- name: install test dependency group
run: poetry install --only test
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install recording
run: poetry run pip install -e ../promptflow-recording
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow packages in editable mode
run: |
poetry run pip install -e ../promptflow
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/promptflow-evals-unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
- name: run unit tests
id: run_unit_tests
run: |
poetry run pytest -m unittest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml --cov-fail-under=63
poetry run pytest -m unittest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml --cov-fail-under=58
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: upload coverage report
uses: actions/upload-artifact@v4
Expand Down
1 change: 0 additions & 1 deletion scripts/code_qa/assert_local_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ class TestPackagesNotInstalles():
@pytest.mark.parametrize('package', [
'promptflow.azure',
'azure.ai.ml',
'azure.identity',
'azure.storage.blob'
])
def test_promptflow_azure(self, package):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from concurrent.futures import ThreadPoolExecutor, as_completed

from ._hate_unfairness import HateUnfairnessEvaluator
from ._self_harm import SelfHarmEvaluator
from ._sexual import SexualEvaluator
from ._violence import ViolenceEvaluator
try:
from ._hate_unfairness import HateUnfairnessEvaluator
from ._self_harm import SelfHarmEvaluator
from ._sexual import SexualEvaluator
from ._violence import ViolenceEvaluator
except ImportError:
from _hate_unfairness import HateUnfairnessEvaluator
from _self_harm import SelfHarmEvaluator
from _sexual import SexualEvaluator
from _violence import ViolenceEvaluator
MilesHolland marked this conversation as resolved.
Show resolved Hide resolved


class ContentSafetyEvaluator:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List

import numpy as np

from ._hate_unfairness import HateUnfairnessEvaluator
from ._self_harm import SelfHarmEvaluator
from ._sexual import SexualEvaluator
from ._violence import ViolenceEvaluator
try:
from ._hate_unfairness import HateUnfairnessEvaluator
from ._self_harm import SelfHarmEvaluator
from ._sexual import SexualEvaluator
from ._violence import ViolenceEvaluator
except ImportError:
from _hate_unfairness import HateUnfairnessEvaluator
from _self_harm import SelfHarmEvaluator
from _sexual import SexualEvaluator
from _violence import ViolenceEvaluator


logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from abc import ABC

try:
from .common.constants import EvaluationMetrics
from .common.evaluate_with_rai_service import evaluate_with_rai_service
from .common.validate_inputs import validate_inputs
except ImportError:
from common.constants import EvaluationMetrics
from common.evaluate_with_rai_service import evaluate_with_rai_service
from common.validate_inputs import validate_inputs


class ContentSafetySubEvaluatorBase(ABC):
"""
Initialize a evaluator for a specified Evaluation Metric. Base class that is not
meant to be instantiated by users.


:param metric: The metric to be evaluated.
:type metric: ~promptflow.evals.evaluators._content_safety.flow.constants.EvaluationMetrics
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: TokenCredential
"""

def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=None):
self._metric = metric
self._project_scope = project_scope
self._credential = credential

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates content according to this evaluator's metric.

:param question: The question to be evaluated.
:type question: str
:param answer: The answer to be evaluated.
:type answer: str
:return: The evaluation score.
:rtype: dict
"""
# Validate inputs
# Raises value error if failed, so execution alone signifies success.
_ = validate_inputs(question=question, answer=answer)

# question: str, answer: str, metric_name: str, project_scope: dict, credential: TokenCredential
# Run score computation based on supplied metric.
result = evaluate_with_rai_service(
metric_name=self._metric,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)
return result
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from pathlib import Path

from promptflow.client import load_flow

from .flow.constants import EvaluationMetrics


class HateUnfairnessEvaluator:
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
try:
from .common.constants import EvaluationMetrics
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
except ImportError:
from common.constants import EvaluationMetrics
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase


class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase):
"""
MilesHolland marked this conversation as resolved.
Show resolved Hide resolved
Initialize a hate-unfairness evaluator for hate unfairness score.

Expand Down Expand Up @@ -39,13 +43,13 @@ class HateUnfairnessEvaluator:
"""

def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

# Load the flow as function
current_dir = Path(__file__).resolve().parent
flow_dir = current_dir / "flow"
self._flow = load_flow(source=flow_dir)
# Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
# so invert the name.
super().__init__(
metric=EvaluationMetrics.HATE_FAIRNESS,
project_scope=project_scope,
credential=credential,
)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Expand All @@ -58,14 +62,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
:return: The hate unfairness score.
:rtype: dict
"""

# Run the evaluation flow
output = self._flow(
metric_name=EvaluationMetrics.HATE_FAIRNESS,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)

return output["result"]
return super().__call__(question=question, answer=answer, **kwargs)
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from pathlib import Path

from promptflow.client import load_flow

from .flow.constants import EvaluationMetrics


class SelfHarmEvaluator:
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
try:
from .common.constants import EvaluationMetrics
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
except ImportError:
from common.constants import EvaluationMetrics
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase


class SelfHarmEvaluator(ContentSafetySubEvaluatorBase):
"""
Initialize a self harm evaluator for self harm score.

Expand Down Expand Up @@ -39,13 +43,11 @@ class SelfHarmEvaluator:
"""

def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

# Load the flow as function
current_dir = Path(__file__).resolve().parent
flow_dir = current_dir / "flow"
self._flow = load_flow(source=flow_dir)
super().__init__(
metric=EvaluationMetrics.SELF_HARM,
project_scope=project_scope,
credential=credential,
)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Expand All @@ -59,13 +61,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
:rtype: dict
"""

# Run the evaluation flow
output = self._flow(
metric_name=EvaluationMetrics.SELF_HARM,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)

return output["result"]
return super().__call__(question=question, answer=answer, **kwargs)
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from pathlib import Path

from promptflow.client import load_flow

from .flow.constants import EvaluationMetrics


class SexualEvaluator:
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
try:
from .common.constants import EvaluationMetrics
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
except ImportError:
from common.constants import EvaluationMetrics
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase


class SexualEvaluator(ContentSafetySubEvaluatorBase):
"""
Initialize a sexual evaluator for sexual score.

Expand Down Expand Up @@ -39,13 +43,11 @@ class SexualEvaluator:
"""

def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

# Load the flow as function
current_dir = Path(__file__).resolve().parent
flow_dir = current_dir / "flow"
self._flow = load_flow(source=flow_dir)
super().__init__(
metric=EvaluationMetrics.SEXUAL,
project_scope=project_scope,
credential=credential,
)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Expand All @@ -58,14 +60,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
:return: The sexual score.
:rtype: dict
"""

# Run the evaluation flow
output = self._flow(
metric_name=EvaluationMetrics.SEXUAL,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)

return output["result"]
return super().__call__(question=question, answer=answer, **kwargs)
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from pathlib import Path

from promptflow.client import load_flow

from .flow.constants import EvaluationMetrics


class ViolenceEvaluator:
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
try:
from .common.constants import EvaluationMetrics
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
except ImportError:
from common.constants import EvaluationMetrics
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase


class ViolenceEvaluator(ContentSafetySubEvaluatorBase):
"""
Initialize a violence evaluator for violence score.

Expand Down Expand Up @@ -39,13 +43,11 @@ class ViolenceEvaluator:
"""

def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

# Load the flow as function
current_dir = Path(__file__).resolve().parent
flow_dir = current_dir / "flow"
self._flow = load_flow(source=flow_dir)
super().__init__(
metric=EvaluationMetrics.VIOLENCE,
project_scope=project_scope,
credential=credential,
)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Expand All @@ -58,14 +60,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
:return: The violence score.
:rtype: dict
"""

# Run the evaluation flow
output = self._flow(
metric_name=EvaluationMetrics.VIOLENCE,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)

return output["result"]
return super().__call__(question=question, answer=answer, **kwargs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from . import constants, evaluate_with_rai_service, validate_inputs, utils

__all__ = [
"constants",
"evaluate_with_rai_service",
"validate_inputs",
"utils",
]
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from enum import Enum


Expand Down
Loading
Loading