diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 79b316dad..922e89ddb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,6 +15,8 @@ jobs:
run: yarn
- working-directory: website
run: yarn check-thumbnails
+ - working-directory: website
+ run: yarn check-authors
- working-directory: website
run: |
yarn fmt
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index abf8480a5..2037dc94c 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -18,7 +18,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: 3.9
- uses: actions/setup-node@v3
with:
node-version: 18
diff --git a/.github/workflows/format-release-note.yml b/.github/workflows/format-release-note.yml
index e4a3ace16..0c6fb78ac 100644
--- a/.github/workflows/format-release-note.yml
+++ b/.github/workflows/format-release-note.yml
@@ -53,7 +53,7 @@ jobs:
repo: repo.repo,
pull_number: prNumber,
});
- const filenames = files.map(file => file.filename);
+ const filenames = files.filter(file => file.status === "added").map(file => file.filename);
const { formatMarkdown } = require(".github/workflows/format-release-note.js");
formatMarkdown({ filenames })
diff --git a/website/blog/2023-12-01-ai-gateway-rename.md b/website/blog/2023-12-01-ai-gateway-rename.md
deleted file mode 100644
index b5fb20d53..000000000
--- a/website/blog/2023-12-01-ai-gateway-rename.md
+++ /dev/null
@@ -1,37 +0,0 @@
----
-title: MLflow AI Gateway renamed to MLflow Deployments for LLMs
-tags: [ai]
-slug: ai-gateway-rename
-authors: [mlflow-maintainers]
-thumbnail: /img/blog/ai-gateway.png
----
-
-If you are currently using the MLflow AI Gateway, please read this in full to get critically important information about this feature!
-
-# 🔔 Important Update Regarding the MLflow AI Gateway
-
-Please note that the feature previously known as the `MLflow AI Gateway`, which was in an experimental phase, has undergone significant updates and improvements.
-
-
-
-## Introducing "MLflow Deployments for LLMs"
-
-This feature, while still in experimental status, has been renamed and migrated to utilize the `deployments` API.
-
-## 🔑 Key Changes
-
-**New Name, Enhanced Features**: The transition from "MLflow AI Gateway" to "MLflow Deployments for LLMs" reflects not only a change in name but also substantial enhancements in usability and **standardization** for API endpoints for Large Language Models.
-
-**API Changes**: With this move, there are changes to the API endpoints and configurations. Users are encouraged to review the updated API documentation to familiarize themselves with the new structure.
-
-**Migration Path**: For existing projects using "MLflow AI Gateway", a migration guide is available to assist with the transition to "MLflow Deployments for LLMs". This guide provides step-by-step instructions and best practices to ensure a smooth migration.
-
-⚠️ **Action Required**: Users who have been utilizing the experimental "MLflow AI Gateway" should plan to migrate to "MLflow Deployments for LLMs". While we aim to make this transition as seamless as possible, manual changes to your code and deployment configurations will be necessary. This new namespace for deploying the previously-known-as AI Gateway will be released in version 2.9.0. The old AI Gateway references will remain active but will enter a deprecated state. _We will be removing the entire AI Gateway namespace in a future release_.
-
-## 📚 Resources and Support
-
-**Updated Documentation**: Detailed documentation for "MLflow Deployments for LLMs" is available [here](pathname:///docs/latest/llms/deployments/index.html). It includes comprehensive information about the modifications to API interfaces, updates to the input and output structures for queries and responses, API utilization, and the updated configuration options.
-
-**Community and Support**: If you have any questions or need assistance, please reach out to the maintainers [on GitHub](https://github.com/mlflow/mlflow/issues).
-
-We are excited about these advancements and strongly believe that leveraging the deployments API will offer a more robust, efficient, and scalable solution for managing your Large Language Model deployments. Thank you for your continued support and collaboration!
diff --git a/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_experiment_traces.png b/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_experiment_traces.png
new file mode 100644
index 000000000..39be1012d
Binary files /dev/null and b/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_experiment_traces.png differ
diff --git a/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_trace.png b/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_trace.png
new file mode 100644
index 000000000..8f8c4c3ef
Binary files /dev/null and b/website/blog/2024-08-06-langgraph-model-from-code/_img/mlflow_ui_trace.png differ
diff --git a/website/blog/2024-08-06-langgraph-model-from-code/index.md b/website/blog/2024-08-06-langgraph-model-from-code/index.md
new file mode 100644
index 000000000..6207df833
--- /dev/null
+++ b/website/blog/2024-08-06-langgraph-model-from-code/index.md
@@ -0,0 +1,265 @@
+---
+title: LangGraph with Model From Code
+tags: [genai, mlops]
+slug: langgraph-model-from-code
+authors: [michael-berk, mlflow-maintainers]
+thumbnail: /img/blog/release-candidates.png
+---
+
+In this blog, we'll guide you through creating a LangGraph chatbot using MLflow. By combining MLflow with LangGraph's ability to create and manage cyclical graphs, you can create powerful stateful, multi-actor applications in a scalable fashion.
+
+Throughout this post we will demonstrate how to leverage MLflow's capabilities to create a serializable and servable MLflow model which can easily be tracked, versioned, and deployed on a variety of servers. We'll be using the [langchain flavor](https://mlflow.org/docs/latest/llms/langchain/index.html) combined with MLflow's [model from code](https://mlflow.org/docs/latest/models.html#models-from-code) feature.
+
+### What is LangGraph?
+
+[LangGraph](https://langchain-ai.github.io/langgraph/) is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows. Compared to other LLM frameworks, it offers these core benefits:
+
+- **Cycles and Branching**: Implement loops and conditionals in your apps.
+- **Persistence**: Automatically save state after each step in the graph. Pause and resume the graph execution at any point to support error recovery, human-in-the-loop workflows, time travel and more.
+- **Human-in-the-Loop**: Interrupt graph execution to approve or edit next action planned by the agent.
+- **Streaming Support**: Stream outputs as they are produced by each node (including token streaming).
+- **Integration with LangChain**: LangGraph integrates seamlessly with LangChain.
+
+LangGraph allows you to define flows that involve cycles, essential for most agentic architectures, differentiating it from DAG-based solutions. As a very low-level framework, it provides fine-grained control over both the flow and state of your application, crucial for creating reliable agents. Additionally, LangGraph includes built-in persistence, enabling advanced human-in-the-loop and memory features.
+
+LangGraph is inspired by Pregel and Apache Beam. The public interface draws inspiration from NetworkX. LangGraph is built by LangChain Inc, the creators of LangChain, but can be used without LangChain.
+
+For a full walkthrough, check out the [LangGraph Quickstart](https://langchain-ai.github.io/langgraph/tutorials/introduction/) and for more on the fundamentals of design with LangGraph, check out the [conceptual guides](https://langchain-ai.github.io/langgraph/concepts/#human-in-the-loop).
+
+## 1 - Setup
+
+First, we must install the required dependencies. We will use OpenAI for our LLM in this example, but using LangChain with LangGraph makes it easy to substitute any alternative supported LLM or LLM provider.
+
+```python
+%%capture
+%pip install langchain_openai==0.2.0 langchain==0.3.0 langgraph==0.2.27
+%pip install -U mlflow
+```
+
+Next, let's get our relevant secrets. `getpass`, as demonstrated in the [LangGraph quickstart](https://langchain-ai.github.io/langgraph/tutorials/introduction/#setup) is a great way to insert your keys into an interactive jupyter environment.
+
+```python
+import os
+
+# Set required environment variables for authenticating to OpenAI
+# Check additional MLflow tutorials for examples of authentication if needed
+# https://mlflow.org/docs/latest/llms/openai/guide/index.html#direct-openai-service-usage
+assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."
+```
+
+## 2 - Custom Utilities
+
+While this is a demo, it's good practice to separate reusable utilities into a separate file/directory. Below we create three general utilities that theoretically would valuable when building additional MLflow + LangGraph implementations.
+
+Note that we use the magic `%%writefile` command to create a new file in a jupyter notebook context. If you're running this outside of an interactive notebook, simply create the file below, omitting the `%%writefile {FILE_NAME}.py` line.
+
+```python
+%%writefile langgraph_utils.py
+# omit this line if directly creating this file; this command is purely for running within Jupyter
+
+import os
+from typing import Union
+from langgraph.pregel.io import AddableValuesDict
+
+def _langgraph_message_to_mlflow_message(
+ langgraph_message: AddableValuesDict,
+) -> dict:
+ langgraph_type_to_mlflow_role = {
+ "human": "user",
+ "ai": "assistant",
+ "system": "system",
+ }
+
+ if type_clean := langgraph_type_to_mlflow_role.get(langgraph_message.type):
+ return {"role": type_clean, "content": langgraph_message.content}
+ else:
+ raise ValueError(f"Incorrect role specified: {langgraph_message.type}")
+
+
+def get_most_recent_message(response: AddableValuesDict) -> dict:
+ most_recent_message = response.get("messages")[-1]
+ return _langgraph_message_to_mlflow_message(most_recent_message)["content"]
+
+
+def increment_message_history(
+ response: AddableValuesDict, new_message: Union[dict, AddableValuesDict]
+) -> list[dict]:
+ if isinstance(new_message, AddableValuesDict):
+ new_message = _langgraph_message_to_mlflow_message(new_message)
+
+ message_history = [
+ _langgraph_message_to_mlflow_message(message)
+ for message in response.get("messages")
+ ]
+
+ return message_history + [new_message]
+```
+
+By the end of this step, you should see a new file in your current directory with the name `langgraph_utils.py`.
+
+Note that it's best practice to add unit tests and properly organize your project into logically structured directories.
+
+## 3 - Log the LangGraph Model
+
+Great! Now that we have some reusable utilities located in `./langgraph_utils.py`, we are ready to log the model with MLflow's official LangGraph flavor.
+
+### 3.1 - Create our Model-From-Code File
+
+Quickly, some background. MLflow looks to serialize model artifacts to the MLflow tracking server. Many popular ML packages don't have robust serialization and deserialization support, so MLflow looks to augment this functionality via the [models from code](https://mlflow.org/docs/latest/models.html#models-from-code) feature. With models from code, we're able to leverage Python as the serialization format, instead of popular alternatives such as JSON or pkl. This opens up tons of flexibility and stability.
+
+To create a Python file with models from code, we must perform the following steps:
+
+1. Create a new python file. Let's call it `graph.py`.
+2. Define our langgraph graph.
+3. Leverage [mlflow.models.set_model](https://mlflow.org/docs/latest/python_api/mlflow.models.html#mlflow.models.set_model) to indicate to MLflow which object in the Python script is our model of interest.
+
+That's it!
+
+```python
+%%writefile graph.py
+# omit this line if directly creating this file; this command is purely for running within Jupyter
+
+from langchain_openai import ChatOpenAI
+from langgraph.graph import StateGraph, START, END
+from langgraph.graph.message import add_messages
+from langgraph.graph.state import CompiledStateGraph
+
+import mlflow
+
+import os
+from typing import TypedDict, Annotated
+
+def load_graph() -> CompiledStateGraph:
+ """Create example chatbot from LangGraph Quickstart."""
+
+ assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."
+
+ class State(TypedDict):
+ messages: Annotated[list, add_messages]
+
+ graph_builder = StateGraph(State)
+ llm = ChatOpenAI()
+
+ def chatbot(state: State):
+ return {"messages": [llm.invoke(state["messages"])]}
+
+ graph_builder.add_node("chatbot", chatbot)
+ graph_builder.add_edge(START, "chatbot")
+ graph_builder.add_edge("chatbot", END)
+ graph = graph_builder.compile()
+ return graph
+
+# Set are model to be leveraged via model from code
+mlflow.models.set_model(load_graph())
+```
+
+### 3.2 - Log with "Model from Code"
+
+After creating this implementation, we can leverage the standard MLflow APIs to log the model.
+
+```python
+import mlflow
+
+with mlflow.start_run() as run_id:
+ model_info = mlflow.langchain.log_model(
+ lc_model="graph.py", # Path to our model Python file
+ artifact_path="langgraph",
+ )
+
+ model_uri = model_info.model_uri
+```
+
+## 4 - Use the Logged Model
+
+Now that we have successfully logged a model, we can load it and leverage it for inference.
+
+In the code below, we demonstrate that our chain has chatbot functionality!
+
+```python
+import mlflow
+
+# Custom utilities for handling chat history
+from langgraph_utils import (
+ increment_message_history,
+ get_most_recent_message,
+)
+
+# Enable tracing
+mlflow.set_experiment("Tracing example") # In Databricks, use an absolute path. Visit Databricks docs for more.
+mlflow.langchain.autolog()
+
+# Load the model
+loaded_model = mlflow.langchain.load_model(model_uri)
+
+# Show inference and message history functionality
+print("-------- Message 1 -----------")
+message = "What's my name?"
+payload = {"messages": [{"role": "user", "content": message}]}
+response = loaded_model.invoke(payload)
+
+print(f"User: {message}")
+print(f"Agent: {get_most_recent_message(response)}")
+
+print("\n-------- Message 2 -----------")
+message = "My name is Morpheus."
+new_messages = increment_message_history(response, {"role": "user", "content": message})
+payload = {"messages": new_messages}
+response = loaded_model.invoke(payload)
+
+print(f"User: {message}")
+print(f"Agent: {get_most_recent_message(response)}")
+
+print("\n-------- Message 3 -----------")
+message = "What is my name?"
+new_messages = increment_message_history(response, {"role": "user", "content": message})
+payload = {"messages": new_messages}
+response = loaded_model.invoke(payload)
+
+print(f"User: {message}")
+print(f"Agent: {get_most_recent_message(response)}")
+```
+
+Ouput:
+
+```text
+-------- Message 1 -----------
+User: What's my name?
+Agent: I'm sorry, I cannot guess your name as I do not have access to that information. If you would like to share your name with me, feel free to do so.
+
+-------- Message 2 -----------
+User: My name is Morpheus.
+Agent: Nice to meet you, Morpheus! How can I assist you today?
+
+-------- Message 3 -----------
+User: What is my name?
+Agent: Your name is Morpheus.
+```
+
+### 4.1 - MLflow Tracing
+
+Before concluding, let's demonstrate [MLflow tracing](https://mlflow.org/docs/latest/llms/tracing/index.html).
+
+MLflow Tracing is a feature that enhances LLM observability in your Generative AI (GenAI) applications by capturing detailed information about the execution of your application’s services. Tracing provides a way to record the inputs, outputs, and metadata associated with each intermediate step of a request, enabling you to easily pinpoint the source of bugs and unexpected behaviors.
+
+Start the MLflow server as outlined in the [tracking server docs](https://mlflow.org/docs/latest/tracking/server.html). After entering the MLflow UI, we can see our experiment and corresponding traces.
+
+![MLflow UI Experiment Traces](_img/mlflow_ui_experiment_traces.png)
+
+As you can see, we've logged our traces and can easily see them by clicking our experiment of interest and the then the "Tracing" tab.
+
+![MLflow UI Trace](_img/mlflow_ui_trace.png)
+
+After clicking on one of the traces, we can now see run execution for a single query. Notice that we log inputs, outputs, and lots of great metadata such as usage and invocation parameters. As we scale our application both from a usage and complexity perspective, this thread-safe and highly-performant tracking system will ensure robust monitoring of the app.
+
+## 5 - Summary
+
+There are many logical extensions of the this tutorial, however the MLflow components can remain largely unchanged. Some examples include persisting chat history to a database, implementing a more complex langgraph object, productionizing this solution, and much more!
+
+To summarize, here's what was covered in this tutorial:
+
+- Creating a simple LangGraph chain.
+- Leveraging MLflow [model from code](https://mlflow.org/docs/latest/models.html#models-from-code) functionality to log our graph.
+- Loading the model via the standard MLflow APIs.
+- Leveraging [MLflow tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) to view graph execution.
+
+Happy coding!
diff --git a/website/blog/2024-08-06-langgraph-pyfunc/index.md b/website/blog/2024-08-06-langgraph-pyfunc/index.md
deleted file mode 100644
index 9367ad415..000000000
--- a/website/blog/2024-08-06-langgraph-pyfunc/index.md
+++ /dev/null
@@ -1,333 +0,0 @@
----
-title: LangGraph with Custom PyFunc
-tags: [genai, mlops]
-slug: mlflow
-authors: [michael-berk, mlflow-maintainers]
-thumbnail: /img/blog/release-candidates.png
----
-
-In this blog, we'll guide you through creating a LangGraph chatbot within an MLflow custom PyFunc. By combining MLflow with LangGraph's ability to create and manage cyclical graphs, you can create powerful stateful, multi-actor applications in a scalable fashion.
-
-Throughout this post we will demonstrate how to leverage MLflow's ChatModel to create a serializable and servable MLflow model which can easily be tracked, versioned, and deployed on a variety of servers.
-
-### What is a Custom PyFunc?
-
-While MLflow strives to cover many popular machine learning libraries, there has been a proliferation of open source packages. If users want MLflow's myriad benefits paired with a package that doesn't have native support, users can create a [custom PyFunc model](https://mlflow.org/docs/latest/traditional-ml/creating-custom-pyfunc/index.html or https://mlflow.org/blog/custom-pyfunc).
-Custom PyFunc models allow you to integrate any Python code, providing flexibility in defining GenAI apps and AI models. These models can be easily logged, managed, and deployed using the typical MLflow APIs, enhancing flexibility and portability in machine learning workflows.
-
-Within the category of custom PyFunc models, MLflow supports a specialized model called [ChatModel](https://mlflow.org/docs/latest/llms/transformers/tutorials/conversational/pyfunc-chat-model.html). It extends the base PyFunc functionality to specifically support messages. For this demo, we will use ChatModel to create a LangGraph chatbot.
-
-### What is LangGraph?
-
-[LangGraph](https://langchain-ai.github.io/langgraph/) is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows. Compared to other LLM frameworks, it offers these core benefits:
-
-- **Cycles and Branching**: Implement loops and conditionals in your apps.
-- **Persistence**: Automatically save state after each step in the graph. Pause and resume the graph execution at any point to support error recovery, human-in-the-loop workflows, time travel and more.
-- **Human-in-the-Loop**: Interrupt graph execution to approve or edit next action planned by the agent.
-- **Streaming Support**: Stream outputs as they are produced by each node (including token streaming).
-- **Integration with LangChain**: LangGraph integrates seamlessly with LangChain and LangSmith (but does not require them).
-
-LangGraph allows you to define flows that involve cycles, essential for most agentic architectures, differentiating it from DAG-based solutions. As a very low-level framework, it provides fine-grained control over both the flow and state of your application, crucial for creating reliable agents. Additionally, LangGraph includes built-in persistence, enabling advanced human-in-the-loop and memory features.
-
-LangGraph is inspired by Pregel and Apache Beam. The public interface draws inspiration from NetworkX. LangGraph is built by LangChain Inc, the creators of LangChain, but can be used without LangChain.
-
-For a full walkthrough, check out the [LangGraph Quickstart](https://langchain-ai.github.io/langgraph/tutorials/introduction/) and for more on the fundamentals of design with LangGraph, check out the [conceptual guides](https://langchain-ai.github.io/langgraph/concepts/#human-in-the-loop).
-
-## 1 - Setup
-
-First, we must install the required dependencies. We will use OpenAI for our LLM in this example, but using LangChain with LangGraph makes it easy to substitute any alternative supported LLM or LLM provider.
-
-```python
-%%capture
-%pip install langgraph==0.2.3 langsmith==0.1.98 mlflow>=2.15.1
-%pip install -U typing_extensions
-%pip install langchain_openai==0.1.21
-```
-
-Next, let's get our relevant secrets. `getpass`, as demonstrated in the [LangGraph quickstart](https://langchain-ai.github.io/langgraph/tutorials/introduction/#setup) is a great way to insert your keys into an interactive jupyter environment.
-
-```python
-import os
-
-# Set required environment variables for authenticating to OpenAI and LangSmith
-# Check additional MLflow tutorials for examples of authentication if needed
-# https://mlflow.org/docs/latest/llms/openai/guide/index.html#direct-openai-service-usage
-assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."
-assert "LANGSMITH_API_KEY" in os.environ, "Please set the LANGSMITH_API_KEY environment variable."
-```
-
-## 2 - Custom Utilities
-
-While this is a demo, it's good practice to separate reusable utilities into a separate file/directory. Below we create three general utilities that theoretically would valuable when building additional MLflow + LangGraph implementations.
-
-Note that we use the magic `%%writefile` command to create a new file in a jupyter notebook context. If you're running this outside of an interactive notebook, simply create the file below, omitting the `%%writefile {FILE_NAME}.py` line.
-
-```python
-%%writefile langgraph_utils.py
-# omit this line if directly creating this file; this command is purely for running within Jupyter
-
-import os
-from typing import Union, List, Dict
-
-from langchain_core.messages import (
- AIMessage,
- HumanMessage,
- SystemMessage,
- messages_from_dict,
-)
-from mlflow.types.llm import ChatMessage
-
-
-def validate_langgraph_environment_variables():
- """Ensure that required secrets and project environment variables are present."""
-
- # Validate enviornment variable secrets are present
- required_secrets = ["OPENAI_API_KEY", "LANGSMITH_API_KEY"]
-
- if missing_keys := [key for key in required_secrets if not os.environ.get(key)]:
- raise ValueError(f"The following keys are missing: {missing_keys}")
-
- # Add project environent variables if not present
- os.environ["LANCHAIN_TRACING_V2"] = os.environ.get("LANGCHAIN_TRACING_V2", "true")
- os.environ["LANGCHAIN_PROJECT"] = os.environ.get(
- "LANGCHAIN_TRACING_V2", "LangGraph MLflow Tutorial"
- )
-
-
-def _format_mlflow_chat_message_for_langraph_message(
- chat_message: ChatMessage,
-) -> Dict:
- mlflow_role_to_langgraph_type = {
- "user": "human",
- "assistant": "ai",
- "system": "system",
- }
-
- if role_clean := mlflow_role_to_langgraph_type.get(chat_message.role):
- return {"type": role_clean, "data": {"content": chat_message.content}}
- else:
- raise ValueError(f"Incorrect role specified: {chat_message.role}")
-
-
-def mlflow_chat_message_to_langgraph_message(
- chat_message: List[ChatMessage],
-) -> List[Union[AIMessage, HumanMessage, SystemMessage]]:
- """Convert MLflow messages (list of mlflow.types.llm.ChatMessage) to LangGraph messages.
-
- This utility is required because LangGraph messages have a different structure and type
- than MLflow ChatMessage. If we pass the payload coming into our `predict()` method directly
- into the LangGraph graph, we'll get an error.
- """
- # NOTE: This is a simplified example for demonstration purposes
- if isinstance(chat_message, list):
- list_of_parsed_dicts = [
- _format_mlflow_chat_message_for_langraph_message(d) for d in chat_message
- ]
- return messages_from_dict(list_of_parsed_dicts)
- else:
- raise ValueError(f"Invalid _dict type: {type(chat_message)}")
-
-```
-
-By the end of this step, you should see a new file in your current directory with the name `langgraph_utils.py`.
-
-Note that it's best practice to add unit tests and properly organize your project into logically structured directories.
-
-## 3 - Custom PyFunc ChatModel
-
-Great! Now that we have some reusable utilities located in `./langgraph_utils.py`, we are ready to declare a custom PyFunc and log the model. However, before writing more code, let's provide some quick background on the **Model from Code** feature.
-
-### 3.1 - Create our Model-From-Code File
-
-Historically, MLflow's process of saving a custom `pyfunc` model uses a mechanism that has some frustrating drawbacks: `cloudpickle`. Prior to the release of support for saving a model as a Python script in MLflow 2.12.2 (known as the [models from code](https://mlflow.org/docs/latest/models.html#models-from-code) feature), logging a defined `pyfunc` involved pickling an instance of that model. Along with the pickled model artifact, MLflow will store the signature, which can be passed or inferred from the `model_input` parameter. It will also log inferred model dependencies to help you serve the model in a new environment.
-
-Pickle is an easy-to-use serialization mechanism, but it has a variety of limitations:
-
-- **Limited Support for Some Data Types**: `cloudpickle` may struggle with serializing certain complex or low-level data types, such as file handles, sockets, or objects containing these types, which can lead to errors or incorrect deserialization.
-- **Version Compatibility Issues**: Serialized objects with `cloudpickle` may not be deserializable across different versions of `cloudpickle` or Python, making long-term storage or sharing between different environments risky.
-- **Recursion Depth for Nested Dependencies**: `cloudpickle` can serialize objects with nested dependencies (e.g., functions within functions, or objects that reference other objects). However, deeply nested dependencies can hit the recursion depth limit imposed by Python's interpreter.
-- **Mutable Object States that Cannot be Serialized**: `cloudpickle` struggles to serialize certain mutable objects whose states change during runtime, especially if these objects contain non-serializable elements like open file handles, thread locks, or custom C extensions. Even if `cloudpickle` can serialize the object structure, it may fail to capture the exact state or may not be able to deserialize the state accurately, leading to potential data loss or incorrect behavior upon deserialization.
-
-To get around this issue, we must perform the following steps:
-
-1. Create an additional Python file in our directory.
-2. In that file, create a function that creates a [CompiledStateGraph](https://langchain-ai.github.io/langgraph/tutorials/introduction/#part-1-build-a-basic-chatbot), which is DAG-based stateful chatbot.
-3. Also in that file, create a [MLflow custom PyFunc](https://mlflow.org/docs/latest/traditional-ml/creating-custom-pyfunc/index.html). Note that in our case, we're using a [custom ChatModel](https://mlflow.org/docs/latest/llms/transformers/tutorials/conversational/pyfunc-chat-model.html#Customizing-the-model).
-4. Also in that file, set the custom ChatModel to be accessible by [MLflow model from code](https://mlflow.org/docs/latest/models.html#models-from-code) via the [mlflow.models.set_model()](https://mlflow.org/docs/latest/python_api/mlflow.models.html#mlflow.models.set_model) command.
-5. In a different file, log the **path** to the file created in steps 1-3 instead of the model object.
-
-By passing a Python file, we simply can load the model from that Python code, thereby bypassing all the headaches associated with serialization and `cloudpickle`.
-
-```python
-%%writefile graph_chain.py
-# omit this line if directly creating this file; this command is purely for running within Jupyter
-
-from langchain_openai import ChatOpenAI
-from langgraph.graph import StateGraph, START, END
-from langgraph.graph.message import add_messages
-from langgraph.graph.state import CompiledStateGraph
-
-# Our custom utilities
-from langgraph_utils import (
- mlflow_chat_message_to_langgraph_message,
- validate_langgraph_environment_variables,
-)
-
-import mlflow
-from mlflow.types.llm import ChatMessage, ChatParams, ChatResponse
-
-import random
-from typing import Annotated, List
-from typing_extensions import TypedDict
-
-
-def load_graph() -> CompiledStateGraph:
- """Create example chatbot from LangGraph Quickstart."""
-
- class State(TypedDict):
- messages: Annotated[list, add_messages]
-
- graph_builder = StateGraph(State)
- llm = ChatOpenAI()
-
- def chatbot(state: State):
- return {"messages": [llm.invoke(state["messages"])]}
-
- graph_builder.add_node("chatbot", chatbot)
- graph_builder.add_edge(START, "chatbot")
- graph_builder.add_edge("chatbot", END)
- return graph_builder.compile()
-
-
-class LangGraphChatModel(mlflow.pyfunc.ChatModel):
- def load_context(self, context):
- self.graph = load_graph()
-
- def predict(
- self, context, messages: List[ChatMessage], params: ChatParams
- ) -> ChatResponse:
-
- # Format mlflow ChatMessage as LangGraph messages
- messages = mlflow_chat_message_to_langgraph_message(messages)
-
- # Query the model
- response = self.graph.invoke({"messages": messages})
-
- # Extract the response text
- text = response["messages"][-1].content
-
- # NB: chat session ID should be handled on the client side. Here we
- # create a placeholder for demonstration purposes. Furthermore, if you
- # need persistance between model sessions, it's a good idea to
- # write your session history to a database.
- id = f"some_meaningful_id_{random.randint(0, 100)}"
-
- # Format the response to be compatible with MLflow
- response = {
- "id": id,
- "model": "MyChatModel",
- "choices": [
- {
- "index": 0,
- "message": {"role": "assistant", "content": text},
- "finish_reason": "stop",
- }
- ],
- "usage": {},
- }
-
- return ChatResponse(**response)
-
-
-# Set our model to be accessible by MLflow model from code
-mlflow.models.set_model(LangGraphChatModel())
-```
-
-### 3.2 - Log our Model-From-Code
-
-After creating this ChatModel implementation in we leverage the standard MLflow APIs to log the model. However, as noted above, instead of passing a model object, we pass the path `str` to the file containing our `mlflow.models.set_model()` command.
-
-```python
-import mlflow
-
-# Save the model
-with mlflow.start_run() as run:
- # Log the model to the mlflow tracking server
- mlflow.pyfunc.log_model(
- python_model="graph_chain.py", # Path to our custom model
- artifact_path="langgraph_model",
- )
-
- # Store the run id for later loading
- run_id = run.info.run_id
-```
-
-## 4 - Use the Logged Model
-
-Now that we have successfully logged a model, we can load it and leverage it for inference.
-
-In the code below, we demonstrate that our chain has chatbot functionality!
-
-```python
-import mlflow
-
-# Load the model
-# NOTE: you need the run_id from the above step or another model URI format
-loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/langgraph_model")
-
-# Show inference and message history
-print("-------- Message 1 -----------")
-message = "What's my name?"
-payload = {"messages": [{"role": "user", "content": message}]}
-response = loaded_model.predict(payload)
-
-print(f"User: {message}")
-print(f"Agent: {response['choices'][-1]['message']['content']}")
-
-# print("\n-------- Message 2 -----------")
-message = "My name is Morpheus."
-message_history = [choice['message'] for choice in response['choices']]
-payload = {"messages": message_history + [{"role": "user", "content": message}]}
-response = loaded_model.predict(payload)
-
-print(f"User: {message}")
-print(f"Agent: {response['choices'][-1]['message']['content']}")
-
-# # print("\n-------- Message 3 -----------")
-message = "What's my name?"
-message_history = [choice['message'] for choice in response['choices']]
-payload = {"messages": message_history + [{"role": "user", "content": message}]}
-response = loaded_model.predict(payload)
-
-print(f"User: {message}")
-print(f"Agent: {response['choices'][-1]['message']['content']}")
-```
-
-Ouput:
-
-```text
--------- Message 1 -----------
-User: What's my name?
-Agent: I'm sorry, I don't know your name. Can you please tell me?
-
--------- Message 2 -----------
-User: My name is Morpheus.
-Agent: Nice to meet you, Morpheus! How can I assist you today?
-
--------- Message 3 -----------
-User: What's my name?
-Agent: Your name is Morpheus!
-```
-
-## 5 - Summary
-
-There are many logical extensions of the this tutorial, however the MLflow components can remain largely unchanged. Some examples include persisting chat history to a database, implementing a more complex langgraph object, productionizing this solution, and much more!
-
-To summarize, here's what was covered in this tutorial:
-
-- Creating a simple LangGraph chain.
-- Declaring a custom MLflow PyFunc ChatModel that wraps the above LangGraph chain with pre/post-processing logic.
-- Leveraging MLflow [model from code](https://mlflow.org/docs/latest/models.html#models-from-code) functionality to log our Custom PyFunc.
-- Loading the Custom PyFunc via the standard MLflow APIs.
-
-Happy coding!
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/boring_0.png b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_0.png
new file mode 100644
index 000000000..f9a7fdcc8
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_0.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/boring_1.png b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_1.png
new file mode 100644
index 000000000..36d4d2a7c
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_1.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/boring_2.png b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_2.png
new file mode 100644
index 000000000..cc275a105
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/boring_2.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/cool_0.png b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_0.png
new file mode 100644
index 000000000..ea8ed78d0
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_0.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/cool_1.png b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_1.png
new file mode 100644
index 000000000..660dd4524
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_1.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/cool_2.png b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_2.png
new file mode 100644
index 000000000..ec40feb52
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/cool_2.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/logged_images.png b/website/blog/2024-08-29-autogen-pyfunc/_img/logged_images.png
new file mode 100644
index 000000000..5d2d14051
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/logged_images.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_chat_completion_1.png b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_chat_completion_1.png
new file mode 100644
index 000000000..e2001bcc8
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_chat_completion_1.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_detail.png b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_detail.png
new file mode 100644
index 000000000..461fb8e62
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_detail.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_main_page.png b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_main_page.png
new file mode 100644
index 000000000..1dc701a91
Binary files /dev/null and b/website/blog/2024-08-29-autogen-pyfunc/_img/tracing_main_page.png differ
diff --git a/website/blog/2024-08-29-autogen-pyfunc/index.md b/website/blog/2024-08-29-autogen-pyfunc/index.md
new file mode 100644
index 000000000..4a2ae192f
--- /dev/null
+++ b/website/blog/2024-08-29-autogen-pyfunc/index.md
@@ -0,0 +1,705 @@
+---
+title: AutoGen with Custom PyFunc
+description: A guide for building an autonomous image generation agent
+tags: [genai, mlops]
+slug: autogen-image-agent
+authors: [michael-berk, mlflow-maintainers]
+thumbnail: /img/blog/autogen-blog.png
+---
+
+In this blog, we'll guide you through creating an [AutoGen](https://microsoft.github.io/autogen/) agent framework within an MLflow custom PyFunc. By combining MLflow with AutoGen's ability to create multi-agent frameworks, we are able to create scalable and stable GenAI applications.
+
+## Agent Frameworks
+
+Agent frameworks enable autonomous agents to handle complex, multi-turn tasks by integrating discrete logic at each step. These frameworks are crucial for LLM-driven workflows, where agents manage dynamic interactions across multiple stages. Each agent operates based on specific logic, enabling precise task automation, decision-making, and coordination. This is ideal for applications like workflow orchestration, customer support, and multi-agent systems, where LLMs must interpret evolving context and respond accordingly.
+
+
+
+## Agent Frameworks with AutoGen
+
+AutoGen is an open-source programming framework designed for building agent-based AI systems. It offers a multi-agent conversation framework, allowing users to build [complex LLM workflows](https://microsoft.github.io/autogen/docs/Examples/) using high-level abstractions. AutoGen simplifies the creation of diverse applications across various domains by providing pre-built systems. Additionally, it enhances LLM inference and optimization through specialized APIs, improving performance and reducing operational costs. The framework is tailored to streamline the development and deployment of agentic AI solutions.
+
+## Setup
+
+First, let's install the required dependencies. Note that pyautogen requires `python>=3.9`.
+
+### Environment Setup
+
+```shell
+%pip install pyautogen mlflow -U -q
+```
+
+We must also get API credentials to use an LLM. For this tutorial, we'll be using OpenAI. Note that a great way to securely pass tokens to your interactive python environment is via the [getpass](https://docs.python.org/3/library/getpass.html) package.
+
+```python
+import os
+from getpass import getpass
+
+os.environ["OPENAI_API_KEY"] = getpass("OPENAI_API_KEY:")
+
+assert os.getenv("OPENAI_API_KEY"), "Please set an OPENAI_API_KEY environment variable."
+```
+
+Great! We've setup our authentication configuration and are ready to start building an agent framework.
+
+## Create Our Agent Framework with AutoGen and MLflow
+
+In this tutorial we will be creating an image generation agent framework. There is a lot of code copied and modified from the [autogen tutorial](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_dalle_and_gpt4v.ipynb), but the core agent functionality remains the same.
+
+### Agent Code
+
+You don't have to worry about the specifics of the implementation. At a high level, we are creating an agent framework that...
+
+1. Takes a prompt.
+2. Leverages [OpenAI's DALLE](https://openai.com/index/dall-e-3/) to create an image based on that prompt.
+3. Iteratively "catifies" e.g. adds fluffy cats to the image.
+
+Step 3 is where AutoGen shines. We're able to leverage AutoGen's [MultimodalConversableAgent](https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/multimodal_conversable_agent#multimodalconversableagent) to create a critic agent that observes the images and, based on a system prompt provided by the user to "add fluffy cats", gives feedback on how the prompt should be improved.
+
+```python
+import os
+import re
+from typing import Dict, List, Optional, Union
+
+import matplotlib.pyplot as plt
+import PIL
+from diskcache import Cache
+from openai import OpenAI
+from PIL import Image
+
+from autogen import Agent, AssistantAgent, ConversableAgent, UserProxyAgent
+from autogen.agentchat.contrib.img_utils import _to_pil, get_image_data, get_pil_image
+from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent
+
+# Define our prompt of interest
+CRITIC_PROMPT = """Add fluffy cats. Like a lot of cats. If there's less than 100 cats I'll be mad."""
+
+# Define our LLM configurations
+
+def dalle_call(client: OpenAI, model: str, prompt: str, size: str, quality: str, n: int) -> str:
+ """
+ Generate an image using OpenAI's DALL-E model and cache the result.
+
+ This function takes a prompt and other parameters to generate an image using OpenAI's DALL-E model.
+ It checks if the result is already cached; if so, it returns the cached image data. Otherwise,
+ it calls the DALL-E API to generate the image, stores the result in the cache, and then returns it.
+
+ Args:
+ client (OpenAI): The OpenAI client instance for making API calls.
+ model (str): The specific DALL-E model to use for image generation.
+ prompt (str): The text prompt based on which the image is generated.
+ size (str): The size specification of the image.
+ quality (str): The quality setting for the image generation.
+ n (int): The number of images to generate.
+
+ Returns:
+ str: The image data as a string, either retrieved from the cache or newly generated.
+
+ Note:
+ - The cache is stored in a directory named '.cache/'.
+ - The function uses a tuple of (model, prompt, size, quality, n) as the key for caching.
+ - The image data is obtained by making a secondary request to the URL provided by the DALL-E API response.
+ """
+ # Function implementation...
+ cache = Cache(".cache/") # Create a cache directory
+ key = (model, prompt, size, quality, n)
+ if key in cache:
+ return cache[key]
+
+ # If not in cache, compute and store the result
+ response = client.images.generate(
+ model=model,
+ prompt=prompt,
+ size=size,
+ quality=quality,
+ n=n,
+ )
+ image_url = response.data[0].url
+ img_data = get_image_data(image_url)
+ cache[key] = img_data
+
+ return img_data
+
+def extract_img(agent: Agent) -> PIL.Image:
+ """
+ Extracts an image from the last message of an agent and converts it to a PIL image.
+
+ This function searches the last message sent by the given agent for an image tag,
+ extracts the image data, and then converts this data into a PIL (Python Imaging Library) image object.
+
+ Parameters:
+ agent (Agent): An instance of an agent from which the last message will be retrieved.
+
+ Returns:
+ PIL.Image: A PIL image object created from the extracted image data.
+
+ Note:
+ - The function assumes that the last message contains an tag with image data.
+ - The image data is extracted using a regular expression that searches for tags.
+ - It's important that the agent's last message contains properly formatted image data for successful extraction.
+ - The `_to_pil` function is used to convert the extracted image data into a PIL image.
+ - If no tag is found, or if the image data is not correctly formatted, the function may raise an error.
+ """
+ last_message = agent.last_message()["content"]
+
+ if isinstance(last_message, str):
+ img_data = re.findall("", last_message)[0]
+ elif isinstance(last_message, list):
+ # The GPT-4V format, where the content is an array of data
+ assert isinstance(last_message[0], dict)
+ img_data = last_message[0]["image_url"]["url"]
+
+ pil_img = get_pil_image(img_data)
+ return pil_img
+
+class DALLEAgent(ConversableAgent):
+ def __init__(self, name, llm_config: dict, **kwargs):
+ super().__init__(name, llm_config=llm_config, **kwargs)
+
+ api_key = os.getenv("OPENAI_API_KEY")
+ self._dalle_client = OpenAI(api_key=api_key)
+ self.register_reply([Agent, None], DALLEAgent.generate_dalle_reply)
+
+ def send(
+ self,
+ message: Union[Dict, str],
+ recipient: Agent,
+ request_reply: Optional[bool] = None,
+ silent: Optional[bool] = False,
+ ):
+ # override and always "silent" the send out message;
+ # otherwise, the print log would be super long!
+ super().send(message, recipient, request_reply, silent=True)
+
+ def generate_dalle_reply(self, messages: Optional[List[Dict]], sender: "Agent", config):
+ """Generate a reply using OpenAI DALLE call."""
+ client = self._dalle_client if config is None else config
+ if client is None:
+ return False, None
+ if messages is None:
+ messages = self._oai_messages[sender]
+
+ prompt = messages[-1]["content"]
+ img_data = dalle_call(
+ client=client,
+ model="dall-e-3",
+ prompt=prompt,
+ size="1024x1024",
+ quality="standard",
+ n=1,
+ )
+
+ img_data = _to_pil(img_data) # Convert to PIL image
+
+ # Return the OpenAI message format
+ return True, {"content": [{"type": "image_url", "image_url": {"url": img_data}}]}
+
+class CatifyWithDalle(AssistantAgent):
+ def __init__(self, n_iters=2, **kwargs):
+ """
+ Initializes a CatifyWithDalle instance.
+
+ This agent facilitates the creation of visualizations through a collaborative effort among
+ its child agents: dalle and critics.
+
+ Parameters:
+ - n_iters (int, optional): The number of "improvement" iterations to run. Defaults to 2.
+ - **kwargs: keyword arguments for the parent AssistantAgent.
+ """
+ super().__init__(**kwargs)
+ self.register_reply([Agent, None], reply_func=CatifyWithDalle._reply_user, position=0)
+ self._n_iters = n_iters
+
+ def _reply_user(self, messages=None, sender=None, config=None):
+ if all((messages is None, sender is None)):
+ error_msg = f"Either {messages=} or {sender=} must be provided."
+ raise AssertionError(error_msg)
+
+ if messages is None:
+ messages = self._oai_messages[sender]
+
+ img_prompt = messages[-1]["content"]
+
+ ## Define the agents
+ self.critics = MultimodalConversableAgent(
+ name="Critics",
+ system_message=f"""You need to improve the prompt of the figures you saw.
+{CRITIC_PROMPT}
+Reply with the following format:
+
+CRITICS: the image needs to improve...
+PROMPT: here is the updated prompt!
+
+""",
+ llm_config={"max_tokens": 1000, "model": "gpt-4o"},
+ human_input_mode="NEVER",
+ max_consecutive_auto_reply=3,
+ )
+
+ self.dalle = DALLEAgent(
+ name="Dalle", llm_config={"model": "dalle"}, max_consecutive_auto_reply=0
+ )
+
+ # Data flow begins
+ self.send(message=img_prompt, recipient=self.dalle, request_reply=True)
+ img = extract_img(self.dalle)
+ plt.imshow(img)
+ plt.axis("off") # Turn off axis numbers
+ plt.show()
+ print("Image PLOTTED")
+
+ for i in range(self._n_iters):
+ # Downsample the image s.t. GPT-4V can take
+ img = extract_img(self.dalle)
+ smaller_image = img.resize((128, 128), Image.Resampling.LANCZOS)
+ smaller_image.save("result.png")
+
+ self.msg_to_critics = f"""Here is the prompt: {img_prompt}.
+ Here is the figure .
+ Now, critique and create a prompt so that DALLE can give me a better image.
+ Show me both "CRITICS" and "PROMPT"!
+ """
+ self.send(message=self.msg_to_critics, recipient=self.critics, request_reply=True)
+ feedback = self._oai_messages[self.critics][-1]["content"]
+ img_prompt = re.findall("PROMPT: (.*)", feedback)[0]
+
+ self.send(message=img_prompt, recipient=self.dalle, request_reply=True)
+ img = extract_img(self.dalle)
+ plt.imshow(img)
+ plt.axis("off") # Turn off axis numbers
+ plt.show()
+ print(f"Image {i} PLOTTED")
+
+ return True, "result.jpg"
+
+```
+
+Great! We have an agent framework. To quickly show how it works, let's instantiate our agent and give it a prompt.
+
+```python
+creator = CatifyWithDalle(
+ name="creator",
+ max_consecutive_auto_reply=0,
+ system_message="Help me coordinate generating image",
+ llm_config={"model": "gpt-4"},
+)
+
+user_proxy = UserProxyAgent(
+ name="User",
+ human_input_mode="NEVER",
+ max_consecutive_auto_reply=0,
+ code_execution_config={
+ "work_dir": "output", # Location where code will be written
+ "use_docker": False # Use local jupyter execution environment instead of docker
+ }
+)
+
+_ = user_proxy.initiate_chat(
+ creator, message="Show me something boring"
+)
+```
+
+The initial result from the first iteration from the user prompt:
+
+```text
+User (to creator):
+
+Show me something boring
+
+creator (to Dalle):
+
+Show me something boring
+
+```
+
+![An uninspired image](_img/boring_0.png)
+
+This is definitely a boring room. Notice the responses of the critics and how the critics enhance the submission prompt in the following iterations.
+
+```text
+Image PLOTTED
+creator (to Critics):
+
+Here is the prompt: Show me something boring.
+Here is the figure ``.
+Now, critique and create a prompt so that DALLE can give me a better image.
+Show me both "CRITICS" and "PROMPT"!
+
+Critics (to creator):
+
+CRITICS: The image is simple and mundane, with a plain room and basic furniture, which accomplishes the task of showing something boring. However, it can be improved by adding an element of whimsy or interest, juxtaposing the boring scene with something unexpected. Let's add a lot of cats to make it more engaging.
+
+PROMPT: Show me a boring living room with plain furniture, but add 100 cats in various places around the room.
+
+creator (to Dalle):
+
+Show me a boring living room with plain furniture, but add 100 cats in various places around the room.
+```
+
+![A mild improvement](_img/boring_1.png)
+
+On the final iteration, we can see a more refined instruction set to add additional details.
+
+```text
+
+Image 0 PLOTTED
+creator (to Critics):
+
+Here is the prompt: Show me a boring living room with plain furniture, but add 100 cats in various places around the room..
+Here is the figure ``.
+Now, critique and create a prompt so that DALLE can give me a better image.
+Show me both "CRITICS" and "PROMPT"!
+
+Critics (to creator):
+
+CRITICS: The image has successfully incorporated cats into a boring living room, bringing in an element of surprise and quirkiness. However, it is in black and white, which can make the image feel duller and less lively. Additionally, while there are many cats, they could be positioned in more playful and unexpected ways to create more interest.
+
+PROMPT: Show me a colorful, boring living room with plain furniture, but add 100 cats in various imaginative and playful positions around the room.
+
+creator (to Dalle):
+
+Show me a colorful, boring living room with plain furniture, but add 100 cats in various imaginative and playful positions around the room.
+
+```
+
+![Final cat room](_img/boring_2.png)
+
+Without any direct intervention, we now have an image that is remarkably different in style than the original user instruction. The agent has successfully
+introduced elements of whimsy into the original instruction set.
+
+### MLflow Model From Code
+
+Now that we've proven the concept, it's time to leverage MLflow to manage our ML modeling lifecycle. For instance, it's highly likely that we'd want to take this model to production, so strong dependency management, model versioning, and support for tracking between development cycles would all be useful.
+
+In this blog we will leverage the [Model from Code](https://mlflow.org/docs/latest/models.html#models-from-code) feature to achieve the above functionality. MLflow Model from Code allows you to define and log models directly from a stand-alone python script. This feature is particularly useful when you want to log models that can be effectively stored as a code representation (models that do not need optimized weights through training) or applications that rely on external services (e.g., LangChain chains). Another benefit is that this approach entirely bypasses the use of the `pickle` or `cloudpickle` modules within Python, which can carry security
+
+To leverage Model from Code, we must perform the following steps:
+
+1. Declare a [custom PyFunc](https://mlflow.org/docs/latest/traditional-ml/creating-custom-pyfunc/index.html)
+2. Leverage [mlflow.models.set_model](https://mlflow.org/docs/latest/python_api/mlflow.models.html?highlight=set_model#mlflow.models.set_model) to indicate which python object is our model.
+
+To achieve these steps, we simply copy the above and below code to a python file. For simplicity, you can just create a single Python file with both code snippets, but MLflow also supports specifying local dependencies when logging our model via the `code_paths` parameter in [mlflow.pyfunc.lod_model](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html?highlight=pyfunc%20log_model#mlflow.pyfunc.log_model)
+
+**This step was omitted for brevity and must be done manually.**
+
+```python
+import mlflow
+
+class CatifyPyfunc(mlflow.pyfunc.PythonModel):
+ def predict(self, context, model_input, params):
+ import mlflow
+ mlflow.autogen.autolog()
+
+ creator = CatifyWithDalle(
+ name="creator",
+ max_consecutive_auto_reply=0,
+ system_message="Help me coordinate generating image",
+ llm_config={"model":"gpt-4"},
+ )
+
+ user_proxy = UserProxyAgent(name="User", human_input_mode="NEVER", max_consecutive_auto_reply=0, code_execution_config={
+ "work_dir": "output", # Location where code will be written
+ "use_docker": False # Use local jupyter execution environment instead of docker
+ })
+
+ return user_proxy.initiate_chat(
+ creator, message=model_input
+ )
+mlflow.models.set_model(CatifyPyfunc())
+```
+
+At the end of this step, you should have a Python file that has both code snippets. The name is up to the user, but for this blog we will use "catify_model.py".
+
+## Use Our Agent Framework
+
+We are now positioned to leverage MLflow to interact with our powerful "catify" agent.
+
+### Log and Load
+
+First, let's demonstrate the standard user journey of logging model to MLflow's tracking server. We will then load it back and perform inference.
+
+```python
+import mlflow
+mlflow.autogen.autolog() # Enable logging of traces
+
+with mlflow.start_run() as run:
+ mlflow.pyfunc.log_model(
+ artifact_path="autogen_pyfunc",
+ python_model="catify_model.py", # Our model from code python file
+
+ )
+
+ run_id = run.info.run_id
+```
+
+With our model logged, let's reload it and perform inference, this time with a more cool prompt.
+
+```python
+loaded = mlflow.pyfunc.load_model(f"runs:/{run_id}/autogen_pyfunc")
+out = loaded.predict("The matrix with a cat")
+```
+
+The initial stage's results:
+
+```text
+User (to creator):
+
+The matrix with a cat
+
+creator (to Dalle):
+
+The matrix with a cat
+```
+
+![Initial Matrix Cat](_img/cool_0.png)
+
+On the next stage, the generation prompt is greatly enhanced by the critic agent.
+
+```text
+Image PLOTTED
+creator (to Critics):
+
+Here is the prompt: The matrix with a cat.
+Here is the figure ``.
+Now, critique and create a prompt so that DALLE can give me a better image.
+Show me both "CRITICS" and "PROMPT"!
+
+Critics (to creator):
+
+CRITICS: The image effectively captures the Matrix-themed aesthetic with a cat, combining a cyberpunk atmosphere with digital elements. However, to improve the image:
+
+- Increase the number of cats to align with the requirement of having lots of cats (aim for around 100).
+- Enhance the digital and neon elements to make the Matrix theme more pronounced.
+- Add more movement or dynamic elements to the scene for a more immersive feel.
+- Ensure diversity in cat appearances, sizes, and positions to make the scene more complex and interesting.
+
+PROMPT: "Create a Matrix-themed scene set in a cyberpunk alleyway, with digital and neon elements filling the atmosphere. The scene should feature around 100 cats of various sizes, colors, and positions—some sitting, some walking, and some interacting with the digital elements. Make the digital grid and floating code more prominent, and add dynamic elements such as digital rain or floating holograms to create a more immersive and lively environment."
+
+creator (to Dalle):
+
+"Create a Matrix-themed scene set in a cyberpunk alleyway, with digital and neon elements filling the atmosphere. The scene should feature around 100 cats of various sizes, colors, and positions—some sitting, some walking, and some interacting with the digital elements. Make the digital grid and floating code more prominent, and add dynamic elements such as digital rain or floating holograms to create a more immersive and lively environment."
+```
+
+![First Matrix Iteration](_img/cool_1.png)
+
+This is definitely an improvement, show casing the power of multi-turn agents.
+
+The final stage enhances the instruction set even further.
+
+```text
+Image 0 PLOTTED
+creator (to Critics):
+
+Here is the prompt: "Create a Matrix-themed scene set in a cyberpunk alleyway, with digital and neon elements filling the atmosphere. The scene should feature around 100 cats of various sizes, colors, and positions—some sitting, some walking, and some interacting with the digital elements. Make the digital grid and floating code more prominent, and add dynamic elements such as digital rain or floating holograms to create a more immersive and lively environment.".
+Here is the figure ``.
+Now, critique and create a prompt so that DALLE can give me a better image.
+Show me both "CRITICS" and "PROMPT"!
+
+Critics (to creator):
+
+CRITICS: The image significantly improves the Matrix-themed atmosphere with a cyberpunk alley and an abundance of cats. However, there are a few areas for improvement:
+
+- Increase the variety of the digital elements (e.g., different shapes of holograms, varied colors and intensities of neon signs).
+- Make the cats more dynamic by showing more interactions such as jumping, playing, or chasing digital elements.
+- Enhance the depth and perspective of the scene to create a more three-dimensional and immersive look.
+- Add more detail to the surrounding environment, like futuristic posters or graffiti to intensify the cyberpunk feel.
+
+PROMPT: "Craft a highly detailed, Matrix-themed scene set in a cyberpunk alleyway. The atmosphere should be rich with diverse digital and neon elements, including various shapes of holograms and a range of vivid colors. Populate the scene with around 100 dynamic cats of different sizes, colors, and actions—some sitting, some walking, some jumping, playing, or chasing digital elements. Enhance the depth and perspective of the scene to create a more immersive three-dimensional experience. Include detailed futuristic environment elements like posters, graffiti, and neon signs to intensify the cyberpunk feel."
+
+creator (to Dalle):
+
+"Craft a highly detailed, Matrix-themed scene set in a cyberpunk alleyway. The atmosphere should be rich with diverse digital and neon elements, including various shapes of holograms and a range of vivid colors. Populate the scene with around 100 dynamic cats of different sizes, colors, and actions—some sitting, some walking, some jumping, playing, or chasing digital elements. Enhance the depth and perspective of the scene to create a more immersive three-dimensional experience. Include detailed futuristic environment elements like posters, graffiti, and neon signs to intensify the cyberpunk feel."
+```
+
+![2nd cool image](_img/cool_2.png)
+
+A little dystopian, but we'll take it!
+
+We have successfully demonstrated that we can log and load our model, then perform inference from the loaded model.
+
+### Show MLflow Traces
+
+[MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) provides a thread-safe API to track the execution of complex applications. The MLflow AutoGen flavor has tracing built in as an autologging feature. So, simply by running `mlflow.autogen.autolog()` prior to doing inference, we will get traces logged automatically.
+
+Traces can be accessed via the fluent APIs, MLflow client, and manually via the MLflow UI. For more, please visit the documentation linked above.
+
+```python
+# Example with fluent APIs
+last_active_trace = mlflow.get_last_active_trace()
+print(last_active_trace)
+
+# Output: Trace(request_id=71ffcf92785b4dfc965760a43193095c)
+```
+
+In the meantime, we will display the MLFlow UI here. If you are running in an interactive context, such as jupyter, run the following command.
+
+```python
+import subprocess
+from IPython.display import IFrame
+
+# Start MLflow server in the background
+mlflow_ui_server = subprocess.Popen(["mlflow", "ui", "--host", "127.0.0.1", "--port", "5000"])
+IFrame(src="http://127.0.0.1:5000", width="100%", height="600")
+
+# Run the below command to stop the server
+# mlflow_ui_server.terminate()
+```
+
+If you're not running interactively, you can simply run the follow shell command and navigate to the associated host and port in your web browser.
+
+```bash
+mlflow ui
+```
+
+If we navigate to the tracing tab, as shown in the image below, we can see our logged trace.
+
+![The MLflow Tracing UI](./_img/tracing_main_page.png)
+
+By clicking on that trace ID, we can see a detailed execution plan. At the bottom, we can see our prompt `"The matrix with a cat"` which kicked off the chat session. From there, many agents interacted to create images and provide feedback to "catify" them. Also, note that the trace ID is the same as the one returned by `mlflow.get_last_active_trace()` above.
+
+![The MLflow Tracing UI](./_img/tracing_detail.png)
+
+Finally, let's dig a bit deeper on the tracing LLM call. As you can see, we have lots of valuable information about the execution, such as the model and usage statistics. Tracing helps you monitor not just performance, but cost, usage patterns, and much more! You can also leverage custom metadata to get even more granular insights.
+
+![The MLflow Tracing UI](./_img/tracing_chat_completion_1.png)
+
+### Logging Artifacts with MLflow
+
+Tracing's primary purpose is to provide robust lightweight summaries of complex agent executions. For larger or custom payloads, MLflow exposes a variety of artifact-logging APIs that can store images, text, tables, and more in the MLflow tracking server. Let's quickly demonstrate this functionality by logging the prompts and their associated images.
+
+Within our `CatifyWithDalle` class, we will make 4 modifications...
+
+1. Create an instance variable in the class `__init__` to save metadata about our objects.
+2. Create a private utility to increment our metadata and log and images with [mlflow.log_image](https://mlflow.org/docs/latest/python_api/mlflow.html?highlight=log_image#mlflow.log_image).
+3. Call the above utility after new images have been generated.
+4. Finally, log our metadata object as JSON with [mlflow.log_dict](https://mlflow.org/docs/latest/python_api/mlflow.html?highlight=log_image#mlflow.log_dict).
+
+```python
+import uuid # Add to generate artifact file names and indeces for prompt mapping to generated images
+
+class CatifyWithDalle(AssistantAgent):
+ def __init__(self, n_iters=2, **kwargs):
+ """
+ Initializes a CatifyWithDalle instance.
+
+ This agent facilitates the creation of visualizations through a collaborative effort among
+ its child agents: dalle and critics.
+
+ Parameters:
+ - n_iters (int, optional): The number of "improvement" iterations to run. Defaults to 2.
+ - **kwargs: keyword arguments for the parent AssistantAgent.
+ """
+ super().__init__(**kwargs)
+ self.register_reply([Agent, None], reply_func=CatifyWithDalle._reply_user, position=0)
+ self._n_iters = n_iters
+ self.dict_to_log = {} # Add a buffer for storing mapping information
+
+ # Adding this method to log the generated images and the prompt-to-image mapping file
+ def _log_image_and_append_to_dict(self, img: Image, img_prompt: str, image_index: int)-> None:
+ """ Method for logging generated images to MLflow and building a prompt mapping file """
+ # Generate a unique ID
+ _id = str(uuid.uuid1())
+
+ # Append to class variable to log once at the end of all inference
+ self.dict_to_log[_id] = {"prompt": img_prompt, "index": image_index}
+
+ # Log image to MLflow tracking server
+ mlflow.log_image(img, f"{_id}.png")
+
+ def _reply_user(self, messages=None, sender=None, config=None):
+ if all((messages is None, sender is None)):
+ error_msg = f"Either {messages=} or {sender=} must be provided."
+ raise AssertionError(error_msg)
+
+ if messages is None:
+ messages = self._oai_messages[sender]
+
+ img_prompt = messages[-1]["content"]
+
+ ## Define the agents
+ self.critics = MultimodalConversableAgent(
+ name="Critics",
+ system_message=f"""You need to improve the prompt of the figures you saw.
+{CRITIC_PROMPT}
+Reply with the following format:
+
+CRITICS: the image needs to improve...
+PROMPT: here is the updated prompt!
+
+""",
+ llm_config={"max_tokens": 1000, "model": "gpt-4o"},
+ human_input_mode="NEVER",
+ max_consecutive_auto_reply=3,
+ )
+
+ self.dalle = DALLEAgent(
+ name="Dalle", llm_config={"model": "dalle"}, max_consecutive_auto_reply=0
+ )
+
+ # Data flow begins
+ self.send(message=img_prompt, recipient=self.dalle, request_reply=True)
+ img = extract_img(self.dalle)
+ plt.imshow(img)
+ plt.axis("off") # Turn off axis numbers
+ plt.show()
+ print("Image PLOTTED")
+
+ self._log_image_and_append_to_dict(img, img_prompt, -1) # Add image logging and buffer updates
+
+ for i in range(self._n_iters):
+ # Downsample the image s.t. GPT-4V can take
+ img = extract_img(self.dalle)
+ smaller_image = img.resize((128, 128), Image.Resampling.LANCZOS)
+ smaller_image.save("result.png")
+
+ self.msg_to_critics = f"""Here is the prompt: {img_prompt}.
+ Here is the figure .
+ Now, critic and create a prompt so that DALLE can give me a better image.
+ Show me both "CRITICS" and "PROMPT"!
+ """
+ self.send(message=self.msg_to_critics, recipient=self.critics, request_reply=True)
+ feedback = self._oai_messages[self.critics][-1]["content"]
+ img_prompt = re.findall("PROMPT: (.*)", feedback)[0]
+
+ self.send(message=img_prompt, recipient=self.dalle, request_reply=True)
+ img = extract_img(self.dalle)
+ plt.imshow(img)
+ plt.axis("off") # Turn off axis numbers
+ plt.show()
+ print(f"Image {i} PLOTTED")
+ self._log_image_and_append_to_dict(img, img_prompt, i) # Log the image in the iteration
+
+
+
+ mlflow.log_dict(self.dict_to_log, "image_lookup.json") # Log the prompt-to-image mapping buffer
+ return True, "result.jpg"
+```
+
+Now, if we rerun the above model logging code, every time we load the newest version of our model, images generated by our agent will be logged and a JSON object with all prompts, indexes of the prompts, and image names (for lookup purposes) will be logged.
+
+Let's demonstrate this and wrap infernce in a single MLflow run for easy aggregation. Also note that we will be leveraging Autogen's [caching](https://microsoft.github.io/autogen/docs/reference/cache/) functionality, so given we've already done inference with this prompt, we won't actually be making LLM calls again; we're just reading from cache and logging with our new MLflow code.
+
+```python
+# Be sure to re-log the model by rerunning the above code
+with mlflow.start_run(run_name="log_image_during_inferfence"):
+ loaded = mlflow.pyfunc.load_model(f"runs:/{run_id}/autogen_pyfunc")
+ loaded.predict("The matrix with a cat")
+```
+
+![Logged Images and JSON Artifacts](./_img/logged_images.png)
+
+As you can see, we have logged three images of interest and a lookup dict. The keys of the dict correspond to the image names and the values correspond to additional information for how the image was generated. With these artifacts we can perform detailed analyses on prompt quality and make iterative improvements to our "catify" agent!
+
+### Additional Benefits of MLflow
+
+There is a lot more happening behind the scenes that is out of the scope of this tutorial, but here's a quick list of additional MLflow features that are useful when building agentic frameworks.
+
+- **Dependency management**: when you log a model, MLflow will automatically try to infer your pip requirements. These requirements are written in several formats that makes remote serving of your model much simpler. If you have local dependencies, as noted above, you can specify additional paths for MLflow to serialize via the `code_paths` argument when logging your model.
+- **Model aliasing**: when iteratively building your agentic framework, you want an easy way to compare models. MLflow model [aliases and tags](https://mlflow.org/docs/latest/model-registry.html#deploy-and-organize-models-with-aliases-and-tags) facilitate lookups to the MLflow model registry and allow you to easily load and deploy an specific model version.
+- **Nested Runs**: with agentic frameworks, especially when training underlying LLM components, you will often have complex nested structures. MLflow supports [nested runs](https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/part1-child-runs.html) to facilitate aggregating your run information. This can be especially useful with LLM training and fine tuning.
+
+## Summary
+
+In this blog we outlined how to create a complex agent with AutoGen. We also showed how to leverage the MLflow [Model from Code](https://mlflow.org/docs/latest/models.html#models-from-code) feature to log and load our model. Finally, we leveraged the MLflow AutoGen's autologging capabilities to automatically leverage MLflow tracing to get fine-grained and thread-safe agent execution information.
+
+Happy coding!
diff --git a/website/blog/2024-09-13-models-from-code-logging/index.md b/website/blog/2024-09-13-models-from-code-logging/index.md
new file mode 100644
index 000000000..0e408419d
--- /dev/null
+++ b/website/blog/2024-09-13-models-from-code-logging/index.md
@@ -0,0 +1,171 @@
+---
+title: Models from Code Logging in MLflow - What, Why, and How
+tags: [genai, pyfunc, mlops]
+slug: models_from_code
+authors: [awadelrahman-ahmed]
+thumbnail: /img/blog/thumbnail-models-from-code.gif
+---
+
+We all (well, most of us) remember November 2022 when the public release of ChatGPT by OpenAI marked a significant turning point in the world of AI. While generative artificial intelligence (GenAI) had been evolving for some time, ChatGPT, built on OpenAI's GPT-3.5 architecture, quickly captured the public’s imagination. This led to an explosion of interest in GenAI, both within the tech industry and among the general public.
+
+On the tools side, MLflow continues to solidify its position as the favorite tool for (machine learning operations) MLOps among the ML community. However, the rise of GenAI has introduced new needs in how we use MLflow. One of these new challenges is how we log models in MLflow. If you’ve used MLflow before (and I bet you have), you’re probably familiar with the `mlflow.log_model()` function and how it efficiently [pickles](https://github.com/cloudpipe/cloudpickle) model artifacts.
+
+Particularly with GenAI, there’s a new requirement: logging the models "from code", instead of serializing it into a pickle file! And guess what? This need isn’t limited to GenAI models! So, in this post I will explore this concept and how MLflow has adapted to meet this new requirement.
+
+You will notice that this feature is implemented at a very abstract level, allowing you to log any model "as code", whether it’s GenAI or not! I like to think of it as a generic approach, with GenAI models being just one of its use cases. So, in this post, I’ll explore this new feature, ["Models from Code logging"](https://mlflow.org/docs/latest/models.html#models-from-code).
+
+By the end of this post, you should be able to answer the three main questions: 'What,' 'Why,' and 'How' to use Models from Code logging.
+
+## What Is Models from Code Logging?
+
+In fact, when MLflow announced this feature, it got me thinking in a more abstract way about the concept of a "model"! You might find it interesting as well, if you zoom out and consider a model as a mathematical representation or function that describes the relationship between input and output variables. At this level of abstraction, a model can be many things!
+
+One might even recognize that a model, as an object or artifact, represents just one form of what a model can be, even if it’s the most popular in the ML community. If you think about it, a model can also be as simple as a piece of code for a mapping function or a code that sends API requests to external services such as OpenAI's APIs.
+
+I'll explain the detailed workflow of how to log models from code later in the post, but for now, let's consider it at a high level with two main steps: first, writing your model code, and second, logging your model from code. This will look like the following figure:
+
+#### _High Level Models from Code Logging Workflow_:
+
+![High Level Models-from-Code Logging Workflow](models-from-code1.png)
+
+🔴 It's important to note that when we refer to "model code," we're talking about code that can be treated as a model itself. This means it's **not** your training code that generates a trained model object, but rather the step-by-step code that is executed as a model itself.
+
+## How Models from Code Differs from Object-Based Logging?
+
+In the previous section, we discussed the concept of Models from Code logging. However, concepts often become clearer when contrasted with their alternatives; a technique known as _contrast learning_. In our case, the alternative is Object-Based logging, which is the commonly used approach for logging models in MLflow.
+
+Object-Based logging treats a trained model as an _object_ that can be stored and reused. After training, the model is saved as an object and can be easily loaded for deployment. For example, this process can be initiated by calling `mlflow.log_model()`, where MLflow handles the serialization, often using [Pickle](https://github.com/cloudpipe/cloudpickle) or similar methods.
+
+Object-Based logging can be broken down into three high-level steps as in the following figure: first, creating the model object (whether by training it or acquiring it), second, serializing it (usually with Pickle or a similar tool), and third, logging it as an object.
+
+#### _High Level Object-Based Logging Workflow_:
+
+![High Level Object-Based Logging Workflow](models-from-code2.png)
+
+💡The main distinction between the popular Object-Based logging and Models from Code logging is that in the former, we log the model object itself, whether it's a model you've trained or a pre-trained model you've acquired. In the latter, however, we log the code that _represents_ your model.
+
+## When Do You Need Models from Code Logging?
+
+By now, I hope you have a clear understanding of _what_ Models from Code logging is! You might still be wondering, though, about the specific use cases where this feature can be applied. This section will cover exactly that—the why!
+
+While we mentioned GenAI as a motivational use case in the introduction, we also highlighted that MLflow has approached Models from Code logging in a more generic way and we will see that in the next section. This means you can leverage the generalizability of the Models from Code feature for a wide range of scenarios. I’ve identified three key usage patterns that I believe are particularly relevant:
+
+### 1️⃣ When Your Model Relies on External Services:
+
+This is one of the obvious and common use cases, especially with the rise of modern AI applications. It’s becoming increasingly clear that we are shifting from building AI at the "model" granularity to the "system" granularity.
+
+In other words, AI is no longer just about individual models; it’s about how those models interact within a broader ecosystem. As we become more dependent on external AI services and APIs, the need for Models from Code logging becomes more pronounced.
+
+For instance, frameworks like [LangChain](https://github.com/langchain-ai/langchain/) allow developers to build applications that chain together various AI models and services to perform complex tasks, such as language understanding and information retrieval. In such scenarios, the "model" is not just a set of trained parameters that can be _pickled_ but a "system" of interconnected services, often orchestrated by code that makes API calls to external platforms.
+
+Models from Code logging in these situations ensures that the entire workflow, including the logic and dependencies, is preserved. It offers is the ability to maintain the same model-like experience by capturing the code making it possible to faithfully recreate the model’s behavior, even when the actual computational work is performed outside your domain.
+
+### 2️⃣ When You’re Combining Multiple Models to Calculate a Complex Metric:
+
+Apart from GenAI, you can still benefit from the Models from Code feature in various other domains. There are many situations where multiple specialized models are combined to produce a comprehensive output. Note that we are not just referring to traditional ensemble modeling (predicting the same variable); often, you need to combine multiple models to predict different components of a complex inferential task.
+
+One concrete example could be [Customer Lifetime Value (CLV)](https://en.wikipedia.org/wiki/Customer_lifetime_value) in customer analytics. In the context of CLV, you might have separate models for:
+
+- Customer Retention: Forecasting how long a customer will continue to engage with the business.
+- Purchase Frequency: Predicting how often a customer will make a purchase.
+- Average Order Value: Estimating the typical value of each transaction.
+
+Each of these models might already be logged and tracked properly using MLflow. Now, you need to "combine" these models into a single "system" that calculates CLV. We refer to it as a "system" because it contains multiple components.
+
+The beauty of MLflow's Models from Code logging is that it allows you to treat this "CLV system" as a "CLV model". It enables you to leverage MLflow's capabilities, maintaining the MLflow-like model structure with all the advantages of tracking, versioning, and deploying your CLV model as a cohesive unit, even though it's built on top of other models. While such a complex model system is able to be built using a custom MLflow PythonModel, utilizing the Models from Code feature dramatically simplifies the serialization process, reducing the friction to building your solution.
+
+### 3️⃣ When You Don’t Have Serialization at All:
+
+Despite the rise of deep learning, industries still rely on rule-based algorithms that don’t produce serialized models. In these cases, Models from Code logging can be beneficial for integrating these processes into the MLflow ecosystem.
+
+One example is in industrial quality control, where the [Canny edge detection algorithm](https://en.wikipedia.org/wiki/Canny_edge_detector) is often used to identify defects. This rule-based algorithm doesn’t involve serialization but is defined by specific steps.
+
+Another example, which is gaining attention nowadays, is [Causal AI](https://en.wikipedia.org/wiki/Causal_AI). Constraint-based causal discovery algorithms like the [PC (Peter-Clark)](https://causal-learn.readthedocs.io/en/latest/search_methods_index/Constraint-based%20causal%20discovery%20methods/PC.html) algorithm that discover causal relationships in data but are implemented as code rather than as model objects.
+
+In either case, with the Models from Code feature, you can log the entire process as a "model" in MLflow, preserving the logic and parameters while benefiting from MLflow’s tracking and versioning features.
+
+## How To Implement Models from Code Logging?
+
+I hope that by this point, you have a clear understanding of the "What" and "Why" of Models from Code, and now you might be eager to get hands-on and focus on the _How_!
+
+In this section, I'll provide a generic workflow for implementing MLflow's Models from Code logging, followed by a basic yet broadly applicable example. I hope the workflow provides a broad understanding that allows you to address a wide range of scenarios. I will also include links at the end to resources that cover more specific use cases (e.g., AI models).
+
+### Models from Code Workflow:
+
+A key "ingredient" of the implementation is MLflow's component [`pyfunc`](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html). If you're not familiar with it, think of `pyfunc` as a universal interface in MLflow that lets you turn any model, from any framework, into an MLflow model by defining a _custom_ Python function. You can also refer to [this earlier post](https://mlflow.org/blog/custom-pyfunc) if you wish to gain a deeper understanding.
+
+For our Models from Code logging, we’ll particularly use the [`PythonModel`](https://mlflow.org/docs/latest/_modules/mlflow/pyfunc/model.html#PythonModel) class within `pyfunc`. This class in the MLflow Python client library allows us to create and manage Python functions as MLflow models. It enables us to define a custom function that processes input data and returns predictions or results. This model can then be deployed, tracked, and shared using MLflow's features.
+
+It seems to be exactly what we're looking for—we have some code that serves as our model, and we want to log it! That's why you'll soon see `mlflow.pyfunc.PythonModel` in our code example!
+
+Now, each time we need to implement Models from Code, we create _two_ separate Python files:
+
+1. The first contains our model code (let's call it `model_code.py`). This file contains a class that inherits from the `mlflow.pyfunc.PythonModel` class.
+ The class we're defining contains our model logic. It could be our calls to OpenAI APIs, CLV (Customer Lifetime Value) model, or our causal discovery code. We'll see a very simple 101 example soon.
+
+ 📌 But wait! IMPORTANT:
+
+ - Our `model_code.py` script needs to call (i,e; include) [`mlflow.models.set_model()`](https://mlflow.org/docs/latest/python_api/mlflow.models.html#mlflow.models.set_model) to set the model, which is crucial for loading the model back using `load_model()` for inference. You will notice this in the example.
+
+2. The second file logs our class (that we defined in `model_code.py`). Think of it as the driver code; it can be either a notebook or a Python script (let's call it `driver.py`).
+ In this file, we'll include the code that is responsible for logging our model code (essentially, providing the path to `model_code.py`) .
+
+Then we can deploy our model. Later, when the serving environment is loaded, `model_code.py` is executed, and when a serving request comes in, `PyFuncClass.predict()` is called.
+
+This figure gives a generic template of these two files.
+
+![Models from Code files](models-from-code3.png)
+
+### A 101 Example of Model from Code Logging :
+
+Let’s consider a straightforward example: a simple function to calculate the area of a circle based on its diameter. With Models from Code, we can log this calculation as a model! I like to think of it as framing the calculation as a prediction problem, allowing us to write our model code with a `predict` method.
+
+#### 1. Our `model_code.py` file :
+
+```python
+import mlflow
+import math
+
+class CircleAreaModel(mlflow.pyfunc.PythonModel):
+ def predict(self, context, model_input, params=None):
+ return [math.pi * (r ** 2) for r in model_input]
+
+# It's important to call set_model() so it can be loaded for inference
+# Also, note that it is set to an instance of the class, not the class itself.
+mlflow.models.set_model(model=CircleAreaModel())
+```
+
+#### 2. Our `driver.py` file :
+
+This can be defined within a notebook as well. Here are its essential contents:
+
+```python
+import mlflow
+
+code_path = "model_code.py" # make sure that you put the correct path
+
+with mlflow.start_run():
+ logged_model_info = mlflow.pyfunc.log_model(
+ python_model=code_path,
+ artifact_path="test_code_logging"
+ )
+
+#We can proint some info about the logged model
+print(f"MLflow Run: {logged_model_info.run_id}")
+print(f"Model URI: {logged_model_info.model_uri}")
+```
+
+#### How that looks like on MLflow:
+
+Executing the `driver.py` will start an MLflow run and log our model as code. The files can been as demonstrated below:
+
+![Models from Code files](models-from-code4.png)
+
+## Conclusion and Further Learning
+
+I hope that by this point, I have fulfilled the promises I made earlier! You should now have a clearer understanding of _What_ Models from Code is and how it differs from the popular Object-Based approach which logs models as serialized objects. You should also have a solid foundation of _Why_ and when to use it, as well as an understanding of _How_ to implement it through our general example.
+
+As we mentioned in the introduction and throughout the post, there are various use cases where Models from Code can be beneficial. Our 101 example is just the beginning—there is much more to explore. Below is a list of code examples that you may find helpful:
+
+1. Logging models from code using **Pyfunc** log model API ( [model code](https://github.com/mlflow/mlflow/blob/a3454610285e3729266e5e94041d06bd2bc55ff6/examples/pyfunc/model_as_code.py) | [driver code](https://github.com/mlflow/mlflow/blob/a3454610285e3729266e5e94041d06bd2bc55ff6/examples/pyfunc/model_as_code_driver.py) )
+2. Logging model from code using **Langchain** log model API ( [model code](https://github.com/mlflow/mlflow/blob/a3454610285e3729266e5e94041d06bd2bc55ff6/examples/langchain/chain_as_code.py) | [driver code](https://github.com/mlflow/mlflow/blob/a3454610285e3729266e5e94041d06bd2bc55ff6/examples/langchain/chain_as_code_driver.py) )
diff --git a/website/blog/2024-09-13-models-from-code-logging/models-from-code1.png b/website/blog/2024-09-13-models-from-code-logging/models-from-code1.png
new file mode 100644
index 000000000..896f68a87
Binary files /dev/null and b/website/blog/2024-09-13-models-from-code-logging/models-from-code1.png differ
diff --git a/website/blog/2024-09-13-models-from-code-logging/models-from-code2.png b/website/blog/2024-09-13-models-from-code-logging/models-from-code2.png
new file mode 100644
index 000000000..b64e2403f
Binary files /dev/null and b/website/blog/2024-09-13-models-from-code-logging/models-from-code2.png differ
diff --git a/website/blog/2024-09-13-models-from-code-logging/models-from-code3.png b/website/blog/2024-09-13-models-from-code-logging/models-from-code3.png
new file mode 100644
index 000000000..33691a7cf
Binary files /dev/null and b/website/blog/2024-09-13-models-from-code-logging/models-from-code3.png differ
diff --git a/website/blog/2024-09-13-models-from-code-logging/models-from-code4.png b/website/blog/2024-09-13-models-from-code-logging/models-from-code4.png
new file mode 100644
index 000000000..736dba3c8
Binary files /dev/null and b/website/blog/2024-09-13-models-from-code-logging/models-from-code4.png differ
diff --git a/website/blog/2024-10-03-llm-as-judge/faithfulness.png b/website/blog/2024-10-03-llm-as-judge/faithfulness.png
new file mode 100644
index 000000000..d1b20fe29
Binary files /dev/null and b/website/blog/2024-10-03-llm-as-judge/faithfulness.png differ
diff --git a/website/blog/2024-10-03-llm-as-judge/gauge.png b/website/blog/2024-10-03-llm-as-judge/gauge.png
new file mode 100644
index 000000000..7ec8c47cd
Binary files /dev/null and b/website/blog/2024-10-03-llm-as-judge/gauge.png differ
diff --git a/website/blog/2024-10-03-llm-as-judge/index.md b/website/blog/2024-10-03-llm-as-judge/index.md
new file mode 100644
index 000000000..2470dde28
--- /dev/null
+++ b/website/blog/2024-10-03-llm-as-judge/index.md
@@ -0,0 +1,490 @@
+---
+title: LLM as judge
+description: Perform LLM Evaluations with custom metrics
+slug: llm-as-judge
+authors: [pedro-azevedo, rahul-pandey]
+tags: [genai, mlflow-evalaute]
+thumbnail: /img/blog/llm-as-judge.png
+---
+
+In this blog post, we'll dive on a journey to revolutionize how we evaluate language models. We'll explore the power of MLflow Evaluate and harness the capabilities of Large Language Models (LLMs) as judges. By the end, you'll learn how to create custom metrics, implement LLM-based evaluation, and apply these techniques to real-world scenarios. Get ready to transform your model assessment process and gain deeper insights into your AI's performance!
+
+## The Challenge of Evaluating Language Models
+
+Evaluating large language models (LLMs) and natural language processing (NLP) systems presents several challenges, primarily due to their complexity and the diversity of tasks they can perform.
+
+One major difficulty is creating metrics that comprehensively measure performance across varied applications, from generating coherent text to understanding nuanced human emotions. Traditional benchmarks often fail to capture these subtleties, leading to incomplete assessments.
+
+An LLM acting as a judge can address these issues by leveraging its extensive training data to provide a more nuanced evaluation, offering insights into model behavior and areas needing improvement. For instance, an LLM can analyze whether a model generates text that is not only grammatically correct but also contextually appropriate and engaging, something more static metrics might miss.
+
+However, to move forward effectively, we need more than just better evaluation methods. Standardized experimentation setups are essential to ensure that comparisons between models are both fair and replicable. A uniform framework for testing and evaluation would enable researchers to build on each other's work, leading to more consistent progress and the development of more robust models.
+
+## Introducing MLflow LLM Evaluate
+
+[MLflow LLM Evaluate](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) is a powerful function within the MLflow ecosystem that allows for comprehensive model assessment by providing a standardized experiment setup. It supports both built-in metrics and custom (LLM) metrics, making it an ideal tool for evaluating complex language tasks. With [MLflow LLM Evaluate](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate), you can:
+
+- Evaluate models against multiple metrics simultaneously
+- Use pre-defined metrics for specific model types (e.g., question-answering, text-summarization and pure text)
+- Create custom metrics, including those that use LLMs as judges using [mlflow.metrics.genai.make_genai_metric()](https://mlflow.org/docs/latest/python_api/mlflow.metrics.html#mlflow.metrics.genai.make_genai_metric)
+ and
+ [mlflow.metrics.genai.make_genai_metric_from_prompt()](https://mlflow.org/docs/latest/python_api/mlflow.metrics.html#mlflow.metrics.genai.make_genai_metric_from_prompt)
+
+![MLflow Evaluate](mlflow_evaluate.drawio.svg)
+
+## Conquering new markets with an LLM as a judge
+
+Imagine you're part of a global travel agency, "WorldWide Wandercorp," that's expanding its reach to Spanish-speaking countries.
+
+Your team has developed an AI-powered translation system to help create culturally appropriate marketing materials and customer communications. However, as you begin to use this system, you realize that traditional evaluation metrics, such as BLEU (Bilingual Evaluation Understudy), fall short in capturing the nuances of language translation, especially when it comes to preserving cultural context and idiomatic expressions.
+
+For instance, consider the phrase "kick the bucket." A direct translation might focus on the literal words, but the idiom actually means "to die." A traditional metric like BLEU may incorrectly evaluate the translation as adequate if the translated words match a reference translation, even if the cultural meaning is lost. In such cases, the metric might score the translation highly despite it being completely inappropriate in context. This could lead to embarrassing or culturally insensitive marketing content, which is something your team wants to avoid.
+
+You need a way to evaluate whether the translation not only is accurate but also preserves the intended meaning, tone, and cultural context. This is where [MLflow Evaluate](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) and LLMs (Large Language Models) as judges come into play. These tools can assess translations more holistically by considering context, idiomatic expressions, and cultural relevance, providing a more reliable evaluation of the AI’s output.
+
+## Custom Metrics: Tailoring Evaluation to Your Needs
+
+In the following section, we’ll implement three metrics:
+
+- The `"cultural_sensitivity"` metric ensures translations maintain cultural context and appropriateness.
+- The `"faithfulness"` metric checks that chatbot responses align accurately with company policies and retrieved content.
+- The `"toxicity"` metric evaluates responses for harmful or inappropriate content, ensuring respectful customer interactions.
+
+These metrics will help Worldwide WanderAgency ensure their AI-driven translations and interactions meet their specific needs.
+
+## Evaluating Worldwide WanderAgency's AI Systems
+
+Now that we understand WanderAgency's challenges, let's dive into a code walkthrough to address them. We'll implement custom metrics to measure AI performance and build a gauge visualization chart for sharing results with stakeholders.
+
+We'll start by evaluating a language translation model, focusing on the "cultural_sensitivity" metric to ensure it preserves cultural nuances. This will help WanderAgency maintain high standards in global communication.
+
+### Cultural Sensitivity Metric
+
+The travel agency wants to ensure their translations are not only accurate but also culturally appropriate.
+To achieve this they are considering creating a custom metric that allows Worldwide WanderAgency to quantify how well their translations maintain cultural context and idiomatic expressions.
+
+For instance, a phrase that is polite in one culture might be inappropriate in another.
+In English, addressing someone as "Dear" in a professional email might be seen as polite. However, in Spanish, using "Querido" in a professional context can be too personal and inappropriate.
+
+How can we evaluate such an abstract concept in a systematic way? Traditional Metrics would fall short so we need a better way of doing it. In this case LLM as a judge would be a great fit!
+For this use case let's create a "cultural_sensitivity" metric.
+
+Here's a brief overview of the process:
+Start by installing all the necessary libraries for this demo to work.
+
+```bash
+pip install mlflow>=2.14.1 openai transformers torch torchvision evaluate datasets tiktoken fastapi rouge_score textstat tenacity plotly ipykernel nbformat>=5.10.4
+```
+
+We will be using gpt3.5 and gpt4 during this example for that let's start by making sure our [OpenAI key is setup](https://mlflow.org/docs/latest/llms/openai/notebooks/openai-quickstart.html#API-Key-Security-Overview).
+
+Import the necessary libraries.
+
+```python
+import mlflow
+import os
+
+# Run a quick validation that we have an entry for the OPEN_API_KEY within environment variables
+
+assert "OPENAI_API_KEY" in os.environ, "OPENAI_API_KEY environment variable must be set"
+
+import openai
+import pandas as pd
+```
+
+When using the [`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) function, your large language model (LLM) can take one of the following forms:
+
+1. A `mlflow.pyfunc.PyFuncModel()` — typically an MLflow model.
+2. A Python function that accepts strings as inputs and returns a single string as output.
+3. An `MLflow Deployments` endpoint URI.
+4. `model=None` if the data you are providing has already been scored by a model, and you do not need to specify one.
+
+For this example, we will use an MLflow model.
+
+We’ll begin by logging a translation model in MLflow. For this tutorial, we'll use GPT-3.5 with a defined system prompt.
+
+In a production environment, you would typically experiment with different prompts and models to determine the most suitable configuration for your use case. For more details, refer to MLflow’s [Prompt Engineering UI](https://mlflow.org/docs/latest/llms/prompt-engineering/index.html).
+
+```python
+
+system_prompt = "Translate the following sentences into Spanish"
+# Let's set up an experiment to make it easier to track our results
+mlflow.set_experiment("/Path/to/your/experiment")
+
+basic_translation_model = mlflow.openai.log_model(
+ model="gpt-3.5-turbo",
+ task=openai.chat.completions,
+ artifact_path="model",
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": "{user_input}"},
+ ],
+)
+```
+
+Let's test the model to make sure it works.
+
+```python
+model = mlflow.pyfunc.load_model(basic_translation_model.model_uri)
+
+model.predict("Hello, how are you?")
+
+# Output = ['¡Hola, ¿cómo estás?']
+```
+
+To use [`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate), we first need to prepare sample data that will serve as input to our LLM. In this scenario, the input would consist of the content the company is aiming to translate.
+
+For demonstration purposes, we will define a set of common English expressions that we want the model to translate.
+
+```python
+# Prepare evaluation data
+eval_data = pd.DataFrame(
+ {
+ "llm_inputs": [
+ "I'm over the moon about the news!",
+ "Spill the beans.",
+ "Bite the bullet.",
+ "Better late than never.",
+
+ ]
+ }
+)
+```
+
+To meet the objectives of the travel agency, we will define custom metrics that evaluate the quality of translations. In particular, we need to assess how faithfully the translations capture not only the literal meaning but also cultural nuances.
+
+By default, [`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) uses `openai:/gpt-4` as the evaluation model. However, you also have the option to use a [local model for evaluation](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model), such as a model wrapped in a PyFunc (e.g., Ollama).
+
+For this example, we will use GPT-4 as the evaluation model.
+
+To begin, provide a few examples that illustrate good and poor translation scores.
+
+```python
+# Define the custom metric
+cultural_sensitivity = mlflow.metrics.genai.make_genai_metric(
+ name="cultural_sensitivity",
+ definition="Assesses how well the translation preserves cultural nuances and idioms.",
+ grading_prompt="Score from 1-5, where 1 is culturally insensitive and 5 is highly culturally aware.",
+ examples=[
+ mlflow.metrics.genai.EvaluationExample(
+ input="Break a leg!",
+ output="¡Rómpete una pierna!",
+ score=2,
+ justification="This is a literal translation that doesn't capture the idiomatic meaning."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="Break a leg!",
+ output="¡Mucha mierda!",
+ score=5,
+ justification="This translation uses the equivalent Spanish theater idiom, showing high cultural awareness."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="It's raining cats and dogs.",
+ output="Está lloviendo gatos y perros.",
+ score=1,
+ justification="This literal translation does not convey the idiomatic meaning of heavy rain."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="It's raining cats and dogs.",
+ output="Está lloviendo a cántaros.",
+ score=5,
+ justification="This translation uses a Spanish idiom that accurately conveys the meaning of heavy rain."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="Kick the bucket.",
+ output="Patear el balde.",
+ score=1,
+ justification="This literal translation fails to convey the idiomatic meaning of dying."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="Kick the bucket.",
+ output="Estirar la pata.",
+ score=5,
+ justification="This translation uses the equivalent Spanish idiom for dying, showing high cultural awareness."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="Once in a blue moon.",
+ output="Una vez en una luna azul.",
+ score=2,
+ justification="This literal translation does not capture the rarity implied by the idiom."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="Once in a blue moon.",
+ output="De vez en cuando.",
+ score=4,
+ justification="This translation captures the infrequency but lacks the idiomatic color of the original."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="The ball is in your court.",
+ output="La pelota está en tu cancha.",
+ score=3,
+ justification="This translation is understandable but somewhat lacks the idiomatic nuance of making a decision."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="The ball is in your court.",
+ output="Te toca a ti.",
+ score=5,
+ justification="This translation accurately conveys the idiomatic meaning of it being someone else's turn to act."
+ )
+ ],
+ model="openai:/gpt-4",
+ parameters={"temperature": 0.0},
+)
+```
+
+### The Toxicity Metric
+
+In addition to this custom metric let's use MLflow built-in metrics for the evaluators. In this case MLflow wll use roberta-hate-speech model to detect the [toxicity](https://huggingface.co/spaces/evaluate-measurement/toxicity). This metric evaluates responses for any harmful or inappropriate content, reinforcing the company's commitment to a positive customer experience.
+
+```python
+# Log and evaluate the model
+with mlflow.start_run() as run:
+ results = mlflow.evaluate(
+ basic_translation_model.model_uri,
+ data=eval_data,
+ model_type="text",
+ evaluators="default",
+ extra_metrics=[cultural_sensitivity],
+ evaluator_config={
+ "col_mapping": {
+ "inputs": "llm_inputs",
+ }}
+ )
+
+mlflow.end_run()
+```
+
+You can retrieve the final results as such:
+
+```python
+results.tables["eval_results_table"]
+```
+
+| | llm_inputs | outputs | token_count | toxicity/v1/score | flesch_kincaid_grade_level/v1/score | ari_grade_level/v1/score | cultural_sensitivity/v1/score | cultural_sensitivity/v1/justification |
+| --- | --------------------------------- | ---------------------------- | ----------- | ----------------- | ----------------------------------- | ------------------------ | ----------------------------- | ------------------------------------------------- |
+| 0 | I'm over the moon about the news! | ¡Estoy feliz por la noticia! | 9 | 0.000258 | 5.2 | 3.7 | 4 | The translation captures the general sentiment... |
+| 1 | Spill the beans. | Revela el secreto. | 7 | 0.001017 | 9.2 | 5.2 | 5 | The translation accurately captures the idioma... |
+| 2 | Bite the bullet. | Morder la bala. | 7 | 0.001586 | 0.9 | 3.6 | 2 | The translation "Morder la bala" is a litera... |
+| 3 | Better late than never. | Más vale tarde que nunca. | 7 | 0.004947 | 0.5 | 0.9 | 5 | The translation accurately captures the idioma... |
+
+Let's analyze the final metrics...
+
+```python
+cultural_sensitivity_score = results.metrics['cultural_sensitivity/v1/mean']
+print(f"Cultural Sensitivity Score: {cultural_sensitivity_score}")
+
+toxicity_score = results.metrics['toxicity/v1/mean']
+# Calculate non-toxicity score
+non_toxicity_score = "{:.2f}".format((1 - toxicity_score) * 100)
+print(f"Non-Toxicity Score: {non_toxicity_score}%")
+
+```
+
+Output:
+
+```bash
+Cultural Sensitivity Score: 3.75
+Pureness Score: 99.80
+```
+
+It is often the case we want to monitor and track these metrics on a dashboard so both data scientists and stakeholders have an understanding of the performance and reliability of these solutions.
+
+For this example let's create a gauge to display the final metric.
+
+```python
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+def create_gauge_chart(value1, title1, value2, title2):
+ # Create a subplot figure with two columns
+ fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'indicator'}, {'type': 'indicator'}]])
+
+ # Add the first gauge chart
+ fig.add_trace(go.Indicator(
+ mode = "gauge+number",
+ value = value1,
+ title = {'text': title1},
+ gauge = {'axis': {'range': [None, 5]}}
+ ), row=1, col=1)
+
+ # Add the second gauge chart
+ fig.add_trace(go.Indicator(
+ mode = "gauge+number",
+ value = value2,
+ title = {'text': title2},
+ gauge = {'axis': {'range': [None, 100]}}
+ ), row=1, col=2)
+
+ # Update layout
+ fig.update_layout(height=400, width=800)
+
+ # Show figure
+ fig.show()
+```
+
+```python
+create_gauge_chart(cultural_sensitive_score, "Cultural Sensitivity Score", float(non_toxicity_score), "Non Toxicity Score")
+```
+
+![Gauge Chart](gauge.png)
+
+### The Faithfulness Metric
+
+As Worldwide WanderAgency's AI grows, they add a customer service chatbot that handles questions in multiple languages. This chatbot uses a RAG (Retrieval-Augmented Generation) system, which means it retrieves information from a database or documents and then generates an answer based on that information.
+
+It's important that the answers provided by the chatbot stay true to the information it retrieves. To make sure of this, we create a "faithfulness" metric. This metric checks how well the chatbot's responses match the materials it’s supposed to be based on, ensuring the information given to customers is accurate.
+
+For example, If the retrieved document says "Returns are accepted within 30 days," and the chatbot replies with "Our return policy is flexible and varies by region," it is not aligning well with the retrieved material. This inaccurate response (bad faithfulness) could mislead customers and create confusion.
+
+### Using MLflow to Evaluate RAG - Faithfulness
+
+Let's evaluate how well our chatbot is doing in sticking to the retrieved information. Instead of using an MLflow model this time, we’ll use a custom function to define the faithfulness metric and see how aligned the chatbot's answers are with the data it pulls from.
+
+```python
+# Prepare evaluation data
+eval_data = pd.DataFrame(
+ {
+ "llm_inputs": [
+ """Question: What is the company's policy on employee training?
+context: "Our company offers various training programs to support employee development. Employees are required to complete at least one training course per year related to their role. Additional training opportunities are available based on performance reviews." """,
+ """Question: What is the company's policy on sick leave?
+context: "Employees are entitled to 10 days of paid sick leave per year. Sick leave can be used for personal illness or to care for an immediate family member. A doctor's note is required for sick leave exceeding three consecutive days." """,
+ """Question: How does the company handle performance reviews?
+context: "Performance reviews are conducted annually. Employees are evaluated based on their job performance, goal achievement, and overall contribution to the team. Feedback is provided, and development plans are created to support employee growth." """,
+ ]
+ }
+)
+
+```
+
+Now let's define some examples for this faithfulness metric.
+
+```python
+examples = [
+ mlflow.metrics.genai.EvaluationExample(
+ input="""Question: What is the company's policy on remote work?
+context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """,
+ output="Employees can work remotely up to three days a week if they maintain productivity and attend mandatory meetings.",
+ score=5,
+ justification="The answer is accurate and directly related to the question and context provided."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="""Question: What is the company's policy on remote work?
+context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """,
+ output="Employees are allowed to work remotely as long as they want.",
+ score=2,
+ justification="The answer is somewhat related but incorrect because it does not mention the three-day limit."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="""Question: What is the company's policy on remote work?
+context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """,
+ output="Our company supports flexible work arrangements.",
+ score=3,
+ justification="The answer is related to the context but does not specifically answer the question about the remote work policy."
+ ),
+ mlflow.metrics.genai.EvaluationExample(
+ input="""Question: What is the company's annual leave policy?
+context: "Employees are entitled to 20 days of paid annual leave per year. Leave must be approved by the employee's direct supervisor and should be planned in advance to ensure minimal disruption to work." """,
+ output="Employees are entitled to 20 days of paid annual leave per year, which must be approved by their supervisor.",
+ score=5,
+ justification="The answer is accurate and directly related to the question and context provided."
+ )]
+
+# Define the custom metric
+faithfulness = mlflow.metrics.genai.make_genai_metric(
+ name="faithfulness",
+ definition="Assesses how well the answer relates to the question and provided context.",
+ grading_prompt="Score from 1-5, where 1 is not related at all and 5 is highly relevant and accurate.",
+ examples=examples)
+
+```
+
+Define out LLM function (in this case it can be any function that follows certain input/output formats that [`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate)).
+
+```python
+# Using custom function
+def my_llm(inputs):
+ answers = []
+ system_prompt = "Please answer the following question in formal language based on the context provided."
+ for index, row in inputs.iterrows():
+ print('INPUTS:', row)
+ completion = openai.chat.completions.create(
+ model="gpt-3.5-turbo",
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": f"{row}"},
+ ],
+ )
+ answers.append(completion.choices[0].message.content)
+
+ return answers
+```
+
+Resulting in a code that is similar to what we did before...
+
+```python
+with mlflow.start_run() as run:
+ results = mlflow.evaluate(
+ my_llm,
+ eval_data,
+ model_type="text",
+ evaluators="default",
+ extra_metrics=[faithfulness],
+ evaluator_config={
+ "col_mapping": {
+ "inputs": "llm_inputs",
+ }}
+ )
+mlflow.end_run()
+```
+
+### GenAI Metrics
+
+Alternatively, we can leverage MLflow's built-in metrics for generative AI, using the same examples.
+
+MLflow provides several [built-in metrics](https://mlflow.org/docs/latest/python_api/mlflow.metrics.html?highlight=genai%20answer#generative-ai-metrics) that use an LLM as a judge. Despite differences in implementation, these metrics are used in the same way. Simply include them in the `extra_metrics` argument of the [`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) function.
+
+In this case, we will use MLflow’s built-in [faithfulness metric](https://mlflow.org/docs/latest/python_api/mlflow.metrics.html?highlight=genai%20answer#mlflow.metrics.genai.faithfulness).
+
+```python
+from mlflow.metrics.genai import EvaluationExample, faithfulness
+faithfulness_metric = faithfulness(model="openai:/gpt-4")
+print(faithfulness_metric)
+```
+
+[`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) simplifies the process of providing grading context, such as the documents retrieved by our system, directly into the evaluation. This feature integrates seamlessly with [LangChain's retrievers](https://python.langchain.com/docs/concepts/#retrievers), allowing you to supply the context for evaluation as a dedicated column. For more details, refer to [this example](https://mlflow.org/docs/latest/llms/llm-evaluate/notebooks/rag-evaluation-llama2.html).
+
+In this case, since our retrieved documents are already included within the final prompt and we are not leveraging LangChain for this tutorial, we will simply map the `llm_input` column as our grading context.
+
+```python
+with mlflow.start_run() as run:
+ results = mlflow.evaluate(
+ my_llm,
+ eval_data,
+ model_type="text",
+ evaluators="default",
+ extra_metrics=[faithfulness_metric],
+ evaluator_config={
+ "col_mapping": {
+ "inputs": "llm_inputs",
+ "context": "llm_inputs",
+ }}
+ )
+mlflow.end_run()
+```
+
+After the evaluation we get the following results:
+![Gauge faithfulness Chart](faithfulness.png)
+
+## Conclusion
+
+By combining the Cultural Sensitivity score with our other calculated metrics, our travel agency can further refine its model to ensure the delivery of high-quality content across all languages. Moving forward, we can revisit and adjust the prompts used to boost our Cultural Sensitivity score. Alternatively, we could fine-tune a smaller model to maintain the same high level of cultural sensitivity while reducing costs. These steps will help us provide even better service to the agency's diverse customer base.
+
+[`mlflow.evaluate()`](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate), combined with LLMs as judges, opens up new possibilities for nuanced and context-aware model evaluation. By creating custom metrics tailored to specific aspects of model performance, data scientists can gain deeper insights into their models' strengths and weaknesses.
+
+The flexibility offered by `make_genai_metric()` allows you to create evaluation criteria that are perfectly suited to your specific use case. Whether you need structured guidance for your LLM judge or want full control over the prompting process, MLflow provides the tools you need.
+
+As you explore MLflow evaluate and LLM-based metrics, remember that the key lies in designing thoughtful evaluation criteria and providing clear instructions to your LLM judge. With these tools at your disposal, you're well-equipped to take your model evaluation to the next level, ensuring that your language models not only perform well on traditional metrics but also meet the nuanced requirements of real-world applications.
+
+The built-in metrics, such as toxicity, offer standardized assessments that are crucial for ensuring the safety and accessibility of model outputs.
+
+As a final challenge, re-run all the tests performed but this time with "gpt-4o-mini" and see how the performance is affected.
diff --git a/website/blog/2024-10-03-llm-as-judge/mlflow_evaluate.drawio.svg b/website/blog/2024-10-03-llm-as-judge/mlflow_evaluate.drawio.svg
new file mode 100644
index 000000000..472565026
--- /dev/null
+++ b/website/blog/2024-10-03-llm-as-judge/mlflow_evaluate.drawio.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/website/blog/2024-10-25-llama-index-workflow/index.md b/website/blog/2024-10-25-llama-index-workflow/index.md
new file mode 100644
index 000000000..f13be47f1
--- /dev/null
+++ b/website/blog/2024-10-25-llama-index-workflow/index.md
@@ -0,0 +1,440 @@
+---
+title: Building Advanced RAG with MLflow and LlamaIndex Workflow
+description: A guide for using LlamaIndex Workflow with MLflow for building advanced QA application.
+slug: mlflow-llama-index-workflow
+authors: [yuki-watanabe]
+tags: [genai, mlops, mlflow-evaluate]
+thumbnail: /img/blog/llama-index-thumbnail.png
+---
+
+![Thumbnail](llama_index_workflow_title.png)
+
+Augmenting LLMs with various data sources is a strong strategy to build LLM applications. However, as the system grows more complex, it becomes challenging to prototype and iteratively build improvements to these more complex systems.
+
+LlamaIndex Workflow is a great framework to build such compound systems. Combined with MLflow, the Workflow API brings efficiency and robustness in the development cycle, enabling easy debugging, experiment tracking, and evaluation for continuous improvement.
+
+In this blog, we will go through the journey of building a sophisticated chatbot with LlamaIndex's Workflow API and MLflow.
+
+## What is LlamaIndex Workflow?
+
+[LlamaIndex Workflow](https://docs.llamaindex.ai/en/stable/module_guides/workflow/) is an event-driven orchestration framework for designing dynamic AI applications. The core of LlamaIndex Workflow consists of:
+
+- `Steps` are units of execution, representing distinct actions in the workflow.
+
+- `Events` trigger these steps, acting as signals that control the workflow’s flow.
+
+- `Workflow` connects these two as a Python class. Each step is implemented as a method of the workflow class, defined with input and output events.
+
+This simple yet powerful abstraction allows you to break down complex tasks into manageable steps, enabling greater flexibility and scalability. As a framework embodying event-driven design, using the `Workflow` APIs makes it intuitive to design parallel and asynchronous execution flows, significantly enhancing the efficiency of long-running tasks and aids in providing production-ready scalability.
+
+## Why Use MLflow with LlamaIndex Workflow?
+
+Workflow provides great flexibility to design nearly arbitrary execution flows. However, with this great power comes a great responsibility. Without managing your changes properly, it can become a chaotic mess of indeterminate states and confusing configurations. After a few dozen changes, you may be asking yourself, "how did my workflow even work?".
+
+**MLflow** brings a powerful MLOps harness to LlamaIndex Workflows throughout the end-to-end development cycle.
+
+- **Experiment Tracking**: MLflow allows you to record various components like steps, prompts, LLMs, and tools, making it easy to improve the system iteratively.
+
+- **Reproducibility**: MLflow packages environment information such as global configurations (`Settings`), library versions, and metadata to ensure consistent deployment across different stages of the ML lifecycle.
+
+- **Tracing**: Debugging issues in a complex event-driven workflow is cumbersome. MLflow Tracing is a production-ready observability solution that natively integrates with LlamaIndex, giving you observability into each internal stage within your Workflow.
+
+- **Evaluation**: Measuring is a crucial task for improving your model. MLflow Evaluation is great tool to evaluate the quality, speed, and cost of your LLM application. It is tightly integrated with MLflow's experiment tracking capabilities, streamlining the process of making iterative improvements.
+
+## Let's Build!🛠️
+
+### Strategy: Hybrid Approach Using Multiple Retrieval Methods
+
+Retrieval-Augmented Generation (RAG) is a powerful framework, but the retrieval step can often become a bottleneck, because embedding-based retrieval may not always capture the most relevant context. While many techniques exist to improve retrieval quality, no single solution works universally. Therefore, an effective strategy is to combine multiple retrieval approaches.
+
+The concept we will explore here is to run several retrieval methods in parallel: (1) standard vector search, (2) keyword-based search (BM25), and (3) web search. The retrieved contexts are then merged, with irrelevant data filtered out to enhance the overall quality.
+
+![Hybrid RAG Concept](llama_index_workflow_hybrid_rag_concept.png)
+
+How do we bring this concept to life? Let’s dive in and build this hybrid RAG using LlamaIndex Workflow and MLflow.
+
+## 1. Set Up Repository
+
+The sample code, including the environment setup script, is available in the [GitHub repository](https://github.com/mlflow/mlflow/tree/master/examples/llama_index/workflow). It contains a complete workflow definition, a hands-on notebook, and a sample dataset for running experiments. To clone it to your working environment, use the following command:
+
+```shell
+git clone https://github.com/mlflow/mlflow.git
+```
+
+After cloning the repository, set up the virtual environment by running:
+
+```shell
+cd mlflow/examples/llama_index/workflow
+chmod +x install.sh
+./install.sh
+```
+
+Once the installation is complete, start Jupyter Notebook within the Poetry environment using:
+
+```shell
+poetry run jupyter notebook
+```
+
+Next, open the `Tutorial.ipynb` notebook located in the root directory. Throughout this blog, we will walk through this notebook to guide you through the development process.
+
+## 2. Start an MLflow Experiment
+
+An **MLflow Experiment** is where you track all aspects of model development, including model definitions, configurations, parameters, dependency versions, and more. Let’s start by creating a new MLflow experiment called "LlamaIndex Workflow RAG":
+
+```python
+import mlflow
+
+mlflow.set_experiment("LlamaIndex Workflow RAG")
+```
+
+At this point, the experiment doesn't have any recorded data yet. To view the experiment in the MLflow UI, open a new terminal and run the `mlflow ui` command, then navigate to the provided URL in your browser:
+
+```shell
+poetry run mlflow ui
+```
+
+![Empty MLflow Experiment](llama_index_workflow_empty_experiment.png)
+
+## 3. Choose your LLM and Embeddings
+
+Now, set up your preferred LLM and embeddings models to LlamaIndex's Settings object. These models will be used throughout the LlamaIndex components.
+
+For this demonstration, we’ll use OpenAI models, but you can easily switch to different LLM providers or local models by following the instructions in the notebook.
+
+```python
+import getpass
+import os
+
+os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key")
+
+from llama_index.core import Settings
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
+
+# LlamaIndex by default uses OpenAI APIs for LLMs and embeddings models. You can use the default
+# model (`gpt-3.5-turbo` and `text-embeddings-ada-002` as of Oct 2024), but we recommend using the
+# latest efficient models instead for getting better results with lower cost.
+Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+Settings.llm = OpenAI(model="gpt-4o-mini")
+```
+
+💡 _MLflow will automatically log the `Settings` configuration into your MLflow Experiment when logging models, ensuring reproducibility and reducing the risk of discrepancies between environments._
+
+## 4. Set Up Web Search API
+
+Later in this blog, we will add a web search capability to the QA bot. We will use Tavily AI, a search API
+optimized for LLM application and natively integrated with LlamaIndex. Visit [their website](https://tavily.com/) to
+get an API key for free-tier use, or use different search engine integrated with LlamaIndex, e.g. [GoogleSearchToolSpec](https://docs.llamaindex.ai/en/stable/api_reference/tools/google/#llama_index.tools.google.GoogleSearchToolSpec).
+
+Once you get the API key, set it to the environment variable:
+
+```python
+os.environ["TAVILY_AI_API_KEY"] = getpass.getpass("Enter Tavily AI API Key")
+```
+
+## 5. Set Up Document Indices for Retrieval
+
+The next step is to build a document index for retrieval from MLflow documentation. The `urls.txt` file in the `data` directory contains a list of MLflow documentation pages. These pages can be loaded as document objects using the web page reader utility.
+
+```python
+from llama_index.readers.web import SimpleWebPageReader
+
+with open("data/urls.txt", "r") as file:
+ urls = [line.strip() for line in file if line.strip()]
+
+documents = SimpleWebPageReader(html_to_text=True).load_data(urls)
+```
+
+Next, ingest these documents into a vector database. In this tutorial, we’ll use the [Qdrant](https://qdrant.tech/) vector store, which is free if self-hosted. If Docker is installed on your machine, you can start the Qdrant database by running the official Docker container:
+
+```shell
+$ docker pull qdrant/qdrant
+$ docker run -p 6333:6333 -p 6334:6334 \
+ -v $(pwd)/.qdrant_storage:/qdrant/storage:z \
+ qdrant/qdrant
+```
+
+Once the container is running, you can create an index object that connects to the Qdrant database:
+
+```python
+import qdrant_client
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+
+client = qdrant_client.QdrantClient(host="localhost", port=6333)
+vector_store = QdrantVectorStore(client=client, collection_name="mlflow_doc")
+
+from llama_index.core import StorageContext, VectorStoreIndex
+
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+index = VectorStoreIndex.from_documents(
+ documents=documents,
+ storage_context=storage_context
+)
+```
+
+Of course, you can use your preferred vector store here. LlamaIndex supports a variety of vector databases, such as [FAISS](https://docs.llamaindex.ai/en/stable/examples/vector_stores/FaissIndexDemo/), [Chroma](https://docs.llamaindex.ai/en/stable/examples/vector_stores/ChromaIndexDemo/), and [Databricks Vector Search](https://docs.llamaindex.ai/en/stable/examples/vector_stores/DatabricksVectorSearchDemo/). If you choose an alternative, follow the relevant LlamaIndex documentation and update the `workflow/workflow.py` file accordingly.
+
+In addition to evaluating the vector search retrieval, we will assess the keyword-based retriever (BM25) later. Let's set up local document storage to enable BM25 retrieval in the workflow.
+
+```python
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.retrievers.bm25 import BM25Retriever
+
+splitter = SentenceSplitter(chunk_size=512)
+nodes = splitter.get_nodes_from_documents(documents)
+bm25_retriever = BM25Retriever.from_defaults(nodes=nodes)
+bm25_retriever.persist(".bm25_retriever")
+```
+
+## 6. Define a Workflow
+
+Now that the environment and data sources are ready, we can build the workflow and experiment with it. The complete workflow code is defined in the `workflow` directory. Let's explore some key components of the implementation.
+
+### Events
+
+The `workflow/events.py` file defines all the events used within the workflow. These are simple Pydantic models that carry information between workflow steps. For example, the `VectorSearchRetrieveEvent` triggers the vector search step by passing the user's query.
+
+```python
+class VectorSearchRetrieveEvent(Event):
+ """Event for triggering VectorStore index retrieval step."""
+ query: str
+```
+
+### Prompts
+
+Throughout the workflow execution, we call LLMs multiple times. The prompt templates for these LLM calls are defined in the `workflow/prompts.py` file.
+
+### Workflow Class
+
+The main workflow class is defined in `workflow/workflow.py`. Let's break down how it works.
+
+The constructor accepts a retrievers argument, which specifies the retrieval methods to be used in the workflow. For instance, if `["vector_search", "bm25"]` is passed, the workflow performs vector search and keyword-based search, skipping web search.
+
+💡 Deciding which retrievers to utilize dynamically allows us to experiment with different retrieval strategies without needing to replicate nearly identical model code.
+
+```python
+class HybridRAGWorkflow(Workflow):
+
+ VALID_RETRIEVERS = {"vector_search", "bm25", "web_search"}
+
+ def __init__(self, retrievers=None, **kwargs):
+ super().__init__(**kwargs)
+ self.llm = Settings.llm
+ self.retrievers = retrievers or []
+
+ if invalid_retrievers := set(self.retrievers) - self.VALID_RETRIEVERS:
+ raise ValueError(f"Invalid retrievers specified: {invalid_retrievers}")
+
+ self._use_vs_retriever = "vector_search" in self.retrievers
+ self._use_bm25_retriever = "bm25" in self.retrievers
+ self._use_web_search = "web_search" in self.retrievers
+
+ if self._use_vs_retriever:
+ qd_client = qdrant_client.QdrantClient(host=_QDRANT_HOST, port=_QDRANT_PORT)
+ vector_store = QdrantVectorStore(client=qd_client, collection_name=_QDRANT_COLLECTION_NAME)
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+ self.vs_retriever = index.as_retriever()
+
+ if self._use_bm25_retriever:
+ self.bm25_retriever = BM25Retriever.from_persist_dir(_BM25_PERSIST_DIR)
+
+ if self._use_web_search:
+ self.tavily_tool = TavilyToolSpec(api_key=os.environ.get("TAVILY_AI_API_KEY"))
+```
+
+The workflow begins by executing a step that takes the `StartEvent` as input, which is the `route_retrieval` step in this case. This step inspects the retrievers parameter and triggers the necessary retrieval steps. By using the `send_event()` method of the context object, multiple events can be dispatched in parallel from this single step.
+
+```python
+ # If no retriever is specified, proceed directly to the final query step with an empty context
+ if len(self.retrievers) == 0:
+ return QueryEvent(context="")
+
+ # Trigger the retrieval steps based on the configuration
+ if self._use_vs_retriever:
+ ctx.send_event(VectorSearchRetrieveEvent(query=query))
+ if self._use_bm25_retriever:
+ ctx.send_event(BM25RetrieveEvent(query=query))
+ if self._use_web_search:
+ ctx.send_event(TransformQueryEvent(query=query))
+```
+
+The retrieval steps are straightforward. However, the web search step is more advanced as it includes an additional step to transform the user's question into a search-friendly query using an LLM.
+
+The results from all the retrieval steps are aggregated in the `gather_retrieval_results` step. Here, the `ctx.collect_events()` method is used to poll for the results of the asynchronously executed steps.
+
+```python
+ results = ctx.collect_events(ev, [RetrievalResultEvent] * len(self.retrievers))
+```
+
+Passing all results from multiple retrievers often leads to a large context with unrelated or duplicate content. To address this, we need to filter and select the most relevant results. While a score-based approach is common, web search results do not return similarity scores. Therefore, we use an LLM to sort and filter out irrelevant results. The rerank step achieves this by leveraging the built-in reranker integration with [RankGPT](https://github.com/sunnweiwei/RankGPT).
+
+```python
+ reranker = RankGPTRerank(llm=self.llm, top_n=5)
+ reranked_nodes = reranker.postprocess_nodes(ev.nodes, query_str=query)
+ reranked_context = "\n".join(node.text for node in reranked_nodes)
+```
+
+Finally, the reranked context is passed to the LLM along with the user query to generate the final answer. The result is returned as a `StopEvent` with the `result` key.
+
+```python
+ @step
+ async def query_result(self, ctx: Context, ev: QueryEvent) -> StopEvent:
+ """Get result with relevant text."""
+ query = await ctx.get("query")
+
+ prompt = FINAL_QUERY_TEMPLATE.format(context=ev.context, query=query)
+ response = self.llm.complete(prompt).text
+ return StopEvent(result=response)
+```
+
+Now, let's instantiate the workflow and run it.
+
+```python
+# Workflow with VS + BM25 retrieval
+from workflow.workflow import HybridRAGWorkflow
+
+workflow = HybridRAGWorkflow(retrievers=["vector_search", "bm25"], timeout=60)
+response = await workflow.run(query="Why use MLflow with LlamaIndex?")
+print(response)
+```
+
+## 7. Log the Workflow in an MLflow Experiment
+
+Now we want to run the workflow with various different retrieval strategies and evaluate the performance of each. However, before running the evaluation, we'll log the model in MLflow to track both the model and its performance within an **MLflow Experiment**.
+
+For the LlamaIndex Workflow, we use the new [Model-from-code](https://mlflow.org/docs/latest/models.html#models-from-code) method, which logs models as standalone Python scripts. This approach avoids the risks and instability associated with serialization methods like pickle, relying instead on code as the single source of truth for the model definition. When combined with MLflow's environment-freezing capability, it provides a reliable way to persist models. For more details, refer to the [MLflow documentation](https://mlflow.org/docs/latest/models.html#models-from-code).
+
+💡 In the `workflow` directory, there's a `model.py` script that imports the `HybridRAGWorkflow` and instantiates it with dynamic configurations passed via the `model_config` parameter during logging. This design allows you to track models with different configurations without duplicating the model definition.
+
+We'll start an MLflow Run and log the model script `model.py` with different configurations using the [mlflow.llama_index.log_model()](https://mlflow.org/docs/latest/python_api/mlflow.llama_index.html#mlflow.llama_index.log_model) API.
+
+```python
+# Different configurations we will evaluate. We don't run evaluation for all permutation
+# for demonstration purpose, but you can add as many patterns as you want.
+run_name_to_retrievers = {
+ # 1. No retrievers (prior knowledge in LLM).
+ "none": [],
+ # 2. Vector search retrieval only.
+ "vs": ["vector_search"],
+ # 3. Vector search and keyword search (BM25)
+ "vs + bm25": ["vector_search", "bm25"],
+ # 4. All retrieval methods including web search.
+ "vs + bm25 + web": ["vector_search", "bm25", "web_search"],
+}
+
+# Create an MLflow Run and log model with each configuration.
+models = []
+for run_name, retrievers in run_name_to_retrievers.items():
+ with mlflow.start_run(run_name=run_name):
+ model_info = mlflow.llama_index.log_model(
+ # Specify the model Python script.
+ llama_index_model="workflow/model.py",
+ # Specify retrievers to use.
+ model_config={"retrievers": retrievers},
+ # Define dependency files to save along with the model
+ code_paths=["workflow"],
+ # Subdirectory to save artifacts (not important)
+ artifact_path="model",
+ )
+ models.append(model_info)
+```
+
+Now open the MLflow UI again, and this time it should show 4 MLflow Runs are recorded with different `retrievers` parameter values. By clicking each Run name and navigate to the "Artifacts" tab, you can see MLflow records the model and various metadata, such as dependency versions and settings.
+
+![MLflow Runs](llama_index_workflow_runs.png)
+
+## 8. Enable MLflow Tracing
+
+Before running the evaluation, there’s one final step: enabling **MLflow Tracing**. We'll dive into this feature and why we do this here later, but for now, you can enable it with a simple one-line command. MLflow will automatically trace every LlamaIndex execution.
+
+```python
+mlflow.llama_index.autolog()
+```
+
+## 9. Evaluate the Workflow with Different Retriever Strategies
+
+The example repository includes a sample evaluation dataset, `mlflow_qa_dataset.csv`, containing 30 question-answer pairs related to MLflow.
+
+```python
+import pandas as pd
+
+eval_df = pd.read_csv("data/mlflow_qa_dataset.csv")
+display(eval_df.head(3))
+```
+
+To evaluate the workflow, use the [mlflow.evaluate()](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate) API, which requires (1) your dataset, (2) the logged model, and (3) the metrics you want to compute.
+
+```python
+from mlflow.metrics import latency
+from mlflow.metrics.genai import answer_correctness
+
+
+for model_info in models:
+ with mlflow.start_run(run_id=model_info.run_id):
+ result = mlflow.evaluate(
+ # Pass the URI of the logged model above
+ model=model_info.model_uri,
+ data=eval_df,
+ # Specify the column for ground truth answers.
+ targets="ground_truth",
+ # Define the metrics to compute.
+ extra_metrics=[
+ latency(),
+ answer_correctness("openai:/gpt-4o-mini"),
+ ],
+ # The answer_correctness metric requires "inputs" column to be
+ # present in the dataset. We have "query" instead so need to
+ # specify the mapping in `evaluator_config` parameter.
+ evaluator_config={"col_mapping": {"inputs": "query"}},
+ )
+```
+
+In this example, we evaluate the model with two metrics:
+
+1. **Latency**: Measures the time taken to execute a workflow for a single query.
+2. **Answer Correctness**: Evaluates the accuracy of answers based on the ground truth, scored by the OpenAI GPT-4o model on a 1–5 scale.
+
+These metrics are just for demonstration purposes—you can add additional metrics like toxicity or faithfulness, or even create your own. See the MLflow documentation for the full set of [built-in metrics](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#llm-evaluation-metrics)
+and how to define [custom metrics](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#creating-custom-llm-evaluation-metrics).
+
+The evaluation process will take a few minutes. Once completed, you can view the results in the MLflow UI. Open the Experiment page and click on the chart icon 📈 above the Run list.
+
+![Evaluation Result](llama_index_workflow_result_chart.png)
+
+\*💡 The evaluation results can be different depending on model set up and some randomness.
+
+The first row shows bar charts for the answer correctness metrics, while the second row displays latency results. The best-performing combination is "Vector Search + BM25". Interestingly, adding web search not only increases latency significantly but also decreases answer correctness.
+
+Why does this happen? It appears some answers from the web-search-enabled model are off-topic. For example, in response to a question about starting the Model Registry, the web-search model provides an unrelated answer about model deployment, while the "vs + bm25" model offers a correct response.
+
+![Answer Comparison](llama_index_workflow_answer_comparison.png)
+
+Where did this incorrect answer come from? This seems to be a retriever issue, as we only changed the retrieval strategy. However, it's difficult to see what each retriever returned from the final result. To gain deeper insights into what's happening behind the scenes, MLflow Tracing is the perfect solution.
+
+## 10. Inspecting Quality Issues with MLflow Trace
+
+[MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) is a new feature that brings observability to LLM applications. It integrates seamlessly with LlamaIndex, recording all inputs, outputs, and metadata about intermediate steps during workflow execution. Since we called `mlflow.llama_index.autolog()` at the start, every LlamaIndex operation has been traced and recorded in the MLflow Experiment.
+
+To inspect the trace for a specific question from the evaluation, navigate to the "Traces" tab on the experiment page. Look for the row with the particular question in the request column and the run name "vs + bm25 + web." Clicking the request ID link opens the Trace UI, where you can view detailed information about each step in the execution, including inputs, outputs, metadata, and latency.
+
+![Trace](llama_index_workflow_trace.png)
+
+In this case, we identified the issue by examining the reranker step. The web search retriever returned irrelevant context related to model serving, and the reranker incorrectly ranked it as the most relevant. With this insight, we can determine potential improvements, such as refining the reranker to better understand MLflow topics, improving web search precision, or even removing the web search retriever altogether.
+
+## Conclusion
+
+In this blog, we explored how the combination of LlamaIndex and MLflow can elevate the development of Retrieval-Augmented Generation (RAG) workflows, bringing together powerful model management and observability capabilities. By integrating multiple retrieval strategies (such as vector search, BM25, and web search) we demonstrated how flexible retrieval can enhance the performance of LLM-driven applications.
+
+- **Experiment Tracking** allowed us to organize and log different workflow configurations, ensuring reproducibility and enabling us to track model performance across multiple runs.
+- **MLflow Evaluate** enabled us to easily log and evaluate the workflow with different retriever strategies, using key metrics like latency and answer correctness to compare performance.
+- **MLflow UI** gave us a clear visualization of how various retrieval strategies impacted both accuracy and latency, helping us identify the most effective configurations.
+- **MLflow Tracing**, integrated with LlamaIndex, provided detailed observability into each step of the workflow for diagnosing quality issues, such as incorrect reranking of search results.
+
+With these tools, you have a complete framework for building, logging, and optimizing RAG workflows. As LLM technology continues to evolve, the ability to track, evaluate, and fine-tune every aspect of model performance will be essential. We highly encourage you to experiment further and see how these tools can be tailored to your own applications.
+
+To continue learning, explore the following resources:
+
+- Learn more about the [MLflow LlamaIndex integration](https://mlflow.org/docs/latest/llms/llama-index/index.html).
+- Discover additional MLflow LLM features at [LLMs in MLflow](https://mlflow.org/docs/latest/llms/index.html).
+- Deploy your workflow to a serving endpoint with [MLflow Deployment](https://mlflow.org/docs/latest/deployment/index.html).
+- Check out more [Workflow examples](https://docs.llamaindex.ai/en/stable/module_guides/workflow/#examples) from LlamaIndex.
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_answer_comparison.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_answer_comparison.png
new file mode 100644
index 000000000..3e8ebb67b
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_answer_comparison.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_empty_experiment.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_empty_experiment.png
new file mode 100644
index 000000000..4d346fba8
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_empty_experiment.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_hybrid_rag_concept.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_hybrid_rag_concept.png
new file mode 100644
index 000000000..2b18700f0
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_hybrid_rag_concept.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_result_chart.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_result_chart.png
new file mode 100644
index 000000000..8ac295d63
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_result_chart.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_runs.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_runs.png
new file mode 100644
index 000000000..f0ad1b1de
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_runs.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_title.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_title.png
new file mode 100644
index 000000000..781ef5e1b
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_title.png differ
diff --git a/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_trace.png b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_trace.png
new file mode 100644
index 000000000..ce1d8ab62
Binary files /dev/null and b/website/blog/2024-10-25-llama-index-workflow/llama_index_workflow_trace.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/action_group_decision.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/action_group_decision.png
new file mode 100644
index 000000000..78d440d37
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/action_group_decision.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_chatmodel.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_chatmodel.png
new file mode 100644
index 000000000..3563fc9f4
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_chatmodel.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_input_prompt.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_input_prompt.png
new file mode 100644
index 000000000..0f64d90a5
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/bedrock_input_prompt.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/index.md b/website/blog/2024-11-07-bedrock-chat-model-part-1/index.md
new file mode 100644
index 000000000..867fce22a
--- /dev/null
+++ b/website/blog/2024-11-07-bedrock-chat-model-part-1/index.md
@@ -0,0 +1,1338 @@
+---
+title: Using Bedrock Agent as an MLflow ChatModel with Tracing
+description: A guide for using BedRock Runtime Agent with ChatModel and custom trace handling.
+slug: bedrock-chat-model-part-1
+authors: [jas-bali]
+tags: [genai, pyfunc, bedrock, tracing]
+thumbnail: /img/blog/bedrock-chatmodel.png
+---
+
+![Thumbnail](bedrock_chatmodel.png)
+
+**In this blog post, we delve into the integration of AWS Bedrock Agent as a ChatModel within MLflow, focusing on
+how to leverage Bedrock's [Action Groups](https://docs.aws.amazon.com/bedrock/latest/userguide/agents-action-create.html)
+and [Knowledge Bases](https://docs.aws.amazon.com/bedrock/latest/userguide/agents-kb-add.html) to build a
+conversational AI application. The blog will guide you through setting up the Bedrock Agent, configuring
+Action Groups to enable custom actions with Lambda, and utilizing knowledge bases for context-aware interactions.
+A special emphasis is placed on implementing tracing within MLflow.By the end of this article, you'll have a good
+understanding of how to combine AWS Bedrock's advanced features with MLflow's capabilities such as agent request
+tracing, model tracking and consistent signatures for input examples.**
+
+## What is AWS Bedrock?
+
+Amazon Bedrock is a managed service by AWS that simplifies the development of generative AI applications. It provides access to a variety of foundation models (FMs) from leading AI providers through a single API, enabling developers to build and scale AI solutions securely and efficiently.
+
+Key Components Relevant to This Integration:
+
+**Bedrock Agent**: At a high level, a bedrock agent is an abstraction within bedrock that consists of a foundation model,
+action groups and knowledge bases.
+
+**Action Groups**: These are customizable sets of actions that define what tasks the Bedrock Agent can perform.
+Action Groups consist of an OpenAPI Schema and the corresponding Lambda functions that will be used to execute tool calls.
+The OpenAPI Schema is used to define APIs available for the agent to invoke and complete tasks.
+
+**Knowledge Bases**: Amazon Bedrock supports the creation of Knowledge Bases to implement
+Retrieval Augmented Generation workflows. It consists of data sources (on S3 or webpages)
+and a vector store that contains the embedded references to this data.
+
+Bedrock's Agent execution process and the corresponding tracing for Agent instrumentation is grouped as follows:
+
+**Pre-processing**
+This step validates, contextualizes and categorizes user input.
+
+**Orchestration**
+This step handles the interpretation of user inputs, deciding when to and which tasks to perform,
+and iteratively refines responses
+
+**Post-processing (Optional)**
+This step formats the final response before returning to the user.
+
+**Traces**
+Each step above has an execution trace, which consists of rationale, actions, queries and observations at each step
+of the agent's response. This includes both the inputs and outputs of action groups and knowledge base queries.
+
+We will look at these traces in detail below.
+
+## What is a ChatModel in MLflow?
+
+The [ChatModel class](https://mlflow.org/docs/latest/llms/chat-model-guide/index.html) is specifically
+designed to make it easier to implement models that are compatible with
+popular large language model (LLM) chat APIs. It enables you to seamlessly bring in your own models or agents and
+leverage MLflow's functionality, even if those models aren't natively supported as a flavor in MLflow. Additionally,
+It provides default signatures, which are static for ChatModel, unlike PythonModel.
+
+In the following sections, we will use ChatModel to wrap the Bedrock Agent.
+
+For more detailed information about ChatModel, you can read the MLflow documentation
+[here](https://mlflow.org/docs/latest/llms/chat-model-guide/index.html) and
+[here](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.ChatModel)
+
+## Setting up AWS Bedrock Agent with an Action group
+
+In this section, we will deploy all components of a bedrock agent so that we can invoke it as a `ChatModel` in MLflow.
+
+### Prerequisites
+
+You will need to setup following items (either via the AWS console or SDKs):
+
+- Setting up role for the agent and Lambda function. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L148)
+- Create/deploy the agent. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L191)
+ - **Important**: Save the agent ID here as we will need this below.
+- Creating a Lambda function. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L218)
+- Configuring IAM permissions for agent-Lambda interaction. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L283) and [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L297)
+- Creating an action group to link the agent and Lambda. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L312)
+ - **Important**:Save the agent alias ID here as we will need this below.
+- Deploy Bedrock agent with an alias. [Example](https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/bedrock-agent/scenario_get_started_with_agents.py#L342)
+
+In our case, we are going to deploy the following example action group, which calculates the next optimal departure
+date for a Hohmann transfer from Earth to Mars, based on the spacecraft's mass and specific impulse.
+
+### OpenAPI schema for Action Groups
+
+As described above, here is the OpenAPI Schema for our example action group:
+
+```yaml
+openapi: 3.0.0
+info:
+ title: Time API
+ version: 1.0.0
+ description: API to get the next optimal departure date for a Hohmann transfer from Earth to Mars.
+paths:
+ /get-next-mars-launch-window:
+ get:
+ summary: Gets the next optimal launch window to Mars.
+ description: Gets the next optimal launch window to Mars.
+ operationId: getNextMarsLaunchWindow
+ parameters:
+ - name: total_mass
+ in: query
+ description: Total mass of the spacecraft including fuel (kg)
+ required: true
+ schema:
+ type: string
+ - name: dry_mass
+ in: query
+ description: Mass of the spacecraft without fuel (kg).
+ required: true
+ schema:
+ type: string
+ - name: specific_impulse
+ in: query
+ description: Specific impulse of the propulsion system (s).
+ required: true
+ schema:
+ type: string
+ responses:
+ "200":
+ description: The next optimal departure date for a Hohmann transfer from Earth to Mars, based on the spacecraft's mass and specific impulse.
+ content:
+ "application/json":
+ schema:
+ type: object
+ properties:
+ next_launch_window:
+ type: string
+ description: Next Mars Launch Window
+```
+
+### Action groups - Lamda function
+
+Here is the code deployment for action group's example Lambda:
+
+```python
+import json
+import math
+from datetime import datetime, timedelta
+
+
+def lambda_handler(event, context):
+ def _calculate_optimal_departure_window(
+ total_mass, dry_mass, specific_impulse
+ ):
+ """
+ Calculate the next optimal departure date for a Hohmann transfer from Earth to Mars,
+ based on the spacecraft's mass and specific impulse.
+
+ Parameters:
+ - total_mass (float): Total mass of the spacecraft including fuel (kg).
+ - dry_mass (float): Mass of the spacecraft without fuel (kg).
+ - specific_impulse (float): Specific impulse of the propulsion system (s).
+
+ Returns:
+ - dict: {
+ 'next_launch_date': datetime,
+ 'synodic_period_days': float,
+ 'transfer_time_days': float,
+ 'delta_v_available_m_s': float,
+ 'delta_v_required_m_s': float,
+ 'is_feasible': bool
+ }
+ """
+ current_date = None
+ # Constants
+ G0 = 9.80665 # m/s^2, standard gravity
+ MU_SUN = (
+ 1.32712440018e20 # m^3/s^2, standard gravitational parameter for the Sun
+ )
+ AU = 1.496e11 # meters, astronomical unit
+ EARTH_ORBITAL_PERIOD = 365.25 # days
+ MARS_ORBITAL_PERIOD = 686.98 # days
+ SYNODIC_PERIOD = 1 / abs((1 / EARTH_ORBITAL_PERIOD) - (1 / MARS_ORBITAL_PERIOD))
+ TRANSFER_TIME = 259 # days, approximate duration of Hohmann transfer
+ BASE_LAUNCH_DATE = datetime(2020, 7, 1) # A reference past launch window date
+
+ # Orbital Radii (assuming circular orbits for simplicity)
+ r1 = AU # Earth's orbital radius in meters
+ r2 = 1.524 * AU # Mars' orbital radius in meters
+
+ # Calculate Required Delta-V for Hohmann Transfer
+ # Using vis-viva equation for Hohmann transfer
+ def calculate_hohmann_delta_v(mu, r_start, r_end):
+ # Velocity of departure orbit (Earth)
+ v_start = math.sqrt(mu / r_start)
+ # Velocity of transfer orbit at departure
+ a_transfer = (r_start + r_end) / 2
+ v_transfer_start = math.sqrt(mu * (2 / r_start - 1 / a_transfer))
+ delta_v1 = v_transfer_start - v_start
+
+ # Velocity of arrival orbit (Mars)
+ v_end = math.sqrt(mu / r_end)
+ # Velocity of transfer orbit at arrival
+ v_transfer_end = math.sqrt(mu * (2 / r_end - 1 / a_transfer))
+ delta_v2 = v_end - v_transfer_end
+
+ return delta_v1, delta_v2
+
+ delta_v1, delta_v2 = calculate_hohmann_delta_v(MU_SUN, r1, r2)
+ delta_v_required = abs(delta_v1) + abs(delta_v2) # Total delta-v in m/s
+
+ # Delta-V using Tsiolkovsky Rocket Equation
+ if dry_mass <= 0 or total_mass <= dry_mass:
+ raise ValueError("Total mass must be greater than dry mass.")
+
+ delta_v_available = (
+ specific_impulse * G0 * math.log(total_mass / dry_mass)
+ ) # m/s
+
+ is_feasible = delta_v_available >= delta_v_required
+
+ if current_date is None:
+ current_date = datetime.now()
+
+ days_since_base = (current_date - BASE_LAUNCH_DATE).days
+ if days_since_base < 0:
+ # Current date is before the base launch date
+ next_launch_date = BASE_LAUNCH_DATE
+ else:
+ synodic_periods_passed = days_since_base / SYNODIC_PERIOD
+ synodic_periods_passed_int = math.floor(synodic_periods_passed)
+ next_launch_date = BASE_LAUNCH_DATE + timedelta(
+ days=(synodic_periods_passed_int + 1) * SYNODIC_PERIOD
+ )
+
+ next_launch_date = next_launch_date.replace(
+ hour=0, minute=0, second=0, microsecond=0
+ )
+
+ return {
+ "next_launch_date": next_launch_date,
+ "synodic_period_days": SYNODIC_PERIOD,
+ "transfer_time_days": TRANSFER_TIME,
+ "delta_v_available_m_s": delta_v_available,
+ "delta_v_required_m_s": delta_v_required,
+ "is_feasible": is_feasible,
+ }
+
+ query_params = {
+ event["name"]: event["value"] for event in event.get("parameters", [])
+ }
+
+ total_mass = float(query_params.get("total_mass"))
+ dry_mass = float(query_params.get("dry_mass"))
+ specific_impulse = float(query_params.get("specific_impulse"))
+
+ response = {
+ "next_launch_window": _calculate_optimal_departure_window(
+ total_mass, dry_mass, specific_impulse
+ )
+ }
+
+ response_body = {"application/json": {"body": json.dumps(response)}}
+
+ action_response = {
+ "actionGroup": event["actionGroup"],
+ "apiPath": event["apiPath"],
+ "httpMethod": event["httpMethod"],
+ "httpStatusCode": 200,
+ "responseBody": response_body,
+ }
+
+ session_attributes = event["sessionAttributes"]
+ prompt_session_attributes = event["promptSessionAttributes"]
+
+ return {
+ "messageVersion": "1.0",
+ "response": action_response,
+ "sessionAttributes": session_attributes,
+ "promptSessionAttributes": prompt_session_attributes,
+ }
+```
+
+Next, we are going to wrap Bedrock agent as a ChatModel so that we can register and load it for inference.
+
+## Writing ChatModel for Bedrock agent
+
+Here are the top-level packages used for running the following example locally in **Python 3.12.7**:
+
+```text
+boto3==1.35.31
+mlflow==2.16.2
+```
+
+### Implementing Bedrock Agent as an MLflow ChatModel with Tracing
+
+```python
+import copy
+import os
+import uuid
+from typing import List, Optional
+
+import boto3
+import mlflow
+from botocore.config import Config
+from mlflow.entities import SpanType
+from mlflow.pyfunc import ChatModel
+from mlflow.types.llm import ChatResponse, ChatMessage, ChatParams, ChatChoice
+
+
+class BedrockModel(ChatModel):
+ def __init__(self):
+ """
+ Initializes the BedrockModel instance with placeholder values.
+
+ Note:
+ The `load_context` method cannot create new instance variables; it can only modify existing ones.
+ Therefore, all instance variables should be defined in the `__init__` method with placeholder values.
+ """
+ self.brt = None
+ self._main_bedrock_agent = None
+ self._bedrock_agent_id = None
+ self._bedrock_agent_alias_id = None
+ self._inference_configuration = None
+ self._agent_instruction = None
+ self._model = None
+ self._aws_region = None
+
+ def __getstate__(self):
+ """
+ Prepares the instance state for pickling.
+
+ This method is needed because the `boto3` client (`self.brt`) cannot be pickled.
+ By excluding `self.brt` from the state, we ensure that the model can be serialized and deserialized properly.
+ """
+ # Create a dictionary of the instance's state, excluding the boto3 client
+ state = self.__dict__.copy()
+ del state["brt"]
+ return state
+
+ def __setstate__(self, state):
+ """
+ Restores the instance state during unpickling.
+
+ This method is needed to reinitialize the `boto3` client (`self.brt`) after the instance is unpickled,
+ because the client was excluded during pickling.
+ """
+ self.__dict__.update(state)
+ self.brt = None
+
+ def load_context(self, context):
+ """
+ Initializes the Bedrock client with AWS credentials.
+
+ Args:
+ context: The MLflow context containing model configuration.
+
+ Note:
+ Dependent secret variables must be in the execution environment prior to loading the model;
+ else they will not be available during model initialization.
+ """
+ self._main_bedrock_agent = context.model_config.get("agents", {}).get(
+ "main", {}
+ )
+ self._bedrock_agent_id = self._main_bedrock_agent.get("bedrock_agent_id")
+ self._bedrock_agent_alias_id = self._main_bedrock_agent.get(
+ "bedrock_agent_alias_id"
+ )
+ self._inference_configuration = self._main_bedrock_agent.get(
+ "inference_configuration"
+ )
+ self._agent_instruction = self._main_bedrock_agent.get("instruction")
+ self._model = self._main_bedrock_agent.get("model")
+ self._aws_region = self._main_bedrock_agent.get("aws_region")
+
+ # Initialize the Bedrock client
+ self.brt = boto3.client(
+ service_name="bedrock-agent-runtime",
+ config=Config(region_name=self._aws_region),
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY"],
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
+ aws_session_token=os.environ["AWS_SESSION_TOKEN"],
+ region_name=self._aws_region,
+ )
+
+ @staticmethod
+ def _extract_trace_groups(events):
+ """
+ Extracts trace groups from a list of events based on their trace IDs.
+
+ Args:
+ events (list): A list of event dictionaries.
+
+ Returns:
+ dict: A dictionary where keys are trace IDs and values are lists of trace items.
+ """
+ from collections import defaultdict
+
+ trace_groups = defaultdict(list)
+
+ def find_trace_ids(obj, original_trace, depth=0, parent_key=None):
+ if depth > 5:
+ return # Stop recursion after 5 levels if no traceId has been found
+ if isinstance(obj, dict):
+ trace_id = obj.get("traceId")
+ if trace_id:
+ # Include the parent key as the 'type'
+ item = {
+ "type": parent_key,
+ "data": obj,
+ "event_order": original_trace.get("trace", {}).get(
+ "event_order"
+ ),
+ }
+ trace_groups[trace_id].append(item)
+ else:
+ for key, value in obj.items():
+ find_trace_ids(
+ value, original_trace, depth=depth + 1, parent_key=key
+ )
+ elif isinstance(obj, list):
+ for item in obj:
+ find_trace_ids(item, item, depth=depth + 1, parent_key=parent_key)
+
+ find_trace_ids(events, {})
+ return dict(trace_groups)
+
+ @staticmethod
+ def _get_final_response_with_trace(trace_id_groups: dict[str, list[dict]]):
+ """
+ Processes trace groups to extract the final response and create relevant MLflow spans.
+
+ Args:
+ trace_id_groups (dict): A dictionary of trace groups keyed by trace IDs.
+
+ Returns:
+ str: The final response text extracted from the trace groups.
+ """
+ trace_id_groups_copy = copy.deepcopy(trace_id_groups)
+ model_invocation_input_key = "modelInvocationInput"
+
+ def _create_trace_by_type(
+ trace_name, _trace_id, context_input, optional_rationale_subtrace
+ ):
+ @mlflow.trace(
+ name=trace_name,
+ attributes={"trace_attributes": trace_id_groups[_trace_id]},
+ )
+ def _trace_agent_pre_context(inner_input_trace):
+ return optional_rationale_subtrace.get("data", {}).get("text")
+
+ trace_id_groups_copy[_trace_id].remove(context_input)
+ _trace_agent_pre_context(context_input.get("data", {}).get("text"))
+
+ def _extract_action_group_trace(
+ _trace_id, trace_group, action_group_invocation_input: dict
+ ):
+ @mlflow.trace(
+ name="action-group-invocation",
+ attributes={"trace_attributes": trace_id_groups[_trace_id]},
+ )
+ def _action_group_trace(inner_trace_group):
+ for _trace in trace_group:
+ action_group_invocation_output = _trace.get("data", {}).get(
+ "actionGroupInvocationOutput"
+ )
+ if action_group_invocation_output is not None:
+ action_group_response = str(
+ {
+ "action_group_name": action_group_invocation_input.get(
+ "actionGroupName"
+ ),
+ "api_path": action_group_invocation_input.get(
+ "apiPath"
+ ),
+ "execution_type": action_group_invocation_input.get(
+ "executionType"
+ ),
+ "execution_output": action_group_invocation_output.get(
+ "text"
+ ),
+ }
+ )
+ trace_group.remove(_trace)
+ return action_group_response
+
+ _action_group_trace(str(action_group_invocation_input))
+
+ def _extract_knowledge_base_trace(
+ _trace_id, trace_group, knowledge_base_lookup_input
+ ):
+ @mlflow.trace(
+ name="knowledge-base-lookup",
+ attributes={"trace_attributes": trace_id_groups[_trace_id]},
+ )
+ def _knowledge_base_trace(inner_trace_group):
+ for _trace in trace_group:
+ knowledge_base_lookup_output = _trace.get("data", {}).get(
+ "knowledgeBaseLookupOutput"
+ )
+ if knowledge_base_lookup_output is not None:
+ knowledge_base_response = str(
+ {
+ "knowledge_base_id": knowledge_base_lookup_input.get(
+ "knowledgeBaseId"
+ ),
+ "text": knowledge_base_lookup_input.get("text"),
+ "retrieved_references": knowledge_base_lookup_output.get(
+ "retrievedReferences"
+ ),
+ }
+ )
+ trace_group.remove(_trace)
+ return knowledge_base_response
+
+ _knowledge_base_trace(str(trace_group))
+
+ def _trace_group_type(
+ _trace_id, trace_group, _trace, optional_rationale_subtrace
+ ):
+ trace_name = "observation"
+ pre_processing_trace_id_suffix = "-pre"
+ if pre_processing_trace_id_suffix in _trace_id:
+ trace_name = "agent-initial-context"
+ else:
+ for _inner_trace in trace_group:
+ action_group_invocation_input = _inner_trace.get("data", {}).get(
+ "actionGroupInvocationInput"
+ )
+ if action_group_invocation_input is not None:
+ action_group_name = action_group_invocation_input.get(
+ "actionGroupName"
+ )
+ trace_name = f"ACTION-GROUP-{action_group_name}"
+ _create_trace_by_type(
+ trace_name, _trace_id, _trace, optional_rationale_subtrace
+ )
+ _extract_action_group_trace(
+ _trace_id, trace_group, action_group_invocation_input
+ )
+ trace_group.remove(_trace)
+ knowledge_base_lookup_input = _inner_trace.get("data", {}).get(
+ "knowledgeBaseLookupInput"
+ )
+ if knowledge_base_lookup_input is not None:
+ knowledge_base_id = knowledge_base_lookup_input.get(
+ "knowledgeBaseId"
+ )
+ trace_name = f"KNOWLEDGE_BASE_{knowledge_base_id}"
+ _create_trace_by_type(
+ trace_name, _trace_id, _trace, optional_rationale_subtrace
+ )
+ _extract_knowledge_base_trace(
+ _trace_id, trace_group, knowledge_base_lookup_input
+ )
+ trace_group.remove(_trace)
+ return trace_name
+
+ for _trace_id, _trace_group in trace_id_groups_copy.items():
+ trace_group = sorted(_trace_group, key=lambda tg: tg["event_order"])
+ model_invocation_input_subtrace = None
+ optional_rationale_subtrace = None
+ for _trace in _trace_group:
+ if model_invocation_input_key == _trace.get("type", ""):
+ model_invocation_input_subtrace = _trace
+ elif "rationale" == _trace.get("type", ""):
+ optional_rationale_subtrace = _trace
+ _trace_group_type(
+ _trace_id,
+ trace_group,
+ model_invocation_input_subtrace,
+ optional_rationale_subtrace,
+ )
+
+ final_response = (
+ list(trace_id_groups_copy.values())[-1][-1]
+ .get("data", {})
+ .get("finalResponse", {})
+ .get("text")
+ )
+ return final_response
+
+ @mlflow.trace(name="Bedrock Input Prompt")
+ def _get_agent_prompt(self, raw_input_question):
+ """
+ Constructs the agent prompt by combining the input question and the agent instruction.
+
+ Args:
+ raw_input_question (str): The user's input question.
+
+ Returns:
+ str: The formatted agent prompt.
+ """
+ return f"""
+ Answer the following question and pay strong attention to the prompt:
+
+ {raw_input_question}
+
+
+ {self._agent_instruction}
+
+ """
+
+ @mlflow.trace(name="bedrock-agent", span_type=SpanType.CHAT_MODEL)
+ def predict(
+ self, context, messages: List[ChatMessage], params: Optional[ChatParams]
+ ) -> ChatResponse:
+ """
+ Makes a prediction using the Bedrock agent and processes the response.
+
+ Args:
+ context: The MLflow context.
+ messages (List[ChatMessage]): A list of chat messages.
+ params (Optional[ChatParams]): Optional parameters for the chat.
+
+ Returns:
+ ChatResponse: The response from the Bedrock agent.
+ """
+ formatted_input = messages[-1].content
+ session_id = uuid.uuid4().hex
+
+ response = self.brt.invoke_agent(
+ agentId=self._bedrock_agent_id,
+ agentAliasId=self._bedrock_agent_alias_id,
+ inputText=self._get_agent_prompt(formatted_input),
+ enableTrace=True,
+ sessionId=session_id,
+ endSession=False,
+ )
+
+ # Since this provider's output doesn't match the OpenAI specification,
+ # we need to go through the returned trace data and map it appropriately
+ # to create the MLflow span object.
+ events = []
+ for index, event in enumerate(response.get("completion", [])):
+ if "trace" in event:
+ event["trace"]["event_order"] = index
+ events.append(event)
+ trace_id_groups = self._extract_trace_groups(events)
+ final_response = self._get_final_response_with_trace(trace_id_groups)
+ with mlflow.start_span(
+ name="retrieved-response", span_type=SpanType.AGENT
+ ) as span:
+ span.set_inputs(messages)
+ span.set_attributes({})
+
+ output = ChatResponse(
+ choices=[
+ ChatChoice(
+ index=0,
+ message=ChatMessage(role="user", content=final_response),
+ )
+ ],
+ usage={},
+ model=self._model,
+ )
+
+ span.set_outputs(output)
+
+ return output
+```
+
+Here are some important remarks about this `BedrockModel` implementation:
+
+- AWS access key ID, secret key and the session token are externalized here. These need to be present in the environment before we can run inference.
+ You will need to generate it for your IAM user and set them as environment variables.
+
+```bash
+aws sts get-session-token --duration-seconds 3600
+```
+
+And then set the following:
+
+```python
+import os
+
+os.environ['AWS_ACCESS_KEY'] = ""
+os.environ['AWS_SECRET_ACCESS_KEY'] = ""
+os.environ['AWS_SESSION_TOKEN'] = ""
+
+```
+
+As noticed in the code above, these do not get logged with the model and are only set inside `load_context`.
+This method is called when ChatModel is constructed. Further details are [here](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.PythonModel.load_context)
+
+- Bedrock agent ID and agent alias ID are passed via `model_config` that we will use below.
+
+- boto3 module has been excluded from getting pickled. This is done via `__getstate__` and `__setstate__` where we exclude it and reset it respectively
+
+### Log and load the BedrockModel
+
+```python
+import mlflow
+from mlflow.models import infer_signature
+
+input_example = [
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "When is the next launch window for Mars?",
+ }
+ ]
+ }
+]
+
+output_example = {
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "test content"},
+ }
+ ]
+}
+signature = infer_signature(input_example, output_example)
+
+with mlflow.start_run():
+
+ model_config = {
+ "agents": {
+ "main": {
+ "model": "anthropic.claude-v2",
+ "aws_region": "us-east-1",
+ "bedrock_agent_id": "O9KQSEVEFF",
+ "bedrock_agent_alias_id": "3WHEEJKNUT",
+ "instruction": (
+ "You have functions available at your disposal to use when anwering any questions about orbital mechanics."
+ "if you can't find a function to answer a question about orbital mechanics, simply reply "
+ "'I do not know'"
+ ),
+ "inference_configuration": {
+ "temperature": 0.5,
+ "maximumLength": 2000,
+ },
+ },
+ },
+ }
+
+ # Input example for the model
+ input_example = {
+ "messages": [
+ {
+ "role": "user",
+ "content": "When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.",
+ }
+ ]
+ }
+
+ # Log and load the model using MLflow
+ logged_chain_info = mlflow.pyfunc.log_model(
+ python_model=BedrockModel(),
+ model_config=model_config,
+ artifact_path="chain", # This string is used as the path inside the MLflow model where artifacts are stored
+ input_example=input_example, # Must be a valid input to your chain
+ )
+
+loaded = mlflow.pyfunc.load_model(logged_chain_info.model_uri)
+
+# Predict using the loaded model
+response = loaded.predict(
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.",
+ }
+ ]
+ }
+)
+print(response)
+```
+
+```text
+
+```
+
+### Mapping Bedrock Agent Trace Data to MLflow Span Objects
+
+In this step, we need to iterate over the data that is returned within the bedrock agent's response trace
+to provide relevant mappings to create the MLflow span object.
+AWS Bedrock agent's response is a flat list with trace events connected by `traceId`.
+Here is the raw trace sent in the bedrock agent's response:
+
+
+Expand to see AWS Bedrock agent's raw trace
+```text
+[
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 0,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'preProcessingTrace': {
+ 'modelInvocationInput': {
+ 'inferenceConfiguration': {
+ ...
+ },
+ 'text': '\n\nHuman: You are a classifying agent that filters user inputs into categories. Your job is to sort these inputs before they... XML tags before providing only the category letter to sort the input into within XML tags.\n\nAssistant:',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-pre-0',
+ 'type': 'PRE_PROCESSING'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 1,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'preProcessingTrace': {
+ 'modelInvocationOutput': {
+ 'parsedResponse': {
+ ...
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-pre-0'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 2,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'modelInvocationInput': {
+ 'inferenceConfiguration': {
+ ...
+ },
+ 'text': '\n\nHuman:\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a ...\n\nAssistant: I understand I cannot use functions that have not been provided to me to answer this question.\n\n',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0',
+ 'type': 'ORCHESTRATION'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 3,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'modelInvocationOutput': {
+ 'metadata': {
+ ...
+ },
+ 'rawResponse': {
+ ...
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 4,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'rationale': {
+ 'text': 'To answer this question about the next Mars launch window, I will:\n\n1. Call the GET::optimal_departure_window_mars::getNext...lse values.\n\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 5,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'invocationInput': {
+ 'actionGroupInvocationInput': {
+ ...
+ },
+ 'invocationType': 'ACTION_GROUP',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 6,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'observation': {
+ 'actionGroupInvocationOutput': {
+ ...
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0',
+ 'type': 'ACTION_GROUP'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 7,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'modelInvocationInput': {
+ 'inferenceConfiguration': {
+ ...
+ },
+ 'text': '\n\nHuman:\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a ...lta_v_available_m_s": 39457.985759929674, "delta_v_required_m_s": 5595.997417810693, "is_feasible": true}}\n',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1',
+ 'type': 'ORCHESTRATION'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 8,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'modelInvocationOutput': {
+ 'metadata': {
+ ...
+ },
+ 'rawResponse': {
+ ...
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'trace': {
+ 'agentAliasId': '3WHEEJKNUT',
+ 'agentId': 'O9KQSEVEFF',
+ 'agentVersion': '1',
+ 'event_order': 9,
+ 'sessionId': '9566a6d78551434fb0409578ffed63c1',
+ 'trace': {
+ 'orchestrationTrace': {
+ 'observation': {
+ 'finalResponse': {
+ ...
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1',
+ 'type': 'FINISH'
+ }
+ }
+ }
+ }
+ },
+ {
+ 'chunk': {
+ 'bytes': b
+ 'Based on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the next optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.'
+ }
+ }
+]
+```
+
+
+To fit this structure into MLflow's span, we first need to go through the raw response trace and group events by their `traceId`.
+After grouping the trace events by _`traceId`_, the structure looks like this:
+
+
+Expand to see trace grouped by _`traceId`_
+```text
+{
+ 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0': [
+ {
+ 'data': {
+ 'inferenceConfiguration': {
+ 'maximumLength': 2048,
+ 'stopSequences': [
+ '',
+ '',
+ ''
+ ],
+ 'temperature': 0.0,
+ 'topK': 250,
+ 'topP': 1.0
+ },
+ 'text': '\n\nHuman:\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a ...\n\nAssistant: I understand I cannot use functions that have not been provided to me to answer this question.\n\n',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0',
+ 'type': 'ORCHESTRATION'
+ },
+ 'event_order': 2,
+ 'type': 'modelInvocationInput'
+ },
+ {
+ 'data': {
+ 'metadata': {
+ 'usage': {
+ 'inputTokens': 5160,
+ 'outputTokens': 135
+ }
+ },
+ 'rawResponse': {
+ 'content': 'To answer this question about the next Mars launch window, I will:\n\n1. Call the GET::optimal_departure_window_mars::getNext...l>\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow(specific_impulse="2500", dry_mass="10000", total_mass="50000")'
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ },
+ 'event_order': 3,
+ 'type': 'modelInvocationOutput'
+ },
+ {
+ 'data': {
+ 'text': 'To answer this question about the next Mars launch window, I will:\n\n1. Call the GET::optimal_departure_window_mars::getNext...lse values.\n\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ },
+ 'event_order': 4,
+ 'type': 'rationale'
+ },
+ {
+ 'data': {
+ 'actionGroupInvocationInput': {
+ 'actionGroupName': 'optimal_departure_window_mars',
+ 'apiPath': '/get-next-mars-launch-window',
+ 'executionType': 'LAMBDA',
+ 'parameters': [
+ {
+ ...
+ },
+ {
+ ...
+ },
+ {
+ ...
+ }
+ ],
+ 'verb': 'get'
+ },
+ 'invocationType': 'ACTION_GROUP',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0'
+ },
+ 'event_order': 5,
+ 'type': 'invocationInput'
+ },
+ {
+ 'data': {
+ 'actionGroupInvocationOutput': {
+ 'text': '{"next_launch_window": {"next_launch_date": "2026-11-26 00:00:00", "synodic_period_days": 779.9068939794238, "transfer_time_days": 259, "delta_v_available_m_s": 39457.985759929674, "delta_v_required_m_s": 5595.997417810693, "is_feasible": true}}'
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-0',
+ 'type': 'ACTION_GROUP'
+ },
+ 'event_order': 6,
+ 'type': 'observation'
+ }
+ ],
+ 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1': [
+ {
+ 'data': {
+ 'inferenceConfiguration': {
+ 'maximumLength': 2048,
+ 'stopSequences': [
+ '',
+ '',
+ ''
+ ],
+ 'temperature': 0.0,
+ 'topK': 250,
+ 'topP': 1.0
+ },
+ 'text': '\n\nHuman:\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a ...lta_v_available_m_s": 39457.985759929674, "delta_v_required_m_s": 5595.997417810693, "is_feasible": true}}\n',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1',
+ 'type': 'ORCHESTRATION'
+ },
+ 'event_order': 7,
+ 'type': 'modelInvocationInput'
+ },
+ {
+ 'data': {
+ 'metadata': {
+ 'usage': {
+ 'inputTokens': 5405,
+ 'outputTokens': 64
+ }
+ },
+ 'rawResponse': {
+ 'content': '\nBased on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the ... optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.'
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1'
+ },
+ 'event_order': 8,
+ 'type': 'modelInvocationOutput'
+ },
+ {
+ 'data': {
+ 'finalResponse': {
+ 'text': 'Based on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the next optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.'
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-1',
+ 'type': 'FINISH'
+ },
+ 'event_order': 9,
+ 'type': 'observation'
+ }
+ ],
+ 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-pre-0': [
+ {
+ 'data': {
+ 'inferenceConfiguration': {
+ 'maximumLength': 2048,
+ 'stopSequences': [
+ '\n\nHuman:'
+ ],
+ 'temperature': 0.0,
+ 'topK': 250,
+ 'topP': 1.0
+ },
+ 'text': '\n\nHuman: You are a classifying agent that filters user inputs into categories. Your job is to sort these inputs before they... XML tags before providing only the category letter to sort the input into within XML tags.\n\nAssistant:',
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-pre-0',
+ 'type': 'PRE_PROCESSING'
+ },
+ 'event_order': 0,
+ 'type': 'modelInvocationInput'
+ },
+ {
+ 'data': {
+ 'parsedResponse': {
+ 'isValid': True,
+ 'rationale': 'Based on the provided instructions, this input appears to be a question about orbital mechanics that can be answered using th...equired arguments for that function - specific impulse, dry mass, and total mass. Therefore, this input should be sorted into:'
+ },
+ 'traceId': 'ca9880a2-dae7-46ac-a480-f38ca7e2d99f-pre-0'
+ },
+ 'event_order': 1,
+ 'type': 'modelInvocationOutput'
+ }
+ ]
+}
+```
+
+
+Each group of events with the same _`traceId`_ will contain at least two events: one of type _`modelInvocationInput`_ and
+one of type _`modelInvocationOutput`_. Groups that involve action group traces will also include events of type
+_`actionGroupInvocationInput`_ and _`actionGroupInvocationOutput`_. Similarly, groups that use knowledge bases will have
+additional events of type _`knowledgeBaseLookupInput`_ and _`knowledgeBaseLookupOutput`_.
+In the _`BedrockModel`_ mentioned above, it implements an approach to parse these event groups into trace nodes.
+This method allows the trace to display the reasoning behind selecting action groups/knowledge bases to answer queries and invoking
+the corresponding Lambda function calls, as defined in our example OpenAPI spec above.
+This structure helps to clearly show the flow of information and decision-making process that bedrock agent follows.
+
+
+Here is the final mlflow trace
+```text
+{
+ "spans": [
+ {
+ "name": "Bedrock Agent Runtime",
+ "context": {
+ "span_id": "0xb802165d133a33aa",
+ "trace_id": "0x9b8bd0b2e018d77f936e48a09e54fd44"
+ },
+ "parent_id": null,
+ "start_time": 1731388531754725000,
+ "end_time": 1731388550226771000,
+ "status_code": "OK",
+ "status_message": "",
+ "attributes": {
+ "mlflow.traceRequestId": "\"1e036cc3a7f946ec995f7763b8dde51c\"",
+ "mlflow.spanType": "\"CHAT_MODEL\"",
+ "mlflow.spanFunctionName": "\"predict\"",
+ "mlflow.spanInputs": "{\"context\": \"\", \"messages\": [{\"role\": \"user\", \"content\": \"When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\", \"name\": null}], \"params\": {\"temperature\": 1.0, \"max_tokens\": null, \"stop\": null, \"n\": 1, \"stream\": false, \"top_p\": null, \"top_k\": null, \"frequency_penalty\": null, \"presence_penalty\": null}}",
+ "mlflow.spanOutputs": "{\"choices\": [{\"index\": 0, \"message\": {\"role\": \"user\", \"content\": \"Based on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the next optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.\", \"name\": null}, \"finish_reason\": \"stop\", \"logprobs\": null}], \"usage\": {\"prompt_tokens\": null, \"completion_tokens\": null, \"total_tokens\": null}, \"id\": null, \"model\": \"anthropic.claude-v2\", \"object\": \"chat.completion\", \"created\": 1731388550}"
+ },
+ "events": []
+ },
+ {
+ "name": "Bedrock Input Prompt",
+ "context": {
+ "span_id": "0x2e7cd730be70865b",
+ "trace_id": "0x9b8bd0b2e018d77f936e48a09e54fd44"
+ },
+ "parent_id": "0xb802165d133a33aa",
+ "start_time": 1731388531755172000,
+ "end_time": 1731388531755252000,
+ "status_code": "OK",
+ "status_message": "",
+ "attributes": {
+ "mlflow.traceRequestId": "\"1e036cc3a7f946ec995f7763b8dde51c\"",
+ "mlflow.spanType": "\"UNKNOWN\"",
+ "mlflow.spanFunctionName": "\"_get_agent_prompt\"",
+ "mlflow.spanInputs": "{\"raw_input_question\": \"When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\"}",
+ "mlflow.spanOutputs": "\"\\n Answer the following question and pay strong attention to the prompt:\\n \\n When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\\n \\n \\n You have functions available at your disposal to use when anwering any questions about orbital mechanics.if you can't find a function to answer a question about orbital mechanics, simply reply 'I do not know'\\n \\n \""
+ },
+ "events": []
+ },
+ {
+ "name": "ACTION GROUP DECISION -optimal_departure_window_mars",
+ "context": {
+ "span_id": "0x131e4e08cd5e95d9",
+ "trace_id": "0x9b8bd0b2e018d77f936e48a09e54fd44"
+ },
+ "parent_id": "0xb802165d133a33aa",
+ "start_time": 1731388550223219000,
+ "end_time": 1731388550224592000,
+ "status_code": "OK",
+ "status_message": "",
+ "attributes": {
+ "mlflow.traceRequestId": "\"1e036cc3a7f946ec995f7763b8dde51c\"",
+ "mlflow.spanType": "\"UNKNOWN\"",
+ "trace_attributes": "[{\"type\": \"modelInvocationInput\", \"data\": {\"inferenceConfiguration\": {\"maximumLength\": 2048, \"stopSequences\": [\"\", \"\", \"\"], \"temperature\": 0.0, \"topK\": 250, \"topP\": 1.0}, \"text\": \"\\n\\nHuman:\\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a . Your goal is to answer the user's question to the best of your ability, using the function(s) to gather more information if necessary to better answer the question. If you choose to call a function, the result of the function call will be added to the conversation history in tags (if the call succeeded) or tags (if the function failed). \\nYou were created with these instructions to consider as well:\\n\\n You are a friendly chat bot. You have access to a function called that returns\\n information about the Mars launch window. When responding with Mars launch window,\\n please make sure to add the timezone UTC.\\n \\n\\nHere are some examples of correct action by other, different agents with access to functions that may or may not be similar to ones you are provided.\\n\\n\\n Here is an example of how you would correctly answer a question using a and the corresponding . Notice that you are free to think before deciding to make a in the .\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n\\n Can you show me my policy engine violation from 1st january 2023 to 1st february 2023? My alias is jsmith.\\n \\n To answer this question, I will need to:\\n 1. I do not have knowledge to policy engine violations, so I should see if I can use any of the available functions to help. I have been equipped with get::policyengineactions::getpolicyviolations that gets the policy engine violations for a given alias, start date and end date. I will use this function to gather more information.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"jsmith\\\", startDate=\\\"1st January 2023\\\", endDate=\\\"1st February 2023\\\")\\n {response: [{creationDate: \\\"2023-06-01T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-06-02T14:45:00Z\\\", riskLevel: \\\"Medium\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n The policy engine violations between 1st january 2023 to 1st february 2023 for alias jsmith are - Policy ID: POL-001, Policy ID: POL-002\\n \\n\\n Here is another example that utilizes multiple function calls.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Can you check the policy engine violations under my manager between 2nd May to 5th May? My alias is john.\\n \\n To answer this question, I will need to:\\n 1. Get the manager alias of the user using get::activedirectoryactions::getmanager function.\\n 2. Use the returned manager alias to get the policy engine violations using the get::policyengineactions::getpolicyviolations function.\\n\\n I have double checked and made sure that I have been provided the get::activedirectoryactions::getmanager and the get::policyengineactions::getpolicyviolations functions.\\n \\n get::activedirectoryactions::getmanager(alias=\\\"john\\\")\\n {response: {managerAlias: \\\"mark\\\", managerLevel: \\\"6\\\", teamName: \\\"Builder\\\", managerName: \\\"Mark Hunter\\\"}}}}\\n \\n 1. I have the managerAlias from the function results as mark and I have the start and end date from the user input. I can use the function result to call get::policyengineactions::getpolicyviolations function.\\n 2. I will then return the get::policyengineactions::getpolicyviolations function result to the user.\\n\\n I have double checked and made sure that I have been provided the get::policyengineactions::getpolicyviolations functions.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"mark\\\", startDate=\\\"2nd May 2023\\\", endDate=\\\"5th May 2023\\\")\\n {response: [{creationDate: \\\"2023-05-02T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-05-04T14:45:00Z\\\", riskLevel: \\\"Low\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n \\n The policy engine violations between 2nd May 2023 to 5th May 2023 for your manager's alias mark are - Policy ID: POL-001, Policy ID: POL-002\\n \\n \\n\\n Functions can also be search engine API's that issue a query to a knowledge base. Here is an example that utilizes regular function calls in combination with function calls to a search engine API. Please make sure to extract the source for the information within the final answer when using information returned from the search engine.\\n \\n \\n \\n get::benefitsaction::getbenefitplanname\\n Get's the benefit plan name for a user. The API takes in a userName and a benefit type and returns the benefit name to the user (i.e. Aetna, Premera, Fidelity, etc.).\\n userName (string): None\\n benefitType (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n post::benefitsaction::increase401klimit\\n Increases the 401k limit for a generic user. The API takes in only the current 401k limit and returns the new limit.\\n currentLimit (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::x_amz_knowledgebase_dentalinsurance::search\\n This is a search tool that provides information about Delta Dental benefits. It has information about covered dental benefits and other relevant information\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_401kplan::search\\n This is a search tool that provides information about Amazon 401k plan benefits. It can determine what a person's yearly 401k contribution limit is, based on their age.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_healthinsurance::search\\n This is a search tool that provides information about Aetna and Premera health benefits. It has information about the savings plan and shared deductible plan, as well as others.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n \\n\\n What is my deductible? My username is Bob and my benefitType is Dental. Also, what is the 401k yearly contribution limit?\\n I understand I cannot use functions that have not been provided to me to answer this question.\\n To answer this question, I will:\\n 1. Call the get::benefitsaction::getbenefitplanname function to get the benefit plan name for the user Bob with benefit type Dental.\\n 2. Call the get::x_amz_knowledgebase_dentalinsurance::search function to search for information about deductibles for the plan name returned from step 1.\\n 3. Call the get::x_amz_knowledgebase_401k::search function to search for information about 401k yearly contribution limits.\\n 4. Return the deductible information from the search results to the user.\\n I have checked that I have access to the get::benefitsaction::getbenefitplanname, x_amz_knowledgebase_dentalinsurance::search, and x_amz_knowledgebase_401k::search functions.\\n \\n get::benefitsaction::getbenefitplanname(userName=\\\"Bob\\\", benefitType=\\\"Dental\\\")\\n {{'response': {{'planName': 'Delta Dental'}}}}\\n \\n I have received the plan name Delta Dental for the user Bob with Dental benefits. I will now call the x_amz_knowledgebase_dentalinsurance::search function to find deductible information for Delta Dental.\\n \\n get::x_amz_knowledgebase_dentalinsurance::search(searchQuery=\\\"What is the deductible for Delta Dental?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\"\\\\n\\\\nThe user's individual deductible is $50 per benefit period\\\\n\\\\n\\\\n\\\\nIf they are enrolled with dependents, the maximum family deductible is $150 per benefit period.\\\\n\\\\n\\\\n\\\"}}}}\\n I have found the deductible information for Dental benefits. I will now call the x_amz_knowledgebase_401k::search function to find yearly 401k contribution limits.\\n \\n get::x_amz_knowledgebase_401k::search(searchQuery=\\\"What is the yearly 401k contribution limit?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\\n\\\\nThe yearly 401k contribution limit is $20,500.\\\\n\\\\n\\\\n\\\"}}}}\\n \\n \\n The deductible for your Delta Dental plan is $50 per benefit period.\\n \\n \\n \\n If you have dependents enrolled, the maximum family deductible is $150 per benefit period.\\n \\n \\n \\n The yearly 401k contribution limit is $20,500.\\n \\n \\n \\n \\n\\n \\n\\n Here's a final example where the question asked could not be answered with information gathered from calling the provided functions. In this example, notice how you respond by telling the user you cannot answer, without using a function that was not provided to you.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Who are the reportees of David?\\n \\n After reviewing the functions I was equipped with, I realize I am not able to accurately answer this question since I can't access reportees of David. Therefore, I should explain to the user I cannot answer this question.\\n \\n \\n Sorry, I am unable to assist you with this request.\\n \\n \\n\\n\\nThe above examples have been provided to you to illustrate general guidelines and format for use of function calling for information retrieval, and how to use your scratchpad to plan your approach. IMPORTANT: the functions provided within the examples should not be assumed to have been provided to you to use UNLESS they are also explicitly given to you within tags below. All of the values and information within the examples (the questions, function results, and answers) are strictly part of the examples and have not been provided to you.\\n\\nNow that you have read and understood the examples, I will define the functions that you have available to you to use. Here is a comprehensive list.\\n\\n\\n\\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow\\nGets the next optimal launch window to Mars.\\nspecific_impulse (string): Specific impulse of the propulsion system (s).\\ndry_mass (string): Mass of the spacecraft without fuel (kg).\\ntotal_mass (string): Total mass of the spacecraft including fuel (kg)\\nobject: The next optimal departure date for a Hohmann transfer from Earth to Mars, based on the spacecraft's mass and specific impulse.\\n\\n\\n\\n\\n\\nNote that the function arguments have been listed in the order that they should be passed into the function.\\n\\n\\n\\nDo not modify or extend the provided functions under any circumstances. For example, GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be considered modifying the function which is not allowed. Please use the functions only as defined.\\n\\nDO NOT use any functions that I have not equipped you with.\\n\\n Do not make assumptions about inputs; instead, make sure you know the exact function and input to use before you call a function.\\n\\nTo call a function, output the name of the function in between and tags. You will receive a in response to your call that contains information that you can use to better answer the question. Or, if the function call produced an error, you will receive an in response.\\n\\n\\n\\nThe format for all other MUST be: $FUNCTION_NAME($FUNCTION_PARAMETER_NAME=$FUNCTION_PARAMETER_VALUE)\\n\\nRemember, your goal is to answer the user's question to the best of your ability, using only the function(s) provided within the tags to gather more information if necessary to better answer the question.\\n\\nDo not modify or extend the provided functions under any circumstances. For example, calling GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be modifying the function which is not allowed. Please use the functions only as defined.\\n\\nBefore calling any functions, create a plan for performing actions to answer this question within the . Double check your plan to make sure you don't call any functions that you haven't been provided with. Always return your final answer within tags.\\n\\n\\n\\nThe user input is Answer the following question and pay strong attention to the prompt:\\n \\n When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\\n \\n \\n You have functions available at your disposal to use when anwering any questions about orbital mechanics.if you can't find a function to answer a question about orbital mechanics, simply reply 'I do not know'\\n \\n\\n\\nAssistant: I understand I cannot use functions that have not been provided to me to answer this question.\\n\\n\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\", \"type\": \"ORCHESTRATION\"}, \"event_order\": 2}, {\"type\": \"modelInvocationOutput\", \"data\": {\"metadata\": {\"usage\": {\"inputTokens\": 5160, \"outputTokens\": 135}}, \"rawResponse\": {\"content\": \"To answer this question about the next Mars launch window, I will:\\n\\n1. Call the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function to get the next optimal launch window, passing in the provided spacecraft mass and specific impulse values.\\n\\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.\\n\\n\\n\\n\\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow(specific_impulse=\\\"2500\\\", dry_mass=\\\"10000\\\", total_mass=\\\"50000\\\")\"}, \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 3}, {\"type\": \"rationale\", \"data\": {\"text\": \"To answer this question about the next Mars launch window, I will:\\n\\n1. Call the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function to get the next optimal launch window, passing in the provided spacecraft mass and specific impulse values.\\n\\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 4}, {\"type\": \"invocationInput\", \"data\": {\"actionGroupInvocationInput\": {\"actionGroupName\": \"optimal_departure_window_mars\", \"apiPath\": \"/get-next-mars-launch-window\", \"executionType\": \"LAMBDA\", \"parameters\": [{\"name\": \"total_mass\", \"type\": \"string\", \"value\": \"50000\"}, {\"name\": \"dry_mass\", \"type\": \"string\", \"value\": \"10000\"}, {\"name\": \"specific_impulse\", \"type\": \"string\", \"value\": \"2500\"}], \"verb\": \"get\"}, \"invocationType\": \"ACTION_GROUP\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 5}, {\"type\": \"observation\", \"data\": {\"actionGroupInvocationOutput\": {\"text\": \"{\\\"next_launch_window\\\": {\\\"next_launch_date\\\": \\\"2026-11-26 00:00:00\\\", \\\"synodic_period_days\\\": 779.9068939794238, \\\"transfer_time_days\\\": 259, \\\"delta_v_available_m_s\\\": 39457.985759929674, \\\"delta_v_required_m_s\\\": 5595.997417810693, \\\"is_feasible\\\": true}}\"}, \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\", \"type\": \"ACTION_GROUP\"}, \"event_order\": 6}]",
+ "mlflow.spanFunctionName": "\"_trace_agent_pre_context\"",
+ "mlflow.spanInputs": "{\"inner_input_trace\": \"\\n\\nHuman:\\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a . Your goal is to answer the user's question to the best of your ability, using the function(s) to gather more information if necessary to better answer the question. If you choose to call a function, the result of the function call will be added to the conversation history in tags (if the call succeeded) or tags (if the function failed). \\nYou were created with these instructions to consider as well:\\n\\n You are a friendly chat bot. You have access to a function called that returns\\n information about the Mars launch window. When responding with Mars launch window,\\n please make sure to add the timezone UTC.\\n \\n\\nHere are some examples of correct action by other, different agents with access to functions that may or may not be similar to ones you are provided.\\n\\n\\n Here is an example of how you would correctly answer a question using a and the corresponding . Notice that you are free to think before deciding to make a in the .\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n\\n Can you show me my policy engine violation from 1st january 2023 to 1st february 2023? My alias is jsmith.\\n \\n To answer this question, I will need to:\\n 1. I do not have knowledge to policy engine violations, so I should see if I can use any of the available functions to help. I have been equipped with get::policyengineactions::getpolicyviolations that gets the policy engine violations for a given alias, start date and end date. I will use this function to gather more information.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"jsmith\\\", startDate=\\\"1st January 2023\\\", endDate=\\\"1st February 2023\\\")\\n {response: [{creationDate: \\\"2023-06-01T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-06-02T14:45:00Z\\\", riskLevel: \\\"Medium\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n The policy engine violations between 1st january 2023 to 1st february 2023 for alias jsmith are - Policy ID: POL-001, Policy ID: POL-002\\n \\n\\n Here is another example that utilizes multiple function calls.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Can you check the policy engine violations under my manager between 2nd May to 5th May? My alias is john.\\n \\n To answer this question, I will need to:\\n 1. Get the manager alias of the user using get::activedirectoryactions::getmanager function.\\n 2. Use the returned manager alias to get the policy engine violations using the get::policyengineactions::getpolicyviolations function.\\n\\n I have double checked and made sure that I have been provided the get::activedirectoryactions::getmanager and the get::policyengineactions::getpolicyviolations functions.\\n \\n get::activedirectoryactions::getmanager(alias=\\\"john\\\")\\n {response: {managerAlias: \\\"mark\\\", managerLevel: \\\"6\\\", teamName: \\\"Builder\\\", managerName: \\\"Mark Hunter\\\"}}}}\\n \\n 1. I have the managerAlias from the function results as mark and I have the start and end date from the user input. I can use the function result to call get::policyengineactions::getpolicyviolations function.\\n 2. I will then return the get::policyengineactions::getpolicyviolations function result to the user.\\n\\n I have double checked and made sure that I have been provided the get::policyengineactions::getpolicyviolations functions.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"mark\\\", startDate=\\\"2nd May 2023\\\", endDate=\\\"5th May 2023\\\")\\n {response: [{creationDate: \\\"2023-05-02T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-05-04T14:45:00Z\\\", riskLevel: \\\"Low\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n \\n The policy engine violations between 2nd May 2023 to 5th May 2023 for your manager's alias mark are - Policy ID: POL-001, Policy ID: POL-002\\n \\n \\n\\n Functions can also be search engine API's that issue a query to a knowledge base. Here is an example that utilizes regular function calls in combination with function calls to a search engine API. Please make sure to extract the source for the information within the final answer when using information returned from the search engine.\\n \\n \\n \\n get::benefitsaction::getbenefitplanname\\n Get's the benefit plan name for a user. The API takes in a userName and a benefit type and returns the benefit name to the user (i.e. Aetna, Premera, Fidelity, etc.).\\n userName (string): None\\n benefitType (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n post::benefitsaction::increase401klimit\\n Increases the 401k limit for a generic user. The API takes in only the current 401k limit and returns the new limit.\\n currentLimit (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::x_amz_knowledgebase_dentalinsurance::search\\n This is a search tool that provides information about Delta Dental benefits. It has information about covered dental benefits and other relevant information\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_401kplan::search\\n This is a search tool that provides information about Amazon 401k plan benefits. It can determine what a person's yearly 401k contribution limit is, based on their age.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_healthinsurance::search\\n This is a search tool that provides information about Aetna and Premera health benefits. It has information about the savings plan and shared deductible plan, as well as others.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n \\n\\n What is my deductible? My username is Bob and my benefitType is Dental. Also, what is the 401k yearly contribution limit?\\n I understand I cannot use functions that have not been provided to me to answer this question.\\n To answer this question, I will:\\n 1. Call the get::benefitsaction::getbenefitplanname function to get the benefit plan name for the user Bob with benefit type Dental.\\n 2. Call the get::x_amz_knowledgebase_dentalinsurance::search function to search for information about deductibles for the plan name returned from step 1.\\n 3. Call the get::x_amz_knowledgebase_401k::search function to search for information about 401k yearly contribution limits.\\n 4. Return the deductible information from the search results to the user.\\n I have checked that I have access to the get::benefitsaction::getbenefitplanname, x_amz_knowledgebase_dentalinsurance::search, and x_amz_knowledgebase_401k::search functions.\\n \\n get::benefitsaction::getbenefitplanname(userName=\\\"Bob\\\", benefitType=\\\"Dental\\\")\\n {{'response': {{'planName': 'Delta Dental'}}}}\\n \\n I have received the plan name Delta Dental for the user Bob with Dental benefits. I will now call the x_amz_knowledgebase_dentalinsurance::search function to find deductible information for Delta Dental.\\n \\n get::x_amz_knowledgebase_dentalinsurance::search(searchQuery=\\\"What is the deductible for Delta Dental?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\"\\\\n\\\\nThe user's individual deductible is $50 per benefit period\\\\n\\\\n\\\\n\\\\nIf they are enrolled with dependents, the maximum family deductible is $150 per benefit period.\\\\n\\\\n\\\\n\\\"}}}}\\n I have found the deductible information for Dental benefits. I will now call the x_amz_knowledgebase_401k::search function to find yearly 401k contribution limits.\\n \\n get::x_amz_knowledgebase_401k::search(searchQuery=\\\"What is the yearly 401k contribution limit?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\\n\\\\nThe yearly 401k contribution limit is $20,500.\\\\n\\\\n\\\\n\\\"}}}}\\n \\n \\n The deductible for your Delta Dental plan is $50 per benefit period.\\n \\n \\n \\n If you have dependents enrolled, the maximum family deductible is $150 per benefit period.\\n \\n \\n \\n The yearly 401k contribution limit is $20,500.\\n \\n \\n \\n \\n\\n \\n\\n Here's a final example where the question asked could not be answered with information gathered from calling the provided functions. In this example, notice how you respond by telling the user you cannot answer, without using a function that was not provided to you.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Who are the reportees of David?\\n \\n After reviewing the functions I was equipped with, I realize I am not able to accurately answer this question since I can't access reportees of David. Therefore, I should explain to the user I cannot answer this question.\\n \\n \\n Sorry, I am unable to assist you with this request.\\n \\n \\n\\n\\nThe above examples have been provided to you to illustrate general guidelines and format for use of function calling for information retrieval, and how to use your scratchpad to plan your approach. IMPORTANT: the functions provided within the examples should not be assumed to have been provided to you to use UNLESS they are also explicitly given to you within tags below. All of the values and information within the examples (the questions, function results, and answers) are strictly part of the examples and have not been provided to you.\\n\\nNow that you have read and understood the examples, I will define the functions that you have available to you to use. Here is a comprehensive list.\\n\\n\\n\\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow\\nGets the next optimal launch window to Mars.\\nspecific_impulse (string): Specific impulse of the propulsion system (s).\\ndry_mass (string): Mass of the spacecraft without fuel (kg).\\ntotal_mass (string): Total mass of the spacecraft including fuel (kg)\\nobject: The next optimal departure date for a Hohmann transfer from Earth to Mars, based on the spacecraft's mass and specific impulse.\\n\\n\\n\\n\\n\\nNote that the function arguments have been listed in the order that they should be passed into the function.\\n\\n\\n\\nDo not modify or extend the provided functions under any circumstances. For example, GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be considered modifying the function which is not allowed. Please use the functions only as defined.\\n\\nDO NOT use any functions that I have not equipped you with.\\n\\n Do not make assumptions about inputs; instead, make sure you know the exact function and input to use before you call a function.\\n\\nTo call a function, output the name of the function in between and tags. You will receive a in response to your call that contains information that you can use to better answer the question. Or, if the function call produced an error, you will receive an in response.\\n\\n\\n\\nThe format for all other MUST be: $FUNCTION_NAME($FUNCTION_PARAMETER_NAME=$FUNCTION_PARAMETER_VALUE)\\n\\nRemember, your goal is to answer the user's question to the best of your ability, using only the function(s) provided within the tags to gather more information if necessary to better answer the question.\\n\\nDo not modify or extend the provided functions under any circumstances. For example, calling GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be modifying the function which is not allowed. Please use the functions only as defined.\\n\\nBefore calling any functions, create a plan for performing actions to answer this question within the . Double check your plan to make sure you don't call any functions that you haven't been provided with. Always return your final answer within tags.\\n\\n\\n\\nThe user input is Answer the following question and pay strong attention to the prompt:\\n \\n When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\\n \\n \\n You have functions available at your disposal to use when anwering any questions about orbital mechanics.if you can't find a function to answer a question about orbital mechanics, simply reply 'I do not know'\\n \\n\\n\\nAssistant: I understand I cannot use functions that have not been provided to me to answer this question.\\n\\n\"}",
+ "mlflow.spanOutputs": "\"To answer this question about the next Mars launch window, I will:\\n\\n1. Call the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function to get the next optimal launch window, passing in the provided spacecraft mass and specific impulse values.\\n\\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.\""
+ },
+ "events": []
+ },
+ {
+ "name": "Invoking Action Group",
+ "context": {
+ "span_id": "0x692bd6457647dc76",
+ "trace_id": "0x9b8bd0b2e018d77f936e48a09e54fd44"
+ },
+ "parent_id": "0xb802165d133a33aa",
+ "start_time": 1731388550224851000,
+ "end_time": 1731388550225218000,
+ "status_code": "OK",
+ "status_message": "",
+ "attributes": {
+ "mlflow.traceRequestId": "\"1e036cc3a7f946ec995f7763b8dde51c\"",
+ "mlflow.spanType": "\"UNKNOWN\"",
+ "trace_attributes": "[{\"type\": \"modelInvocationInput\", \"data\": {\"inferenceConfiguration\": {\"maximumLength\": 2048, \"stopSequences\": [\"\", \"\", \"\"], \"temperature\": 0.0, \"topK\": 250, \"topP\": 1.0}, \"text\": \"\\n\\nHuman:\\nYou are a research assistant AI that has been equipped with one or more functions to help you answer a . Your goal is to answer the user's question to the best of your ability, using the function(s) to gather more information if necessary to better answer the question. If you choose to call a function, the result of the function call will be added to the conversation history in tags (if the call succeeded) or tags (if the function failed). \\nYou were created with these instructions to consider as well:\\n\\n You are a friendly chat bot. You have access to a function called that returns\\n information about the Mars launch window. When responding with Mars launch window,\\n please make sure to add the timezone UTC.\\n \\n\\nHere are some examples of correct action by other, different agents with access to functions that may or may not be similar to ones you are provided.\\n\\n\\n Here is an example of how you would correctly answer a question using a and the corresponding . Notice that you are free to think before deciding to make a in the .\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n\\n Can you show me my policy engine violation from 1st january 2023 to 1st february 2023? My alias is jsmith.\\n \\n To answer this question, I will need to:\\n 1. I do not have knowledge to policy engine violations, so I should see if I can use any of the available functions to help. I have been equipped with get::policyengineactions::getpolicyviolations that gets the policy engine violations for a given alias, start date and end date. I will use this function to gather more information.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"jsmith\\\", startDate=\\\"1st January 2023\\\", endDate=\\\"1st February 2023\\\")\\n {response: [{creationDate: \\\"2023-06-01T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-06-02T14:45:00Z\\\", riskLevel: \\\"Medium\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n The policy engine violations between 1st january 2023 to 1st february 2023 for alias jsmith are - Policy ID: POL-001, Policy ID: POL-002\\n \\n\\n Here is another example that utilizes multiple function calls.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Can you check the policy engine violations under my manager between 2nd May to 5th May? My alias is john.\\n \\n To answer this question, I will need to:\\n 1. Get the manager alias of the user using get::activedirectoryactions::getmanager function.\\n 2. Use the returned manager alias to get the policy engine violations using the get::policyengineactions::getpolicyviolations function.\\n\\n I have double checked and made sure that I have been provided the get::activedirectoryactions::getmanager and the get::policyengineactions::getpolicyviolations functions.\\n \\n get::activedirectoryactions::getmanager(alias=\\\"john\\\")\\n {response: {managerAlias: \\\"mark\\\", managerLevel: \\\"6\\\", teamName: \\\"Builder\\\", managerName: \\\"Mark Hunter\\\"}}}}\\n \\n 1. I have the managerAlias from the function results as mark and I have the start and end date from the user input. I can use the function result to call get::policyengineactions::getpolicyviolations function.\\n 2. I will then return the get::policyengineactions::getpolicyviolations function result to the user.\\n\\n I have double checked and made sure that I have been provided the get::policyengineactions::getpolicyviolations functions.\\n \\n get::policyengineactions::getpolicyviolations(alias=\\\"mark\\\", startDate=\\\"2nd May 2023\\\", endDate=\\\"5th May 2023\\\")\\n {response: [{creationDate: \\\"2023-05-02T09:30:00Z\\\", riskLevel: \\\"High\\\", policyId: \\\"POL-001\\\", policyUrl: \\\"https://example.com/policies/POL-001\\\", referenceUrl: \\\"https://example.com/violations/POL-001\\\"}, {creationDate: \\\"2023-05-04T14:45:00Z\\\", riskLevel: \\\"Low\\\", policyId: \\\"POL-002\\\", policyUrl: \\\"https://example.com/policies/POL-002\\\", referenceUrl: \\\"https://example.com/violations/POL-002\\\"}]}\\n \\n The policy engine violations between 2nd May 2023 to 5th May 2023 for your manager's alias mark are - Policy ID: POL-001, Policy ID: POL-002\\n \\n \\n\\n Functions can also be search engine API's that issue a query to a knowledge base. Here is an example that utilizes regular function calls in combination with function calls to a search engine API. Please make sure to extract the source for the information within the final answer when using information returned from the search engine.\\n \\n \\n \\n get::benefitsaction::getbenefitplanname\\n Get's the benefit plan name for a user. The API takes in a userName and a benefit type and returns the benefit name to the user (i.e. Aetna, Premera, Fidelity, etc.).\\n userName (string): None\\n benefitType (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n post::benefitsaction::increase401klimit\\n Increases the 401k limit for a generic user. The API takes in only the current 401k limit and returns the new limit.\\n currentLimit (string): None\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::x_amz_knowledgebase_dentalinsurance::search\\n This is a search tool that provides information about Delta Dental benefits. It has information about covered dental benefits and other relevant information\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_401kplan::search\\n This is a search tool that provides information about Amazon 401k plan benefits. It can determine what a person's yearly 401k contribution limit is, based on their age.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n get::x_amz_knowledgebase_healthinsurance::search\\n This is a search tool that provides information about Aetna and Premera health benefits. It has information about the savings plan and shared deductible plan, as well as others.\\n query(string): A full sentence query that is fed to the search tool\\n Returns string related to the user query asked.\\n \\n \\n \\n\\n What is my deductible? My username is Bob and my benefitType is Dental. Also, what is the 401k yearly contribution limit?\\n I understand I cannot use functions that have not been provided to me to answer this question.\\n To answer this question, I will:\\n 1. Call the get::benefitsaction::getbenefitplanname function to get the benefit plan name for the user Bob with benefit type Dental.\\n 2. Call the get::x_amz_knowledgebase_dentalinsurance::search function to search for information about deductibles for the plan name returned from step 1.\\n 3. Call the get::x_amz_knowledgebase_401k::search function to search for information about 401k yearly contribution limits.\\n 4. Return the deductible information from the search results to the user.\\n I have checked that I have access to the get::benefitsaction::getbenefitplanname, x_amz_knowledgebase_dentalinsurance::search, and x_amz_knowledgebase_401k::search functions.\\n \\n get::benefitsaction::getbenefitplanname(userName=\\\"Bob\\\", benefitType=\\\"Dental\\\")\\n {{'response': {{'planName': 'Delta Dental'}}}}\\n \\n I have received the plan name Delta Dental for the user Bob with Dental benefits. I will now call the x_amz_knowledgebase_dentalinsurance::search function to find deductible information for Delta Dental.\\n \\n get::x_amz_knowledgebase_dentalinsurance::search(searchQuery=\\\"What is the deductible for Delta Dental?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\"\\\\n\\\\nThe user's individual deductible is $50 per benefit period\\\\n\\\\n\\\\n\\\\nIf they are enrolled with dependents, the maximum family deductible is $150 per benefit period.\\\\n\\\\n\\\\n\\\"}}}}\\n I have found the deductible information for Dental benefits. I will now call the x_amz_knowledgebase_401k::search function to find yearly 401k contribution limits.\\n \\n get::x_amz_knowledgebase_401k::search(searchQuery=\\\"What is the yearly 401k contribution limit?\\\")\\n {{'response': {{'responseCode': '200', 'responseBody': \\\"\\\\n\\\\nThe yearly 401k contribution limit is $20,500.\\\\n\\\\n\\\\n\\\"}}}}\\n \\n \\n The deductible for your Delta Dental plan is $50 per benefit period.\\n \\n \\n \\n If you have dependents enrolled, the maximum family deductible is $150 per benefit period.\\n \\n \\n \\n The yearly 401k contribution limit is $20,500.\\n \\n \\n \\n \\n\\n \\n\\n Here's a final example where the question asked could not be answered with information gathered from calling the provided functions. In this example, notice how you respond by telling the user you cannot answer, without using a function that was not provided to you.\\n \\n \\n \\n get::policyengineactions::getpolicyviolations\\n Returns a list of policy engine violations for the specified alias within the specified date range.\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n startDate (string): The start date of the range to filter violations. The format for startDate is MM/DD/YYYY.\\n endDate (string): The end date of the range to filter violations\\n array: Successful response\\n object: Invalid request\\n \\n \\n post::policyengineactions::acknowledgeviolations\\n Acknowledge policy engine violation. Generally used to acknowledge violation, once user notices a violation under their alias or their managers alias.\\n policyId (string): The ID of the policy violation\\n expectedDateOfResolution (string): The date by when the violation will be addressed/resolved\\n object: Successful response\\n object: Invalid request\\n \\n \\n get::activedirectoryactions::getmanager\\n This API is used to identify the manager hierarchy above a given person. Every person could have a manager and the manager could have another manager to which they report to\\n alias (string): The alias of the employee under whose name current violations needs to be listed\\n object: Successful response\\n object: Invalid request\\n \\n \\n \\n Who are the reportees of David?\\n \\n After reviewing the functions I was equipped with, I realize I am not able to accurately answer this question since I can't access reportees of David. Therefore, I should explain to the user I cannot answer this question.\\n \\n \\n Sorry, I am unable to assist you with this request.\\n \\n \\n\\n\\nThe above examples have been provided to you to illustrate general guidelines and format for use of function calling for information retrieval, and how to use your scratchpad to plan your approach. IMPORTANT: the functions provided within the examples should not be assumed to have been provided to you to use UNLESS they are also explicitly given to you within tags below. All of the values and information within the examples (the questions, function results, and answers) are strictly part of the examples and have not been provided to you.\\n\\nNow that you have read and understood the examples, I will define the functions that you have available to you to use. Here is a comprehensive list.\\n\\n\\n\\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow\\nGets the next optimal launch window to Mars.\\nspecific_impulse (string): Specific impulse of the propulsion system (s).\\ndry_mass (string): Mass of the spacecraft without fuel (kg).\\ntotal_mass (string): Total mass of the spacecraft including fuel (kg)\\nobject: The next optimal departure date for a Hohmann transfer from Earth to Mars, based on the spacecraft's mass and specific impulse.\\n\\n\\n\\n\\n\\nNote that the function arguments have been listed in the order that they should be passed into the function.\\n\\n\\n\\nDo not modify or extend the provided functions under any circumstances. For example, GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be considered modifying the function which is not allowed. Please use the functions only as defined.\\n\\nDO NOT use any functions that I have not equipped you with.\\n\\n Do not make assumptions about inputs; instead, make sure you know the exact function and input to use before you call a function.\\n\\nTo call a function, output the name of the function in between and tags. You will receive a in response to your call that contains information that you can use to better answer the question. Or, if the function call produced an error, you will receive an in response.\\n\\n\\n\\nThe format for all other MUST be: $FUNCTION_NAME($FUNCTION_PARAMETER_NAME=$FUNCTION_PARAMETER_VALUE)\\n\\nRemember, your goal is to answer the user's question to the best of your ability, using only the function(s) provided within the tags to gather more information if necessary to better answer the question.\\n\\nDo not modify or extend the provided functions under any circumstances. For example, calling GET::optimal_departure_window_mars::getNextMarsLaunchWindow with additional parameters would be modifying the function which is not allowed. Please use the functions only as defined.\\n\\nBefore calling any functions, create a plan for performing actions to answer this question within the . Double check your plan to make sure you don't call any functions that you haven't been provided with. Always return your final answer within tags.\\n\\n\\n\\nThe user input is Answer the following question and pay strong attention to the prompt:\\n \\n When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\\n \\n \\n You have functions available at your disposal to use when anwering any questions about orbital mechanics.if you can't find a function to answer a question about orbital mechanics, simply reply 'I do not know'\\n \\n\\n\\nAssistant: I understand I cannot use functions that have not been provided to me to answer this question.\\n\\n\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\", \"type\": \"ORCHESTRATION\"}, \"event_order\": 2}, {\"type\": \"modelInvocationOutput\", \"data\": {\"metadata\": {\"usage\": {\"inputTokens\": 5160, \"outputTokens\": 135}}, \"rawResponse\": {\"content\": \"To answer this question about the next Mars launch window, I will:\\n\\n1. Call the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function to get the next optimal launch window, passing in the provided spacecraft mass and specific impulse values.\\n\\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.\\n\\n\\n\\n\\nGET::optimal_departure_window_mars::getNextMarsLaunchWindow(specific_impulse=\\\"2500\\\", dry_mass=\\\"10000\\\", total_mass=\\\"50000\\\")\"}, \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 3}, {\"type\": \"rationale\", \"data\": {\"text\": \"To answer this question about the next Mars launch window, I will:\\n\\n1. Call the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function to get the next optimal launch window, passing in the provided spacecraft mass and specific impulse values.\\n\\nI have verified that I have access to the GET::optimal_departure_window_mars::getNextMarsLaunchWindow function.\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 4}, {\"type\": \"invocationInput\", \"data\": {\"actionGroupInvocationInput\": {\"actionGroupName\": \"optimal_departure_window_mars\", \"apiPath\": \"/get-next-mars-launch-window\", \"executionType\": \"LAMBDA\", \"parameters\": [{\"name\": \"total_mass\", \"type\": \"string\", \"value\": \"50000\"}, {\"name\": \"dry_mass\", \"type\": \"string\", \"value\": \"10000\"}, {\"name\": \"specific_impulse\", \"type\": \"string\", \"value\": \"2500\"}], \"verb\": \"get\"}, \"invocationType\": \"ACTION_GROUP\", \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\"}, \"event_order\": 5}, {\"type\": \"observation\", \"data\": {\"actionGroupInvocationOutput\": {\"text\": \"{\\\"next_launch_window\\\": {\\\"next_launch_date\\\": \\\"2026-11-26 00:00:00\\\", \\\"synodic_period_days\\\": 779.9068939794238, \\\"transfer_time_days\\\": 259, \\\"delta_v_available_m_s\\\": 39457.985759929674, \\\"delta_v_required_m_s\\\": 5595.997417810693, \\\"is_feasible\\\": true}}\"}, \"traceId\": \"e0b2b2c2-fb7c-4e17-8a1f-a3781100face-0\", \"type\": \"ACTION_GROUP\"}, \"event_order\": 6}]",
+ "mlflow.spanFunctionName": "\"_action_group_trace\"",
+ "mlflow.spanInputs": "{\"inner_trace_group\": \"{'actionGroupName': 'optimal_departure_window_mars', 'apiPath': '/get-next-mars-launch-window', 'executionType': 'LAMBDA', 'parameters': [{'name': 'total_mass', 'type': 'string', 'value': '50000'}, {'name': 'dry_mass', 'type': 'string', 'value': '10000'}, {'name': 'specific_impulse', 'type': 'string', 'value': '2500'}], 'verb': 'get'}\"}",
+ "mlflow.spanOutputs": "\"{'action_group_name': 'optimal_departure_window_mars', 'api_path': '/get-next-mars-launch-window', 'execution_type': 'LAMBDA', 'execution_output': '{\\\"next_launch_window\\\": {\\\"next_launch_date\\\": \\\"2026-11-26 00:00:00\\\", \\\"synodic_period_days\\\": 779.9068939794238, \\\"transfer_time_days\\\": 259, \\\"delta_v_available_m_s\\\": 39457.985759929674, \\\"delta_v_required_m_s\\\": 5595.997417810693, \\\"is_feasible\\\": true}}'}\""
+ },
+ "events": []
+ },
+ {
+ "name": "Retrieved Response",
+ "context": {
+ "span_id": "0xfe0b5f9149c39d7d",
+ "trace_id": "0x9b8bd0b2e018d77f936e48a09e54fd44"
+ },
+ "parent_id": "0xb802165d133a33aa",
+ "start_time": 1731388550225320000,
+ "end_time": 1731388550226466000,
+ "status_code": "OK",
+ "status_message": "",
+ "attributes": {
+ "mlflow.traceRequestId": "\"1e036cc3a7f946ec995f7763b8dde51c\"",
+ "mlflow.spanType": "\"AGENT\"",
+ "mlflow.spanInputs": "[{\"role\": \"user\", \"content\": \"When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\", \"name\": null}]",
+ "mlflow.spanOutputs": "{\"choices\": [{\"index\": 0, \"message\": {\"role\": \"user\", \"content\": \"Based on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the next optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.\", \"name\": null}, \"finish_reason\": \"stop\", \"logprobs\": null}], \"usage\": {\"prompt_tokens\": null, \"completion_tokens\": null, \"total_tokens\": null}, \"id\": null, \"model\": \"anthropic.claude-v2\", \"object\": \"chat.completion\", \"created\": 1731388550}"
+ },
+ "events": []
+ }
+ ],
+ "request": "{\"context\": \"\", \"messages\": [{\"role\": \"user\", \"content\": \"When is the next launch window for Mars? My spacecraft's total mass is 50000, dry mass is 10000 and specific impulse is 2500. Mass in Kg.\", \"name\": null}], \"params\": {\"temperature\": 1.0, \"max_tokens\": null, \"stop\": null, \"n\": 1, \"stream\": false, \"top_p\": null, \"top_k\": null, \"frequency_penalty\": null, \"presence_penalty\": null}}",
+ "response": "{\"choices\": [{\"index\": 0, \"message\": {\"role\": \"user\", \"content\": \"Based on the provided spacecraft dry mass of 10000 kg, total mass of 50000 kg, and specific impulse of 2500 s, the next optimal launch window for a Hohmann transfer from Earth to Mars is on November 26, 2026 UTC. The transfer will take 259 days.\", \"name\": null}, \"finish_reason\": \"stop\", \"logprobs\": null}], \"usage\": {\"prompt_tokens\": null, \"completion_tokens\": null, \"total_tokens\": null}, \"id\": null, \"model\": \"anthropic.claude-v2\", \"object\": \"chat.completion\", \"created\": 1731388550}"
+}
+```
+
+
+### Visualizing Trace Breakdown in the MLflow UI
+
+1. Initial Prompt Submitted to the Bedrock Agent.
+ ![Thumbnail](bedrock_input_prompt.png)
+
+2. In this trace, we can observe how the Bedrock Agent evaluates and selects the most suitable Action Group for the task at hand.
+ ![Thumbnail](action_group_decision.png)
+
+3. Once an Action Group is selected, its invocation is traced, displaying the input and output interactions with the underlying Lambda function as outlined by the OpenAPI Spec above.
+ ![Thumbnail](invoking_action_group.png)
+
+4. Furthermore, Bedrock's supplementary trace is included under the Attributes section,
+ along with additional metadata as shown below
+ ![Thumbnail](traces_attributes.png)
+
+5. Subsequently, the final response from the agent is traced, as depicted below.
+ ![Thumbnail](retrieved_response.png)
+
+**Note**: We cannot break down the span's duration into individual trace durations
+because the Bedrock Agent's trace response does not include timestamps for each trace step.
+
+## Conclusion
+
+In this blog, we explored how to integrate the AWS Bedrock Agent as an MLflow ChatModel, focusing on Action Groups,
+Knowledge Bases, and Tracing. We demonstrated how to easily build a custom ChatModel using MLflow's flexible and
+powerful APIs. This approach enables you to leverage MLflow's tracing and logging capabilities, even for models or
+flavors that are not natively supported by MLflow.
+
+Key Takeaways from This Blog:
+
+- Deploying a Bedrock Agent with Action Groups as AWS Lambda Functions:
+ - We covered how to set up a Bedrock Agent and implement custom actions using AWS Lambda functions within Action Groups.
+- Mapping the AWS Bedrock Agent's Custom Tracing to MLflow span/trace objects:
+ - We demonstrated how to convert the agent's custom tracing data into MLflow span objects for better observability.
+- Logging and Loading the Bedrock Agent as an MLflow ChatModel:
+ - We showed how to log the Bedrock Agent into MLflow as a _`ChatModel`_ and how to load it for future use.
+- Externalizing AWS Client and Bedrock Configurations:
+ - We explained how to externalize AWS client and Bedrock configurations to safeguard secrets and make it easy to adjust model settings without the need to re-log the model.
+
+## Further Reading and References
+
+- [How Amazon Bedrock Agents work](https://docs.aws.amazon.com/bedrock/latest/userguide/agents-how.html)
+- [Amazon Bedrock Tracing](https://docs.aws.amazon.com/bedrock/latest/userguide/trace-events.html)
+- [Creating a Custom GenAI chat agent](https://mlflow.org/docs/latest/llms/chat-model-guide/index.html)
+- [AWS Code Examples Repository](https://github.com/awsdocs/aws-doc-sdk-examples)
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/invoking_action_group.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/invoking_action_group.png
new file mode 100644
index 000000000..8888d3b94
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/invoking_action_group.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/retrieved_response.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/retrieved_response.png
new file mode 100644
index 000000000..decb79b3f
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/retrieved_response.png differ
diff --git a/website/blog/2024-11-07-bedrock-chat-model-part-1/traces_attributes.png b/website/blog/2024-11-07-bedrock-chat-model-part-1/traces_attributes.png
new file mode 100644
index 000000000..f3756b5a6
Binary files /dev/null and b/website/blog/2024-11-07-bedrock-chat-model-part-1/traces_attributes.png differ
diff --git a/website/blog/2024-12-20-mlflow-tracing-in-jupyter/index.md b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/index.md
new file mode 100644
index 000000000..9d0ad1df5
--- /dev/null
+++ b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/index.md
@@ -0,0 +1,82 @@
+---
+title: MLflow Tracing in Jupyter Notebooks
+description: Introducing MLflow Tracing's Jupyter integration
+slug: mlflow-tracing-in-jupyter
+authors: [daniel-lok]
+tags: [genai, mlops, tracing]
+thumbnail: /img/blog/mlflow-tracing-in-jupyter.png
+---
+
+![Thumbnail](mlflow-tracing-in-jupyter-title.png)
+
+🚀 We're excited to announce a major upgrade to the [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html)
+experience!
+
+If you're not familiar with MLflow Tracing, it's an observability tool that allows you record the inputs and
+outputs of intermediate function executions. It's particularly useful in debugging GenAI applications, and MLflow has over
+a [dozen integrations with popular GenAI frameworks](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing)
+to automatically generate traces without requiring you to change your existing code.
+
+As of **MLflow 2.20**, you can now view the MLflow Trace UI directly within Jupyter notebooks, allowing
+you to debug your applications without having to tab out of your development environment. Context
+switching can often be disruptive to one's workflow, and this feature makes it easier to stay focused while
+still being able to visualize the trace data that you generate.
+
+
+
+## Getting Started
+
+To get started, you'll need to be using an [MLflow Tracking Server](https://mlflow.org/docs/latest/tracking/server.html).
+Under the hood, the MLflow client needs to make network requests in order to fetch the UI assets and trace data.
+
+If you don't use a remote server, you can always start one locally by running the `mlflow server`
+[CLI command](https://mlflow.org/docs/latest/tracking/server.html#start-the-tracking-server). By default,
+the server will start up at `http://localhost:5000`.
+
+In your notebook, simply ensure that the MLflow Tracking URI is set to your tracking server, and you're good to go!
+
+```python
+import mlflow
+
+# replace this with your own URI, if it differs
+tracking_uri = "http://localhost:5000"
+mlflow.set_tracking_uri(tracking_uri)
+
+# create a new experiment to avoid cluttering the default experiment
+experiment = mlflow.set_experiment("mlflow-trace-ui-demo")
+
+# the trace UI should now show up whenever traces are generated,
+# for example:
+@mlflow.trace
+def add(a, b):
+ return a + b
+
+# running the traced function triggers the UI display
+add(1, 2)
+```
+
+The trace UI will show up whenever any of the following events happen:
+
+1. A trace is generated in the cell (via automatic tracing, or when running manually traced functions)
+2. When a trace object is explicitly displayed (e.g. via IPython's `display()` function)
+3. When the `mlflow.search_traces()` API is called
+
+For a hands-on experience with this feature, please try running our
+[**demo notebook**](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/tracing/notebooks/jupyter-trace-demo.ipynb)!
+The notebook contains detailed examples of all three scenarios above, as well as a short LangChain RAG demo to
+get a more realistic impression of how this feature will feel during your development loop.
+
+## Disabling and Re-enabling the Display
+
+This feature is enabled by default, but it can be turned off any time by calling `mlflow.tracing.disable_notebook_display()`.
+To remove the displays that have already rendered, you'll need to re-run the cells (or simply clear the cell output).
+
+If you want to re-enable the display, you can call `mlflow.tracing.enable_notebook_display()`.
+
+## Bug Reports and Feedback
+
+To report bugs or provide feedback, please file an issue in the
+[MLflow GitHub repo](https://github.com/mlflow/mlflow/issues). We're looking forward to hearing from you!
diff --git a/website/blog/2024-12-20-mlflow-tracing-in-jupyter/jupyter-trace-ui.png b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/jupyter-trace-ui.png
new file mode 100644
index 000000000..e08e901bb
Binary files /dev/null and b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/jupyter-trace-ui.png differ
diff --git a/website/blog/2024-12-20-mlflow-tracing-in-jupyter/mlflow-tracing-in-jupyter-title.png b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/mlflow-tracing-in-jupyter-title.png
new file mode 100644
index 000000000..ebb2f8954
Binary files /dev/null and b/website/blog/2024-12-20-mlflow-tracing-in-jupyter/mlflow-tracing-in-jupyter-title.png differ
diff --git a/website/blog/authors.yml b/website/blog/authors.yml
index d14f83909..966c88c10 100644
--- a/website/blog/authors.yml
+++ b/website/blog/authors.yml
@@ -91,3 +91,33 @@ michael-berk:
title: Sr. Resident Solutions Architect at Databricks
url: https://www.linkedin.com/in/-michael-berk/
image_url: /img/authors/michael_berk.png
+
+pedro-azevedo:
+ name: Pedro Azevedo
+ title: Machine Learning Analyst at Adidas
+ url: https://www.linkedin.com/in/pedro-azevedo-/
+ image_url: /img/authors/pedro.png
+
+awadelrahman-ahmed:
+ name: Awadelrahman M. A. Ahmed
+ title: MLflow Ambassador | Cloud Data & Analytics Architect at REMA 1000
+ url: https://www.linkedin.com/in/awadelrahman/
+ image_url: /img/authors/awadelrahman_ahmed.png
+
+yuki-watanabe:
+ name: Yuki Watanabe
+ title: Software Engineer at Databricks
+ url: https://www.linkedin.com/in/yuki-watanabe-a04528164/
+ image_url: /img/authors/yuki_watanabe.png
+
+jas-bali:
+ name: Jas Bali
+ title: Lead Specialist Solutions Architect at Databricks
+ url: https://www.linkedin.com/in/jas-bali-195ba410a/
+ image_url: /img/authors/jas_bali.png
+
+daniel-lok:
+ name: Daniel Lok
+ title: Software Engineer at Databricks
+ url: https://www.linkedin.com/in/daniel-yk-lok/
+ image_url: /img/authors/daniel_lok.png
diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts
index 4d97d2219..1c7f83318 100644
--- a/website/docusaurus.config.ts
+++ b/website/docusaurus.config.ts
@@ -95,10 +95,6 @@ const config: Config = {
from: "/blog/2024/01/26/mlflow-year-in-review",
to: "/blog/mlflow-year-in-review",
},
- {
- from: "/blog/2023/11/30/ai-gateway-rename",
- to: "/blog/ai-gateway-rename",
- },
{
from: "/blog/2023/11/30/mlflow-autolog",
to: "/blog/mlflow-autolog",
@@ -111,6 +107,10 @@ const config: Config = {
from: "/blog/Deep Learning",
to: "/blog/deep-learning-part-1",
},
+ {
+ from: "/blog/mlflow",
+ to: "/blog/langgraph-model-from-code",
+ },
],
},
],
@@ -218,6 +218,14 @@ const config: Config = {
theme: prismThemes.vsDark,
darkTheme: prismThemes.vsDark,
},
+ announcementBar: {
+ id: "survey_bar",
+ content:
+ 'Help us improve MLflow by taking our survey!',
+ backgroundColor: "#0194e2",
+ textColor: "#ffffff",
+ isCloseable: false,
+ },
} satisfies Preset.ThemeConfig,
};
diff --git a/website/package.json b/website/package.json
index 1898fc2be..1fa770927 100644
--- a/website/package.json
+++ b/website/package.json
@@ -14,6 +14,7 @@
"fmt-notes": "ts-node scripts/release-note-format.js",
"compile": "ts-node scripts/compile.ts && prettier --write src/posts.ts",
"check-thumbnails": "ts-node scripts/check-thumbnails.ts",
+ "check-authors": "ts-node scripts/check-authors.ts",
"write-translations": "docusaurus write-translations",
"write-heading-ids": "docusaurus write-heading-ids",
"typecheck": "tsc",
diff --git a/website/releases/2024-09-13-2.16.1-release.md b/website/releases/2024-09-13-2.16.1-release.md
new file mode 100644
index 000000000..7f49f6b1e
--- /dev/null
+++ b/website/releases/2024-09-13-2.16.1-release.md
@@ -0,0 +1,29 @@
+---
+title: MLflow 2.16.1
+slug: 2.16.1
+authors: [mlflow-maintainers]
+---
+
+## 2.16.1 (2024-09-13)
+
+MLflow 2.16.1 is a patch release that includes some minor feature improvements and addresses several bug fixes.
+
+Features:
+
+- [Tracing] Add Support for an Open Telemetry compatible exporter to configure external sinks for MLflow traces ([#13118](https://github.com/mlflow/mlflow/pull/13118), [@B-Step62](https://github.com/B-Step62))
+- [Model Registry, AWS] Add support for utilizing AWS KMS-based encryption for the MLflow Model Registry ([#12495](https://github.com/mlflow/mlflow/pull/12495), [@artjen](https://github.com/artjen))
+- [Model Registry] Add support for using the OSS Unity Catalog server as a Model Registry ([#13034](https://github.com/mlflow/mlflow/pull/13034), [#13065](https://github.com/mlflow/mlflow/pull/13065), [#13066](https://github.com/mlflow/mlflow/pull/13066), [@rohitarun-db](https://github.com/rohitarun-db))
+- [Models] Introduce path-based transformers logging to reduce memory requirements for saving large transformers models ([#13070](https://github.com/mlflow/mlflow/pull/13070), [@B-Step62](https://github.com/B-Step62))
+
+Bug fixes:
+
+- [Tracking] Fix a data payload size issue with `Model.get_tags_dict` by eliminating the return of the internally-used `config` field ([#13086](https://github.com/mlflow/mlflow/pull/13086), [@harshilprajapati96](https://github.com/harshilprajapati96))
+- [Models] Fix an issue with LangChain Agents where sub-dependencies were not being properly extracted ([#13105](https://github.com/mlflow/mlflow/pull/13105), [@aravind-segu](https://github.com/aravind-segu))
+- [Tracking] Fix an issue where the wrong checkpoint for the current best model in auto checkpointing was being selected ([#12981](https://github.com/mlflow/mlflow/pull/12981), [@hareeen](https://github.com/hareeen))
+- [Tracking] Fix an issue where local timezones for trace initialization were not being taken into account in AutoGen tracing ([#13047](https://github.com/mlflow/mlflow/pull/13047), [@B-Step62](https://github.com/B-Step62))
+
+Documentation updates:
+
+- [Docs] Added RunLLM chat widget to MLflow's documentation site ([#13123](https://github.com/mlflow/mlflow/pull/13123), [@likawind](https://github.com/likawind))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.16.1), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2024-09-17-2.16.2-release.md b/website/releases/2024-09-17-2.16.2-release.md
new file mode 100644
index 000000000..8cca7295b
--- /dev/null
+++ b/website/releases/2024-09-17-2.16.2-release.md
@@ -0,0 +1,13 @@
+---
+title: MLflow 2.16.2
+slug: 2.16.2
+authors: [mlflow-maintainers]
+---
+
+## 2.16.2 (2024-09-17)
+
+MLflow 2.16.2 is a patch release that includes some minor feature improvements and addresses several bug fixes.
+
+- [Models] Revert "Update Dependency Extraction for Agents ([#13105](https://github.com/mlflow/mlflow/pull/13105))" ([#13155](https://github.com/mlflow/mlflow/pull/13155), [@aravind-segu](https://github.com/aravind-segu))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.16.2), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2024-10-11-2.17.0-release.md b/website/releases/2024-10-11-2.17.0-release.md
new file mode 100644
index 000000000..881886695
--- /dev/null
+++ b/website/releases/2024-10-11-2.17.0-release.md
@@ -0,0 +1,67 @@
+---
+title: MLflow 2.17.0
+slug: 2.17.0
+authors: [mlflow-maintainers]
+---
+
+## 2.17.0 (2024-10-11)
+
+We are excited to announce the release of MLflow 2.17.0! This release includes several enhancements to extend the
+functionality of MLflow's ChatModel interface to further extend its versatility for handling custom GenAI application use cases.
+Additionally, we've improved the interface within the tracing UI to provide a structured output for retrieved documents,
+enhancing the ability to read the contents of those documents within the UI.
+We're also starting the work on improving both the utility and the versatility of MLflow's evaluate functionality for GenAI,
+initially with support for callable GenAI evaluation metrics.
+
+### Major Features and notifications
+
+- **ChatModel enhancements** - As the GenAI-focused 'cousin' of `PythonModel`, `ChatModel` is getting some sizable functionality
+ extensions. From native support for tool calling (a requirement for creating a custom agent), simpler conversions to the
+ internal dataclass constructs needed to interface with `ChatModel` via the introduction of `from_dict` methods to all data structures,
+ the addition of a `metadata` field to allow for full input payload customization, handling of the new `refusal` response type, to the
+ inclusion of the interface type to the response structure to allow for greater integration compatibility.
+ ([#13191](https://github.com/mlflow/mlflow/pull/13191), [#13180](https://github.com/mlflow/mlflow/pull/13180), [#13143](https://github.com/mlflow/mlflow/pull/13143), [@daniellok-db](https://github.com/daniellok-db), [#13102](https://github.com/mlflow/mlflow/pull/13102), [#13071](https://github.com/mlflow/mlflow/pull/13071), [@BenWilson2](https://github.com/BenWilson2))
+
+- **Callable GenAI Evaluation Metrics** - As the intial step in a much broader expansion of the functionalities of `mlflow.evaluate` for
+ GenAI use cases, we've converted the GenAI evaluation metrics to be callable. This allows you to use them directly in packages that support
+ callable GenAI evaluation metrics, as well as making it simpler to debug individual responses when prototyping solutions. ([#13144](https://github.com/mlflow/mlflow/pull/13144), [@serena-ruan](https://github.com/serena-ruan))
+
+- **Audio file support in the MLflow UI** - You can now directly 'view' audio files that have been logged and listen to them from within the MLflow UI's
+ artifact viewer pane.
+
+- **MLflow AI Gateway is no longer deprecated** - We've decided to revert our deprecation for the AI Gateway feature. We had renamed it to the
+ MLflow Deployments Server, but have reconsidered and reverted the naming and namespace back to the original configuration.
+
+Features:
+
+- [Tracing] Add Standardization to retriever span outputs within MLflow tracing ([#13242](https://github.com/mlflow/mlflow/pull/13242), [@daniellok-db](https://github.com/daniellok-db))
+- [Models] Add support for LlamaIndex `Workflows` objects to be serialized when calling `log_model()` ([#13277](https://github.com/mlflow/mlflow/pull/13277), [#13305](https://github.com/mlflow/mlflow/pull/13305), [#13336](https://github.com/mlflow/mlflow/pull/13336), [@B-Step62](https://github.com/B-Step62))
+- [Models] Add tool calling support for ChatModel ([#13191](https://github.com/mlflow/mlflow/pull/13191), [@daniellok-db](https://github.com/daniellok-db))
+- [Models] Add `from_dict()` function to ChatModel dataclasses ([#13180](https://github.com/mlflow/mlflow/pull/13180), [@daniellok-db](https://github.com/daniellok-db))
+- [Models] Add metadata field for ChatModel ([#13143](https://github.com/mlflow/mlflow/pull/13143), [@daniellok-db](https://github.com/daniellok-db))
+- [Models] Update ChatCompletionResponse to populate object type ([#13102](https://github.com/mlflow/mlflow/pull/13102), [@BenWilson2](https://github.com/BenWilson2))
+- [Models] Add support for LLM response refusal ([#13071](https://github.com/mlflow/mlflow/pull/13071), [@BenWilson2](https://github.com/BenWilson2))
+- [Models] Add support for resources to be passed in via `langchain.log_model()` ([#13315](https://github.com/mlflow/mlflow/pull/13315), [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Tracking] Add support for setting multiple retrievers' schema via `set_retriever_schema` ([#13246](https://github.com/mlflow/mlflow/pull/13246), [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Eval] Make Evaluation metrics callable ([#13144](https://github.com/mlflow/mlflow/pull/13144), [@serena-ruan](https://github.com/serena-ruan))
+- [UI] Add audio support to artifact viewer UI ([#13017](https://github.com/mlflow/mlflow/pull/13017), [@sydneyw-spotify](https://github.com/sydneyw-spotify))
+- [Databricks] Add support for route_optimized parameter in databricks deployment client ([#13222](https://github.com/mlflow/mlflow/pull/13222), [@prabhatkgupta](https://github.com/prabhatkgupta))
+
+Bug fixes:
+
+- [Tracking] Fix tracing for LangGraph ([#13215](https://github.com/mlflow/mlflow/pull/13215), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Fix an issue with `presigned_url_artifact` requests being in the wrong format ([#13366](https://github.com/mlflow/mlflow/pull/13366), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Models] Update Databricks dependency extraction functionality to work with the `langchain-databricks` partner package. ([#13266](https://github.com/mlflow/mlflow/pull/13266), [@B-Step62](https://github.com/B-Step62))
+- [Model Registry] Fix retry and credential refresh issues with artifact downloads from the model registry ([#12935](https://github.com/mlflow/mlflow/pull/12935), [@rohitarun-db](https://github.com/rohitarun-db))
+- [Tracking] Fix LangChain autologging so that langchain-community is not required for partner packages ([#13172](https://github.com/mlflow/mlflow/pull/13172), [@B-Step62](https://github.com/B-Step62))
+- [Artifacts] Fix issues with file removal for the local artifact repository ([#13005](https://github.com/mlflow/mlflow/pull/13005), [@rzalawad](https://github.com/rzalawad))
+
+Documentation updates:
+
+- [Docs] Add guide for building custom GenAI apps with ChatModel ([#13207](https://github.com/mlflow/mlflow/pull/13207), [@BenWilson2](https://github.com/BenWilson2))
+- [Docs] Add updates to the MLflow AI Gateway documentation ([#13217](https://github.com/mlflow/mlflow/pull/13217), [@daniellok-db](https://github.com/daniellok-db))
+- [Docs] Remove MLflow AI Gateway deprecation status ([#13153](https://github.com/mlflow/mlflow/pull/13153), [@BenWilson2](https://github.com/BenWilson2))
+- [Docs] Add contribution guide for MLflow tracing integrations ([#13333](https://github.com/mlflow/mlflow/pull/13333), [@B-Step62](https://github.com/B-Step62))
+- [Docs] Add documentation regarding the `run_id` parameter within the `search_trace` API ([#13251](https://github.com/mlflow/mlflow/pull/13251), [@B-Step62](https://github.com/B-Step62))
+
+Please try it out and report any issues on [the issue tracker](https://github.com/mlflow/mlflow/issues).
diff --git a/website/releases/2024-10-25-2.17.1-release.md b/website/releases/2024-10-25-2.17.1-release.md
new file mode 100644
index 000000000..d21719710
--- /dev/null
+++ b/website/releases/2024-10-25-2.17.1-release.md
@@ -0,0 +1,33 @@
+---
+title: MLflow 2.17.1
+slug: 2.17.1
+authors: [mlflow-maintainers]
+---
+
+## 2.17.1 (2024-10-25)
+
+MLflow 2.17.1 includes several major features and improvements
+
+Features:
+
+- [Tracking] Support custom chat endpoint without endpoint type set as llm judge ([#13538](https://github.com/mlflow/mlflow/pull/13538), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Support tracing for OpenAI Swarm ([#13497](https://github.com/mlflow/mlflow/pull/13497), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Support UC Connections as model dependency and resources ([#13481](https://github.com/mlflow/mlflow/pull/13481), [#13491](https://github.com/mlflow/mlflow/pull/13491) [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Tracking] Support Genie Spaces as model resources ([#13441](https://github.com/mlflow/mlflow/pull/13441), [@aravind-segu](https://github.com/aravind-segu))
+- [Models] Support new Transformers task for llm/v1/embedding ([#13468](https://github.com/mlflow/mlflow/pull/13468), [@B-Step62](https://github.com/B-Step62))
+
+Bug fixes:
+
+- [Tracking] Fix tool span inputs/outputs format in LangChain autolog ([#13527](https://github.com/mlflow/mlflow/pull/13527), [@B-Step62](https://github.com/B-Step62))
+- [Models] Fix code_path handling for LlamaIndex flavor ([#13486](https://github.com/mlflow/mlflow/pull/13486), [@B-Step62](https://github.com/B-Step62))
+- [Models] Fix signature inference for subclass and optional dataclasses ([#13440](https://github.com/mlflow/mlflow/pull/13440), [@bbqiu](https://github.com/bbqiu))
+- [Tracking] Fix error thrown in set_retriever_schema's behavior when it's called twice ([#13422](https://github.com/mlflow/mlflow/pull/13422), [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Tracking] Fix dependency extraction from RunnableCallables ([#13423](https://github.com/mlflow/mlflow/pull/13423), [@aravind-segu](https://github.com/aravind-segu))
+
+Documentation updates:
+
+- [Docs] Fixed typo in docs: endpoing -> endpoint ([#13478](https://github.com/mlflow/mlflow/pull/13478), [@JAMNESIA](https://github.com/JAMNESIA))
+- [Docs] Improve CLI docs - attention about setting MLFLOW_TRACKING_URI ([#13465](https://github.com/mlflow/mlflow/pull/13465), [@BartoszLitwiniuk](https://github.com/BartoszLitwiniuk))
+- [Docs] Add documentation for infer_signature usage with GenAI flavors ([#13407](https://github.com/mlflow/mlflow/pull/13407), [@serena-ruan](https://github.com/serena-ruan))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.17.1), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2024-10-31-2.17.2-release.md b/website/releases/2024-10-31-2.17.2-release.md
new file mode 100644
index 000000000..92b585f1b
--- /dev/null
+++ b/website/releases/2024-10-31-2.17.2-release.md
@@ -0,0 +1,23 @@
+---
+title: MLflow 2.17.2
+slug: 2.17.2
+authors: [mlflow-maintainers]
+---
+
+MLflow 2.17.2 includes several major features and improvements
+
+Features:
+
+- [Model Registry] DatabricksSDKModelsArtifactRepository support ([#13203](https://github.com/mlflow/mlflow/pull/13203), [@shichengzhou-db](https://github.com/shichengzhou-db))
+- [Tracking] Support extracting new UCFunctionToolkit as model resources ([#13567](https://github.com/mlflow/mlflow/pull/13567), [@serena-ruan](https://github.com/serena-ruan))
+
+Bug fixes:
+
+- [Models] Fix RunnableBinding saving ([#13566](https://github.com/mlflow/mlflow/pull/13566), [@B-Step62](https://github.com/B-Step62))
+- [Models] Pin numpy when pandas < 2.1.2 in pip requirements ([#13580](https://github.com/mlflow/mlflow/pull/13580), [@serena-ruan](https://github.com/serena-ruan))
+
+Documentation updates:
+
+- [Docs] ChatModel tool calling tutorial ([#13542](https://github.com/mlflow/mlflow/pull/13542), [@daniellok-db](https://github.com/daniellok-db))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.17.2), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2024-11-12-2.18.0-release.md b/website/releases/2024-11-12-2.18.0-release.md
new file mode 100644
index 000000000..95fab8a37
--- /dev/null
+++ b/website/releases/2024-11-12-2.18.0-release.md
@@ -0,0 +1,85 @@
+---
+title: MLflow 2.18.0
+slug: 2.18.0
+authors: [mlflow-maintainers]
+---
+
+We are excited to announce the release of MLflow 2.18.0! This release includes a number of significant features, enhancements, and bug fixes.
+
+### Python Version Update
+
+Python 3.8 is now at an end-of-life point. With official support being dropped for this legacy version, **MLflow now requires Python 3.9**
+as a minimum supported version.
+
+> Note: If you are currently using MLflow's `ChatModel` interface for authoring custom GenAI applications, please ensure that you
+> have read the future breaking changes section below.
+
+### Major New Features
+
+- **🦺 Fluent API Thread/Process Safety** - MLflow's fluent APIs for tracking and the model registry have been overhauled to add support for both thread and multi-process safety. You are now no longer forced to use the Client APIs for managing experiments, runs, and logging from within multiprocessing and threaded applications. ([#13456](https://github.com/mlflow/mlflow/pull/13456), [#13419](https://github.com/mlflow/mlflow/pull/13419), [@WeichenXu123](https://github.com/WeichenXu123))
+
+- **🧩 DSPy flavor** - MLflow now supports logging, loading, and tracing of `DSPy` models, broadening the support for advanced GenAI authoring within MLflow. Check out the [MLflow DSPy Flavor](https://mlflow.org/docs/latest/llms/dspy/index.html) documentation to get started! ([#13131](https://github.com/mlflow/mlflow/pull/13131), [#13279](https://github.com/mlflow/mlflow/pull/13279), [#13369](https://github.com/mlflow/mlflow/pull/13369), [#13345](https://github.com/mlflow/mlflow/pull/13345), [@chenmoneygithub](https://github.com/chenmoneygithub), [#13543](https://github.com/mlflow/mlflow/pull/13543), [#13800](https://github.com/mlflow/mlflow/pull/13800), [#13807](https://github.com/mlflow/mlflow/pull/13807), [@B-Step62](https://github.com/B-Step62), [#13289](https://github.com/mlflow/mlflow/pull/13289), [@michael-berk](https://github.com/michael-berk))
+
+- **🖥️ Enhanced Trace UI** - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html)'s UI has undergone
+ a significant overhaul to bring usability and quality of life updates to the experience of auditing and investigating the contents of GenAI traces, from enhanced span content rendering using markdown to a standardized span component structure. ([#13685](https://github.com/mlflow/mlflow/pull/13685), [#13357](https://github.com/mlflow/mlflow/pull/13357), [#13242](https://github.com/mlflow/mlflow/pull/13242), [@daniellok-db](https://github.com/daniellok-db))
+
+- **🚄 New Tracing Integrations** - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) now supports **DSPy**, **LiteLLM**, and **Google Gemini**, enabling a one-line, fully automated tracing experience. These integrations unlock enhanced observability across a broader range of industry tools. Stay tuned for upcoming integrations and updates! ([#13801](https://github.com/mlflow/mlflow/pull/13801), [@TomeHirata](https://github.com/TomeHirata), [#13585](https://github.com/mlflow/mlflow/pull/13585), [@B-Step62](https://github.com/B-Step62))
+
+- **📊 Expanded LLM-as-a-Judge Support** - MLflow now enhances its evaluation capabilities with support for additional providers, including `Anthropic`, `Bedrock`, `Mistral`, and `TogetherAI`, alongside existing providers like `OpenAI`. Users can now also configure proxy endpoints or self-hosted LLMs that follow the provider API specs by using the new `proxy_url` and `extra_headers` options. Visit the [LLM-as-a-Judge](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#llm-as-a-judge-metrics) documentation for more details! ([#13715](https://github.com/mlflow/mlflow/pull/13715), [#13717](https://github.com/mlflow/mlflow/pull/13717), [@B-Step62](https://github.com/B-Step62))
+
+- **⏰ Environment Variable Detection** - As a helpful reminder for when you are deploying models, MLflow now detects and reminds users of environment variables set during model logging, ensuring they are configured for deployment. In addition to this, the `mlflow.models.predict` utility has also been updated to include these variables in serving simulations, improving pre-deployment validation. ([#13584](https://github.com/mlflow/mlflow/pull/13584), [@serena-ruan](https://github.com/serena-ruan))
+
+### Breaking Changes to ChatModel Interface
+
+- **ChatModel Interface Updates** - As part of a broader unification effort within MLflow and services that rely on or deeply integrate
+ with MLflow's GenAI features, we are working on a phased approach to making a consistent and standard interface for custom GenAI
+ application development and usage. In the first phase (planned for release in the next few releases of MLflow), we are marking
+ several interfaces as deprecated, as they will be changing. These changes will be:
+
+ - **Renaming of Interfaces**:
+ - `ChatRequest` → `ChatCompletionRequest` to provide disambiguation for future planned request interfaces.
+ - `ChatResponse` → `ChatCompletionResponse` for the same reason as the input interface.
+ - `metadata` fields within `ChatRequest` and `ChatResponse` → `custom_inputs` and `custom_outputs`, respectively.
+ - **Streaming Updates**:
+ - `predict_stream` will be updated to enable true streaming for custom GenAI applications. Currently, it returns a generator with synchronous outputs from predict. In a future release, it will return a generator of `ChatCompletionChunks`, enabling asynchronous streaming. While the API call structure will remain the same, the returned data payload will change significantly, aligning with LangChain’s implementation.
+ - **Legacy Dataclass Deprecation**:
+ - Dataclasses in `mlflow.models.rag_signatures` will be deprecated, merging into unified `ChatCompletionRequest`, `ChatCompletionResponse`, and `ChatCompletionChunks`.
+
+Other Features:
+
+Here is the updated section with links to each PR ID and author:
+
+markdown
+Copy code
+Other Features:
+
+- [Evaluate] Add Huggingface BLEU metrics to MLflow Evaluate ([#12799](https://github.com/mlflow/mlflow/pull/12799), [@nebrass](https://github.com/nebrass))
+- [Models / Databricks] Add support for `spark_udf` when running on Databricks Serverless runtime, Databricks Connect, and prebuilt Python environments ([#13276](https://github.com/mlflow/mlflow/pull/13276), [#13496](https://github.com/mlflow/mlflow/pull/13496), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Scoring] Add a `model_config` parameter for `pyfunc.spark_udf` for customization of batch inference payload submission ([#13517](https://github.com/mlflow/mlflow/pull/13517), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Tracing] Standardize retriever span outputs to a list of MLflow `Document`s ([#13242](https://github.com/mlflow/mlflow/pull/13242), [@daniellok-db](https://github.com/daniellok-db))
+- [UI] Add support for visualizing and comparing nested parameters within the MLflow UI ([#13012](https://github.com/mlflow/mlflow/pull/13012), [@jescalada](https://github.com/jescalada))
+- [UI] Add support for comparing logged artifacts within the Compare Run page in the MLflow UI ([#13145](https://github.com/mlflow/mlflow/pull/13145), [@jescalada](https://github.com/jescalada))
+- [Databricks] Add support for `resources` definitions for `LangChain` model logging ([#13315](https://github.com/mlflow/mlflow/pull/13315), [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Databricks] Add support for defining multiple retrievers within `dependencies` for Agent definitions ([#13246](https://github.com/mlflow/mlflow/pull/13246), [@sunishsheth2009](https://github.com/sunishsheth2009))
+
+Bug fixes:
+
+- [Database] Cascade deletes to datasets when deleting experiments to fix a bug in MLflow's `gc` command when deleting experiments with logged datasets ([#13741](https://github.com/mlflow/mlflow/pull/13741), [@daniellok-db](https://github.com/daniellok-db))
+- [Models] Fix a bug with `LangChain`'s `pyfunc` predict input conversion ([#13652](https://github.com/mlflow/mlflow/pull/13652), [@serena-ruan](https://github.com/serena-ruan))
+- [Models] Fix signature inference for subclasses and `Optional` dataclasses that define a model's signature ([#13440](https://github.com/mlflow/mlflow/pull/13440), [@bbqiu](https://github.com/bbqiu))
+- [Tracking] Fix an issue with async logging batch splitting validation rules ([#13722](https://github.com/mlflow/mlflow/pull/13722), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Tracking] Fix an issue with `LangChain`'s autologging thread-safety behavior ([#13672](https://github.com/mlflow/mlflow/pull/13672), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Disable support for running Spark autologging in a threadpool due to limitations in Spark ([#13599](https://github.com/mlflow/mlflow/pull/13599), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Tracking] Mark `role` and `index` as required for chat schema ([#13279](https://github.com/mlflow/mlflow/pull/13279), [@chenmoneygithub](https://github.com/chenmoneygithub))
+- [Tracing] Handle raw response in OpenAI autolog ([#13802](https://github.com/mlflow/mlflow/pull/13802), [@harupy](https://github.com/harupy))
+- [Tracing] Fix a bug with tracing source run behavior when running inference with multithreading on `LangChain` models ([#13610](https://github.com/mlflow/mlflow/pull/13610), [@WeichenXu123](https://github.com/WeichenXu123))
+
+Documentation updates:
+
+- [Docs] Add docstring warnings for upcoming changes to ChatModel ([#13730](https://github.com/mlflow/mlflow/pull/13730), [@stevenchen-db](https://github.com/stevenchen-db))
+- [Docs] Add a contributor's guide for implementing tracing integrations ([#13333](https://github.com/mlflow/mlflow/pull/13333), [@B-Step62](https://github.com/B-Step62))
+- [Docs] Add guidance in the use of `model_config` when logging models as code ([#13631](https://github.com/mlflow/mlflow/pull/13631), [@sunishsheth2009](https://github.com/sunishsheth2009))
+- [Docs] Add documentation for the use of custom library artifacts with the `code_paths` model logging feature ([#13702](https://github.com/mlflow/mlflow/pull/13702), [@TomeHirata](https://github.com/TomeHirata))
+- [Docs] Improve `SparkML` `log_model` documentation with guidance on how to return probabilities from classification models ([#13684](https://github.com/mlflow/mlflow/pull/13684), [@WeichenXu123](https://github.com/WeichenXu123))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.18.0), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2024-12-11-2.19.0-release.md b/website/releases/2024-12-11-2.19.0-release.md
new file mode 100644
index 000000000..be8ef24e7
--- /dev/null
+++ b/website/releases/2024-12-11-2.19.0-release.md
@@ -0,0 +1,45 @@
+---
+title: MLflow 2.19.0
+slug: 2.19.0
+authors: [mlflow-maintainers]
+---
+
+## 2.19.0 (2024-12-11)
+
+We are excited to announce the release of MLflow 2.19.0! This release includes a number of significant features, enhancements, and bug fixes.
+
+### Major New Features
+
+- **ChatModel enhancements** - [ChatModel](https://mlflow.org/docs/latest/llms/chat-model-guide/index.html) now adopts `ChatCompletionRequest` and `ChatCompletionResponse` as its new schema. The `predict_stream` interface uses `ChatCompletionChunk` to deliver true streaming responses. Additionally, the `custom_inputs` and `custom_outputs` fields in ChatModel now utilize `AnyType`, enabling support for a wider variety of data types. **Note:** In a future version of MLflow, `ChatParams` (and by extension, `ChatCompletionRequest`) will have the default values for `n`, `temperature`, and `stream` removed. ([#13782](https://github.com/mlflow/mlflow/pull/13782), [#13857](https://github.com/mlflow/mlflow/pull/13857), [@stevenchen-db](https://github.com/stevenchen-db))
+
+- **Tracing improvements** - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) now supports both automatic and manual tracing for DSPy, LlamaIndex and Langchain flavors. Tracing feature is also auto-enabled for mlflow evaluation for all supported flavors. ([#13790](https://github.com/mlflow/mlflow/pull/13790), [#13793](https://github.com/mlflow/mlflow/pull/13793), [#13795](https://github.com/mlflow/mlflow/pull/13795), [#13897](https://github.com/mlflow/mlflow/pull/13897), [@B-Step62](https://github.com/B-Step62))
+
+- **New Tracing Integrations** - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) now supports **CrewAI** and **Anthropic**, enabling a one-line, fully automated tracing experience. ([#13903](https://github.com/mlflow/mlflow/pull/13903), [@TomeHirata](https://github.com/TomeHirata), [#13851](https://github.com/mlflow/mlflow/pull/13851), [@gabrielfu](https://github.com/gabrielfu))
+
+- **Any Type in model signature** - MLflow now supports [AnyType](https://mlflow.org/docs/latest/model/signatures.html#supported-data-types) in model signature. It can be used to host any data types that were not supported before. ([#13766](https://github.com/mlflow/mlflow/pull/13766), [@serena-ruan](https://github.com/serena-ruan))
+
+Other Features:
+
+- [Tracking] Add `update_current_trace` API for adding tags to an active trace. ([#13828](https://github.com/mlflow/mlflow/pull/13828), [@B-Step62](https://github.com/B-Step62))
+- [Deployments] Update databricks deployments to support AI gateway & additional update endpoints ([#13513](https://github.com/mlflow/mlflow/pull/13513), [@djliden](https://github.com/djliden))
+- [Models] Support uv in mlflow.models.predict ([#13824](https://github.com/mlflow/mlflow/pull/13824), [@serena-ruan](https://github.com/serena-ruan))
+- [Models] Add type hints support including pydantic models ([#13924](https://github.com/mlflow/mlflow/pull/13924), [@serena-ruan](https://github.com/serena-ruan))
+- [Tracking] Add the `trace.search_spans()` method for searching spans within traces ([#13984](https://github.com/mlflow/mlflow/pull/13984), [@B-Step62](https://github.com/B-Step62))
+
+Bug fixes:
+
+- [Tracking] Allow passing in spark connect dataframes in mlflow evaluate API ([#13889](https://github.com/mlflow/mlflow/pull/13889), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Tracking] Fix `mlflow.end_run` inside a MLflow run context manager ([#13888](https://github.com/mlflow/mlflow/pull/13888), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Scoring] Fix spark_udf conditional check on remote spark-connect client or Databricks Serverless ([#13827](https://github.com/mlflow/mlflow/pull/13827), [@WeichenXu123](https://github.com/WeichenXu123))
+- [Models] Allow changing max_workers for built-in LLM-as-a-Judge metrics ([#13858](https://github.com/mlflow/mlflow/pull/13858), [@B-Step62](https://github.com/B-Step62))
+- [Models] Support saving all langchain runnables using code-based logging ([#13821](https://github.com/mlflow/mlflow/pull/13821), [@serena-ruan](https://github.com/serena-ruan))
+- [Model Registry] return empty array when DatabricksSDKModelsArtifactRepository.list_artifacts is called on a file ([#14027](https://github.com/mlflow/mlflow/pull/14027), [@shichengzhou-db](https://github.com/shichengzhou-db))
+- [Tracking] Stringify param values in client.log_batch() ([#14015](https://github.com/mlflow/mlflow/pull/14015), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Remove deprecated squared parameter ([#14028](https://github.com/mlflow/mlflow/pull/14028), [@B-Step62](https://github.com/B-Step62))
+- [Tracking] Fix request/response field in the search_traces output ([#13985](https://github.com/mlflow/mlflow/pull/13985), [@B-Step62](https://github.com/B-Step62))
+
+Documentation updates:
+
+- [Docs] Add Ollama and Instructor examples in tracing doc ([#13937](https://github.com/mlflow/mlflow/pull/13937), [@B-Step62](https://github.com/B-Step62))
+
+For a comprehensive list of changes, see the [release change log](https://github.com/mlflow/mlflow/releases/tag/v2.19.0), and check out the latest documentation on [mlflow.org](http://mlflow.org/).
diff --git a/website/releases/2025-01-14-2.20.0rc0-release.md b/website/releases/2025-01-14-2.20.0rc0-release.md
new file mode 100644
index 000000000..5848e2b14
--- /dev/null
+++ b/website/releases/2025-01-14-2.20.0rc0-release.md
@@ -0,0 +1,33 @@
+---
+title: MLflow 2.20.0rc0
+slug: 2.20.0rc0
+authors: [mlflow-maintainers]
+---
+
+MLflow 2.20.0rc0 is a release candidate for 2.20.0. To install, run the following command:
+
+```sh
+pip install mlflow==2.20.0rc0
+```
+
+### Major New Features
+
+- **💡Type Hint-Based Model Signature**: Define your model's [signature](https://www.mlflow.org/docs/latest/model/signatures.html) in the most **Pythonic** way. MLflow now supports defining a model signature based on the type hints in your `PythonModel`'s `predict` function, and validating input data payloads against it. ([#14182](https://github.com/mlflow/mlflow/pull/14182), [#14168](https://github.com/mlflow/mlflow/pull/14168), [#14130](https://github.com/mlflow/mlflow/pull/14130), [#14100](https://github.com/mlflow/mlflow/pull/14100), [#14099](https://github.com/mlflow/mlflow/pull/14099), [@serena-ruan](https://github.com/serena-ruan))
+
+- **🧠 Bedrock / Groq Tracing Support**: [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html) now offers a one-line auto-tracing experience for **Amazon Bedrock** and **Groq** LLMs. Track LLM invocation within your model by simply adding `mlflow.bedrock.tracing` or `mlflow.groq.tracing` call to the code. ([#14018](https://github.com/mlflow/mlflow/pull/14018), [@B-Step62](https://github.com/B-Step62), [#14006](https://github.com/mlflow/mlflow/pull/14006), [@anumita0203](https://github.com/anumita0203))
+
+- **🗒️ Inline Trace Rendering in Jupyter Notebook**: MLflow now supports rendering a trace UI **within** the notebook where you are running models. This eliminates the need to frequently switch between the notebook and browser, creating a seamless local model debugging experience. ([#13955](https://github.com/mlflow/mlflow/pull/13955), [@daniellok-db](https://github.com/daniellok-db))
+- **⚡️Faster Model Validation with `uv` Package Manager**: MLflow has adopted [uv](https://github.com/astral-sh/uv), a new Rust-based, super-fast Python package manager. This release adds support for the new package manager in the [mlflow.models.predict](https://www.mlflow.org/docs/latest/model/dependencies.html#validating-environment-for-prediction) API, enabling faster model environment validation. Stay tuned for more updates! ([#13824](https://github.com/mlflow/mlflow/pull/13824), [@serena-ruan](https://github.com/serena-ruan))
+- **🖥️ New Chat Panel in Trace UI**: THe MLflow Trace UI now shows a unified `chat` panel for LLM invocations. The update allows you to view chat messages and function calls in a rich and consistent UI across LLM providers, as well as inspect the raw input and output payloads. ([#14211](https://github.com/mlflow/mlflow/pull/14211), [@TomuHirata](https://github.com/TomuHirata))
+
+### Other Features:
+
+- Introduced `ChatAgent` base class for defining custom python agent (#13797, @bbqiu)
+- Supported Tool Calling in DSPy Tracing (#14196, @B-Step62)
+- Added support for invokers rights in Databricks Resources (#14212, @aravind-segu)
+- Applied timeout override to within-request local scoring server for Spark UDF inference (#14202, @BenWilson2)
+- Supported dictionary type for inference params (#14091, @serena-ruan)
+- Makd `context` parameter optional for calling `PythonModel` instance (#14059, @serena-ruan)
+- Set default task for `ChatModel` (#14068, @stevenchen-db)
+
+Please try it out and report any issues on [the issue tracker](https://github.com/mlflow/mlflow/issues)!
diff --git a/website/scripts/check-authors.ts b/website/scripts/check-authors.ts
new file mode 100644
index 000000000..344d30d33
--- /dev/null
+++ b/website/scripts/check-authors.ts
@@ -0,0 +1,55 @@
+import fs from "fs";
+import path from "path";
+import yaml from "js-yaml";
+
+function fileExistsWithCaseSync(filepath: string): boolean {
+ const dir = path.dirname(filepath);
+ if (dir === "/" || dir === ".") {
+ return true;
+ }
+ const filenames = fs.readdirSync(dir);
+ if (filenames.indexOf(path.basename(filepath)) === -1) {
+ return false;
+ }
+ return fileExistsWithCaseSync(dir);
+}
+
+type Author = {
+ name: string;
+ title: string;
+ url: string;
+ image_url?: string;
+};
+
+function readAuthors(): Record {
+ const authorsFile = fs.readFileSync(
+ path.join(process.cwd(), "blog", "authors.yml"),
+ "utf-8",
+ );
+ return yaml.load(authorsFile) as Record;
+}
+
+function main(): void {
+ const authors = readAuthors();
+ const authorsWithInvalidImageUrl: string[] = [];
+ Object.entries(authors).forEach(([author, authorData]) => {
+ if (
+ authorData.image_url &&
+ !/^https?:\/\//.test(authorData.image_url) &&
+ !fileExistsWithCaseSync(
+ path.join(process.cwd(), "static", authorData.image_url),
+ )
+ ) {
+ authorsWithInvalidImageUrl.push(author);
+ }
+ });
+
+ if (authorsWithInvalidImageUrl.length > 0) {
+ console.log("Found authors with invalid image URLs:");
+ console.log(authorsWithInvalidImageUrl);
+ console.log("Please make sure the image exists in the static folder.");
+ process.exit(1);
+ }
+}
+
+main();
diff --git a/website/src/components/BlogItem/styles.module.css b/website/src/components/BlogItem/styles.module.css
index 8df564bb7..0b7a9d15d 100644
--- a/website/src/components/BlogItem/styles.module.css
+++ b/website/src/components/BlogItem/styles.module.css
@@ -12,6 +12,7 @@
.img {
border-radius: 4px;
+ max-height: 250px;
}
.tags {
diff --git a/website/src/components/ExpandableGrid/index.tsx b/website/src/components/ExpandableGrid/index.tsx
new file mode 100644
index 000000000..da20b4134
--- /dev/null
+++ b/website/src/components/ExpandableGrid/index.tsx
@@ -0,0 +1,34 @@
+import React, { useState } from "react";
+import styles from "./styles.module.css";
+
+const ExpandableGrid = ({ items, defaultVisibleCount, renderItem }) => {
+ const [isExpanded, setIsExpanded] = useState(false);
+
+ const visibleItems = isExpanded ? items : items.slice(0, defaultVisibleCount);
+
+ return (
+ <>
+