From e1919bf12c269106859eabeec25b42952d5f8e61 Mon Sep 17 00:00:00 2001 From: Mahesh Date: Wed, 20 Nov 2024 20:34:53 +0530 Subject: [PATCH 1/9] feat: add mistral as a new guardrail provider --- plugins/index.ts | 21 +++++ plugins/mistral/index.ts | 112 +++++++++++++++++++++++++++ plugins/mistral/manifest.json | 135 +++++++++++++++++++++++++++++++++ plugins/types.ts | 6 +- src/middlewares/hooks/index.ts | 3 +- 5 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 plugins/mistral/index.ts create mode 100644 plugins/mistral/manifest.json diff --git a/plugins/index.ts b/plugins/index.ts index 03e7c6785..3bccb8fbf 100644 --- a/plugins/index.ts +++ b/plugins/index.ts @@ -32,6 +32,20 @@ import { handler as patronusnoRacialBias } from './patronus/noRacialBias'; import { handler as patronusretrievalAnswerRelevance } from './patronus/retrievalAnswerRelevance'; import { handler as patronustoxicity } from './patronus/toxicity'; import { handler as patronuscustom } from './patronus/custom'; +import { mistralGuardrailHandler } from './mistral'; +import { PluginHandler } from './types'; + +const mistralGuardCategories = [ + 'sexual', + 'hate_and_discrimination', + 'violence_and_threats', + 'dangerous_and_criminal_content', + 'selfharm', + 'health', + 'financial', + 'law', + 'pii', +]; export const plugins = { default: { @@ -80,4 +94,11 @@ export const plugins = { toxicity: patronustoxicity, custom: patronuscustom, }, + mistral: mistralGuardCategories.reduce( + (config, category) => { + config[category] = mistralGuardrailHandler; + return config; + }, + {} as Record + ), }; diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts new file mode 100644 index 000000000..19bcf336c --- /dev/null +++ b/plugins/mistral/index.ts @@ -0,0 +1,112 @@ +import { + HookEventType, + PluginContext, + PluginHandler, + PluginParameters, +} from '../types'; +import { getText, post } from '../utils'; + +interface MistralResponse { + id: string; + model: string; + results: [ + { + categories: { + sexual: boolean; + hate_and_discrimination: boolean; + violence_and_threats: boolean; + dangerous_and_criminal_content: boolean; + selfharm: boolean; + health: boolean; + financial: boolean; + law: boolean; + pii: boolean; + }; + category_score: { + sexual: number; + hate_and_discrimination: number; + violence_and_threats: number; + dangerous_and_criminal_content: number; + selfharm: number; + health: number; + financial: number; + law: number; + pii: number; + }; + }, + ]; +} + +type GuardrailFunction = keyof MistralResponse['results'][0]['categories']; + +export const mistralGuardrailHandler: PluginHandler = async ( + context: PluginContext, + parameters: PluginParameters, + eventType: HookEventType, + _options, + fn: string +) => { + let error = null; + let verdict = true; + let data = null; + + const creds = parameters.credentials as Record; + if (!creds.apiKey) { + return { + error: 'Mistral API key not provided.', + verdict: false, + data: null, + }; + } + + let model = 'mistral-moderation-latest'; + + if (parameters.model) { + // Model can be passed dynamically + model = parameters.model; + } + + const guardrailFunction = fn as GuardrailFunction; + + const text = getText(context, eventType); + const messages = context.request?.json?.messages ?? []; + + if (!text || !Array.isArray(messages) || !messages.length) { + return { + error: 'Mistral: Invalid Request body', + verdict: false, + data: null, + }; + } + + // Use conversation guardrail if it's a chatcomplete and before hook + const shouldUseConversation = + eventType === 'beforeRequestHook' && context.requestType === 'chatComplete'; + const url = shouldUseConversation + ? 'https://api.mistral.ai/v1/chat/moderations' + : 'https://api.mistral.ai/v1/moderations'; + + try { + const request = await post( + url, + { + model: model, + ...(!shouldUseConversation && { input: [text] }), + ...(shouldUseConversation && { input: [messages] }), + }, + { + headers: { + Authorization: `Bearer ${creds.apiKey}`, + 'Content-Type': 'application/json', + }, + } + ); + + verdict = request.results?.[0]?.categories[guardrailFunction]; + } catch (error) { + error = error; + verdict = false; + } + + return { error, verdict, data }; +}; diff --git a/plugins/mistral/manifest.json b/plugins/mistral/manifest.json new file mode 100644 index 000000000..4a0fb148c --- /dev/null +++ b/plugins/mistral/manifest.json @@ -0,0 +1,135 @@ +{ + "id": "mistral", + "description": "Mistral Content Moderation classifier leverages the most relevant policy categories for effective guardrails and introduces a pragmatic approach to LLM safety by addressing model-generated harms such as unqualified advice and PII", + "credentials": { + "type": "object", + "properties": { + "apiKey": { + "type": "string", + "label": "API Key", + "description": "Find your API key in the Mistral la-plateforme", + "encrypted": true + } + }, + "required": ["apiKey"] + }, + "functions": [ + { + "name": "Detect PII", + "id": "pii", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that requests, shares, or attempts to elicit personal identifying information such as full names, addresses, phone numbers, social security numbers, or financial account details." + } + ], + "parameters": {} + }, + { + "name": "Detect Sexual Content", + "id": "sexual", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Material that explicitly depicts, describes, or promotes sexual activities, nudity, or sexual services. This includes pornographic content, graphic descriptions of sexual acts, and solicitation for sexual purposes. Educational or medical content about sexual health presented in a non-explicit, informational context is generally exempted." + } + ], + "parameters": {} + }, + { + "name": "Detect Hate & Discrimination", + "id": "hate_and_discrimination", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that expresses prejudice, hostility, or advocates discrimination against individuals or groups based on protected characteristics such as race, ethnicity, religion, gender, sexual orientation, or disability. This includes slurs, dehumanizing language, calls for exclusion or harm targeted at specific groups, and persistent harassment or bullying of individuals based on these characteristics." + } + ], + "parameters": {} + }, + { + "name": "Detect Violent & Thereat", + "id": "violence_and_threats", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that describes, glorifies, incites, or threatens physical violence against individuals or groups. This includes graphic depictions of injury or death, explicit threats of harm, and instructions for carrying out violent acts. This category covers both targeted threats and general promotion or glorification of violence." + } + ], + "parameters": {} + }, + { + "name": "Detect Dangerous & Criminal Content", + "id": "dangerous_and_criminal_content", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that promotes or provides instructions for illegal activities or extremely hazardous behaviors that pose a significant risk of physical harm, death, or legal consequences. This includes guidance on creating weapons or explosives, encouragement of extreme risk-taking behaviors, and promotion of non-violent crimes such as fraud, theft, or drug trafficking." + } + ], + "parameters": {} + }, + { + "name": "Detect Selfharm", + "id": "selfharm", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that promotes, instructs, plans, or encourages deliberate self-injury, suicide, eating disorders, or other self-destructive behaviors. This includes detailed methods, glorification, statements of intent, dangerous challenges, and related slang terms" + } + ], + "parameters": {} + }, + { + "name": "Detect Health", + "id": "health", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that contains or tries to elicit detailed or tailored medical advice." + } + ], + "parameters": {} + }, + { + "name": "Detect Finance", + "id": "financial", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that contains or tries to elicit detailed or tailored financial advice." + } + ], + "parameters": {} + }, + { + "name": "Detect Law", + "id": "law", + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "type": "guardrail", + "description": [ + { + "type": "subHeading", + "text": "Content that contains or tries to elicit detailed or tailored legal advice." + } + ], + "parameters": {} + } + ] +} diff --git a/plugins/types.ts b/plugins/types.ts index c4f37a7f5..0c1f02f1b 100644 --- a/plugins/types.ts +++ b/plugins/types.ts @@ -1,5 +1,7 @@ export interface PluginContext { [key: string]: any; + requestType: 'complete' | 'chatComplete'; + provider: string; } export interface PluginParameters { @@ -22,5 +24,7 @@ export type PluginHandler = ( eventType: HookEventType, options: { env: Record; - } + }, + // Handler function, useful in cases for a provider with multiple guardrails ex: mistral + fn: string ) => Promise; diff --git a/src/middlewares/hooks/index.ts b/src/middlewares/hooks/index.ts index bfdca842a..6c6246e97 100644 --- a/src/middlewares/hooks/index.ts +++ b/src/middlewares/hooks/index.ts @@ -261,7 +261,8 @@ export class HooksManager { context, check.parameters, eventType, - options + options, + fn ); return { ...result, From a9f6b112b9ed4c85bcd7851ab60a0b9e789b8cb7 Mon Sep 17 00:00:00 2001 From: Mahesh Date: Thu, 28 Nov 2024 00:35:01 +0530 Subject: [PATCH 2/9] fix: add test cases for mistral --- plugins/mistral/index.ts | 9 ++- plugins/mistral/mistra.test.ts | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 plugins/mistral/mistra.test.ts diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts index 19bcf336c..1a7643a9b 100644 --- a/plugins/mistral/index.ts +++ b/plugins/mistral/index.ts @@ -69,9 +69,14 @@ export const mistralGuardrailHandler: PluginHandler = async ( const guardrailFunction = fn as GuardrailFunction; const text = getText(context, eventType); - const messages = context.request?.json?.messages ?? []; + const messages = context.request?.json?.messages; - if (!text || !Array.isArray(messages) || !messages.length) { + // should contain text or should contain messages array + if ( + (!text && !Array.isArray(messages)) || + (Array.isArray(messages) && messages.length === 0) + ) { + console.log(!text, messages); return { error: 'Mistral: Invalid Request body', verdict: false, diff --git a/plugins/mistral/mistra.test.ts b/plugins/mistral/mistra.test.ts new file mode 100644 index 000000000..69143449d --- /dev/null +++ b/plugins/mistral/mistra.test.ts @@ -0,0 +1,109 @@ +import { PluginContext } from '../types'; +import testCreds from './.creds.json'; +import { mistralGuardrailHandler } from './index'; + +function getParameters() { + return { + credentials: testCreds, + }; +} + +describe('validateProject handler', () => { + it('should fail if the apiKey is invalid', async () => { + const eventType = 'beforeRequestHook'; + const context = { + request: { text: 'this is a test string for moderations' }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + parameters.credentials.apiKey = 'invalid-api-key'; + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(false); + expect(result.error).toBeDefined(); + expect(result.data).toBeNull(); + }); + + it('should return pii true for pii function', async () => { + const eventType = 'beforeRequestHook'; + const context = { + request: { + text: 'Say Hi. My name is Jhon Doe and my email is user@example.com', + }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(true); + expect(result.error).toBeNull(); + expect(result.data).toBeNull(); + }); + + it('should be false when pii is not present', async () => { + const eventType = 'beforeRequestHook'; + const context = { + request: { text: 'this text is safe text' }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(false); + expect(result.error).toBeNull(); + expect(result.data).toBeNull(); + }); + + it('should work pii for chatComplete messages', async () => { + const eventType = 'beforeRequestHook'; + const context = { + requestType: 'chatComplete', + request: { + json: { + messages: [ + { + role: 'user', + content: + 'Say Hi. My name is Jhon Doe and my email is user@example.com', + }, + ], + }, + }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(true); + expect(result.error).toBeNull(); + expect(result.data).toBeNull(); + }); +}); From 3881aaa571ce2ff6d9333fb2e253e9dd29268554 Mon Sep 17 00:00:00 2001 From: Mahesh Date: Thu, 28 Nov 2024 00:51:06 +0530 Subject: [PATCH 3/9] chore: add extra test cases --- plugins/mistral/index.ts | 6 ++- plugins/mistral/mistra.test.ts | 76 +++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts index 1a7643a9b..b73d4f130 100644 --- a/plugins/mistral/index.ts +++ b/plugins/mistral/index.ts @@ -69,14 +69,16 @@ export const mistralGuardrailHandler: PluginHandler = async ( const guardrailFunction = fn as GuardrailFunction; const text = getText(context, eventType); - const messages = context.request?.json?.messages; + const messages = + eventType === 'beforeRequestHook' + ? context.request?.json?.messages + : context.response?.json?.messages; // should contain text or should contain messages array if ( (!text && !Array.isArray(messages)) || (Array.isArray(messages) && messages.length === 0) ) { - console.log(!text, messages); return { error: 'Mistral: Invalid Request body', verdict: false, diff --git a/plugins/mistral/mistra.test.ts b/plugins/mistral/mistra.test.ts index 69143449d..2cd5509cb 100644 --- a/plugins/mistral/mistra.test.ts +++ b/plugins/mistral/mistra.test.ts @@ -8,7 +8,7 @@ function getParameters() { }; } -describe('validateProject handler', () => { +describe('mistral guardrail handler', () => { it('should fail if the apiKey is invalid', async () => { const eventType = 'beforeRequestHook'; const context = { @@ -106,4 +106,78 @@ describe('validateProject handler', () => { expect(result.error).toBeNull(); expect(result.data).toBeNull(); }); + + it('should give error on invalid request body', async () => { + const eventType = 'beforeRequestHook'; + const context = { + request: {}, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(false); + expect(result.error).toBe('Mistral: Invalid Request body'); + expect(result.data).toBeNull(); + }); + + it('should work for afterRequestHook', async () => { + const eventType = 'afterRequestHook'; + const context = { + response: { text: 'this text is safe text' }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(false); + expect(result.error).toBeNull(); + expect(result.data).toBeNull(); + }); + + it('should work for afterRequestHook with chatCompletion messages', async () => { + const eventType = 'afterRequestHook'; + const context = { + requestType: 'chatComplete', + response: { + json: { + messages: [ + { + role: 'user', + content: + 'Say Hi. My name is Jhon Doe and my email is user@example.com', + }, + ], + }, + }, + }; + const parameters = JSON.parse(JSON.stringify(getParameters())); + + const result = await mistralGuardrailHandler( + context as unknown as PluginContext, + parameters, + eventType, + { env: {} }, + 'pii' + ); + + expect(result).toBeDefined(); + expect(result.verdict).toBe(false); + expect(result.error).toBeNull(); + expect(result.data).toBeNull(); + }); }); From 8a1e01dd03e6972f71009333fb259698207bf261 Mon Sep 17 00:00:00 2001 From: Mahesh Date: Tue, 17 Dec 2024 19:08:09 +0530 Subject: [PATCH 4/9] fix: review commits --- plugins/mistral/index.ts | 2 +- plugins/mistral/{mistra.test.ts => mistral.test.ts} | 0 plugins/types.ts | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename plugins/mistral/{mistra.test.ts => mistral.test.ts} (100%) diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts index b73d4f130..aca5c979f 100644 --- a/plugins/mistral/index.ts +++ b/plugins/mistral/index.ts @@ -44,7 +44,7 @@ export const mistralGuardrailHandler: PluginHandler = async ( parameters: PluginParameters, eventType: HookEventType, _options, - fn: string + fn?: string ) => { let error = null; let verdict = true; diff --git a/plugins/mistral/mistra.test.ts b/plugins/mistral/mistral.test.ts similarity index 100% rename from plugins/mistral/mistra.test.ts rename to plugins/mistral/mistral.test.ts diff --git a/plugins/types.ts b/plugins/types.ts index 0c1f02f1b..75e1a1020 100644 --- a/plugins/types.ts +++ b/plugins/types.ts @@ -26,5 +26,5 @@ export type PluginHandler = ( env: Record; }, // Handler function, useful in cases for a provider with multiple guardrails ex: mistral - fn: string + fn?: string ) => Promise; From 08cd951aecf43a583c5218f34d00df79b30a4d7e Mon Sep 17 00:00:00 2001 From: Mahesh Date: Fri, 20 Dec 2024 16:19:33 +0530 Subject: [PATCH 5/9] fix: mistral guardrails to moderations function --- plugins/mistral/index.ts | 30 +++++-- plugins/mistral/manifest.json | 151 ++++++++++------------------------ 2 files changed, 65 insertions(+), 116 deletions(-) diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts index aca5c979f..51088e808 100644 --- a/plugins/mistral/index.ts +++ b/plugins/mistral/index.ts @@ -43,8 +43,7 @@ export const mistralGuardrailHandler: PluginHandler = async ( context: PluginContext, parameters: PluginParameters, eventType: HookEventType, - _options, - fn?: string + _options ) => { let error = null; let verdict = true; @@ -66,7 +65,7 @@ export const mistralGuardrailHandler: PluginHandler = async ( model = parameters.model; } - const guardrailFunction = fn as GuardrailFunction; + const checks = parameters.categories as GuardrailFunction[]; const text = getText(context, eventType); const messages = @@ -109,10 +108,27 @@ export const mistralGuardrailHandler: PluginHandler = async ( } ); - verdict = request.results?.[0]?.categories[guardrailFunction]; - } catch (error) { - error = error; - verdict = false; + const categories: Record = + request.results[0]?.categories ?? {}; + const categoriesFlagged = Object.keys(categories).filter((category) => { + if ( + checks.includes(category as GuardrailFunction) && + !!categories[category as GuardrailFunction] + ) { + return true; + } + return false; + }); + + if (categoriesFlagged.length > 0) { + verdict = false; + data = { flagged_categories: categoriesFlagged }; + } + // Success + verdict = true; + } catch (err) { + error = err; + verdict = true; } return { error, verdict, data }; diff --git a/plugins/mistral/manifest.json b/plugins/mistral/manifest.json index 4a0fb148c..2f8ae0624 100644 --- a/plugins/mistral/manifest.json +++ b/plugins/mistral/manifest.json @@ -15,121 +15,54 @@ }, "functions": [ { - "name": "Detect PII", - "id": "pii", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], + "name": "Moderate Content", + "id": "moderateContent", "type": "guardrail", + "supportedHooks": ["beforeRequestHook"], "description": [ { "type": "subHeading", - "text": "Content that requests, shares, or attempts to elicit personal identifying information such as full names, addresses, phone numbers, social security numbers, or financial account details." + "text": "Checks if the content passes the mentioned content moderation checks." } ], - "parameters": {} - }, - { - "name": "Detect Sexual Content", - "id": "sexual", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Material that explicitly depicts, describes, or promotes sexual activities, nudity, or sexual services. This includes pornographic content, graphic descriptions of sexual acts, and solicitation for sexual purposes. Educational or medical content about sexual health presented in a non-explicit, informational context is generally exempted." - } - ], - "parameters": {} - }, - { - "name": "Detect Hate & Discrimination", - "id": "hate_and_discrimination", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that expresses prejudice, hostility, or advocates discrimination against individuals or groups based on protected characteristics such as race, ethnicity, religion, gender, sexual orientation, or disability. This includes slurs, dehumanizing language, calls for exclusion or harm targeted at specific groups, and persistent harassment or bullying of individuals based on these characteristics." - } - ], - "parameters": {} - }, - { - "name": "Detect Violent & Thereat", - "id": "violence_and_threats", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that describes, glorifies, incites, or threatens physical violence against individuals or groups. This includes graphic depictions of injury or death, explicit threats of harm, and instructions for carrying out violent acts. This category covers both targeted threats and general promotion or glorification of violence." - } - ], - "parameters": {} - }, - { - "name": "Detect Dangerous & Criminal Content", - "id": "dangerous_and_criminal_content", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that promotes or provides instructions for illegal activities or extremely hazardous behaviors that pose a significant risk of physical harm, death, or legal consequences. This includes guidance on creating weapons or explosives, encouragement of extreme risk-taking behaviors, and promotion of non-violent crimes such as fraud, theft, or drug trafficking." - } - ], - "parameters": {} - }, - { - "name": "Detect Selfharm", - "id": "selfharm", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that promotes, instructs, plans, or encourages deliberate self-injury, suicide, eating disorders, or other self-destructive behaviors. This includes detailed methods, glorification, statements of intent, dangerous challenges, and related slang terms" - } - ], - "parameters": {} - }, - { - "name": "Detect Health", - "id": "health", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that contains or tries to elicit detailed or tailored medical advice." - } - ], - "parameters": {} - }, - { - "name": "Detect Finance", - "id": "financial", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that contains or tries to elicit detailed or tailored financial advice." - } - ], - "parameters": {} - }, - { - "name": "Detect Law", - "id": "law", - "supportedHooks": ["beforeRequestHook", "afterRequestHook"], - "type": "guardrail", - "description": [ - { - "type": "subHeading", - "text": "Content that contains or tries to elicit detailed or tailored legal advice." - } - ], - "parameters": {} + "parameters": { + "type": "object", + "properties": { + "categories": { + "type": "array", + "label": "Moderation Checks", + "description": [ + { + "type": "subHeading", + "text": "Select the categories that should NOT be allowed in the content. (Checked via OpenAI moderation API)" + } + ], + "items": { + "type": "string", + "enum": [ + "sexual", + "hate_and_discrimination", + "violence_and_threats", + "dangerous_and_criminal_content", + "selfharm", + "health", + "financial", + "law", + "pii" + ], + "default": [ + "hate/threatening", + "harassment/threatening", + "self-harm/intent", + "self-harm/instructions", + "sexual/minors", + "violence/graphic" + ] + } + } + }, + "required": ["categories"] + } } ] } From 578fce748bf7d681d67cadb6071c4c24276786f1 Mon Sep 17 00:00:00 2001 From: Mahesh Vagicherla Date: Thu, 26 Dec 2024 12:24:38 +0530 Subject: [PATCH 6/9] fix: update types, guardrail functions --- plugins/index.ts | 23 +++-------------------- plugins/types.ts | 4 +--- src/middlewares/hooks/index.ts | 3 +-- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/plugins/index.ts b/plugins/index.ts index 9804e5080..5b59d8ac4 100644 --- a/plugins/index.ts +++ b/plugins/index.ts @@ -33,19 +33,6 @@ import { handler as patronusretrievalAnswerRelevance } from './patronus/retrieva import { handler as patronustoxicity } from './patronus/toxicity'; import { handler as patronuscustom } from './patronus/custom'; import { mistralGuardrailHandler } from './mistral'; -import { PluginHandler } from './types'; - -const mistralGuardCategories = [ - 'sexual', - 'hate_and_discrimination', - 'violence_and_threats', - 'dangerous_and_criminal_content', - 'selfharm', - 'health', - 'financial', - 'law', - 'pii', -]; import { handler as pangeatextGuard } from './pangea/textGuard'; export const plugins = { @@ -95,13 +82,9 @@ export const plugins = { toxicity: patronustoxicity, custom: patronuscustom, }, - mistral: mistralGuardCategories.reduce( - (config, category) => { - config[category] = mistralGuardrailHandler; - return config; - }, - {} as Record - ), + mistral: { + moderateContent: mistralGuardrailHandler, + }, pangea: { textGuard: pangeatextGuard, }, diff --git a/plugins/types.ts b/plugins/types.ts index b0e1cf27a..252b0a0e6 100644 --- a/plugins/types.ts +++ b/plugins/types.ts @@ -24,7 +24,5 @@ export type PluginHandler = ( eventType: HookEventType, options?: { env: Record; - }, - // Handler function, useful in cases for a provider with multiple guardrails ex: mistral - fn?: string + } ) => Promise; diff --git a/src/middlewares/hooks/index.ts b/src/middlewares/hooks/index.ts index 6c6246e97..bfdca842a 100644 --- a/src/middlewares/hooks/index.ts +++ b/src/middlewares/hooks/index.ts @@ -261,8 +261,7 @@ export class HooksManager { context, check.parameters, eventType, - options, - fn + options ); return { ...result, From 3b39fd448cd1497774691cd4199b9f89f1016525 Mon Sep 17 00:00:00 2001 From: Mahesh Vagicherla Date: Thu, 26 Dec 2024 13:10:05 +0530 Subject: [PATCH 7/9] chore: update test according to new functions --- plugins/mistral/index.ts | 2 - plugins/mistral/mistral.test.ts | 127 ++++++-------------------------- 2 files changed, 23 insertions(+), 106 deletions(-) diff --git a/plugins/mistral/index.ts b/plugins/mistral/index.ts index 51088e808..2c93f6992 100644 --- a/plugins/mistral/index.ts +++ b/plugins/mistral/index.ts @@ -124,8 +124,6 @@ export const mistralGuardrailHandler: PluginHandler = async ( verdict = false; data = { flagged_categories: categoriesFlagged }; } - // Success - verdict = true; } catch (err) { error = err; verdict = true; diff --git a/plugins/mistral/mistral.test.ts b/plugins/mistral/mistral.test.ts index 2cd5509cb..87ad7e417 100644 --- a/plugins/mistral/mistral.test.ts +++ b/plugins/mistral/mistral.test.ts @@ -21,163 +21,82 @@ describe('mistral guardrail handler', () => { context as unknown as PluginContext, parameters, eventType, - { env: {} }, - 'pii' + { env: {} } ); expect(result).toBeDefined(); - expect(result.verdict).toBe(false); + expect(result.verdict).toBe(true); expect(result.error).toBeDefined(); expect(result.data).toBeNull(); }); - it('should return pii true for pii function', async () => { + it('should success and return the flagged categories', async () => { const eventType = 'beforeRequestHook'; const context = { request: { - text: 'Say Hi. My name is Jhon Doe and my email is user@example.com', + text: 'my name is John Doe and my email is john.doe@example.com', }, }; const parameters = JSON.parse(JSON.stringify(getParameters())); + parameters.categories = ['pii']; const result = await mistralGuardrailHandler( context as unknown as PluginContext, parameters, eventType, - { env: {} }, - 'pii' - ); - - expect(result).toBeDefined(); - expect(result.verdict).toBe(true); - expect(result.error).toBeNull(); - expect(result.data).toBeNull(); - }); - - it('should be false when pii is not present', async () => { - const eventType = 'beforeRequestHook'; - const context = { - request: { text: 'this text is safe text' }, - }; - const parameters = JSON.parse(JSON.stringify(getParameters())); - - const result = await mistralGuardrailHandler( - context as unknown as PluginContext, - parameters, - eventType, - { env: {} }, - 'pii' + { env: {} } ); expect(result).toBeDefined(); expect(result.verdict).toBe(false); - expect(result.error).toBeNull(); - expect(result.data).toBeNull(); + expect(result.error).toBeDefined(); + expect(result.data).toMatchObject({ flagged_categories: ['pii'] }); }); - it('should work pii for chatComplete messages', async () => { + it('should include the multiple flagged categories in the response', async () => { const eventType = 'beforeRequestHook'; const context = { - requestType: 'chatComplete', request: { - json: { - messages: [ - { - role: 'user', - content: - 'Say Hi. My name is Jhon Doe and my email is user@example.com', - }, - ], - }, + text: 'my name is John Doe and my email is john.doe@example.com. I am a financial advisor and I suggest you to invest in the stock market in company A.', }, }; const parameters = JSON.parse(JSON.stringify(getParameters())); + parameters.categories = ['pii', 'financial']; const result = await mistralGuardrailHandler( context as unknown as PluginContext, parameters, eventType, - { env: {} }, - 'pii' - ); - - expect(result).toBeDefined(); - expect(result.verdict).toBe(true); - expect(result.error).toBeNull(); - expect(result.data).toBeNull(); - }); - - it('should give error on invalid request body', async () => { - const eventType = 'beforeRequestHook'; - const context = { - request: {}, - }; - const parameters = JSON.parse(JSON.stringify(getParameters())); - - const result = await mistralGuardrailHandler( - context as unknown as PluginContext, - parameters, - eventType, - { env: {} }, - 'pii' + { env: {} } ); expect(result).toBeDefined(); expect(result.verdict).toBe(false); - expect(result.error).toBe('Mistral: Invalid Request body'); - expect(result.data).toBeNull(); + expect(result.error).toBeDefined(); + expect(result.data).toMatchObject({ + flagged_categories: ['financial', 'pii'], + }); }); - it('should work for afterRequestHook', async () => { - const eventType = 'afterRequestHook'; + it('should fail if the request body is invalid', async () => { + const eventType = 'beforeRequestHook'; const context = { - response: { text: 'this text is safe text' }, + request: { text: 'this is safe string without any flagged categories' }, }; - const parameters = JSON.parse(JSON.stringify(getParameters())); - - const result = await mistralGuardrailHandler( - context as unknown as PluginContext, - parameters, - eventType, - { env: {} }, - 'pii' - ); - - expect(result).toBeDefined(); - expect(result.verdict).toBe(false); - expect(result.error).toBeNull(); - expect(result.data).toBeNull(); - }); - it('should work for afterRequestHook with chatCompletion messages', async () => { - const eventType = 'afterRequestHook'; - const context = { - requestType: 'chatComplete', - response: { - json: { - messages: [ - { - role: 'user', - content: - 'Say Hi. My name is Jhon Doe and my email is user@example.com', - }, - ], - }, - }, - }; const parameters = JSON.parse(JSON.stringify(getParameters())); + parameters.categories = ['pii', 'financial']; const result = await mistralGuardrailHandler( context as unknown as PluginContext, parameters, eventType, - { env: {} }, - 'pii' + { env: {} } ); expect(result).toBeDefined(); - expect(result.verdict).toBe(false); - expect(result.error).toBeNull(); + expect(result.verdict).toBe(true); + expect(result.error).toBeDefined(); expect(result.data).toBeNull(); }); }); From e5074491e27a5d77dfea06b3c5cf5e408a55435a Mon Sep 17 00:00:00 2001 From: Mahesh Vagicherla Date: Thu, 26 Dec 2024 13:26:05 +0530 Subject: [PATCH 8/9] chore: update manifest --- plugins/mistral/manifest.json | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/plugins/mistral/manifest.json b/plugins/mistral/manifest.json index 2f8ae0624..d5059d280 100644 --- a/plugins/mistral/manifest.json +++ b/plugins/mistral/manifest.json @@ -51,12 +51,10 @@ "pii" ], "default": [ - "hate/threatening", - "harassment/threatening", - "self-harm/intent", - "self-harm/instructions", - "sexual/minors", - "violence/graphic" + "selfharm", + "pii", + "sexual", + "hate_and_discrimination" ] } } From 08b8aa4ebaead07caa196a2525adf576258ca1c4 Mon Sep 17 00:00:00 2001 From: Mahesh Vagicherla Date: Thu, 26 Dec 2024 15:53:14 +0530 Subject: [PATCH 9/9] chore: update supported hooks --- plugins/mistral/manifest.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/mistral/manifest.json b/plugins/mistral/manifest.json index d5059d280..965e6e958 100644 --- a/plugins/mistral/manifest.json +++ b/plugins/mistral/manifest.json @@ -18,7 +18,7 @@ "name": "Moderate Content", "id": "moderateContent", "type": "guardrail", - "supportedHooks": ["beforeRequestHook"], + "supportedHooks": ["beforeRequestHook", "afterRequestHook"], "description": [ { "type": "subHeading",