diff --git a/apps/hash-ai-worker-ts/README.md b/apps/hash-ai-worker-ts/README.md
index 80e18c2df77..c7ca127c15b 100644
--- a/apps/hash-ai-worker-ts/README.md
+++ b/apps/hash-ai-worker-ts/README.md
@@ -17,6 +17,9 @@ The service uses the following environment variables:
- `HASH_VAULT_HOST`: The host address (including protocol) that the Vault server is running on, e.g. `http://127.0.0.1`
- `HASH_VAULT_PORT`: The port that the Vault server is running on, e.g. `8200`
- `HASH_VAULT_ROOT_TOKEN`: The token to authenticate with the Vault server.
+- `GOOGLE_CLOUD_HASH_PROJECT_ID`: The projectId for a Google Cloud Platform project, used in document analysis (Vertex AI and Cloud Storage). Note that this is the Project ID, _not_ the Project Number.
+- `GOOGLE_CLOUD_STORAGE_BUCKET`: The name of the Google Cloud Storage bucket to use for document analysis.
+- `GOOGLE_APPLICATION_CREDENTIALS`: The path to a configuration file for GCP authentication. Automatically set locally by the `gcloud` CLI, and set manually during the build process.
### Run the worker
diff --git a/apps/hash-ai-worker-ts/docker/Dockerfile b/apps/hash-ai-worker-ts/docker/Dockerfile
index c4f3a89ff29..62c12f35592 100644
--- a/apps/hash-ai-worker-ts/docker/Dockerfile
+++ b/apps/hash-ai-worker-ts/docker/Dockerfile
@@ -63,6 +63,17 @@ WORKDIR /usr/local/src/apps/hash-ai-worker-ts
ENTRYPOINT [ "yarn", "--cache-folder", "/tmp/yarn-cache", "--global-folder", "/tmp/yarn-global" ]
CMD ["start"]
+ARG GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON
+ENV GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON=${GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON}
+
+RUN if [ -n "$GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON" ]; then \
+ echo $GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON > /tmp/google_workload_identity_federation_config.json && \
+ export GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_workload_identity_federation_config.json && \
+ echo "GOOGLE_APPLICATION_CREDENTIALS set from JSON"; \
+ else \
+ echo "GOOGLE_APPLICATION_CREDENTIALS not set, no GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON in environment"; \
+ fi
+
RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
rm -rf /var/lib/apt/lists/* && \
diff --git a/apps/hash-ai-worker-ts/package.json b/apps/hash-ai-worker-ts/package.json
index af0f609e18d..5424c4841b5 100644
--- a/apps/hash-ai-worker-ts/package.json
+++ b/apps/hash-ai-worker-ts/package.json
@@ -46,6 +46,8 @@
"@apps/hash-graph": "0.0.0-private",
"@blockprotocol/graph": "0.4.0-canary.0",
"@blockprotocol/type-system": "0.1.2-canary.0",
+ "@google-cloud/storage": "7.14.0",
+ "@google-cloud/vertexai": "1.9.0",
"@local/advanced-types": "0.0.0-private",
"@local/hash-backend-utils": "0.0.0-private",
"@local/hash-graph-client": "0.0.0-private",
@@ -82,6 +84,7 @@
"openai": "4.68.4",
"openai-chat-tokens": "0.2.8",
"papaparse": "5.4.1",
+ "pdf2json": "3.1.4",
"puppeteer": "22.15.0",
"puppeteer-extra": "3.3.6",
"puppeteer-extra-plugin-stealth": "2.11.2",
diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities.ts
index 9b0e6533bac..398b9d7f7c5 100644
--- a/apps/hash-ai-worker-ts/src/activities/flow-activities.ts
+++ b/apps/hash-ai-worker-ts/src/activities/flow-activities.ts
@@ -8,6 +8,7 @@ import { getFileFromUrlAction } from "./flow-activities/get-file-from-url-action
import { getWebPageByUrlAction } from "./flow-activities/get-web-page-by-url-action.js";
import { getWebPageSummaryAction } from "./flow-activities/get-web-page-summary-action.js";
import { inferEntitiesFromContentAction } from "./flow-activities/infer-entities-from-content-action.js";
+import { inferMetadataFromDocumentAction } from "./flow-activities/infer-metadata-from-document-action.js";
import { persistEntitiesAction } from "./flow-activities/persist-entities-action.js";
import { persistEntityAction } from "./flow-activities/persist-entity-action.js";
import { persistFlowActivity } from "./flow-activities/persist-flow-activity.js";
@@ -28,6 +29,7 @@ export const createFlowActionActivities = ({
getWebPageByUrlAction,
processAutomaticBrowsingSettingsAction,
inferEntitiesFromContentAction,
+ inferMetadataFromDocumentAction,
persistEntityAction,
persistEntitiesAction,
getFileFromUrlAction,
diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/generate-flow-run-name-activity.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/generate-flow-run-name-activity.ts
index 635e803f7bd..243376e6c3b 100644
--- a/apps/hash-ai-worker-ts/src/activities/flow-activities/generate-flow-run-name-activity.ts
+++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/generate-flow-run-name-activity.ts
@@ -30,11 +30,15 @@ const systemPrompt = `
You are a workflow naming agent. A workflow is an automated process that produces a result of interest.
Multiple workflows of the same kind are run with different inputs, and the user requires a unique name for each run, to distinguish it from other runs of the same kind.
-The user provides you with a description of the goal of the workflow, or a description of the template and a list of its inputs, and you generate a short name for the run. Provide only the name – don't include any other text.
+The user provides you with a description of the goal of the workflow, or a description of the template and a list of its inputs, and you generate a short name for the run. Provide only the name – don't include any other text. If there are no inputs provided, you can generate a name based on the template description alone.
The name should be descriptive enough to distinguish it from other runs from the same template, and must always be a single human-readable sentence, with proper grammar and spacing between words.
+
+
Don't include any quotation marks or special characters around the name.
Don't include the word 'workflow' in the name – the user already knows it's a workflow.
+Don't include UUIDs or other identifiers that aren't natural language words. Omit them, or use a generic human-readable replacement (e.g. 'entity').
+
`;
const getModelSuggestedFlowRunName = async (
@@ -79,7 +83,11 @@ const getModelSuggestedFlowRunName = async (
return text;
};
-const outputKindsToIgnore: PayloadKind[] = ["GoogleSheet", "GoogleAccountId"];
+const outputKindsToIgnore: PayloadKind[] = [
+ "GoogleSheet",
+ "GoogleAccountId",
+ "EntityId",
+];
export const generateFlowRunName = async (
params: PersistFlowActivityParams,
@@ -144,13 +152,20 @@ export const generateFlowRunName = async (
!outputKindsToIgnore.includes(output.payload.kind),
);
+ let workflowDescriptionString = `The workflow template is named ${
+ flowDefinition.name
+ } with a description of ${flowDefinition.description}.`;
+
+ if (inputsOfInterest?.length) {
+ workflowDescriptionString += ` The inputs to the workflow run to be named: ${inputsOfInterest
+ .map((input) => JSON.stringify(input))
+ .join("\n")}.`;
+ } else {
+ workflowDescriptionString += ` The workflow run to be named has no inputs.`;
+ }
+
return getModelSuggestedFlowRunName(
- `The workflow template is named ${
- flowDefinition.name
- } with a description of ${flowDefinition.description}.
- The inputs to the workflow run to be named: ${inputsOfInterest
- ?.map((input) => JSON.stringify(input))
- .join("\n")}`,
+ workflowDescriptionString,
usageTrackingParams,
);
};
diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/get-file-from-url-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/get-file-from-url-action.ts
index 36f5838c6fe..04719a25f30 100644
--- a/apps/hash-ai-worker-ts/src/activities/flow-activities/get-file-from-url-action.ts
+++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/get-file-from-url-action.ts
@@ -6,7 +6,7 @@ import { StatusCode } from "@local/status";
import { Context } from "@temporalio/activity";
import { logProgress } from "../shared/log-progress.js";
-import { getFileEntityFromUrl } from "./shared/get-file-entity-from-url.js";
+import { createFileEntityFromUrl } from "./shared/create-file-entity-from-url.js";
import type { FlowActionActivity } from "./types.js";
export const getFileFromUrlAction: FlowActionActivity = async ({ inputs }) => {
@@ -19,17 +19,17 @@ export const getFileFromUrlAction: FlowActionActivity = async ({ inputs }) => {
actionType: "getFileFromUrl",
});
- const getFileEntityFromUrlStatus = await getFileEntityFromUrl({
+ const createFileEntityFromUrlStatus = await createFileEntityFromUrl({
entityUuid: null,
url: originalUrl,
description,
displayName,
});
- if (getFileEntityFromUrlStatus.status !== "ok") {
+ if (createFileEntityFromUrlStatus.status !== "ok") {
return {
code: StatusCode.Internal,
- message: getFileEntityFromUrlStatus.message,
+ message: createFileEntityFromUrlStatus.message,
contents: [],
};
}
@@ -37,7 +37,7 @@ export const getFileFromUrlAction: FlowActionActivity = async ({ inputs }) => {
// @todo look for an existing file with the same originalUrl in the graph, and update it if found?
const operation = "create" as const;
- const fileEntity = getFileEntityFromUrlStatus.entity.toJSON();
+ const fileEntity = createFileEntityFromUrlStatus.entity.toJSON();
logProgress([
{
diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts
new file mode 100644
index 00000000000..28b445bf670
--- /dev/null
+++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts
@@ -0,0 +1,346 @@
+import { createWriteStream } from "node:fs";
+import { mkdir, unlink } from "node:fs/promises";
+import { dirname, join } from "node:path";
+import { Readable } from "node:stream";
+import { finished } from "node:stream/promises";
+import type { ReadableStream } from "node:stream/web";
+import { fileURLToPath } from "node:url";
+
+import { getAwsS3Config } from "@local/hash-backend-utils/aws-config";
+import { AwsS3StorageProvider } from "@local/hash-backend-utils/file-storage/aws-s3-storage-provider";
+import type {
+ OriginProvenance,
+ PropertyProvenance,
+ SourceProvenance,
+} from "@local/hash-graph-client";
+import type {
+ EnforcedEntityEditionProvenance,
+ Entity,
+} from "@local/hash-graph-sdk/entity";
+import {
+ getSimplifiedActionInputs,
+ type OutputNameForAction,
+} from "@local/hash-isomorphic-utils/flows/action-definitions";
+import type { PersistedEntity } from "@local/hash-isomorphic-utils/flows/types";
+import { generateUuid } from "@local/hash-isomorphic-utils/generate-uuid";
+import {
+ blockProtocolPropertyTypes,
+ systemEntityTypes,
+ systemPropertyTypes,
+} from "@local/hash-isomorphic-utils/ontology-type-ids";
+import type { File } from "@local/hash-isomorphic-utils/system-types/shared";
+import { extractEntityUuidFromEntityId } from "@local/hash-subgraph";
+import { StatusCode } from "@local/status";
+import { Context } from "@temporalio/activity";
+import type { Output } from "pdf2json";
+import PDFParser from "pdf2json";
+
+import { getAiAssistantAccountIdActivity } from "../get-ai-assistant-account-id-activity.js";
+import { createInferredEntityNotification } from "../shared/create-inferred-entity-notification.js";
+import { getEntityByFilter } from "../shared/get-entity-by-filter.js";
+import { getFlowContext } from "../shared/get-flow-context.js";
+import { graphApiClient } from "../shared/graph-api-client.js";
+import { logProgress } from "../shared/log-progress.js";
+import { generateDocumentPropertyPatches } from "./infer-metadata-from-document-action/generate-property-patches.js";
+import { generateDocumentProposedEntitiesAndCreateClaims } from "./infer-metadata-from-document-action/generate-proposed-entities-and-claims.js";
+import { getLlmAnalysisOfDoc } from "./infer-metadata-from-document-action/get-llm-analysis-of-doc.js";
+import type { FlowActionActivity } from "./types.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+const baseFilePath = join(__dirname, "/var/tmp_files");
+
+export const inferMetadataFromDocumentAction: FlowActionActivity = async ({
+ inputs,
+}) => {
+ const {
+ flowEntityId,
+ stepId,
+ userAuthentication: { actorId: userActorId },
+ webId,
+ } = await getFlowContext();
+
+ const { documentEntityId } = getSimplifiedActionInputs({
+ inputs,
+ actionType: "inferMetadataFromDocument",
+ });
+
+ const aiAssistantAccountId = await getAiAssistantAccountIdActivity({
+ authentication: { actorId: userActorId },
+ graphApiClient,
+ grantCreatePermissionForWeb: webId,
+ });
+
+ if (!aiAssistantAccountId) {
+ return {
+ code: StatusCode.FailedPrecondition,
+ contents: [],
+ message: `Could not get AI assistant account for web ${webId}`,
+ };
+ }
+
+ const documentEntity = await getEntityByFilter({
+ actorId: aiAssistantAccountId,
+ includeDrafts: false,
+ filter: {
+ all: [
+ {
+ equal: [{ path: ["ownedById"] }, { parameter: webId }],
+ },
+ {
+ equal: [
+ { path: ["uuid"] },
+ { parameter: extractEntityUuidFromEntityId(documentEntityId) },
+ ],
+ },
+ ],
+ },
+ graphApiClient,
+ });
+
+ if (!documentEntity) {
+ return {
+ code: StatusCode.NotFound,
+ contents: [],
+ message: `Could not find or access document entity with entityId ${documentEntityId}`,
+ };
+ }
+
+ const fileUrl =
+ documentEntity.properties[
+ blockProtocolPropertyTypes.fileUrl.propertyTypeBaseUrl
+ ];
+
+ if (!fileUrl) {
+ return {
+ code: StatusCode.InvalidArgument,
+ contents: [],
+ message: `Document entity with entityId ${documentEntityId} does not have a fileUrl property`,
+ };
+ }
+
+ if (typeof fileUrl !== "string") {
+ return {
+ code: StatusCode.InvalidArgument,
+ contents: [],
+ message: `Document entity with entityId ${documentEntityId} has a fileUrl property of type '${typeof fileUrl}', expected 'string'`,
+ };
+ }
+
+ const storageKey =
+ documentEntity.properties[
+ systemPropertyTypes.fileStorageKey.propertyTypeBaseUrl
+ ];
+
+ if (!storageKey) {
+ return {
+ code: StatusCode.InvalidArgument,
+ contents: [],
+ message: `Document entity with entityId ${documentEntityId} does not have a fileStorageKey property`,
+ };
+ }
+
+ if (typeof storageKey !== "string") {
+ return {
+ code: StatusCode.InvalidArgument,
+ contents: [],
+ message: `Document entity with entityId ${documentEntityId} has a fileStorageKey property of type '${typeof storageKey}', expected 'string'`,
+ };
+ }
+
+ await mkdir(baseFilePath, { recursive: true });
+
+ const filePath = `${baseFilePath}/${generateUuid()}.pdf`;
+
+ const s3Config = getAwsS3Config();
+
+ const downloadProvider = new AwsS3StorageProvider(s3Config);
+
+ const urlForDownload = await downloadProvider.presignDownload({
+ entity: documentEntity as Entity,
+ expiresInSeconds: 60 * 60,
+ key: storageKey,
+ });
+
+ const fetchFileResponse = await fetch(urlForDownload);
+
+ if (!fetchFileResponse.ok || !fetchFileResponse.body) {
+ return {
+ code: StatusCode.NotFound,
+ contents: [],
+ message: `Document entity with entityId ${documentEntityId} has a fileUrl ${fileUrl} that could not be fetched: ${fetchFileResponse.statusText}`,
+ };
+ }
+
+ try {
+ const fileStream = createWriteStream(filePath);
+ await finished(
+ Readable.fromWeb(
+ fetchFileResponse.body as ReadableStream,
+ ).pipe(fileStream),
+ );
+ } catch (error) {
+ await unlink(filePath);
+ return {
+ code: StatusCode.Internal,
+ contents: [],
+ message: `Failed to write file to file system: ${(error as Error).message}`,
+ };
+ }
+
+ const pdfParser = new PDFParser();
+
+ const documentJson = await new Promise