Skip to content

Commit

Permalink
H-3578: Infer metadata from uploaded PDF document (#5691)
Browse files Browse the repository at this point in the history
  • Loading branch information
CiaranMn authored Nov 22, 2024
1 parent ead2c1e commit 47c108f
Show file tree
Hide file tree
Showing 55 changed files with 3,548 additions and 326 deletions.
3 changes: 3 additions & 0 deletions apps/hash-ai-worker-ts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ The service uses the following environment variables:
- `HASH_VAULT_HOST`: The host address (including protocol) that the Vault server is running on, e.g. `http://127.0.0.1`
- `HASH_VAULT_PORT`: The port that the Vault server is running on, e.g. `8200`
- `HASH_VAULT_ROOT_TOKEN`: The token to authenticate with the Vault server.
- `GOOGLE_CLOUD_HASH_PROJECT_ID`: The projectId for a Google Cloud Platform project, used in document analysis (Vertex AI and Cloud Storage). Note that this is the Project ID, _not_ the Project Number.
- `GOOGLE_CLOUD_STORAGE_BUCKET`: The name of the Google Cloud Storage bucket to use for document analysis.
- `GOOGLE_APPLICATION_CREDENTIALS`: The path to a configuration file for GCP authentication. Automatically set locally by the `gcloud` CLI, and set manually during the build process.

### Run the worker

Expand Down
11 changes: 11 additions & 0 deletions apps/hash-ai-worker-ts/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ WORKDIR /usr/local/src/apps/hash-ai-worker-ts
ENTRYPOINT [ "yarn", "--cache-folder", "/tmp/yarn-cache", "--global-folder", "/tmp/yarn-global" ]
CMD ["start"]

ARG GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON
ENV GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON=${GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON}

RUN if [ -n "$GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON" ]; then \
echo $GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON > /tmp/google_workload_identity_federation_config.json && \
export GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_workload_identity_federation_config.json && \
echo "GOOGLE_APPLICATION_CREDENTIALS set from JSON"; \
else \
echo "GOOGLE_APPLICATION_CREDENTIALS not set, no GOOGLE_CLOUD_WORKLOAD_IDENTITY_FEDERATION_CONFIG_JSON in environment"; \
fi

RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
rm -rf /var/lib/apt/lists/* && \
Expand Down
3 changes: 3 additions & 0 deletions apps/hash-ai-worker-ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
"@apps/hash-graph": "0.0.0-private",
"@blockprotocol/graph": "0.4.0-canary.0",
"@blockprotocol/type-system": "0.1.2-canary.0",
"@google-cloud/storage": "7.14.0",
"@google-cloud/vertexai": "1.9.0",
"@local/advanced-types": "0.0.0-private",
"@local/hash-backend-utils": "0.0.0-private",
"@local/hash-graph-client": "0.0.0-private",
Expand Down Expand Up @@ -82,6 +84,7 @@
"openai": "4.68.4",
"openai-chat-tokens": "0.2.8",
"papaparse": "5.4.1",
"pdf2json": "3.1.4",
"puppeteer": "22.15.0",
"puppeteer-extra": "3.3.6",
"puppeteer-extra-plugin-stealth": "2.11.2",
Expand Down
2 changes: 2 additions & 0 deletions apps/hash-ai-worker-ts/src/activities/flow-activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { getFileFromUrlAction } from "./flow-activities/get-file-from-url-action
import { getWebPageByUrlAction } from "./flow-activities/get-web-page-by-url-action.js";
import { getWebPageSummaryAction } from "./flow-activities/get-web-page-summary-action.js";
import { inferEntitiesFromContentAction } from "./flow-activities/infer-entities-from-content-action.js";
import { inferMetadataFromDocumentAction } from "./flow-activities/infer-metadata-from-document-action.js";
import { persistEntitiesAction } from "./flow-activities/persist-entities-action.js";
import { persistEntityAction } from "./flow-activities/persist-entity-action.js";
import { persistFlowActivity } from "./flow-activities/persist-flow-activity.js";
Expand All @@ -28,6 +29,7 @@ export const createFlowActionActivities = ({
getWebPageByUrlAction,
processAutomaticBrowsingSettingsAction,
inferEntitiesFromContentAction,
inferMetadataFromDocumentAction,
persistEntityAction,
persistEntitiesAction,
getFileFromUrlAction,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ const systemPrompt = `
You are a workflow naming agent. A workflow is an automated process that produces a result of interest.
Multiple workflows of the same kind are run with different inputs, and the user requires a unique name for each run, to distinguish it from other runs of the same kind.
The user provides you with a description of the goal of the workflow, or a description of the template and a list of its inputs, and you generate a short name for the run. Provide only the name – don't include any other text.
The user provides you with a description of the goal of the workflow, or a description of the template and a list of its inputs, and you generate a short name for the run. Provide only the name – don't include any other text. If there are no inputs provided, you can generate a name based on the template description alone.
The name should be descriptive enough to distinguish it from other runs from the same template, and must always be a single human-readable sentence, with proper grammar and spacing between words.
<Rules>
Don't include any quotation marks or special characters around the name.
Don't include the word 'workflow' in the name – the user already knows it's a workflow.
Don't include UUIDs or other identifiers that aren't natural language words. Omit them, or use a generic human-readable replacement (e.g. 'entity').
</Rules>
`;

const getModelSuggestedFlowRunName = async (
Expand Down Expand Up @@ -79,7 +83,11 @@ const getModelSuggestedFlowRunName = async (
return text;
};

const outputKindsToIgnore: PayloadKind[] = ["GoogleSheet", "GoogleAccountId"];
const outputKindsToIgnore: PayloadKind[] = [
"GoogleSheet",
"GoogleAccountId",
"EntityId",
];

export const generateFlowRunName = async (
params: PersistFlowActivityParams,
Expand Down Expand Up @@ -144,13 +152,20 @@ export const generateFlowRunName = async (
!outputKindsToIgnore.includes(output.payload.kind),
);

let workflowDescriptionString = `The workflow template is named ${
flowDefinition.name
} with a description of ${flowDefinition.description}.`;

if (inputsOfInterest?.length) {
workflowDescriptionString += ` The inputs to the workflow run to be named: ${inputsOfInterest
.map((input) => JSON.stringify(input))
.join("\n")}.`;
} else {
workflowDescriptionString += ` The workflow run to be named has no inputs.`;
}

return getModelSuggestedFlowRunName(
`The workflow template is named ${
flowDefinition.name
} with a description of ${flowDefinition.description}.
The inputs to the workflow run to be named: ${inputsOfInterest
?.map((input) => JSON.stringify(input))
.join("\n")}`,
workflowDescriptionString,
usageTrackingParams,
);
};
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { StatusCode } from "@local/status";
import { Context } from "@temporalio/activity";

import { logProgress } from "../shared/log-progress.js";
import { getFileEntityFromUrl } from "./shared/get-file-entity-from-url.js";
import { createFileEntityFromUrl } from "./shared/create-file-entity-from-url.js";
import type { FlowActionActivity } from "./types.js";

export const getFileFromUrlAction: FlowActionActivity = async ({ inputs }) => {
Expand All @@ -19,25 +19,25 @@ export const getFileFromUrlAction: FlowActionActivity = async ({ inputs }) => {
actionType: "getFileFromUrl",
});

const getFileEntityFromUrlStatus = await getFileEntityFromUrl({
const createFileEntityFromUrlStatus = await createFileEntityFromUrl({
entityUuid: null,
url: originalUrl,
description,
displayName,
});

if (getFileEntityFromUrlStatus.status !== "ok") {
if (createFileEntityFromUrlStatus.status !== "ok") {
return {
code: StatusCode.Internal,
message: getFileEntityFromUrlStatus.message,
message: createFileEntityFromUrlStatus.message,
contents: [],
};
}

// @todo look for an existing file with the same originalUrl in the graph, and update it if found?
const operation = "create" as const;

const fileEntity = getFileEntityFromUrlStatus.entity.toJSON();
const fileEntity = createFileEntityFromUrlStatus.entity.toJSON();

logProgress([
{
Expand Down
Loading

0 comments on commit 47c108f

Please sign in to comment.