diff --git a/deno.json b/deno.json index 0ea11fa..8c53bae 100644 --- a/deno.json +++ b/deno.json @@ -17,6 +17,8 @@ "@std/io": "jsr:@std/io@^0.224.8", "@std/path": "jsr:@std/path@^1.0.6", "@std/tar": "jsr:@std/tar@^0.1.1", + "@types/estree": "npm:@types/estree@^1.0.5", + "@types/mdast": "npm:@types/mdast@^4.0.4", "@types/yargs": "npm:@types/yargs@^17.0.33", "apache-arrow": "npm:apache-arrow@^17.0.0", "esbuild": "npm:esbuild@^0.24.0", diff --git a/deno.lock b/deno.lock index 2bc8de6..97bc188 100644 --- a/deno.lock +++ b/deno.lock @@ -33,6 +33,9 @@ "npm:@lancedb/lancedb@^0.10.0": "npm:@lancedb/lancedb@0.10.0_apache-arrow@17.0.0", "npm:@sinclair/typebox": "npm:@sinclair/typebox@0.33.11", "npm:@sinclair/typebox@^0.33.11": "npm:@sinclair/typebox@0.33.11", + "npm:@types/estree": "npm:@types/estree@1.0.5", + "npm:@types/mdast": "npm:@types/mdast@4.0.4", + "npm:@types/mdast@^4.0.4": "npm:@types/mdast@4.0.4", "npm:@types/yargs": "npm:@types/yargs@17.0.33", "npm:@types/yargs@^17.0.33": "npm:@types/yargs@17.0.33", "npm:apache-arrow@^17.0.0": "npm:apache-arrow@17.0.0", @@ -1370,6 +1373,8 @@ "jsr:@std/tar@^0.1.1", "npm:@lancedb/lancedb@^0.10.0", "npm:@sinclair/typebox@^0.33.11", + "npm:@types/estree@^1.0.5", + "npm:@types/mdast@^4.0.4", "npm:@types/yargs@^17.0.33", "npm:apache-arrow@^17.0.0", "npm:esbuild@^0.24.0", diff --git a/src/embeddings/generator/generator.ts b/src/embeddings/generator/generator.ts index c8fb4e5..a7369bf 100644 --- a/src/embeddings/generator/generator.ts +++ b/src/embeddings/generator/generator.ts @@ -1,8 +1,8 @@ import { BaseEmbeddingSource, MarkdownEmbeddingSource, - walk, } from "./mdSource.ts"; +import { walk } from '@std/fs/walk'; import ollama from "ollama"; import { LanceWriter } from "../lance/index.ts"; @@ -15,7 +15,7 @@ export async function generate( ignoredFiles = DEFAULT_IGNORED_FILES, ) { const embeddingSources: BaseEmbeddingSource[] = [ - ...(await walk(path)) + ...(await Array.fromAsync(walk(path))) .filter(({ path }) => /\.mdx?$/.test(path)) .filter(({ path }) => !ignoredFiles.includes(path)) .map((entry) => new MarkdownEmbeddingSource("guide", entry.path)), diff --git a/src/embeddings/generator/mdSource.ts b/src/embeddings/generator/mdSource.ts index a59a292..403a780 100644 --- a/src/embeddings/generator/mdSource.ts +++ b/src/embeddings/generator/mdSource.ts @@ -1,6 +1,8 @@ // This file is based on https://github.com/supabase-community/nextjs-openai-doc-search/blob/main/lib/generate-embeddings.ts -// import { ObjectExpression } from 'estree'; +// @ts-types="npm:@types/estree" +import { ObjectExpression } from 'estree'; +// @ts-types="npm:@types/mdast" import { Content, Root } from "mdast"; import { fromMarkdown } from "mdast-util-from-markdown"; import { mdxFromMarkdown, MdxjsEsm } from "mdast-util-mdx"; @@ -9,19 +11,14 @@ import { toString } from "mdast-util-to-string"; import { mdxjs } from "micromark-extension-mdxjs"; import { u } from "unist-builder"; import { filter } from "unist-util-filter"; -import { basename } from "@std/path/basename"; -import { dirname } from "@std/path/dirname"; -import { join } from "@std/path/join"; import { createHash } from "node:crypto"; import GithubSlugger from "github-slugger"; -type OE = any; - /** * Extracts ES literals from an `estree` `ObjectExpression` * into a plain JavaScript object. */ -function getObjectFromExpression(node: OE) { +function getObjectFromExpression(node: ObjectExpression) { return node.properties.reduce< Record >((object, property) => { @@ -57,7 +54,7 @@ function extractMetaExport(mdxTree: Root) { node.data?.estree?.body[0]?.type === "ExportNamedDeclaration" && node.data.estree.body[0].declaration?.type === "VariableDeclaration" && node.data.estree.body[0].declaration.declarations[0]?.id.type === - "Identifier" && + "Identifier" && node.data.estree.body[0].declaration.declarations[0].id.name === "meta" ); }); @@ -69,14 +66,14 @@ function extractMetaExport(mdxTree: Root) { const objectExpression = (metaExportNode.data?.estree?.body[0]?.type === "ExportNamedDeclaration" && metaExportNode.data.estree.body[0].declaration?.type === - "VariableDeclaration" && + "VariableDeclaration" && metaExportNode.data.estree.body[0].declaration.declarations[0]?.id - .type === "Identifier" && + .type === "Identifier" && metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === - "meta" && + "meta" && metaExportNode.data.estree.body[0].declaration.declarations[0].init - ?.type === - "ObjectExpression" && + ?.type === + "ObjectExpression" && metaExportNode.data.estree.body[0].declaration.declarations[0].init) || undefined; @@ -184,84 +181,85 @@ function processMdxForSearch(content: string): ProcessedMdx { }; } -type WalkEntry = { - path: string; - parentPath?: string; -}; - -export async function walk( - dir: string, - parentPath?: string, -): Promise { - for await (const entry of Deno.readDir(dir)) { - if (entry.isDirectory) { - // Keep track of document hierarchy (if this dir has corresponding doc file) - const docPath = `${basename(path)}.mdx`; - - return walk( - path, - immediateFiles.includes(docPath) - ? join(dirname(path), docPath) - : parentPath, - ); - } else if (entry.isFile) { - return [ - { - path: path, - parentPath, - }, - ]; - } else { - return []; - } - - const path = join(dir, file); - const stats = await stat(path); - if (stats.isDirectory()) { - } else if (stats.isFile()) { - return [ - { - path: path, - parentPath, - }, - ]; - } else { - return []; - } - } - - // const recursiveFiles = await Promise.all( - // immediateFiles.map(async (file) => { - // const path = join(dir, file) - // const stats = await stat(path) - // if (stats.isDirectory()) { - // // Keep track of document hierarchy (if this dir has corresponding doc file) - // const docPath = `${basename(path)}.mdx` - - // return walk( - // path, - // immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath - // ) - // } else if (stats.isFile()) { - // return [ - // { - // path: path, - // parentPath, - // }, - // ] - // } else { - // return [] - // } - // }) - // ) - - const flattenedFiles = recursiveFiles.reduce( - (all, folderContents) => all.concat(folderContents), - [], - ); - - return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path)); -} +// type WalkEntry = { +// path: string; +// parentPath?: string; +// }; + +// export async function walk( +// dir: string, +// parentPath?: string, +// ): Promise { +// for await (const entry of Deno.readDir(dir)) { +// const path = join(dir, entry); +// if (entry.isDirectory) { +// // Keep track of document hierarchy (if this dir has corresponding doc file) +// const docPath = `${basename(path)}.mdx`; + +// return walk( +// path, +// immediateFiles.includes(docPath) +// ? join(dirname(path), docPath) +// : parentPath, +// ); +// } else if (entry.isFile) { +// return [ +// { +// path: path, +// parentPath, +// }, +// ]; +// } else { +// return []; +// } + +// // const path = join(dir, file); +// // const stats = await stat(path); +// // if (stats.isDirectory()) { +// // } else if (stats.isFile()) { +// // return [ +// // { +// // path: path, +// // parentPath, +// // }, +// // ]; +// // } else { +// // return []; +// // } +// } + +// // const recursiveFiles = await Promise.all( +// // immediateFiles.map(async (file) => { +// // const path = join(dir, file) +// // const stats = await stat(path) +// // if (stats.isDirectory()) { +// // // Keep track of document hierarchy (if this dir has corresponding doc file) +// // const docPath = `${basename(path)}.mdx` + +// // return walk( +// // path, +// // immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath +// // ) +// // } else if (stats.isFile()) { +// // return [ +// // { +// // path: path, +// // parentPath, +// // }, +// // ] +// // } else { +// // return [] +// // } +// // }) +// // ) + +// const flattenedFiles = recursiveFiles.reduce( +// (all, folderContents) => all.concat(folderContents), +// [], +// ); + +// return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path)); +// } export abstract class BaseEmbeddingSource { checksum?: string;