Skip to content

Commit

Permalink
Fix type issues
Browse files Browse the repository at this point in the history
  • Loading branch information
stwiname committed Oct 6, 2024
1 parent e518e9d commit ba08ed0
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 93 deletions.
2 changes: 2 additions & 0 deletions deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"@std/io": "jsr:@std/io@^0.224.8",
"@std/path": "jsr:@std/path@^1.0.6",
"@std/tar": "jsr:@std/tar@^0.1.1",
"@types/estree": "npm:@types/estree@^1.0.5",
"@types/mdast": "npm:@types/mdast@^4.0.4",
"@types/yargs": "npm:@types/yargs@^17.0.33",
"apache-arrow": "npm:apache-arrow@^17.0.0",
"esbuild": "npm:esbuild@^0.24.0",
Expand Down
5 changes: 5 additions & 0 deletions deno.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions src/embeddings/generator/generator.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import {
BaseEmbeddingSource,
MarkdownEmbeddingSource,
walk,
} from "./mdSource.ts";
import { walk } from '@std/fs/walk';
import ollama from "ollama";
import { LanceWriter } from "../lance/index.ts";

Expand All @@ -15,7 +15,7 @@ export async function generate(
ignoredFiles = DEFAULT_IGNORED_FILES,
) {
const embeddingSources: BaseEmbeddingSource[] = [
...(await walk(path))
...(await Array.fromAsync(walk(path)))
.filter(({ path }) => /\.mdx?$/.test(path))
.filter(({ path }) => !ignoredFiles.includes(path))
.map((entry) => new MarkdownEmbeddingSource("guide", entry.path)),
Expand Down
180 changes: 89 additions & 91 deletions src/embeddings/generator/mdSource.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// This file is based on https://github.com/supabase-community/nextjs-openai-doc-search/blob/main/lib/generate-embeddings.ts

// import { ObjectExpression } from 'estree';
// @ts-types="npm:@types/estree"
import { ObjectExpression } from 'estree';
// @ts-types="npm:@types/mdast"
import { Content, Root } from "mdast";
import { fromMarkdown } from "mdast-util-from-markdown";
import { mdxFromMarkdown, MdxjsEsm } from "mdast-util-mdx";
Expand All @@ -9,19 +11,14 @@ import { toString } from "mdast-util-to-string";
import { mdxjs } from "micromark-extension-mdxjs";
import { u } from "unist-builder";
import { filter } from "unist-util-filter";
import { basename } from "@std/path/basename";
import { dirname } from "@std/path/dirname";
import { join } from "@std/path/join";
import { createHash } from "node:crypto";
import GithubSlugger from "github-slugger";

type OE = any;

/**
* Extracts ES literals from an `estree` `ObjectExpression`
* into a plain JavaScript object.
*/
function getObjectFromExpression(node: OE) {
function getObjectFromExpression(node: ObjectExpression) {
return node.properties.reduce<
Record<string, string | number | bigint | true | RegExp | undefined>
>((object, property) => {
Expand Down Expand Up @@ -57,7 +54,7 @@ function extractMetaExport(mdxTree: Root) {
node.data?.estree?.body[0]?.type === "ExportNamedDeclaration" &&
node.data.estree.body[0].declaration?.type === "VariableDeclaration" &&
node.data.estree.body[0].declaration.declarations[0]?.id.type ===
"Identifier" &&
"Identifier" &&
node.data.estree.body[0].declaration.declarations[0].id.name === "meta"
);
});
Expand All @@ -69,14 +66,14 @@ function extractMetaExport(mdxTree: Root) {
const objectExpression =
(metaExportNode.data?.estree?.body[0]?.type === "ExportNamedDeclaration" &&
metaExportNode.data.estree.body[0].declaration?.type ===
"VariableDeclaration" &&
"VariableDeclaration" &&
metaExportNode.data.estree.body[0].declaration.declarations[0]?.id
.type === "Identifier" &&
.type === "Identifier" &&
metaExportNode.data.estree.body[0].declaration.declarations[0].id.name ===
"meta" &&
"meta" &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init
?.type ===
"ObjectExpression" &&
?.type ===
"ObjectExpression" &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init) ||
undefined;

Expand Down Expand Up @@ -184,84 +181,85 @@ function processMdxForSearch(content: string): ProcessedMdx {
};
}

type WalkEntry = {
path: string;
parentPath?: string;
};

export async function walk(
dir: string,
parentPath?: string,
): Promise<WalkEntry[]> {
for await (const entry of Deno.readDir(dir)) {
if (entry.isDirectory) {
// Keep track of document hierarchy (if this dir has corresponding doc file)
const docPath = `${basename(path)}.mdx`;

return walk(
path,
immediateFiles.includes(docPath)
? join(dirname(path), docPath)
: parentPath,
);
} else if (entry.isFile) {
return [
{
path: path,
parentPath,
},
];
} else {
return [];
}

const path = join(dir, file);
const stats = await stat(path);
if (stats.isDirectory()) {
} else if (stats.isFile()) {
return [
{
path: path,
parentPath,
},
];
} else {
return [];
}
}

// const recursiveFiles = await Promise.all(
// immediateFiles.map(async (file) => {
// const path = join(dir, file)
// const stats = await stat(path)
// if (stats.isDirectory()) {
// // Keep track of document hierarchy (if this dir has corresponding doc file)
// const docPath = `${basename(path)}.mdx`

// return walk(
// path,
// immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath
// )
// } else if (stats.isFile()) {
// return [
// {
// path: path,
// parentPath,
// },
// ]
// } else {
// return []
// }
// })
// )

const flattenedFiles = recursiveFiles.reduce(
(all, folderContents) => all.concat(folderContents),
[],
);

return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
}
// type WalkEntry = {
// path: string;
// parentPath?: string;
// };

// export async function walk(
// dir: string,
// parentPath?: string,
// ): Promise<WalkEntry[]> {
// for await (const entry of Deno.readDir(dir)) {
// const path = join(dir, entry);
// if (entry.isDirectory) {
// // Keep track of document hierarchy (if this dir has corresponding doc file)
// const docPath = `${basename(path)}.mdx`;

// return walk(
// path,
// immediateFiles.includes(docPath)
// ? join(dirname(path), docPath)
// : parentPath,
// );
// } else if (entry.isFile) {
// return [
// {
// path: path,
// parentPath,
// },
// ];
// } else {
// return [];
// }

// // const path = join(dir, file);
// // const stats = await stat(path);
// // if (stats.isDirectory()) {
// // } else if (stats.isFile()) {
// // return [
// // {
// // path: path,
// // parentPath,
// // },
// // ];
// // } else {
// // return [];
// // }
// }

// // const recursiveFiles = await Promise.all(
// // immediateFiles.map(async (file) => {
// // const path = join(dir, file)
// // const stats = await stat(path)
// // if (stats.isDirectory()) {
// // // Keep track of document hierarchy (if this dir has corresponding doc file)
// // const docPath = `${basename(path)}.mdx`

// // return walk(
// // path,
// // immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath
// // )
// // } else if (stats.isFile()) {
// // return [
// // {
// // path: path,
// // parentPath,
// // },
// // ]
// // } else {
// // return []
// // }
// // })
// // )

// const flattenedFiles = recursiveFiles.reduce(
// (all, folderContents) => all.concat(folderContents),
// [],
// );

// return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
// }

export abstract class BaseEmbeddingSource {
checksum?: string;
Expand Down

0 comments on commit ba08ed0

Please sign in to comment.