Add a collectGithubRepoFiles tool (#8)

* start work on codebase stripping * tools: add github repo lookup method, updates to code stripper * Add a `collectGithubRepoFiles` tool * update codebase.js * remove debug logging
extremeheat · Feb 25, 2024 · aa50927 · aa50927
1 parent 6364655
commit aa50927
Show file tree

Hide file tree

Showing 9 changed files with 598 additions and 254 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,135 +1,7 @@
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-lerna-debug.log*
-.pnpm-debug.log*
-
-# Diagnostic reports (https://nodejs.org/api/report.html)
-report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-*.lcov
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-build/Release
-
-# Dependency directories
-node_modules/
-jspm_packages/
-
-# Snowpack dependency directory (https://snowpack.dev/)
-web_modules/
-
-# TypeScript cache
-*.tsbuildinfo
-
-# Optional npm cache directory
-.npm
-
-# Optional eslint cache
-.eslintcache
-
-# Optional stylelint cache
-.stylelintcache
-
-# Microbundle cache
-.rpt2_cache/
-.rts2_cache_cjs/
-.rts2_cache_es/
-.rts2_cache_umd/
-
-# Optional REPL history
-.node_repl_history
-
-# Output of 'npm pack'
-*.tgz
-
-# Yarn Integrity file
-.yarn-integrity
-
-# dotenv environment variable files
-.env
-.env.development.local
-.env.test.local
-.env.production.local
-.env.local
-
-# parcel-bundler cache (https://parceljs.org/)
-.cache
-.parcel-cache
-
-# Next.js build output
-.next
-out
-
-# Nuxt.js build / generate output
-.nuxt
-dist
-
-# Gatsby files
-.cache/
-# Comment in the public line in if your project uses Gatsby and not Next.js
-# https://nextjs.org/blog/next-9-1#public-directory-support
-# public
-
-# vuepress build output
-.vuepress/dist
-
-# vuepress v2.x temp and cache directory
-.temp
-.cache
-
-# Docusaurus cache and generated files
-.docusaurus
-
-# Serverless directories
-.serverless/
-
-# FuseBox cache
-.fusebox/
-
-# DynamoDB Local files
-.dynamodb/
-
-# TernJS port file
-.tern-port
-
-# Stores VSCode versions used for testing VSCode extensions
-.vscode-test
-
-# yarn v2
-.yarn/cache
-.yarn/unplugged
-.yarn/build-state.yml
-.yarn/install-state.gz
-.pnp.*
-playground
+node_modules
 package-lock.json
-*.key
-
-test/*.html
+playground
+test/*.html
+src/tools/repos
+__*
+*.key
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/extremeheat/LXL/actions/workflows/ci.yml/badge.svg)](https://github.com/extremeheat/LXL/actions/workflows/)
 [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/extremeheat/LXL)
 
-LangXLang (LXL), a simple wrapper for Node.js to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
+LangXLang (LXL), a Node.js library to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
 
 Supported models are:
 * OpenAI: `gpt-3.5-turbo-16k`, `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo-preview`

diff --git a/src/index.d.ts b/src/index.d.ts
@@ -15,6 +15,7 @@ declare module 'langxlang' {
   }
 
   interface Func {
+    // If default is not provided, the argument is required.
     Arg(options: { type: string[], description: string, example?: string, default?: any, required?: boolean }): string
     Arg(options: { type: object, description: string, example?: string, default?: any, required?: boolean }): object
     Arg<T>(options: { type: T, description: string, example?: string, default?: any, required?: boolean }): T
@@ -38,6 +39,16 @@ declare module 'langxlang' {
   interface Tools {
     // Generate HTML that shows side-by-side outputs for the system/user prompt across different models.
     makeVizForPrompt(systemPrompt: string, userPrompt: string, models: Model[]): Promise<string>
+    // Returns a JS object with a list of files in a GitHub repo
+    collectGithubRepoFiles(repo: string, options: {
+      // What extension of files in the repo to include
+      extension?: string,
+      // The branch to use
+      branch?: string,
+      // Either a function that returns true if the file should be included
+      // or an array of regexes of which one needs to match for inclusion
+      matching?: (fileName: string) => boolean | RegExp[]
+    }): Promise<[absolutePath: string, relativePath: string, contents: string][]>
   }
 
   const tools: Tools

diff --git a/src/tools.js b/src/tools.js
@@ -1,111 +1,7 @@
-const { CompletionService } = require('./CompletionService')
+const codebase = require('./tools/codebase')
+const viz = require('./tools/viz')
 
-function makeVizHtml (data) {
-  return `
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <title>LLM Output Viz</title>
-  <style>
-    pre {
-      max-width: 25vw;
-      overflow: auto;
-    }
-    h3 {
-      background-color: lightcyan;
-      margin-top: 12px;
-    }
-    #grid {
-      /* margin: 1%; */
-      .correct h3 {
-        background-color: lightgreen;
-      }
-    }
-    #grid div {
-      margin-right: 2%;
-    }
-  </style>
-</head>
-<body>
-  <button id="wordwrap">Toggle Word Wrap</button>
-  <div id="grid" style='display: flex; flex-direction: row;'>
-    <div>
-      <h3>System Prompt</h3>
-      <pre id="psys">SYS PROMPT</pre>
-    </div>
-    <div>
-      <h3>User Prompt</h3>
-      <pre id="pusr">USR PROMPT</pre>
-    </div>
-    ${
-      data.models.map(([modelName, modelId]) =>
-        `<div><h3>${modelName}</h3><pre id="presp${modelId}">MODEL OUTPUT</pre></div>`).join('\n')
-    }
-  </div>
-
-  <script>
-    function toggleWordWrap () {
-      const $pre = document.querySelectorAll('pre');
-      for (const $p of $pre) {
-        $p.style.whiteSpace = $p.style.whiteSpace === 'pre-wrap' ? 'pre' : 'pre-wrap';
-      }
-    }
-    wordwrap.onclick = toggleWordWrap;
-    toggleWordWrap();
-
-    const $psys = document.getElementById('psys');
-    const $pusr = document.getElementById('pusr');
-
-    const data = ${JSON.stringify(data)};
-    const outputs = data.outputs;
-    if ($psys) $psys.textContent = data.system;
-    if ($pusr) $pusr.textContent = data.user;
-    for (const [modelName, modelId] of data.models) {
-      const $presp = document.getElementById('presp' + modelId);
-      if ($presp) $presp.textContent = outputs[modelId];
-    }
-</script>
-</body>
-</html>
-  `
+module.exports = {
+  makeVizForPrompt: viz.makeVizForPrompt,
+  collectGithubRepoFiles: codebase.collectGithubRepoFiles
 }
-
-async function makeVizForPrompt (system, user, models) {
-  const service = new CompletionService()
-  const data = { models: [], outputs: {} }
-  for (const model of models) {
-    const { text } = await service.requestCompletion(model, system, user)
-    switch (model) {
-      case 'gpt-3.5-turbo-16k':
-        data.models.push(['GPT-3.5 Turbo 16k', '3516turbo'])
-        data.outputs['3516turbo'] = text
-        break
-      case 'gpt-3.5-turbo':
-        data.models.push(['GPT-3.5 Turbo', '35turbo'])
-        data.outputs['35turbo'] = text
-        break
-      case 'gpt-4':
-        data.models.push(['GPT-4', 'gpt4'])
-        data.outputs.gpt4 = text
-        break
-      case 'gpt-4-turbo-preview':
-        data.models.push(['GPT-4 Turbo Preview', 'gpt4turbo'])
-        data.outputs.gpt4turbo = text
-        break
-      case 'gemini-1.0-pro':
-        data.models.push(['Gemini 1.0 Pro', 'gemini'])
-        data.outputs.gemini = text
-        break
-      default:
-        data.models.push([model, model])
-        data.outputs[model] = text
-    }
-  }
-  data.system = system
-  data.user = user
-  return makeVizHtml(data)
-}
-
-module.exports = { makeVizForPrompt }
diff --git a/src/tools/codebase.js b/src/tools/codebase.js
@@ -0,0 +1,64 @@
+const fs = require('fs')
+const cp = require('child_process')
+const { join } = require('path')
+
+function fixSeparator (path) {
+  return path.replace(/\\/g, '/')
+}
+
+function getAllFilesIn (folder) {
+  const files = []
+  const entries = fs.readdirSync(folder, { withFileTypes: true })
+  for (const entry of entries) {
+    const fullPath = join(folder, entry.name)
+    if (entry.isDirectory()) {
+      files.push(...getAllFilesIn(fullPath))
+    } else {
+      files.push(fullPath)
+    }
+  }
+  return files.map(fixSeparator)
+}
+
+// This function will clone a github repo, review all the files and merge relevant files into a single file
+function collectGithubRepoFiles (repo, options) {
+  const extension = options.extension
+  const branch = options.branch || 'master'
+  // First, try to clone the repo inside a "repos" folder in this directory
+  const safeName = repo.replace(/\//g, ',')
+  const reposDir = join(__dirname, 'repos')
+  const repoPath = join(reposDir, safeName)
+  fs.mkdirSync(reposDir, { recursive: true })
+  if (!fs.existsSync(repoPath)) {
+    cp.execSync(`git clone https://github.com/${repo}.git ${safeName}`, { cwd: reposDir })
+  }
+  // Git pull origin/$branch
+  cp.execSync(`git pull origin ${branch}`, { cwd: repoPath })
+  // Check out the branch
+  cp.execSync(`git checkout ${branch}`, { cwd: repoPath })
+  // Now collect all the files inside repoPath, like `tree`
+  const allFiles = getAllFilesIn(repoPath)
+    .map(f => [f, f.replace(fixSeparator(repoPath), '')])
+
+  // Now figure out the relevant files
+  const relevantFiles = []
+  for (const [file, relFile] of allFiles) {
+    if (extension && !file.endsWith(extension)) {
+      continue
+    }
+    if (options.matching) {
+      if (typeof options.matching === 'function') {
+        if (!options.matching(relFile)) {
+          continue
+        }
+      } else if (!options.matching.some(m => relFile.match(m))) {
+        continue
+      }
+    }
+    relevantFiles.push([file, relFile])
+  }
+  const fileContents = relevantFiles.map(([abs, rel]) => [abs, rel, fs.readFileSync(abs, 'utf8').trim()])
+  return fileContents
+}
+
+module.exports = { collectGithubRepoFiles }