From aa509273e6e9843459f6eac93dfe90c066d3cf3e Mon Sep 17 00:00:00 2001
From: extremeheat <extreme@protonmail.ch>
Date: Sun, 25 Feb 2024 14:38:54 -0500
Subject: [PATCH] Add a collectGithubRepoFiles tool (#8)

* start work on codebase stripping

* tools: add github repo lookup method, updates to code stripper

* Add a `collectGithubRepoFiles` tool

* update codebase.js

* remove debug logging
---
 .gitignore             | 140 +--------------
 README.md              |   2 +-
 src/index.d.ts         |  11 ++
 src/tools.js           | 114 +------------
 src/tools/codebase.js  |  64 +++++++
 src/tools/stripping.js | 379 +++++++++++++++++++++++++++++++++++++++++
 src/tools/viz.js       | 111 ++++++++++++
 test/tools.js          |  21 +++
 test/viz.js            |  10 --
 9 files changed, 598 insertions(+), 254 deletions(-)
 create mode 100644 src/tools/codebase.js
 create mode 100644 src/tools/stripping.js
 create mode 100644 src/tools/viz.js
 create mode 100644 test/tools.js
 delete mode 100644 test/viz.js

diff --git a/.gitignore b/.gitignore
index 039847f..5e15658 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,135 +1,7 @@
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-lerna-debug.log*
-.pnpm-debug.log*
-
-# Diagnostic reports (https://nodejs.org/api/report.html)
-report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-*.lcov
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-build/Release
-
-# Dependency directories
-node_modules/
-jspm_packages/
-
-# Snowpack dependency directory (https://snowpack.dev/)
-web_modules/
-
-# TypeScript cache
-*.tsbuildinfo
-
-# Optional npm cache directory
-.npm
-
-# Optional eslint cache
-.eslintcache
-
-# Optional stylelint cache
-.stylelintcache
-
-# Microbundle cache
-.rpt2_cache/
-.rts2_cache_cjs/
-.rts2_cache_es/
-.rts2_cache_umd/
-
-# Optional REPL history
-.node_repl_history
-
-# Output of 'npm pack'
-*.tgz
-
-# Yarn Integrity file
-.yarn-integrity
-
-# dotenv environment variable files
-.env
-.env.development.local
-.env.test.local
-.env.production.local
-.env.local
-
-# parcel-bundler cache (https://parceljs.org/)
-.cache
-.parcel-cache
-
-# Next.js build output
-.next
-out
-
-# Nuxt.js build / generate output
-.nuxt
-dist
-
-# Gatsby files
-.cache/
-# Comment in the public line in if your project uses Gatsby and not Next.js
-# https://nextjs.org/blog/next-9-1#public-directory-support
-# public
-
-# vuepress build output
-.vuepress/dist
-
-# vuepress v2.x temp and cache directory
-.temp
-.cache
-
-# Docusaurus cache and generated files
-.docusaurus
-
-# Serverless directories
-.serverless/
-
-# FuseBox cache
-.fusebox/
-
-# DynamoDB Local files
-.dynamodb/
-
-# TernJS port file
-.tern-port
-
-# Stores VSCode versions used for testing VSCode extensions
-.vscode-test
-
-# yarn v2
-.yarn/cache
-.yarn/unplugged
-.yarn/build-state.yml
-.yarn/install-state.gz
-.pnp.*
-playground
+node_modules
 package-lock.json
-*.key
-
-test/*.html
\ No newline at end of file
+playground
+test/*.html
+src/tools/repos
+__*
+*.key
\ No newline at end of file
diff --git a/README.md b/README.md
index a8f6363..7e49d77 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/extremeheat/LXL/actions/workflows/ci.yml/badge.svg)](https://github.com/extremeheat/LXL/actions/workflows/)
 [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/extremeheat/LXL)
 
-LangXLang (LXL), a simple wrapper for Node.js to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
+LangXLang (LXL), a Node.js library to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
 
 Supported models are:
 * OpenAI: `gpt-3.5-turbo-16k`, `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo-preview`
diff --git a/src/index.d.ts b/src/index.d.ts
index 8df849d..adcb56b 100644
--- a/src/index.d.ts
+++ b/src/index.d.ts
@@ -15,6 +15,7 @@ declare module 'langxlang' {
   }
 
   interface Func {
+    // If default is not provided, the argument is required.
     Arg(options: { type: string[], description: string, example?: string, default?: any, required?: boolean }): string
     Arg(options: { type: object, description: string, example?: string, default?: any, required?: boolean }): object
     Arg<T>(options: { type: T, description: string, example?: string, default?: any, required?: boolean }): T
@@ -38,6 +39,16 @@ declare module 'langxlang' {
   interface Tools {
     // Generate HTML that shows side-by-side outputs for the system/user prompt across different models.
     makeVizForPrompt(systemPrompt: string, userPrompt: string, models: Model[]): Promise<string>
+    // Returns a JS object with a list of files in a GitHub repo
+    collectGithubRepoFiles(repo: string, options: {
+      // What extension of files in the repo to include
+      extension?: string,
+      // The branch to use
+      branch?: string,
+      // Either a function that returns true if the file should be included
+      // or an array of regexes of which one needs to match for inclusion
+      matching?: (fileName: string) => boolean | RegExp[]
+    }): Promise<[absolutePath: string, relativePath: string, contents: string][]>
   }
 
   const tools: Tools
diff --git a/src/tools.js b/src/tools.js
index 81e9be5..75e0f69 100644
--- a/src/tools.js
+++ b/src/tools.js
@@ -1,111 +1,7 @@
-const { CompletionService } = require('./CompletionService')
+const codebase = require('./tools/codebase')
+const viz = require('./tools/viz')
 
-function makeVizHtml (data) {
-  return `
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <title>LLM Output Viz</title>
-  <style>
-    pre {
-      max-width: 25vw;
-      overflow: auto;
-    }
-    h3 {
-      background-color: lightcyan;
-      margin-top: 12px;
-    }
-    #grid {
-      /* margin: 1%; */
-      .correct h3 {
-        background-color: lightgreen;
-      }
-    }
-    #grid div {
-      margin-right: 2%;
-    }
-  </style>
-</head>
-<body>
-  <button id="wordwrap">Toggle Word Wrap</button>
-  <div id="grid" style='display: flex; flex-direction: row;'>
-    <div>
-      <h3>System Prompt</h3>
-      <pre id="psys">SYS PROMPT</pre>
-    </div>
-    <div>
-      <h3>User Prompt</h3>
-      <pre id="pusr">USR PROMPT</pre>
-    </div>
-    ${
-      data.models.map(([modelName, modelId]) =>
-        `<div><h3>${modelName}</h3><pre id="presp${modelId}">MODEL OUTPUT</pre></div>`).join('\n')
-    }
-  </div>
-
-  <script>
-    function toggleWordWrap () {
-      const $pre = document.querySelectorAll('pre');
-      for (const $p of $pre) {
-        $p.style.whiteSpace = $p.style.whiteSpace === 'pre-wrap' ? 'pre' : 'pre-wrap';
-      }
-    }
-    wordwrap.onclick = toggleWordWrap;
-    toggleWordWrap();
-
-    const $psys = document.getElementById('psys');
-    const $pusr = document.getElementById('pusr');
-
-    const data = ${JSON.stringify(data)};
-    const outputs = data.outputs;
-    if ($psys) $psys.textContent = data.system;
-    if ($pusr) $pusr.textContent = data.user;
-    for (const [modelName, modelId] of data.models) {
-      const $presp = document.getElementById('presp' + modelId);
-      if ($presp) $presp.textContent = outputs[modelId];
-    }
-</script>
-</body>
-</html>
-  `
+module.exports = {
+  makeVizForPrompt: viz.makeVizForPrompt,
+  collectGithubRepoFiles: codebase.collectGithubRepoFiles
 }
-
-async function makeVizForPrompt (system, user, models) {
-  const service = new CompletionService()
-  const data = { models: [], outputs: {} }
-  for (const model of models) {
-    const { text } = await service.requestCompletion(model, system, user)
-    switch (model) {
-      case 'gpt-3.5-turbo-16k':
-        data.models.push(['GPT-3.5 Turbo 16k', '3516turbo'])
-        data.outputs['3516turbo'] = text
-        break
-      case 'gpt-3.5-turbo':
-        data.models.push(['GPT-3.5 Turbo', '35turbo'])
-        data.outputs['35turbo'] = text
-        break
-      case 'gpt-4':
-        data.models.push(['GPT-4', 'gpt4'])
-        data.outputs.gpt4 = text
-        break
-      case 'gpt-4-turbo-preview':
-        data.models.push(['GPT-4 Turbo Preview', 'gpt4turbo'])
-        data.outputs.gpt4turbo = text
-        break
-      case 'gemini-1.0-pro':
-        data.models.push(['Gemini 1.0 Pro', 'gemini'])
-        data.outputs.gemini = text
-        break
-      default:
-        data.models.push([model, model])
-        data.outputs[model] = text
-    }
-  }
-  data.system = system
-  data.user = user
-  return makeVizHtml(data)
-}
-
-module.exports = { makeVizForPrompt }
diff --git a/src/tools/codebase.js b/src/tools/codebase.js
new file mode 100644
index 0000000..a678094
--- /dev/null
+++ b/src/tools/codebase.js
@@ -0,0 +1,64 @@
+const fs = require('fs')
+const cp = require('child_process')
+const { join } = require('path')
+
+function fixSeparator (path) {
+  return path.replace(/\\/g, '/')
+}
+
+function getAllFilesIn (folder) {
+  const files = []
+  const entries = fs.readdirSync(folder, { withFileTypes: true })
+  for (const entry of entries) {
+    const fullPath = join(folder, entry.name)
+    if (entry.isDirectory()) {
+      files.push(...getAllFilesIn(fullPath))
+    } else {
+      files.push(fullPath)
+    }
+  }
+  return files.map(fixSeparator)
+}
+
+// This function will clone a github repo, review all the files and merge relevant files into a single file
+function collectGithubRepoFiles (repo, options) {
+  const extension = options.extension
+  const branch = options.branch || 'master'
+  // First, try to clone the repo inside a "repos" folder in this directory
+  const safeName = repo.replace(/\//g, ',')
+  const reposDir = join(__dirname, 'repos')
+  const repoPath = join(reposDir, safeName)
+  fs.mkdirSync(reposDir, { recursive: true })
+  if (!fs.existsSync(repoPath)) {
+    cp.execSync(`git clone https://github.com/${repo}.git ${safeName}`, { cwd: reposDir })
+  }
+  // Git pull origin/$branch
+  cp.execSync(`git pull origin ${branch}`, { cwd: repoPath })
+  // Check out the branch
+  cp.execSync(`git checkout ${branch}`, { cwd: repoPath })
+  // Now collect all the files inside repoPath, like `tree`
+  const allFiles = getAllFilesIn(repoPath)
+    .map(f => [f, f.replace(fixSeparator(repoPath), '')])
+
+  // Now figure out the relevant files
+  const relevantFiles = []
+  for (const [file, relFile] of allFiles) {
+    if (extension && !file.endsWith(extension)) {
+      continue
+    }
+    if (options.matching) {
+      if (typeof options.matching === 'function') {
+        if (!options.matching(relFile)) {
+          continue
+        }
+      } else if (!options.matching.some(m => relFile.match(m))) {
+        continue
+      }
+    }
+    relevantFiles.push([file, relFile])
+  }
+  const fileContents = relevantFiles.map(([abs, rel]) => [abs, rel, fs.readFileSync(abs, 'utf8').trim()])
+  return fileContents
+}
+
+module.exports = { collectGithubRepoFiles }
diff --git a/src/tools/stripping.js b/src/tools/stripping.js
new file mode 100644
index 0000000..1c73039
--- /dev/null
+++ b/src/tools/stripping.js
@@ -0,0 +1,379 @@
+// Stripping here refers to removing unnecessary tokens, comments and white-space in a program to minimize the amount of tokens
+// that are needed to represent the program. In languages like Java, there's lots of syntax tokens that are needed for the program
+// to run, but not needed for the purpose of abstractly understanding program logic. Think things like public/private, final, etc.
+
+function stripJava (code, options) {
+  // First, we need to "tokenize" the code, by splitting it into 3 types of data: comments, strings, and code.
+  const tokens = []
+  let tokenSoFar = ''
+  let currentTokenType = 'code' // 'code' or 'multi-line-comment', 'single-line-comment' or 'string'
+  for (let i = 0; i < code.length; i++) {
+    const lastChar = code[i - 1]
+    const currentChar = code[i]
+    const nextChar = code[i + 1]
+    if (currentTokenType === 'code') {
+      if (currentChar === '/' && nextChar === '*') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'multi-line-comment'
+      } else if (currentChar === '/' && nextChar === '/') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'single-line-comment'
+      } else if (currentChar === '"') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'string'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'multi-line-comment') {
+      if (currentChar === '*' && nextChar === '/') {
+        tokens.push([tokenSoFar + '*/', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+        i++
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'single-line-comment') {
+      if (currentChar === '\n') {
+        tokens.push([tokenSoFar + '\n', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'string') {
+      if (currentChar === '"' && lastChar !== '\\') {
+        tokens.push([tokenSoFar + '"', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    }
+  }
+  // Now we have an array of tokens, where every other token is a comment or string, and the others are code.
+  tokens.push([tokenSoFar, currentTokenType])
+  // Now we can remove the keyword tokens that we don't want to keep. The always have spaces around them, so nothing fancy is needed.
+  const syntaxTokensToRemove = options.tokensToRemove ||
+    ['protected', 'private', 'public', 'final', 'abstract', 'synchronized', 'volatile', 'transient', 'native', 'strictfp']
+
+  for (const entry of tokens) {
+    if (entry[1] === 'code') {
+      for (const forRemoval of syntaxTokensToRemove) {
+        entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '')
+      }
+    }
+  }
+  // Now we can replace some user specified tokens with other tokens. Useful for renaming variables
+  if (options.replacements) {
+    for (const entry of tokens) {
+      if (entry[1] === 'code') {
+        for (const [old, now] of options.replacements) {
+          entry[0] = entry[0].replaceAll(old, now)
+        }
+      }
+    }
+  }
+
+  // First, make a new set of tokens, removing comments if the user wants
+  let newTokens = []
+  for (const [tokenStr, tokenType] of tokens) {
+    if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) {
+      continue
+    }
+    newTokens.push([tokenStr, tokenType])
+  }
+  // update the newTokens to merge adjacent code tokens (needed for correct space handling)
+  for (let i = 0; i < newTokens.length - 1; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    const [nextTokenStr, nextTokenType] = newTokens[i + 1]
+    if (tokenType === 'code' && nextTokenType === 'code') {
+      newTokens[i + 1][0] = tokenStr + nextTokenStr
+      newTokens[i][0] = ''
+    }
+  }
+  newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '')
+
+  // Now iterate through the new tokens and remove code with empty space lines
+  let result = ''
+  for (let i = 0; i < newTokens.length; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    if (tokenType === 'code') {
+      const newStrLines = []
+      for (const line of tokenStr.split('\n')) {
+        if (line.trim() === '') continue
+        newStrLines.push(line)
+      }
+      const now = newStrLines.join('\n')
+      result += now
+    } else {
+      result += tokenStr
+    }
+  }
+  return result
+}
+
+function stripPHP (code, options = {}) {
+  // First, we need to "tokenize" the code, by splitting it into 3 types of data: comments, strings, and code.
+  const tokens = []
+  let tokenSoFar = ''
+  // 'code' or 'multi-line-comment', 'single-line-comment' or 'double-quote-string', 'single-quote-string', 'heredoc-string', 'nowdoc-string'
+  let currentTokenType = 'code'
+  let currentTokenData = ''
+  for (let i = 0; i < code.length; i++) {
+    const lastChar = code[i - 1]
+    const currentChar = code[i]
+    const nextChar = code[i + 1]
+    const nextNextChar = code[i + 2]
+    if (currentTokenType === 'code') {
+      if (currentChar === '/' && nextChar === '*') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'multi-line-comment'
+      } else if (currentChar === '/' && nextChar === '/') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'single-line-comment'
+      } else if (currentChar === '"') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'double-quote-string'
+      } else if (currentChar === "'") {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'single-quote-string'
+      } else if (currentChar === '<' && nextChar === '<' && nextNextChar === '<') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar + nextChar + nextNextChar
+        i += 2
+        const end = code.indexOf('\n', i)
+        currentTokenData = code.substring(i, end).trim()
+        tokenSoFar += currentTokenData
+        if (currentTokenData.startsWith("'")) {
+          currentTokenType = 'nowdoc-string'
+          currentTokenData = currentTokenData.slice(1, -1)
+        } else {
+          currentTokenType = 'heredoc-string'
+        }
+        i = end
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'multi-line-comment') {
+      if (currentChar === '*' && nextChar === '/') {
+        tokens.push([tokenSoFar + '*/', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+        i++
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'single-line-comment') {
+      if (currentChar === '\n') {
+        tokens.push([tokenSoFar + '\n', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'double-quote-string' || currentTokenType === 'single-quote-string') {
+      if (currentChar === (currentTokenType === 'double-quote-string' ? '"' : "'") && lastChar !== '\\') {
+        tokens.push([tokenSoFar + currentChar, currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'heredoc-string' || currentTokenType === 'nowdoc-string') {
+      if (code.startsWith(currentTokenData, i) && (code[i + currentTokenData.length] === '\n' || code[i + currentTokenData.length] === ';')) {
+        tokenSoFar += currentTokenData
+        i += currentTokenData.length - 1
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    }
+  }
+  tokens.push([tokenSoFar, currentTokenType])
+  // Now we can remove the keyword tokens that are not important for abstractly understanding the program
+  const syntaxTokensToRemove = options.tokensToRemove || ['public', 'private', 'protected', 'final', 'readonly']
+  for (const entry of tokens) {
+    if (entry[1] === 'code') {
+      for (const forRemoval of syntaxTokensToRemove) {
+        entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '')
+      }
+    }
+  }
+  // Now we can replace some user specified tokens with other tokens. Useful for renaming variables
+  if (options.replacements) {
+    for (const entry of tokens) {
+      if (entry[1] === 'code') {
+        for (const [old, now] of options.replacements) {
+          entry[0] = entry[0].replaceAll(old, now)
+        }
+      }
+    }
+  }
+
+  // First, make a new set of tokens, removing comments if the user wants
+  let newTokens = []
+  for (const [tokenStr, tokenType] of tokens) {
+    if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) {
+      continue
+    }
+    newTokens.push([tokenStr, tokenType])
+  }
+  // update the newTokens to merge adjacent code tokens (needed for correct space handling)
+  for (let i = 0; i < newTokens.length - 1; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    const [nextTokenStr, nextTokenType] = newTokens[i + 1]
+    if (tokenType === 'code' && nextTokenType === 'code') {
+      newTokens[i + 1][0] = tokenStr + nextTokenStr
+      newTokens[i][0] = ''
+    }
+  }
+  newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '')
+
+  // Now iterate through the new tokens and remove code with empty space lines
+  let result = ''
+  for (let i = 0; i < newTokens.length; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    if (tokenType === 'code') {
+      const newStrLines = []
+      for (const line of tokenStr.split('\n')) {
+        if (line.trim() === '') continue
+        newStrLines.push(line)
+      }
+      const now = newStrLines.join('\n')
+      result += now
+    } else {
+      result += tokenStr
+    }
+  }
+  return result
+}
+
+function stripGo (code, options) {
+  const tokens = []
+  let tokenSoFar = ''
+  let currentTokenType = 'code'
+  for (let i = 0; i < code.length; i++) {
+    const lastChar = code[i - 1]
+    const currentChar = code[i]
+    const nextChar = code[i + 1]
+    if (currentTokenType === 'code') {
+      if (currentChar === '/' && nextChar === '*') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'multi-line-comment'
+      } else if (currentChar === '/' && nextChar === '/') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'single-line-comment'
+      } else if (currentChar === '"') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'double-quote-string'
+      } else if (currentChar === "'") {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'single-quote-string'
+      } else if (currentChar === '`') {
+        tokens.push([tokenSoFar, currentTokenType])
+        tokenSoFar = currentChar
+        currentTokenType = 'raw-string'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'multi-line-comment') {
+      if (currentChar === '*' && nextChar === '/') {
+        tokens.push([tokenSoFar + '*/', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+        i++
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'single-line-comment') {
+      if (currentChar === '\n') {
+        tokens.push([tokenSoFar + '\n', currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'double-quote-string' || currentTokenType === 'single-quote-string') {
+      if (currentChar === (currentTokenType === 'double-quote-string' ? '"' : "'") && lastChar !== '\\') {
+        tokens.push([tokenSoFar + currentChar, currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    } else if (currentTokenType === 'raw-string') {
+      if (currentChar === '`') {
+        tokens.push([tokenSoFar + currentChar, currentTokenType])
+        tokenSoFar = ''
+        currentTokenType = 'code'
+      } else {
+        tokenSoFar += currentChar
+      }
+    }
+  }
+  // Go doesn't have a lot of syntax tokens that can be removed, so we'll just remove comments and whitespace
+  tokens.push([tokenSoFar, currentTokenType])
+  // Now we can replace some user specified tokens with other tokens. Useful for renaming variables
+  if (options.replacements) {
+    for (const entry of tokens) {
+      if (entry[1] === 'code') {
+        for (const [old, now] of options.replacements) {
+          entry[0] = entry[0].replaceAll(old, now)
+        }
+      }
+    }
+  }
+
+  // First, make a new set of tokens, removing comments if the user wants
+  let newTokens = []
+  for (const [tokenStr, tokenType] of tokens) {
+    if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) {
+      continue
+    }
+    newTokens.push([tokenStr, tokenType])
+  }
+  // update the newTokens to merge adjacent code tokens (needed for correct space handling)
+  for (let i = 0; i < newTokens.length - 1; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    const [nextTokenStr, nextTokenType] = newTokens[i + 1]
+    if (tokenType === 'code' && nextTokenType === 'code') {
+      newTokens[i + 1][0] = tokenStr + nextTokenStr
+      newTokens[i][0] = ''
+    }
+  }
+  newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '')
+
+  // Now iterate through the new tokens and remove code with empty space lines
+  let result = ''
+  for (let i = 0; i < newTokens.length; i++) {
+    const [tokenStr, tokenType] = newTokens[i]
+    if (tokenType === 'code') {
+      const newStrLines = []
+      for (const line of tokenStr.split('\n')) {
+        if (line.trim() === '') continue
+        newStrLines.push(line)
+      }
+      const now = newStrLines.join('\n')
+      result += now
+    } else {
+      result += tokenStr
+    }
+  }
+  return result
+}
+
+module.exports = { stripJava, stripPHP, stripGo }
diff --git a/src/tools/viz.js b/src/tools/viz.js
new file mode 100644
index 0000000..9e184ae
--- /dev/null
+++ b/src/tools/viz.js
@@ -0,0 +1,111 @@
+const { CompletionService } = require('../CompletionService')
+
+function makeVizHtml (data) {
+  return `
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>LLM Output Viz</title>
+  <style>
+    pre {
+      max-width: 25vw;
+      overflow: auto;
+    }
+    h3 {
+      background-color: lightcyan;
+      margin-top: 12px;
+    }
+    #grid {
+      /* margin: 1%; */
+      .correct h3 {
+        background-color: lightgreen;
+      }
+    }
+    #grid div {
+      margin-right: 2%;
+    }
+  </style>
+</head>
+<body>
+  <button id="wordwrap">Toggle Word Wrap</button>
+  <div id="grid" style='display: flex; flex-direction: row;'>
+    <div>
+      <h3>System Prompt</h3>
+      <pre id="psys">SYS PROMPT</pre>
+    </div>
+    <div>
+      <h3>User Prompt</h3>
+      <pre id="pusr">USR PROMPT</pre>
+    </div>
+    ${
+      data.models.map(([modelName, modelId]) =>
+        `<div><h3>${modelName}</h3><pre id="presp${modelId}">MODEL OUTPUT</pre></div>`).join('\n')
+    }
+  </div>
+
+  <script>
+    function toggleWordWrap () {
+      const $pre = document.querySelectorAll('pre');
+      for (const $p of $pre) {
+        $p.style.whiteSpace = $p.style.whiteSpace === 'pre-wrap' ? 'pre' : 'pre-wrap';
+      }
+    }
+    wordwrap.onclick = toggleWordWrap;
+    toggleWordWrap();
+
+    const $psys = document.getElementById('psys');
+    const $pusr = document.getElementById('pusr');
+
+    const data = ${JSON.stringify(data)};
+    const outputs = data.outputs;
+    if ($psys) $psys.textContent = data.system;
+    if ($pusr) $pusr.textContent = data.user;
+    for (const [modelName, modelId] of data.models) {
+      const $presp = document.getElementById('presp' + modelId);
+      if ($presp) $presp.textContent = outputs[modelId];
+    }
+</script>
+</body>
+</html>
+  `
+}
+
+async function makeVizForPrompt (system, user, models) {
+  const service = new CompletionService()
+  const data = { models: [], outputs: {} }
+  for (const model of models) {
+    const { text } = await service.requestCompletion(model, system, user)
+    switch (model) {
+      case 'gpt-3.5-turbo-16k':
+        data.models.push(['GPT-3.5 Turbo 16k', '3516turbo'])
+        data.outputs['3516turbo'] = text
+        break
+      case 'gpt-3.5-turbo':
+        data.models.push(['GPT-3.5 Turbo', '35turbo'])
+        data.outputs['35turbo'] = text
+        break
+      case 'gpt-4':
+        data.models.push(['GPT-4', 'gpt4'])
+        data.outputs.gpt4 = text
+        break
+      case 'gpt-4-turbo-preview':
+        data.models.push(['GPT-4 Turbo Preview', 'gpt4turbo'])
+        data.outputs.gpt4turbo = text
+        break
+      case 'gemini-1.0-pro':
+        data.models.push(['Gemini 1.0 Pro', 'gemini'])
+        data.outputs.gemini = text
+        break
+      default:
+        data.models.push([model, model])
+        data.outputs[model] = text
+    }
+  }
+  data.system = system
+  data.user = user
+  return makeVizHtml(data)
+}
+
+module.exports = { makeVizForPrompt }
diff --git a/test/tools.js b/test/tools.js
new file mode 100644
index 0000000..26357d7
--- /dev/null
+++ b/test/tools.js
@@ -0,0 +1,21 @@
+const { tools } = require('langxlang')
+
+async function testViz () {
+  const viz = await tools.makeVizForPrompt('', 'Why is the sky blue?', ['gpt-3.5-turbo'])
+  console.log(viz)
+}
+
+async function testCodebase () {
+  const files = await tools.collectGithubRepoFiles('extremeheat/node-basic-args', {
+    extension: '.js',
+    matching: [/examples/]
+  })
+  console.log(files)
+}
+
+async function main () {
+  await testViz()
+  await testCodebase()
+}
+
+main()
diff --git a/test/viz.js b/test/viz.js
deleted file mode 100644
index 23bb006..0000000
--- a/test/viz.js
+++ /dev/null
@@ -1,10 +0,0 @@
-const { tools } = require('langxlang')
-const fs = require('fs')
-const path = require('path')
-
-async function main () {
-  // const all = ['gpt-3.5-turbo-16k', 'gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo-preview', 'gemini-1.0-pro']
-  const viz = await tools.makeVizForPrompt('', 'Why is the sky blue?', ['gpt-3.5-turbo'])
-  fs.writeFileSync(path.join(__dirname, 'viz.html'), viz)
-}
-main()