Skip to content

Commit

Permalink
Add a collectGithubRepoFiles tool (#8)
Browse files Browse the repository at this point in the history
* start work on codebase stripping

* tools: add github repo lookup method, updates to code stripper

* Add a `collectGithubRepoFiles` tool

* update codebase.js

* remove debug logging
  • Loading branch information
extremeheat authored Feb 25, 2024
1 parent 6364655 commit aa50927
Show file tree
Hide file tree
Showing 9 changed files with 598 additions and 254 deletions.
140 changes: 6 additions & 134 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,135 +1,7 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# Snowpack dependency directory (https://snowpack.dev/)
web_modules/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Optional stylelint cache
.stylelintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local

# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache

# Next.js build output
.next
out

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# vuepress v2.x temp and cache directory
.temp
.cache

# Docusaurus cache and generated files
.docusaurus

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# Stores VSCode versions used for testing VSCode extensions
.vscode-test

# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
playground
node_modules
package-lock.json
*.key

test/*.html
playground
test/*.html
src/tools/repos
__*
*.key
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Build Status](https://github.com/extremeheat/LXL/actions/workflows/ci.yml/badge.svg)](https://github.com/extremeheat/LXL/actions/workflows/)
[![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/extremeheat/LXL)

LangXLang (LXL), a simple wrapper for Node.js to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
LangXLang (LXL), a Node.js library to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.

Supported models are:
* OpenAI: `gpt-3.5-turbo-16k`, `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo-preview`
Expand Down
11 changes: 11 additions & 0 deletions src/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ declare module 'langxlang' {
}

interface Func {
// If default is not provided, the argument is required.
Arg(options: { type: string[], description: string, example?: string, default?: any, required?: boolean }): string
Arg(options: { type: object, description: string, example?: string, default?: any, required?: boolean }): object
Arg<T>(options: { type: T, description: string, example?: string, default?: any, required?: boolean }): T
Expand All @@ -38,6 +39,16 @@ declare module 'langxlang' {
interface Tools {
// Generate HTML that shows side-by-side outputs for the system/user prompt across different models.
makeVizForPrompt(systemPrompt: string, userPrompt: string, models: Model[]): Promise<string>
// Returns a JS object with a list of files in a GitHub repo
collectGithubRepoFiles(repo: string, options: {
// What extension of files in the repo to include
extension?: string,
// The branch to use
branch?: string,
// Either a function that returns true if the file should be included
// or an array of regexes of which one needs to match for inclusion
matching?: (fileName: string) => boolean | RegExp[]
}): Promise<[absolutePath: string, relativePath: string, contents: string][]>
}

const tools: Tools
Expand Down
114 changes: 5 additions & 109 deletions src/tools.js
Original file line number Diff line number Diff line change
@@ -1,111 +1,7 @@
const { CompletionService } = require('./CompletionService')
const codebase = require('./tools/codebase')
const viz = require('./tools/viz')

function makeVizHtml (data) {
return `
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LLM Output Viz</title>
<style>
pre {
max-width: 25vw;
overflow: auto;
}
h3 {
background-color: lightcyan;
margin-top: 12px;
}
#grid {
/* margin: 1%; */
.correct h3 {
background-color: lightgreen;
}
}
#grid div {
margin-right: 2%;
}
</style>
</head>
<body>
<button id="wordwrap">Toggle Word Wrap</button>
<div id="grid" style='display: flex; flex-direction: row;'>
<div>
<h3>System Prompt</h3>
<pre id="psys">SYS PROMPT</pre>
</div>
<div>
<h3>User Prompt</h3>
<pre id="pusr">USR PROMPT</pre>
</div>
${
data.models.map(([modelName, modelId]) =>
`<div><h3>${modelName}</h3><pre id="presp${modelId}">MODEL OUTPUT</pre></div>`).join('\n')
}
</div>
<script>
function toggleWordWrap () {
const $pre = document.querySelectorAll('pre');
for (const $p of $pre) {
$p.style.whiteSpace = $p.style.whiteSpace === 'pre-wrap' ? 'pre' : 'pre-wrap';
}
}
wordwrap.onclick = toggleWordWrap;
toggleWordWrap();
const $psys = document.getElementById('psys');
const $pusr = document.getElementById('pusr');
const data = ${JSON.stringify(data)};
const outputs = data.outputs;
if ($psys) $psys.textContent = data.system;
if ($pusr) $pusr.textContent = data.user;
for (const [modelName, modelId] of data.models) {
const $presp = document.getElementById('presp' + modelId);
if ($presp) $presp.textContent = outputs[modelId];
}
</script>
</body>
</html>
`
module.exports = {
makeVizForPrompt: viz.makeVizForPrompt,
collectGithubRepoFiles: codebase.collectGithubRepoFiles
}

async function makeVizForPrompt (system, user, models) {
const service = new CompletionService()
const data = { models: [], outputs: {} }
for (const model of models) {
const { text } = await service.requestCompletion(model, system, user)
switch (model) {
case 'gpt-3.5-turbo-16k':
data.models.push(['GPT-3.5 Turbo 16k', '3516turbo'])
data.outputs['3516turbo'] = text
break
case 'gpt-3.5-turbo':
data.models.push(['GPT-3.5 Turbo', '35turbo'])
data.outputs['35turbo'] = text
break
case 'gpt-4':
data.models.push(['GPT-4', 'gpt4'])
data.outputs.gpt4 = text
break
case 'gpt-4-turbo-preview':
data.models.push(['GPT-4 Turbo Preview', 'gpt4turbo'])
data.outputs.gpt4turbo = text
break
case 'gemini-1.0-pro':
data.models.push(['Gemini 1.0 Pro', 'gemini'])
data.outputs.gemini = text
break
default:
data.models.push([model, model])
data.outputs[model] = text
}
}
data.system = system
data.user = user
return makeVizHtml(data)
}

module.exports = { makeVizForPrompt }
64 changes: 64 additions & 0 deletions src/tools/codebase.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
const fs = require('fs')
const cp = require('child_process')
const { join } = require('path')

function fixSeparator (path) {
return path.replace(/\\/g, '/')
}

function getAllFilesIn (folder) {
const files = []
const entries = fs.readdirSync(folder, { withFileTypes: true })
for (const entry of entries) {
const fullPath = join(folder, entry.name)
if (entry.isDirectory()) {
files.push(...getAllFilesIn(fullPath))
} else {
files.push(fullPath)
}
}
return files.map(fixSeparator)
}

// This function will clone a github repo, review all the files and merge relevant files into a single file
function collectGithubRepoFiles (repo, options) {
const extension = options.extension
const branch = options.branch || 'master'
// First, try to clone the repo inside a "repos" folder in this directory
const safeName = repo.replace(/\//g, ',')
const reposDir = join(__dirname, 'repos')
const repoPath = join(reposDir, safeName)
fs.mkdirSync(reposDir, { recursive: true })
if (!fs.existsSync(repoPath)) {
cp.execSync(`git clone https://github.com/${repo}.git ${safeName}`, { cwd: reposDir })
}
// Git pull origin/$branch
cp.execSync(`git pull origin ${branch}`, { cwd: repoPath })
// Check out the branch
cp.execSync(`git checkout ${branch}`, { cwd: repoPath })
// Now collect all the files inside repoPath, like `tree`
const allFiles = getAllFilesIn(repoPath)
.map(f => [f, f.replace(fixSeparator(repoPath), '')])

// Now figure out the relevant files
const relevantFiles = []
for (const [file, relFile] of allFiles) {
if (extension && !file.endsWith(extension)) {
continue
}
if (options.matching) {
if (typeof options.matching === 'function') {
if (!options.matching(relFile)) {
continue
}
} else if (!options.matching.some(m => relFile.match(m))) {
continue
}
}
relevantFiles.push([file, relFile])
}
const fileContents = relevantFiles.map(([abs, rel]) => [abs, rel, fs.readFileSync(abs, 'utf8').trim()])
return fileContents
}

module.exports = { collectGithubRepoFiles }
Loading

0 comments on commit aa50927

Please sign in to comment.