Skip to content

Commit

Permalink
Update Code (#9)
Browse files Browse the repository at this point in the history
* Added jupyterlab with python notebook for testing funciton chat models.

* Add jupyterlab to dev dependencies

* Remove extra cli.py

* Fix typo in filename

* Add inputs parameter to ParsedCode

* Fixes

* Added outputs and all tests are passing

* Made progress on generating tests

Updates toe the add-test openai func -- stil not working

* Add tests functionality is now workingrepo-gpt setup

* Add test generation

* Add code gen functionality

* Add dumpy open ai api key so imports work for tests

* Use union for typing optional

* Clean up

* Add readme docs:
  • Loading branch information
shruti222patel authored Aug 17, 2023
1 parent 3fd72cd commit 28baf8a
Show file tree
Hide file tree
Showing 25 changed files with 5,049 additions and 402 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@ jobs:

- name: Run tests
run: poetry run pytest
env:
OPENAI_API_KEY: ""
Binary file added .repo_gpt/code_embeddings.pkl
Binary file not shown.
1,088 changes: 1,088 additions & 0 deletions How_to_call_functions_with_chat_models.ipynb

Large diffs are not rendered by default.

17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ repo-gpt setup --root_path ./my_project
repo-gpt search "extract handler"
repo-gpt query "What does the function `calculate_sum` do?"
repo-gpt analyze ./my_project/main.py
repo-gpt add-test function_name --test_save_file_path $PWD/test.py --testing_package pytest
```

## Contributing
Expand Down Expand Up @@ -116,6 +117,13 @@ Here are the steps to set up your development environment:
poetry run python cli.py search <text/question>
```

6. Generate tests for a function:
Note: this assumes the function name is unique in the codebase, otherwise, it will pick the first function it finds with that name.

```shell
poetry run python cli.py add-test <unique funciton name> --test_save_file_path <absolute filepath to add tests to> --testing_package <testing package to use e.g. pytest>
```

### Debugging

You can view the output of the `code_embeddings.pkl` using the following command:
Expand All @@ -128,11 +136,12 @@ pd.read_pickle('./.repo_gpt/code_embeddings.pkl', compression='infer')

Here are the improvements we are currently considering:

- [ ] Publishing to PyPi
- [ ] Test suite addition
- [X] Publishing to PyPi
- [X] Test suite addition
- [X] Add CI/CD
- [ ] Prettify output
- [X] Prettify output
- [ ] Add readme section about how folks can contribute parsers for their own languages
- [ ] Save # of tokens each code snippet has so we can ensure we don't pass too many tokens to GPT
- [ ] Add SQL file handler
- [X] Add SQL file handler
- [ ] Add DBT file handler -- this may be a break in pattern as we'd want to use the manifest.json file
- [ ] Create VSCode extension
3,082 changes: 3,082 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,17 @@ tqdm = "^4.65.0"
pathspec = "^0.11.1"
openai = "^0.27.8"
rich = "^13.4.1"
redbaron = "^0.9.2"
tenacity = "^8.2.2"
sqlglot = "^16.4.0"
tree-sitter-languages = "^1.7.0"
pygments = "^2.16.1"
tiktoken = "^0.4.0"

[tool.poetry.group.dev.dependencies]
pytest = "^7.3.2"
pre-commit = "^3.3.2"
exceptiongroup = { version="^1.1.1", markers="python_version <= '3.10'" }
jupyterlab = "^4.0.4"

[tool.poetry.scripts]
repo-gpt = "repo_gpt.cli:main"
Expand Down
1 change: 1 addition & 0 deletions src/repo_gpt/add_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from repo_gpt import search_service
123 changes: 118 additions & 5 deletions src/repo_gpt/cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!./venv/bin/python

import argparse
import os
from pathlib import Path

from .code_manager.code_manager import CodeManager
from .serach_service import SearchService
from .openai_service import OpenAIService
from .search_service import SearchService
from .test_generator import TestGenerator

CODE_EMBEDDING_FILE_PATH = str(Path.cwd() / ".repo_gpt" / "code_embeddings.pkl")

Expand Down Expand Up @@ -64,29 +67,139 @@ def print_help(*args):
default=CODE_EMBEDDING_FILE_PATH,
)

# Sub-command to analyze a file
add_test = subparsers.add_parser("add-test", help="Add tests for existing function")
add_test.add_argument(
"function_name", type=str, help="Name of the function you'd like to test"
)
add_test.add_argument(
"--file_name",
type=str,
help="Name of the file the function is found in. This is helpful if there are many functions with the same "
"name. If this isn't specified, I assume the function name is unique and I'll create tests for the first "
"matching function I find. When a file_name is passed, I will assume the function name is unique in the "
"file, and write tests for the first function I find with the same name in the file.",
default="",
)
add_test.add_argument(
"--test_save_file_path",
type=str,
help="Filepath to save the generated tests to",
)

add_test.add_argument(
"--testing_package",
type=str,
help="Package/library GPT should use to write tests (e.g. pytest, unittest, etc.)",
)
add_test.add_argument(
"--pickle_path",
type=str,
help="Path of the pickled DataFrame to search in",
default=CODE_EMBEDDING_FILE_PATH,
)

parser_help = subparsers.add_parser("help", help="Show this help message")
parser_help.set_defaults(func=print_help)

args = parser.parse_args()

# Services
openai_service = OpenAIService()

search_service = (
SearchService(args.pickle_path, openai_service)
if args.command != "setup"
else None
)

if args.command == "setup":
root_path = Path(args.root_path)
output_path = Path(args.output_path)
manager = CodeManager(root_path, output_path)
manager = CodeManager(output_path, root_path)
manager.setup()
elif args.command == "search":
search_service = SearchService(args.pickle_path)
# search_service.simple_search(args.query) # simple search
search_service.semantic_search(args.query) # semantic search
elif args.command == "query":
search_service = SearchService(args.pickle_path)
search_service.question_answer(args.question)
elif args.command == "analyze":
search_service = SearchService(args.pickle_path)
search_service.analyze_file(args.file_path)
elif args.command == "add-test":
code_manager = CodeManager(args.pickle_path)
# Look for the function name in the embedding file
add_tests(
search_service,
code_manager,
args.function_name,
args.test_save_file_path,
args.testing_package,
)
else:
parser.print_help()


def add_tests(
search_service,
code_manager,
function_name,
test_save_file_path,
testing_package,
):
# Check file path isn't a directory
if os.path.isdir(test_save_file_path):
print(
f"Error: {test_save_file_path} is a directory. Please specify a file path."
)
return
# Find the function via the search service
function_to_test_df, class_to_test_df = search_service.find_function_match(
function_name
)

if function_to_test_df.empty:
print(f"Function {function_name} not found.")
return

# Get the latest version of the function
checksum_filepath_dict = {
function_to_test_df.iloc[0]["file_checksum"]: function_to_test_df.iloc[0][
"filepath"
]
}
code_manager.parse_code_and_save_embeddings(checksum_filepath_dict)

search_service.refresh_df()
# Find the function again after refreshing the code & embeddings
function_to_test_df, class_to_test_df = search_service.find_function_match(
function_name
)

if function_to_test_df.empty:
print(f"Function {function_name} not found.")
return

# Save gpt history
# Ask gpt to explain the function
test_generator = TestGenerator(
function_to_test_df.iloc[0]["code"],
language="python",
unit_test_package=testing_package,
debug=True,
)
unit_tests = test_generator.unit_tests_from_function()
# unit_tests = openai_service.unit_tests_from_function(
# function_to_test_df.iloc[0]["code"],
# unit_test_package=testing_package,
# print_text=True,
# ) # TODO: add language & test framework from config file

print(f"Writing generated unit_tests to {test_save_file_path}...")
# Save code to file
if test_save_file_path is not None:
with open(test_save_file_path, "a") as f:
f.write(unit_tests)


if __name__ == "__main__":
main()
69 changes: 43 additions & 26 deletions src/repo_gpt/code_manager/code_extractor.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import hashlib
import os
from enum import Enum
from pathlib import Path
from typing import List, Type

from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
from pygments.lexers import ClassNotFound, get_lexer_for_filename

from ..file_handler.abstract_handler import CodeBlock, FileHandler
from ..file_handler.abstract_handler import FileHandler, ParsedCode
from ..file_handler.generic_code_file_handler import PHPFileHandler, PythonFileHandler
from ..file_handler.sql_file_handler import SqlFileHandler
from ..utils import logger


class LanguageHandler(Enum):
PYTHON = PythonFileHandler
SQL = SqlFileHandler
PHP = PHPFileHandler


class CodeExtractor:
HANDLER_MAPPING = {
".py": PythonFileHandler,
Expand Down Expand Up @@ -44,6 +51,21 @@ def get_handler(self, filepath: str) -> Type[FileHandler]:
)
return handler_class

def is_file_parsable(self, filepath: str) -> bool:
gitignore = self.get_gitignore()
spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)
handler_class = self.get_handler(filepath)
if handler_class is None or spec.match_file(filepath):
return False
return True

def is_dir_parsable(self, dirpath: str) -> bool:
gitignore = self.get_gitignore()
spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)
if spec.match_file(dirpath):
return False
return True

def generate_md5(self, filepath: str, chunk_size: int = 4096) -> str:
hash = hashlib.md5()
with open(filepath, "rb") as f:
Expand All @@ -55,24 +77,18 @@ def generate_md5(self, filepath: str, chunk_size: int = 4096) -> str:

def extract_code_files(self) -> List[str]:
code_files = []
gitignore = self.get_gitignore()
spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)

for root, dirs, files in os.walk(self.code_root_path):
root_path = Path(root).relative_to(self.code_root_path)

# Skip directories listed in .gitignore
dirs[:] = [
d for d in dirs if not spec.match_file(os.path.join(root_path, d))
d for d in dirs if self.is_dir_parsable(os.path.join(root_path, d))
]

for file in files:
if file.endswith(".py") and not spec.match_file(
os.path.join(root_path, file)
):
if self.is_file_parsable(file):
file_path = root_path / file
code_files.append(self.code_root_path / file_path)

return code_files

def get_gitignore(self) -> List[str]:
Expand All @@ -83,12 +99,22 @@ def get_gitignore(self) -> List[str]:
else:
return []

def extract_functions(self, embedding_code_file_checksums: dict) -> List[CodeBlock]:
code_files = self.extract_code_files()
def extract_functions(
self, embedding_code_file_checksums: dict
) -> List[ParsedCode]:
code_files = (
self.extract_code_files()
if embedding_code_file_checksums is None
else embedding_code_file_checksums.values()
)
code_blocks = []
for code_filepath in code_files:
print(f"🟢 Processing {code_filepath}")
file_checksum = self.generate_md5(code_filepath)
if file_checksum in embedding_code_file_checksums:
if (
embedding_code_file_checksums is not None
and file_checksum in embedding_code_file_checksums
):
print(f"🟡 Skipping -- file unmodified {code_filepath}")
continue
file_code_blocks = self.extract_functions_from_file(
Expand All @@ -105,20 +131,11 @@ def extract_functions(self, embedding_code_file_checksums: dict) -> List[CodeBlo

def extract_functions_from_file(
self, filepath: str, file_checksum: str
) -> List[CodeBlock]:
) -> List[ParsedCode]:
handler = self.get_handler(filepath)
code_blocks = []
if handler:
parsed_code = handler().extract_code(filepath)
if parsed_code:
code_blocks = [
CodeBlock(
code=parsed.code,
code_type=parsed.code_type,
name=parsed.name,
filepath=filepath,
file_checksum=file_checksum,
)
for parsed in parsed_code
]
return code_blocks
for code in parsed_code:
code.filepath = filepath
code.file_checksum = file_checksum
return parsed_code
Loading

0 comments on commit 28baf8a

Please sign in to comment.