Update Code (#9)

* Added jupyterlab with python notebook for testing funciton chat models. * Add jupyterlab to dev dependencies * Remove extra cli.py * Fix typo in filename * Add inputs parameter to ParsedCode * Fixes * Added outputs and all tests are passing * Made progress on generating tests Updates toe the add-test openai func -- stil not working * Add tests functionality is now workingrepo-gpt setup * Add test generation * Add code gen functionality * Add dumpy open ai api key so imports work for tests * Use union for typing optional * Clean up * Add readme docs:
shruti222patel · Aug 17, 2023 · 28baf8a · 28baf8a
1 parent 3fd72cd
commit 28baf8a
Show file tree

Hide file tree

Showing 25 changed files with 5,049 additions and 402 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -40,3 +40,5 @@ jobs:
 
       - name: Run tests
         run: poetry run pytest
+        env:
+          OPENAI_API_KEY: ""
diff --git a/.repo_gpt/code_embeddings.pkl b/.repo_gpt/code_embeddings.pkl
diff --git a/How_to_call_functions_with_chat_models.ipynb b/How_to_call_functions_with_chat_models.ipynb
diff --git a/README.md b/README.md
@@ -78,6 +78,7 @@ repo-gpt setup --root_path ./my_project
 repo-gpt search "extract handler"
 repo-gpt query "What does the function `calculate_sum` do?"
 repo-gpt analyze ./my_project/main.py
+repo-gpt add-test function_name --test_save_file_path $PWD/test.py --testing_package pytest
 ```
 
 ## Contributing
@@ -116,6 +117,13 @@ Here are the steps to set up your development environment:
    poetry run python cli.py search <text/question>
    ```
 
+6. Generate tests for a function:
+Note: this assumes the function name is unique in the codebase, otherwise, it will pick the first function it finds with that name.
+
+   ```shell
+   poetry run python cli.py add-test <unique funciton name> --test_save_file_path <absolute filepath to add tests to> --testing_package <testing package to use e.g. pytest>
+   ```
+
 ### Debugging
 
 You can view the output of the `code_embeddings.pkl` using the following command:
@@ -128,11 +136,12 @@ pd.read_pickle('./.repo_gpt/code_embeddings.pkl', compression='infer')
 
 Here are the improvements we are currently considering:
 
-- [ ] Publishing to PyPi
-- [ ] Test suite addition
+- [X] Publishing to PyPi
+- [X] Test suite addition
 - [X] Add CI/CD
-- [ ] Prettify output
+- [X] Prettify output
 - [ ] Add readme section about how folks can contribute parsers for their own languages
 - [ ] Save # of tokens each code snippet has so we can ensure we don't pass too many tokens to GPT
-- [ ] Add SQL file handler
+- [X] Add SQL file handler
 - [ ] Add DBT file handler -- this may be a break in pattern as we'd want to use the manifest.json file
+- [ ] Create VSCode extension
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,16 +29,17 @@ tqdm = "^4.65.0"
 pathspec = "^0.11.1"
 openai = "^0.27.8"
 rich = "^13.4.1"
-redbaron = "^0.9.2"
 tenacity = "^8.2.2"
 sqlglot = "^16.4.0"
 tree-sitter-languages = "^1.7.0"
 pygments = "^2.16.1"
+tiktoken = "^0.4.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.3.2"
 pre-commit = "^3.3.2"
 exceptiongroup = { version="^1.1.1", markers="python_version <= '3.10'" }
+jupyterlab = "^4.0.4"
 
 [tool.poetry.scripts]
 repo-gpt = "repo_gpt.cli:main"

diff --git a/src/repo_gpt/add_tests.py b/src/repo_gpt/add_tests.py
@@ -0,0 +1 @@
+from repo_gpt import search_service
diff --git a/src/repo_gpt/cli.py b/src/repo_gpt/cli.py
@@ -1,10 +1,13 @@
 #!./venv/bin/python
 
 import argparse
+import os
 from pathlib import Path
 
 from .code_manager.code_manager import CodeManager
-from .serach_service import SearchService
+from .openai_service import OpenAIService
+from .search_service import SearchService
+from .test_generator import TestGenerator
 
 CODE_EMBEDDING_FILE_PATH = str(Path.cwd() / ".repo_gpt" / "code_embeddings.pkl")
 
@@ -64,29 +67,139 @@ def print_help(*args):
         default=CODE_EMBEDDING_FILE_PATH,
     )
 
+    # Sub-command to analyze a file
+    add_test = subparsers.add_parser("add-test", help="Add tests for existing function")
+    add_test.add_argument(
+        "function_name", type=str, help="Name of the function you'd like to test"
+    )
+    add_test.add_argument(
+        "--file_name",
+        type=str,
+        help="Name of the file the function is found in. This is helpful if there are many functions with the same "
+        "name. If this isn't specified, I assume the function name is unique and I'll create tests for the first "
+        "matching function I find. When a file_name is passed, I will assume the function name is unique in the "
+        "file, and write tests for the first function I find with the same name in the file.",
+        default="",
+    )
+    add_test.add_argument(
+        "--test_save_file_path",
+        type=str,
+        help="Filepath to save the generated tests to",
+    )
+
+    add_test.add_argument(
+        "--testing_package",
+        type=str,
+        help="Package/library GPT should use to write tests (e.g. pytest, unittest, etc.)",
+    )
+    add_test.add_argument(
+        "--pickle_path",
+        type=str,
+        help="Path of the pickled DataFrame to search in",
+        default=CODE_EMBEDDING_FILE_PATH,
+    )
+
     parser_help = subparsers.add_parser("help", help="Show this help message")
     parser_help.set_defaults(func=print_help)
 
     args = parser.parse_args()
 
+    # Services
+    openai_service = OpenAIService()
+
+    search_service = (
+        SearchService(args.pickle_path, openai_service)
+        if args.command != "setup"
+        else None
+    )
+
     if args.command == "setup":
         root_path = Path(args.root_path)
         output_path = Path(args.output_path)
-        manager = CodeManager(root_path, output_path)
+        manager = CodeManager(output_path, root_path)
         manager.setup()
     elif args.command == "search":
-        search_service = SearchService(args.pickle_path)
         # search_service.simple_search(args.query) # simple search
         search_service.semantic_search(args.query)  # semantic search
     elif args.command == "query":
-        search_service = SearchService(args.pickle_path)
         search_service.question_answer(args.question)
     elif args.command == "analyze":
-        search_service = SearchService(args.pickle_path)
         search_service.analyze_file(args.file_path)
+    elif args.command == "add-test":
+        code_manager = CodeManager(args.pickle_path)
+        # Look for the function name in the embedding file
+        add_tests(
+            search_service,
+            code_manager,
+            args.function_name,
+            args.test_save_file_path,
+            args.testing_package,
+        )
     else:
         parser.print_help()
 
 
+def add_tests(
+    search_service,
+    code_manager,
+    function_name,
+    test_save_file_path,
+    testing_package,
+):
+    # Check file path isn't a directory
+    if os.path.isdir(test_save_file_path):
+        print(
+            f"Error: {test_save_file_path} is a directory. Please specify a file path."
+        )
+        return
+    # Find the function via the search service
+    function_to_test_df, class_to_test_df = search_service.find_function_match(
+        function_name
+    )
+
+    if function_to_test_df.empty:
+        print(f"Function {function_name} not found.")
+        return
+
+    # Get the latest version of the function
+    checksum_filepath_dict = {
+        function_to_test_df.iloc[0]["file_checksum"]: function_to_test_df.iloc[0][
+            "filepath"
+        ]
+    }
+    code_manager.parse_code_and_save_embeddings(checksum_filepath_dict)
+
+    search_service.refresh_df()
+    # Find the function again after refreshing the code & embeddings
+    function_to_test_df, class_to_test_df = search_service.find_function_match(
+        function_name
+    )
+
+    if function_to_test_df.empty:
+        print(f"Function {function_name} not found.")
+        return
+
+    # Save gpt history
+    # Ask gpt to explain the function
+    test_generator = TestGenerator(
+        function_to_test_df.iloc[0]["code"],
+        language="python",
+        unit_test_package=testing_package,
+        debug=True,
+    )
+    unit_tests = test_generator.unit_tests_from_function()
+    # unit_tests = openai_service.unit_tests_from_function(
+    #     function_to_test_df.iloc[0]["code"],
+    #     unit_test_package=testing_package,
+    #     print_text=True,
+    # )  # TODO: add language & test framework from config file
+
+    print(f"Writing generated unit_tests to {test_save_file_path}...")
+    # Save code to file
+    if test_save_file_path is not None:
+        with open(test_save_file_path, "a") as f:
+            f.write(unit_tests)
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/repo_gpt/code_manager/code_extractor.py b/src/repo_gpt/code_manager/code_extractor.py
@@ -1,18 +1,25 @@
 import hashlib
 import os
+from enum import Enum
 from pathlib import Path
 from typing import List, Type
 
 from pathspec import PathSpec
 from pathspec.patterns import GitWildMatchPattern
 from pygments.lexers import ClassNotFound, get_lexer_for_filename
 
-from ..file_handler.abstract_handler import CodeBlock, FileHandler
+from ..file_handler.abstract_handler import FileHandler, ParsedCode
 from ..file_handler.generic_code_file_handler import PHPFileHandler, PythonFileHandler
 from ..file_handler.sql_file_handler import SqlFileHandler
 from ..utils import logger
 
 
+class LanguageHandler(Enum):
+    PYTHON = PythonFileHandler
+    SQL = SqlFileHandler
+    PHP = PHPFileHandler
+
+
 class CodeExtractor:
     HANDLER_MAPPING = {
         ".py": PythonFileHandler,
@@ -44,6 +51,21 @@ def get_handler(self, filepath: str) -> Type[FileHandler]:
             )
         return handler_class
 
+    def is_file_parsable(self, filepath: str) -> bool:
+        gitignore = self.get_gitignore()
+        spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)
+        handler_class = self.get_handler(filepath)
+        if handler_class is None or spec.match_file(filepath):
+            return False
+        return True
+
+    def is_dir_parsable(self, dirpath: str) -> bool:
+        gitignore = self.get_gitignore()
+        spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)
+        if spec.match_file(dirpath):
+            return False
+        return True
+
     def generate_md5(self, filepath: str, chunk_size: int = 4096) -> str:
         hash = hashlib.md5()
         with open(filepath, "rb") as f:
@@ -55,24 +77,18 @@ def generate_md5(self, filepath: str, chunk_size: int = 4096) -> str:
 
     def extract_code_files(self) -> List[str]:
         code_files = []
-        gitignore = self.get_gitignore()
-        spec = PathSpec.from_lines(GitWildMatchPattern, gitignore)
-
         for root, dirs, files in os.walk(self.code_root_path):
             root_path = Path(root).relative_to(self.code_root_path)
 
             # Skip directories listed in .gitignore
             dirs[:] = [
-                d for d in dirs if not spec.match_file(os.path.join(root_path, d))
+                d for d in dirs if self.is_dir_parsable(os.path.join(root_path, d))
             ]
 
             for file in files:
-                if file.endswith(".py") and not spec.match_file(
-                    os.path.join(root_path, file)
-                ):
+                if self.is_file_parsable(file):
                     file_path = root_path / file
                     code_files.append(self.code_root_path / file_path)
-
         return code_files
 
     def get_gitignore(self) -> List[str]:
@@ -83,12 +99,22 @@ def get_gitignore(self) -> List[str]:
         else:
             return []
 
-    def extract_functions(self, embedding_code_file_checksums: dict) -> List[CodeBlock]:
-        code_files = self.extract_code_files()
+    def extract_functions(
+        self, embedding_code_file_checksums: dict
+    ) -> List[ParsedCode]:
+        code_files = (
+            self.extract_code_files()
+            if embedding_code_file_checksums is None
+            else embedding_code_file_checksums.values()
+        )
         code_blocks = []
         for code_filepath in code_files:
+            print(f"🟢 Processing {code_filepath}")
             file_checksum = self.generate_md5(code_filepath)
-            if file_checksum in embedding_code_file_checksums:
+            if (
+                embedding_code_file_checksums is not None
+                and file_checksum in embedding_code_file_checksums
+            ):
                 print(f"🟡 Skipping -- file unmodified {code_filepath}")
                 continue
             file_code_blocks = self.extract_functions_from_file(
@@ -105,20 +131,11 @@ def extract_functions(self, embedding_code_file_checksums: dict) -> List[CodeBlo
 
     def extract_functions_from_file(
         self, filepath: str, file_checksum: str
-    ) -> List[CodeBlock]:
+    ) -> List[ParsedCode]:
         handler = self.get_handler(filepath)
-        code_blocks = []
         if handler:
             parsed_code = handler().extract_code(filepath)
-            if parsed_code:
-                code_blocks = [
-                    CodeBlock(
-                        code=parsed.code,
-                        code_type=parsed.code_type,
-                        name=parsed.name,
-                        filepath=filepath,
-                        file_checksum=file_checksum,
-                    )
-                    for parsed in parsed_code
-                ]
-        return code_blocks
+            for code in parsed_code:
+                code.filepath = filepath
+                code.file_checksum = file_checksum
+        return parsed_code