diff --git a/README.md b/README.md
index e7ea484..1dc0138 100644
--- a/README.md
+++ b/README.md
@@ -42,10 +42,22 @@ Setup submission files:
 3. Move a copy of the starter notebook (which contains instructions and some starer code) into the submissions directory, and note the filename (i.e. `STARTER_FILENAME`).
 
 
-### OpenAI Setup
+### LLM Setup
+
+Choose an LLM provider (OpenAI or Meta Llama). OpenAI might be easier to get started, but costs money. Whereas Meta Llama is free, and for this reason is the recommended LLM provider. Based on your chosen LLM provider, see the corresponding setup instructions below.
+
+#### OpenAI Setup
 
 Obtain an OpenAI API Key (i.e. `OPENAI_API_KEY`).
 
+#### Llama Setup
+
+See: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+
+First, visit the [Meta Llama website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/), fill out the request form, and wait until your request is accepted.
+
+Then, create a [Hugging Face account](https://huggingface.co) (using the same email address from step 1), and obtain a [user access token](https://huggingface.co/docs/hub/security-tokens) (i.e. `HUGGING_FACE_TOKEN`).
+
 
 ### Environment Variables Setup
 
@@ -54,8 +66,11 @@ Create ".env" file and set environment variables:
 ```sh
 # this is the ".env" file...
 
+# choose one based on your preferred llm provider:
 OPENAI_API_KEY="sk-..."
+HUGGINGFACE_TOKEN="hf_..."
 
+# for grading a particular homework:
 SUBMISSIONS_DIRPATH="/Users/USERNAME/Desktop/GRADING HW 4"
 STARTER_FILENAME="Homework_X_STARTER.ipynb"
 FILE_ID_SPLIT_INDEX="0" # 0 for files from Canvas, 1 for files from Blackboard
@@ -64,12 +79,31 @@ FILE_ID_SPLIT_INDEX="0" # 0 for files from Canvas, 1 for files from Blackboard
 
 ## Usage
 
+### Submission Files Manager
+
 Demonstrate ability to access submission files:
 
 ```sh
 python -m app.submissions_manager
 ```
 
+### LLM
+
+Demonstrate ability to query your LLM of choice (OpenAI or Meta Llama).
+
+Query the OpenAI LLM:
+
+```sh
+TEMP=0.6 python -m app.openai.llm
+```
+
+Query the Meta Llama LLM:
+
+```sh
+TEMP=0.6 python -m app.meta.llm
+```
+> NOTE: the first time the LLama model is run, it will take a while to download.
+
 ### Cell-based Document Splitting
 
 Process the starter file:
@@ -107,14 +141,10 @@ DOCS_LIMIT=5 python -m app.submissions_retriever
 
 ### Retreival Augmented Generation (RAG)
 
-Chat with the LLM:
-
-```sh
-TEMP=0.6 python -m app.openai_llm
-```
+Use an LLM for grading:
 
 ```sh
-DOCS_LIMIT=5 python -m app.submissions_grader
+DOCS_LIMIT=5 python -m app.openai.submissions_grader
 
 # DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_grader
 ```
diff --git a/app/document_formatting.py b/app/document_decorators.py
similarity index 100%
rename from app/document_formatting.py
rename to app/document_decorators.py
diff --git a/app/meta/chain.py b/app/meta/chain.py
new file mode 100644
index 0000000..d7c73c2
--- /dev/null
+++ b/app/meta/chain.py
@@ -0,0 +1,67 @@
+# adapted from youtube video about llama and langchain: ________________
+
+
+import os
+from dotenv import load_dotenv
+
+from langchain import HuggingFacePipeline
+from langchain import PromptTemplate,  LLMChain
+
+from app.llama_prompts import get_prompt, parse_text
+from app.llama_llm import LlamaService
+
+
+if __name__ == "__main__":
+
+    service = LlamaService()
+    #pipeline = service.pipeline
+    #llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={"temperature":TEMP})
+    #print(llm)
+
+    # SIMPLE LLM CHAIN
+
+    #system_prompt = "You are an advanced assistant that excels at translation. "
+    #instruction = "Convert the following text from English to French:\n\n {text}"
+    #template = get_prompt(instruction, system_prompt)
+    #print(template)
+    #prompt = PromptTemplate(template=template, input_variables=["text"])
+#
+    #llm_chain = LLMChain(prompt=prompt, llm=llm)
+#
+    #query = "how are you today?"
+    #response = llm_chain.run(query)
+    #parse_text(response)
+
+
+    # CHAT CHAIN
+
+    if input("Continue to chat (Y/N): ").upper() != "Y":
+        exit()
+
+
+    from langchain.memory import ConversationBufferMemory
+    from langchain import LLMChain, PromptTemplate
+
+    prompt = PromptTemplate(template=template, input_variables=["chat_history", "user_input"])
+    memory = ConversationBufferMemory(memory_key="chat_history")
+
+
+
+    # for chat, with memory
+    instruction = "Chat History:\n\n{chat_history} \n\nUser: {user_input}"
+    system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context"
+
+    template = get_prompt(instruction, system_prompt)
+    print(template)
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm,
+        verbose=True, memory=memory,
+    )
+
+    query = ""
+    while query != "":
+        query = input("Please ask a question: ")
+        print(query)
+
+        response = llm_chain.predict(user_input=query)
+        print(response)
diff --git a/app/meta/llm.py b/app/meta/llm.py
new file mode 100644
index 0000000..763876c
--- /dev/null
+++ b/app/meta/llm.py
@@ -0,0 +1,187 @@
+
+# adapted from youtube video about llama and langchain: ________________
+
+# this is so slow on CPU though...
+# https://stackoverflow.com/a/77022488/670433
+
+
+import os
+from dotenv import load_dotenv
+import textwrap
+from random import choice
+
+import torch
+#import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains import  LLMChain
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+
+#from app.meta.prompts import get_prompt, parse_text, cut_off_text, remove_substring
+
+load_dotenv()
+
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf" # os.getenv("MODEL_NAME", default="meta-llama/Llama-2-7b-chat-hf")
+
+#MAX_NEW_TOKENS = 512
+TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
+
+
+
+
+# THIS IS THE OFFICIAL SYSTEM PROMPT?
+INST, INST_END = "[INST]", "[/INST]"
+SYS, SYS_END = "<<SYS>>\n", "\n<</SYS>>\n\n"
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+"""
+
+def compile_prompt(prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=[]) -> PromptTemplate:
+    """Wraps your query in syntax the model understands. Uses default system instructions, or ones you provide.
+
+        Params:
+            prompt (str) : your prompt string, optionally with placeholder {} for input vars
+
+            input variables: a list of string input variable names in your prompt, default is None
+
+        Returns: langchain.PromptTemplate
+    """
+    formatted_prompt = f"{INST} {SYS} {system_prompt} {SYS_END} {prompt} {INST_END}"
+    return PromptTemplate(template=formatted_prompt, input_variables=input_variables)
+
+
+
+
+class HuggingFaceService:
+    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN):
+        self.model_name = model_name
+        self.token = token # hugging face api token
+        self.temp = temp
+
+        self.device_type = "cuda" if torch.cuda.is_available() else "cpu"
+        # https://stackoverflow.com/a/73530618/670433
+        # https://huggingface.co/openlm-research/open_llama_7b_v2/discussions/2
+        # https://pytorch.org/docs/stable/tensors.html
+        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+    @property
+    def tokenizer(self):
+        # https://huggingface.co/transformers/v2.11.0/model_doc/auto.html?highlight=autotokenizer#autotokenizer
+        return AutoTokenizer.from_pretrained(self.model_name, token=self.token) # cache_dir=CACHE_DIRPATH
+
+    @property
+    def model(self):
+        # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM
+        return AutoModelForCausalLM.from_pretrained(
+            self.model_name, token=self.token, device_map="auto", torch_dtype=self.torch_dtype
+        )
+
+    @property
+    def pipeline(self):
+        """wrapper for tokenizer and model, for performing the 'text-generation' task"""
+        # https://huggingface.co/docs/transformers/main_classes/pipelines
+        return pipeline(
+            task="text-generation", model=self.model, tokenizer=self.tokenizer,
+            device_map="auto", torch_dtype=self.torch_dtype, # torch.bfloat16
+            max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
+            eos_token_id=self.tokenizer.eos_token_id,
+        )
+
+    @property
+    def llm(self):
+        return HuggingFacePipeline(
+            #model_id=self.model_name, # this one is getting set to "gpt2" by default?
+            pipeline=self.pipeline, model_kwargs={"temperature":self.temp}
+        )
+
+
+    #def predict(self, query):
+
+
+    #def formatted_response(self, prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=None):
+    #    prompt = self.compile_prompt(prompt)
+    #
+    #    llm_chain = LLMChain(prompt=prompt, llm=llm)
+    #    response = llm_chain.run(query)
+    #    parse_text(response)
+
+    #def generate(self, text):
+    #    prompt = get_prompt(text)
+    #
+    #    with torch.autocast(self.device_type, dtype=torch.bfloat16):
+    #        #inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda') # on CPU as well?
+    #        inputs = self.tokenizer(prompt, return_tensors="pt") #
+    #        breakpoint()
+    #        #if self.device_type == "cuda":
+    #        #    inputs = inputs.to("cuda")
+    #
+    #        outputs = self.model.generate(**inputs,
+    #                                max_new_tokens=512,
+    #                                eos_token_id=self.tokenizer.eos_token_id,
+    #                                pad_token_id=self.tokenizer.eos_token_id,
+    #        )
+    #        final_outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+    #        final_outputs = cut_off_text(final_outputs, '</s>')
+    #        final_outputs = remove_substring(final_outputs, prompt)
+    #
+    #    return final_outputs#, outputs
+
+
+
+def parse_text(text):
+    wrapped_text = textwrap.fill(text, width=100)
+    print(wrapped_text +'\n\n')
+    # return assistant_text
+
+
+
+
+if __name__ == "__main__":
+
+    hf = HuggingFaceService()
+
+    llm = hf.llm
+    print(llm)
+
+    general_knowlege_queries = [
+        "What year was America founded?",
+        "Tell us about the first humans who landed on the moon."
+    ]
+
+    query = input("Please provide a Query (or press enter): ")
+    query = query or choice(general_knowlege_queries)
+    print(query)
+
+    # response = llm.predict(query).strip()
+    prompt = compile_prompt(prompt=query)
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+    #response = llm_chain.run(query) # chain({'foo': 1, 'bar': 2})
+    #> ValueError: A single string input was passed in, but this chain expects multiple inputs (set()). When a chain expects multiple inputs, please call it by passing in a dictionary, eg `chain({'foo': 1, 'bar': 2})`
+    response = llm_chain({"query": query}) # ooh it's slow?
+    parse_text(response)
+
+
+    breakpoint()
+    exit()
+
+    # PROMPT
+
+    system_prompt = "You are an advanced assistant that excels at translation. "
+    instruction = "Convert the following text from English to French:\n\n {text}"
+    prompt = compile_prompt(prompt=instruction, system_prompt=system_prompt, input_variables=["text"])
+    print(template)
+
+    # CHAIN
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+    query = "how are you today?"
+    while query != "":
+        print(query)
+        response = llm_chain.run(query)
+        parse_text(response)
+        print("------")
+        query = input("Query (or press enter to stop): ")
diff --git a/app/meta/prompts.py b/app/meta/prompts.py
new file mode 100644
index 0000000..a45cf4c
--- /dev/null
+++ b/app/meta/prompts.py
@@ -0,0 +1,38 @@
+
+# adapted from youtube video about llama and langchain: ________________
+
+#import json
+import textwrap
+
+B_INST, E_INST = "[INST]", "[/INST]"
+
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+# TODO: refactor
+
+def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
+    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
+    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
+    return prompt_template
+
+def cut_off_text(text, prompt):
+    cutoff_phrase = prompt
+    index = text.find(cutoff_phrase)
+    if index != -1:
+        return text[:index]
+    else:
+        return text
+
+def remove_substring(string, substring):
+    return string.replace(substring, "")
+
+
+def parse_text(text):
+    wrapped_text = textwrap.fill(text, width=100)
+    print(wrapped_text +'\n\n')
+    # return assistant_text
diff --git a/app/document_processor.py b/app/openai/document_processor.py
similarity index 100%
rename from app/document_processor.py
rename to app/openai/document_processor.py
diff --git a/app/openai_llm.py b/app/openai/llm.py
similarity index 100%
rename from app/openai_llm.py
rename to app/openai/llm.py
diff --git a/app/rows_processor.py b/app/openai/rows_processor.py
similarity index 100%
rename from app/rows_processor.py
rename to app/openai/rows_processor.py
diff --git a/app/starter_doc_processor.py b/app/openai/starter_doc_processor.py
similarity index 100%
rename from app/starter_doc_processor.py
rename to app/openai/starter_doc_processor.py
diff --git a/app/submissions_grader.py b/app/openai/submissions_grader.py
similarity index 100%
rename from app/submissions_grader.py
rename to app/openai/submissions_grader.py
diff --git a/app/submissions_processor.py b/app/openai/submissions_processor.py
similarity index 100%
rename from app/submissions_processor.py
rename to app/openai/submissions_processor.py
diff --git a/app/submissions_retriever.py b/app/openai/submissions_retriever.py
similarity index 100%
rename from app/submissions_retriever.py
rename to app/openai/submissions_retriever.py
diff --git a/app/response_formatters.py b/app/response_models.py
similarity index 100%
rename from app/response_formatters.py
rename to app/response_models.py
diff --git a/requirements.txt b/requirements.txt
index 9090990..51bb70b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,10 +10,16 @@ plotly
 
 
 openai # 1.3.8
-langchain # 0.0.348
+langchain # 0.0.348 ... 0.0.353
 tiktoken
 faiss-cpu
 
+# llama:
+torch # 2.1.0+cu121 (for colab)
+transformers # 4.35.2
+accelerate # 0.25.0
+# torchtext # 0.16.0
+
 
 
 pytest