Merge pull request #49 from cagostino/chris/plonk_rag

Chris/plonk rag
cagostino · Jan 22, 2025 · e790b3e · e790b3e
2 parents 4ccb492 + b8b6cf7
commit e790b3e
Show file tree

Hide file tree

Showing 8 changed files with 153 additions and 334 deletions.
diff --git a/npcsh/cli_helpers.py b/npcsh/cli_helpers.py
@@ -2216,12 +2216,11 @@ def enter_spool_mode(
                 os.getcwd(),
             )
 
-            #sometimes claude responds with unfinished markdown notation. so we need to check if there are two sets
-            #of markdown notation and if not, we add it. so if # markdown notations is odd we add one more
+            # sometimes claude responds with unfinished markdown notation. so we need to check if there are two sets
+            # of markdown notation and if not, we add it. so if # markdown notations is odd we add one more
             if assistant_reply.count("```") % 2 != 0:
                 assistant_reply = assistant_reply + "```"
 
-
             render_markdown(assistant_reply)
 
         except (KeyboardInterrupt, EOFError):

diff --git a/npcsh/helpers.py b/npcsh/helpers.py
@@ -89,6 +89,9 @@ def ensure_npcshrc_exists() -> str:
             npcshrc.write("export NPCSH_INITIALIZED=0\n")
             npcshrc.write("export NPCSH_PROVIDER='ollama'\n")
             npcshrc.write("export NPCSH_MODEL='llama3.2'\n")
+            npcshrc.write("export NPCSH_EMBEDDING_PROVIDER='ollama'\n")
+            npcshrc.write("export NPCSH_EMBEDDING_MODEL='nomic-embed-text'\n")
+
             npcshrc.write("export NPCSH_API_URL=''")
             npcshrc.write("export NPCSH_DB_PATH='~/npcsh_history.db'\n")
     return npcshrc_path

diff --git a/npcsh/llm_funcs.py b/npcsh/llm_funcs.py
@@ -35,7 +35,9 @@
 
 from pydantic import BaseModel, Field
 
-client = chromadb.PersistentClient(path="/home/caug/npcsh_chroma.db")
+EMBEDDINGS_DB_PATH = os.path.expanduser("~/npcsh_chroma.db")
+
+chroma_client = chromadb.PersistentClient(path=EMBEDDINGS_DB_PATH)
 
 
 # Load environment variables from .env file
@@ -81,7 +83,8 @@ def load_env_from_execution_dir() -> None:
     os.environ.get("NPCSH_VECTOR_DB_PATH", "~/npcsh_chroma.db")
 )
 
-NPCSH_EMBEDDING_MODEL = "nomic-embed-text"
+NPCSH_EMBEDDING_MODEL = os.environ.get("NPCSH_EMBEDDING_MODEL","nomic-embed-text")
+NPCSH_EMBEDDING_PROVIDER = os.environ.get("NPCSH_EMBEDDING_PROVIDER", "ollama")
 
 
 def get_ollama_embeddings(
@@ -119,13 +122,22 @@ def get_anthropic_embeddings(
     return embeddings
 
 
-def store_embeddings_for_model(texts, embeddings, model, provider):
+def store_embeddings_for_model(
+    texts,
+    embeddings,
+    metadata=None,
+    model: str = NPCSH_EMBEDDING_MODEL,
+    provider: str = NPCSH_EMBEDDING_PROVIDER,
+):
     collection_name = f"{provider}_{model}_embeddings"
-    collection = client.get_collection(collection_name)
+    collection = chroma_client.get_collection(collection_name)
 
     # Create meaningful metadata for each document (adjust as necessary)
-    metadata = [{"text_length": len(text)} for text in texts]  # Example metadata
-
+    if metadata is None:
+        metadata = [{"text_length": len(text)} for text in texts]  # Example metadata
+        print(
+            "metadata is none, creating metadata for each document as the length of the text"
+        )
     # Add embeddings to the collection with metadata
     collection.add(
         ids=[str(i) for i in range(len(texts))],
@@ -141,16 +153,16 @@ def delete_embeddings_from_collection(collection, ids):
         collection.delete(ids=ids)  # Only delete if ids are provided
 
 
-def search_similar_texts_for_model(
+def search_similar_texts(
     query_embedding: List[float],
-    embedding_model: str,
-    provider: str,
     top_k: int = 5,
     db_path: str = npcsh_vector_db_path,
+    embedding_model: str = NPCSH_EMBEDDING_MODEL,
+    embedding_provider: str = NPCSH_EMBEDDING_PROVIDER,
 ) -> List[dict]:
     """Search for similar texts in Chroma using KNN."""
-    collection_name = f"{provider}_{embedding_model}_embeddings"
-    collection = client.get_collection(collection_name)
+    collection_name = f"{embedding_provider}_{embedding_model}_embeddings"
+    collection = chroma_client.get_collection(collection_name)
 
     search_results = collection.query(query_embedding, n_results=top_k)
 
@@ -168,7 +180,9 @@ def search_similar_texts_for_model(
 
 
 def get_embeddings(
-    texts: List[str], provider: str = npcsh_provider, model: str = NPCSH_EMBEDDING_MODEL
+    texts: List[str],
+    model: str = NPCSH_EMBEDDING_MODEL,
+    provider: str = NPCSH_EMBEDDING_PROVIDER,
 ) -> List[List[float]]:
     """Generate embeddings using the specified provider and store them in Chroma."""
     if provider == "ollama":
@@ -181,7 +195,7 @@ def get_embeddings(
         raise ValueError(f"Unsupported provider: {provider}")
 
     # Store the embeddings in the relevant Chroma collection
-    store_embeddings_for_model(texts, embeddings, model, provider)
+    #store_embeddings_for_model(texts, embeddings, model, provider)
     return embeddings
 
 

diff --git a/npcsh/npc_compiler.py b/npcsh/npc_compiler.py
@@ -638,13 +638,15 @@ class NPCCompiler:
     def __init__(self, npc_directory, db_path):
         self.npc_directory = npc_directory
         self.dirs = [self.npc_directory]
-        if self.npc_directory == os.path.abspath("./npc_team"):
+        if self.npc_directory == os.path.abspath("./npc_team/"):
             self.project_npc_directory = None
             self.project_tools_directory = None
         else:
-            self.project_npc_directory = os.path.abspath("./npc_team")
+            self.project_npc_directory = os.path.abspath("./npc_team/")
+            self.project_tools_directory = os.path.join(
+                self.project_npc_directory, "tools"
+            )
             self.dirs.append(self.project_npc_directory)
-            self.project_tools_directory = os.path.join(self.project_npc_directory, "tools")
 
         self.db_path = db_path
         self.npc_cache = {}
@@ -654,7 +656,6 @@ def __init__(self, npc_directory, db_path):
         # Set tools directories
         self.global_tools_directory = os.path.join(self.npc_directory, "tools")
 
-
         # Initialize Jinja environment with multiple loaders
         self.jinja_env = Environment(
             loader=FileSystemLoader(self.dirs),
@@ -773,11 +774,13 @@ def load_tool_from_file(self, tool_path: str) -> Union[dict, None]:
             return None
 
     def parse_all_npcs(self) -> None:
+        print(self.dirs)
         for directory in self.dirs:
-            for filename in os.listdir(directory):
-                if filename.endswith(".npc"):
-                    npc_path = os.path.join(directory, filename)
-                    self.parse_npc_file(npc_path)
+            if os.path.exists(directory):
+                for filename in os.listdir(directory):
+                    if filename.endswith(".npc"):
+                        npc_path = os.path.join(directory, filename)
+                        self.parse_npc_file(npc_path)
 
     def parse_npc_file(self, npc_file_path: str) -> dict:
         npc_file = os.path.basename(npc_file_path)

diff --git a/npcsh/npcsh.py b/npcsh/npcsh.py