Merge pull request #196 from amosproj/develop

Develop
amosproj · Jul 10, 2024 · 1a6215d · 1a6215d
2 parents a62e4fb + 66f1520
commit 1a6215d
Show file tree

Hide file tree

Showing 33 changed files with 2,027 additions and 633 deletions.
diff --git a/Project/backend/codebase/graph_analysis/graph_analysis.py b/Project/backend/codebase/graph_analysis/graph_analysis.py
@@ -1,22 +1,24 @@
 import networkx as nx
-import os
-import json
+
 
 def get_top_n_central_nodes(centrality_dict, n):
     """Sort nodes based on centrality measure and return top N nodes.
-    
+
     Args:
         centrality_dict: Dictionary of nodes with their centrality values.
         n: Number of top nodes to return.
-    
+
     Returns:
         Sorted list of top N nodes with their centrality values.
     """
     # sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
     # return sorted_nodes[:n]
-    sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
+    sorted_nodes = sorted(
+        centrality_dict.items(), key=lambda item: item[1], reverse=True
+    )
     return [node for node, _ in sorted_nodes[:n]]
 
+
 def analyze_graph_structure(G):
     """Analyzes the structure of a knowledge graph and provides hopefully useful information.
     Currently, I am not sure how to use most of the information, but we may find a way to use it
@@ -32,7 +34,7 @@ def analyze_graph_structure(G):
     num_nodes = G.number_of_nodes()  # Total number of nodes
     num_edges = G.number_of_edges()  # Total number of edges
 
-# Degree Distribution
+    # Degree Distribution
     degree_distribution = dict(G.degree())
     # Degree distribution can indicate the presence of hubs or important nodes
 
@@ -51,7 +53,6 @@ def analyze_graph_structure(G):
     - Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
     """
 
-
     # Betweenness Centrality: Measures node's control over information flow
     betweenness_centrality = nx.betweenness_centrality(G)
     """
@@ -70,7 +71,7 @@ def analyze_graph_structure(G):
     - Betweenness Centrality show the dependency of the network on a node
 
     """
-    
+
     # eigenvector centrality measures the influence of a node in a network
     eigenvector_centrality = nx.eigenvector_centrality(G)
 
@@ -115,7 +116,12 @@ def analyze_graph_structure(G):
     closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)
 
     # Find intersection of top nodes from all measures (set intersection)
-    all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)
+    all_centrality_nodes = (
+        set(degree_centrality)
+        & set(betweenness_centrality)
+        & set(eigenvector_centrality)
+        & set(closeness_centrality)
+    )
 
     top_nodes = list(all_centrality_nodes)[:6]
 

diff --git a/Project/backend/codebase/graph_creator/graph_handler.py b/Project/backend/codebase/graph_creator/graph_handler.py
@@ -1,7 +1,6 @@
 import json
 import logging
 import re
-import time
 
 # from graph_creator import llama3
 # from graph_creator import embedding_handler # To be integrated
@@ -345,18 +344,34 @@ def add_relations_to_data(entity_and_relation_df, new_relations):
     return entity_and_relation_df
 
 
-def add_topic(data: pd.DataFrame) -> pd.DataFrame:
-    documents = list(set(data['node_1']).union(set(data['node_2'])))
+def add_topic(data: pd.DataFrame, max_topics: int = 25) -> pd.DataFrame:
+    documents = list(set(data["node_1"]).union(set(data["node_2"])))
 
     topic_model = BERTopic()
     topics, probabilities = topic_model.fit_transform(documents)
-    topic_name_info = {row['Topic']: row['Name'] for _, row in topic_model.get_topic_info().iterrows()}
-    doc_topic_map = {doc: topic for doc, topic in zip(documents, topics)}
-    doc_topic_strings_map = {doc: topic_name_info.get(topic, "no_topic") for doc, topic in doc_topic_map.items()}
+    topic_info = topic_model.get_topic_info()
+
+    # Keep only the top given number of topics
+    top_topics = topic_model.get_topic_info().head(max_topics)["Topic"].tolist()
+
+    topic_name_info = {
+        row["Topic"]: row["Name"] for _, row in topic_info.iterrows()
+    }
+
+    # Create a mapping for "other" topics
+    doc_topic_map = {doc: (topic if topic in top_topics else "other") for doc, topic in zip(documents, topics)}
+    doc_topic_strings_map = {
+        doc: (topic_name_info.get(topic, "other") if topic != "other" else "other")
+        for doc, topic in doc_topic_map.items()
+    }
 
     # Add new columns to the DataFrame and populate them
-    data['topic_node_1'] = [doc_topic_strings_map[node] for i, node in data['node_1'].items()]
-    data['topic_node_2'] = [doc_topic_strings_map[node] for i, node in data['node_2'].items()]
+    data["topic_node_1"] = [
+        doc_topic_strings_map[node] for i, node in data["node_1"].items()
+    ]
+    data["topic_node_2"] = [
+        doc_topic_strings_map[node] for i, node in data["node_2"].items()
+    ]
     return data
 
 

diff --git a/Project/backend/codebase/graph_creator/pdf_handler.py b/Project/backend/codebase/graph_creator/pdf_handler.py
@@ -31,7 +31,9 @@ def process_pdf_into_chunks(filename):
         raise ValueError("Failed to load PDF documents.")
 
     # splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
+    )
     splits = text_splitter.split_documents(docs)
 
     return splits
diff --git a/Project/backend/codebase/graph_creator/router.py b/Project/backend/codebase/graph_creator/router.py
@@ -297,4 +297,4 @@ async def query_graph(
         )
     graph = netx_services.load_graph(graph_job_id=graph_job_id)
     graph_keywords = analyze_graph_structure(graph)
-    return graph_keywords
+    return graph_keywords
diff --git a/Project/backend/codebase/graph_creator/services/file_handler.py b/Project/backend/codebase/graph_creator/services/file_handler.py
@@ -7,7 +7,10 @@
 from langchain_community.document_loaders import Docx2txtLoader
 from langchain_community.document_loaders import UnstructuredPowerPointLoader
 
-from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
+from langchain_text_splitters import (
+    RecursiveCharacterTextSplitter,
+    RecursiveJsonSplitter,
+)
 
 
 class FileHandler:
@@ -40,7 +43,9 @@ def _process_doc_to_chunks(docs):
             raise ValueError("Failed to load documents.")
 
         # splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
+        )
         splits = text_splitter.split_documents(docs)
         return splits
 
@@ -56,4 +61,3 @@ def _get_json_chunks(self):
         splitter = RecursiveJsonSplitter(max_chunk_size=os.getenv("CHUNK_SIZE", 1500))
         json_chunks = splitter.create_documents(texts=[json_data])
         return json_chunks
-
diff --git a/Project/backend/codebase/graph_creator/services/llm/gemini.py b/Project/backend/codebase/graph_creator/services/llm/gemini.py
@@ -109,7 +109,7 @@ def extract_entities_and_relations(self, chunk):
             "   }, {...}\n"
             "]"
         )
-        
+
         USER_PROMPT = f"context: ```{chunk}``` \n\n output: "
 
         chat_session = self.genai_client.start_chat(history=[])

diff --git a/Project/backend/codebase/graph_creator/services/netx_graphdb.py b/Project/backend/codebase/graph_creator/services/netx_graphdb.py
@@ -36,12 +36,18 @@ def create_graph_from_df(self, data: pd.DataFrame, chunks: dict) -> nx.Graph:
 
             chunk_id = edge["chunk_id"]
             page_number = chunk_to_page[int(chunk_id)]
+            if isinstance(page_number, int):
+                page_number += 1
 
             # Add nodes with page attribute
             if edge["node_1"] not in graph:
-                graph.add_node(edge["node_1"], pages=set([]), topic=edge["topic_node_1"])
+                graph.add_node(
+                    edge["node_1"], pages=set([]), topic=edge["topic_node_1"]
+                )
             if edge["node_2"] not in graph:
-                graph.add_node(edge["node_2"], pages=set([]), topic=edge["topic_node_2"])
+                graph.add_node(
+                    edge["node_2"], pages=set([]), topic=edge["topic_node_2"]
+                )
 
             # Add edge with attributes to the graph
             graph.add_edge(edge["node_1"], edge["node_2"], relation=edge["edge"])
@@ -128,7 +134,7 @@ def _get_graph_file_path_local_storage(graph_job_id: uuid.UUID) -> str:
 
     @staticmethod
     def _graph_bfs_edges(
-            graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
+        graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
     ) -> GraphVisData:
         nodes_data = []
         edges_data = []
@@ -170,14 +176,16 @@ def _graph_bfs_edges(
                 )
             )
 
-        return GraphVisData(document_name=graph_job.name,
-                            graph_created_at=graph_job.updated_at,
-                            nodes=nodes_data,
-                            edges=edges_data)
+        return GraphVisData(
+            document_name=graph_job.name,
+            graph_created_at=graph_job.updated_at,
+            nodes=nodes_data,
+            edges=edges_data,
+        )
 
     @staticmethod
     def _all_graph_data_for_visualization(
-            graph: nx.Graph, graph_job: GraphJob
+        graph: nx.Graph, graph_job: GraphJob
     ) -> GraphVisData:
         nodes_data = []
         edges_data = []
@@ -208,7 +216,9 @@ def _all_graph_data_for_visualization(
                 )
             )
 
-        return GraphVisData(document_name=graph_job.name,
-                            graph_created_at=graph_job.updated_at,
-                            nodes=nodes_data,
-                            edges=edges_data)
+        return GraphVisData(
+            document_name=graph_job.name,
+            graph_created_at=graph_job.updated_at,
+            nodes=nodes_data,
+            edges=edges_data,
+        )
diff --git a/Project/backend/codebase/tests/test_graph_handler.py b/Project/backend/codebase/tests/test_graph_handler.py
@@ -1,7 +1,6 @@
 from requests import patch
-from graph_creator.services.llm.llama_gemini_combination import llama_gemini_combination
 from graph_creator import graph_handler
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 
 import json
 import pandas as pd
@@ -74,6 +73,7 @@ def test_relation_extraction_from_llm_entity_not_in_lists():
     # Assert
     assert relation is None
 
+
 def test_component_connection_with_llm(mocker):
     """
     Tests if component combination with llm works
@@ -89,7 +89,7 @@ def test_component_connection_with_llm(mocker):
         ]
     """
 
-    patcher = patch('graph_creator.services.llm.llama_gemini_combination')
+    patcher = patch("graph_creator.services.llm.llama_gemini_combination")
     MockLlama3 = patcher.start()
     mock_instance = MockLlama3.return_value
 

diff --git a/Project/frontend/src/components/App/index.css b/Project/frontend/src/components/App/index.css
@@ -59,8 +59,6 @@ img {
   gap: 10px;
 }
 
-
-
 .main_wrapper {
   display: flex;
   flex-direction: column;
@@ -71,13 +69,11 @@ img {
   min-height: 100%;
 }
 
-
-
 .Appcontainer {
   display: flex;
   flex-direction: column;
   align-items: center;
   gap: 20px;
   min-width: 100%;
   min-height: 100%;
-}
+}
diff --git a/Project/frontend/src/components/App/index.tsx b/Project/frontend/src/components/App/index.tsx
@@ -17,10 +17,10 @@ import {
 } from '@mui/material';
 
 import logo from '../../assets/team-logo.png';
-import Graph from '../Graph/index_visjs';
+import Graph from '../Graph_page/GraphVisualization';
 import UploadPage from '../UploadPage';
 import LandingPage from '../LandingPage';
-
+import Navbar from '../Navbar/Navbar';
 import './index.css';
 
 const theme = createTheme({
@@ -46,9 +46,7 @@ function App() {
             px={2}
             py={1}
           >
-            <NavLink to="/">
-              <img src={logo} alt="Logo" className="logo" />
-            </NavLink>
+            <Navbar />
             <Typography variant="h6">Graph Masters</Typography>
             <NavLink to="/" style={{ textDecoration: 'none' }}>
               <Typography variant="h6" sx={{ color: 'white' }}>
-Original file line number
+Diff line change
@@ Expand Up / @@ -109,7 +109,7 @@ def extract_entities_and_relations(self, chunk): @@
                 "   }, {...}\n"
                 "]"
             )
             USER_PROMPT = f"context: ```{chunk}``` \n\n output: "
             chat_session = self.genai_client.start_chat(history=[])
@@ Expand Down @@