Skip to content

Commit

Permalink
Merge pull request #196 from amosproj/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
iremozs authored Jul 10, 2024
2 parents a62e4fb + 66f1520 commit 1a6215d
Show file tree
Hide file tree
Showing 33 changed files with 2,027 additions and 633 deletions.
24 changes: 15 additions & 9 deletions Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
import networkx as nx
import os
import json


def get_top_n_central_nodes(centrality_dict, n):
"""Sort nodes based on centrality measure and return top N nodes.
Args:
centrality_dict: Dictionary of nodes with their centrality values.
n: Number of top nodes to return.
Returns:
Sorted list of top N nodes with their centrality values.
"""
# sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
# return sorted_nodes[:n]
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
sorted_nodes = sorted(
centrality_dict.items(), key=lambda item: item[1], reverse=True
)
return [node for node, _ in sorted_nodes[:n]]


def analyze_graph_structure(G):
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
Currently, I am not sure how to use most of the information, but we may find a way to use it
Expand All @@ -32,7 +34,7 @@ def analyze_graph_structure(G):
num_nodes = G.number_of_nodes() # Total number of nodes
num_edges = G.number_of_edges() # Total number of edges

# Degree Distribution
# Degree Distribution
degree_distribution = dict(G.degree())
# Degree distribution can indicate the presence of hubs or important nodes

Expand All @@ -51,7 +53,6 @@ def analyze_graph_structure(G):
- Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
"""


# Betweenness Centrality: Measures node's control over information flow
betweenness_centrality = nx.betweenness_centrality(G)
"""
Expand All @@ -70,7 +71,7 @@ def analyze_graph_structure(G):
- Betweenness Centrality show the dependency of the network on a node
"""

# eigenvector centrality measures the influence of a node in a network
eigenvector_centrality = nx.eigenvector_centrality(G)

Expand Down Expand Up @@ -115,7 +116,12 @@ def analyze_graph_structure(G):
closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)

# Find intersection of top nodes from all measures (set intersection)
all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)
all_centrality_nodes = (
set(degree_centrality)
& set(betweenness_centrality)
& set(eigenvector_centrality)
& set(closeness_centrality)
)

top_nodes = list(all_centrality_nodes)[:6]

Expand Down
31 changes: 23 additions & 8 deletions Project/backend/codebase/graph_creator/graph_handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import logging
import re
import time

# from graph_creator import llama3
# from graph_creator import embedding_handler # To be integrated
Expand Down Expand Up @@ -345,18 +344,34 @@ def add_relations_to_data(entity_and_relation_df, new_relations):
return entity_and_relation_df


def add_topic(data: pd.DataFrame) -> pd.DataFrame:
documents = list(set(data['node_1']).union(set(data['node_2'])))
def add_topic(data: pd.DataFrame, max_topics: int = 25) -> pd.DataFrame:
documents = list(set(data["node_1"]).union(set(data["node_2"])))

topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(documents)
topic_name_info = {row['Topic']: row['Name'] for _, row in topic_model.get_topic_info().iterrows()}
doc_topic_map = {doc: topic for doc, topic in zip(documents, topics)}
doc_topic_strings_map = {doc: topic_name_info.get(topic, "no_topic") for doc, topic in doc_topic_map.items()}
topic_info = topic_model.get_topic_info()

# Keep only the top given number of topics
top_topics = topic_model.get_topic_info().head(max_topics)["Topic"].tolist()

topic_name_info = {
row["Topic"]: row["Name"] for _, row in topic_info.iterrows()
}

# Create a mapping for "other" topics
doc_topic_map = {doc: (topic if topic in top_topics else "other") for doc, topic in zip(documents, topics)}
doc_topic_strings_map = {
doc: (topic_name_info.get(topic, "other") if topic != "other" else "other")
for doc, topic in doc_topic_map.items()
}

# Add new columns to the DataFrame and populate them
data['topic_node_1'] = [doc_topic_strings_map[node] for i, node in data['node_1'].items()]
data['topic_node_2'] = [doc_topic_strings_map[node] for i, node in data['node_2'].items()]
data["topic_node_1"] = [
doc_topic_strings_map[node] for i, node in data["node_1"].items()
]
data["topic_node_2"] = [
doc_topic_strings_map[node] for i, node in data["node_2"].items()
]
return data


Expand Down
4 changes: 3 additions & 1 deletion Project/backend/codebase/graph_creator/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def process_pdf_into_chunks(filename):
raise ValueError("Failed to load PDF documents.")

# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
)
splits = text_splitter.split_documents(docs)

return splits
2 changes: 1 addition & 1 deletion Project/backend/codebase/graph_creator/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,4 @@ async def query_graph(
)
graph = netx_services.load_graph(graph_job_id=graph_job_id)
graph_keywords = analyze_graph_structure(graph)
return graph_keywords
return graph_keywords
10 changes: 7 additions & 3 deletions Project/backend/codebase/graph_creator/services/file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
RecursiveJsonSplitter,
)


class FileHandler:
Expand Down Expand Up @@ -40,7 +43,9 @@ def _process_doc_to_chunks(docs):
raise ValueError("Failed to load documents.")

# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
)
splits = text_splitter.split_documents(docs)
return splits

Expand All @@ -56,4 +61,3 @@ def _get_json_chunks(self):
splitter = RecursiveJsonSplitter(max_chunk_size=os.getenv("CHUNK_SIZE", 1500))
json_chunks = splitter.create_documents(texts=[json_data])
return json_chunks

Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def extract_entities_and_relations(self, chunk):
" }, {...}\n"
"]"
)

USER_PROMPT = f"context: ```{chunk}``` \n\n output: "

chat_session = self.genai_client.start_chat(history=[])
Expand Down
34 changes: 22 additions & 12 deletions Project/backend/codebase/graph_creator/services/netx_graphdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,18 @@ def create_graph_from_df(self, data: pd.DataFrame, chunks: dict) -> nx.Graph:

chunk_id = edge["chunk_id"]
page_number = chunk_to_page[int(chunk_id)]
if isinstance(page_number, int):
page_number += 1

# Add nodes with page attribute
if edge["node_1"] not in graph:
graph.add_node(edge["node_1"], pages=set([]), topic=edge["topic_node_1"])
graph.add_node(
edge["node_1"], pages=set([]), topic=edge["topic_node_1"]
)
if edge["node_2"] not in graph:
graph.add_node(edge["node_2"], pages=set([]), topic=edge["topic_node_2"])
graph.add_node(
edge["node_2"], pages=set([]), topic=edge["topic_node_2"]
)

# Add edge with attributes to the graph
graph.add_edge(edge["node_1"], edge["node_2"], relation=edge["edge"])
Expand Down Expand Up @@ -128,7 +134,7 @@ def _get_graph_file_path_local_storage(graph_job_id: uuid.UUID) -> str:

@staticmethod
def _graph_bfs_edges(
graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
) -> GraphVisData:
nodes_data = []
edges_data = []
Expand Down Expand Up @@ -170,14 +176,16 @@ def _graph_bfs_edges(
)
)

return GraphVisData(document_name=graph_job.name,
graph_created_at=graph_job.updated_at,
nodes=nodes_data,
edges=edges_data)
return GraphVisData(
document_name=graph_job.name,
graph_created_at=graph_job.updated_at,
nodes=nodes_data,
edges=edges_data,
)

@staticmethod
def _all_graph_data_for_visualization(
graph: nx.Graph, graph_job: GraphJob
graph: nx.Graph, graph_job: GraphJob
) -> GraphVisData:
nodes_data = []
edges_data = []
Expand Down Expand Up @@ -208,7 +216,9 @@ def _all_graph_data_for_visualization(
)
)

return GraphVisData(document_name=graph_job.name,
graph_created_at=graph_job.updated_at,
nodes=nodes_data,
edges=edges_data)
return GraphVisData(
document_name=graph_job.name,
graph_created_at=graph_job.updated_at,
nodes=nodes_data,
edges=edges_data,
)
6 changes: 3 additions & 3 deletions Project/backend/codebase/tests/test_graph_handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from requests import patch
from graph_creator.services.llm.llama_gemini_combination import llama_gemini_combination
from graph_creator import graph_handler
from unittest.mock import patch, MagicMock
from unittest.mock import patch

import json
import pandas as pd
Expand Down Expand Up @@ -74,6 +73,7 @@ def test_relation_extraction_from_llm_entity_not_in_lists():
# Assert
assert relation is None


def test_component_connection_with_llm(mocker):
"""
Tests if component combination with llm works
Expand All @@ -89,7 +89,7 @@ def test_component_connection_with_llm(mocker):
]
"""

patcher = patch('graph_creator.services.llm.llama_gemini_combination')
patcher = patch("graph_creator.services.llm.llama_gemini_combination")
MockLlama3 = patcher.start()
mock_instance = MockLlama3.return_value

Expand Down
6 changes: 1 addition & 5 deletions Project/frontend/src/components/App/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ img {
gap: 10px;
}



.main_wrapper {
display: flex;
flex-direction: column;
Expand All @@ -71,13 +69,11 @@ img {
min-height: 100%;
}



.Appcontainer {
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
min-width: 100%;
min-height: 100%;
}
}
8 changes: 3 additions & 5 deletions Project/frontend/src/components/App/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ import {
} from '@mui/material';

import logo from '../../assets/team-logo.png';
import Graph from '../Graph/index_visjs';
import Graph from '../Graph_page/GraphVisualization';
import UploadPage from '../UploadPage';
import LandingPage from '../LandingPage';

import Navbar from '../Navbar/Navbar';
import './index.css';

const theme = createTheme({
Expand All @@ -46,9 +46,7 @@ function App() {
px={2}
py={1}
>
<NavLink to="/">
<img src={logo} alt="Logo" className="logo" />
</NavLink>
<Navbar />
<Typography variant="h6">Graph Masters</Typography>
<NavLink to="/" style={{ textDecoration: 'none' }}>
<Typography variant="h6" sx={{ color: 'white' }}>
Expand Down
Loading

0 comments on commit 1a6215d

Please sign in to comment.