Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
Signed-off-by: Nikolas Rauscher <[email protected]>
  • Loading branch information
nikolas-rauscher committed Jul 3, 2024
2 parents 8979f38 + 75c3bdc commit a3227d4
Show file tree
Hide file tree
Showing 34 changed files with 1,489 additions and 935 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ bin/

# MacOS
.DS_Store
# embeddings
Project/backend/codebase/embeddings/*
41 changes: 41 additions & 0 deletions Documentation/llmExtractionMeasurments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Measurments of running Graph creator
# india health article

llama3, chunk_size=1500, chunk_overlap=150, 30 calls/min, 60 sec wait

gemini-pro, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait

gemini-flash, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait

# Measurments as averages over all llm calls
# two kinds of promts used: extract_entities_and_relations and check_for_connecting_relation


# Execution speed of prompts by llm model

gemini-flash: 12,75s (10 extraction requests) per request extracting, 1.57s connecting

gemini: 23,54s (10 extraction requests) per request extracting, 2,37s connecting

groq+llama3: 0.72s (10 extraction requests) per request extracting, 0,48s connecting

---------------------------------------

# Statistics on the number of extracted entities by llm model

llama: 3078 tokens / 1770 words -> 177 / 180 entities (34 / 47 connecting requests)

gemini: 3078 tokens / 1770 words -> 303 / 316 entities (35 connecting requests)

gemini-flash: 3078 tokens / 1770 words -> 309 / 369 entities (28 connecting requests)


-----------------------------------------

# Duration of knowledge graph extraction by llm model

gemini-flash: 127,5s for entity extraction and 105s for connecting

gemini: 212s for entity extraction and 189s for connecting

groq+llama3: 7,9s for entity extraction and 136s for connecting
1 change: 1 addition & 0 deletions Project/backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ DEBUG=True
ALLOWED_HOSTS=*
CSRF_TRUSTED_ORIGINS='*'
CORS_ALLOWED_ORIGINS='*'
CHUNK_SIZE=1500

# Database
POSTGRES_USER=amos
Expand Down
57 changes: 29 additions & 28 deletions Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ def get_top_n_central_nodes(centrality_dict, n):
Returns:
Sorted list of top N nodes with their centrality values.
"""
# sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
# return sorted_nodes[:n]
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
return sorted_nodes[:n]
return [node for node, _ in sorted_nodes[:n]]

def analyze_graph_structure(G):
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
Expand All @@ -30,6 +32,10 @@ def analyze_graph_structure(G):
num_nodes = G.number_of_nodes() # Total number of nodes
num_edges = G.number_of_edges() # Total number of edges

# Degree Distribution
degree_distribution = dict(G.degree())
# Degree distribution can indicate the presence of hubs or important nodes

if num_nodes == 0 or num_edges == 0:
raise ValueError("The graph is empty or not properly constructed.")

Expand All @@ -48,7 +54,7 @@ def analyze_graph_structure(G):

# Betweenness Centrality: Measures node's control over information flow
betweenness_centrality = nx.betweenness_centrality(G)
"""
"""
- Betweenness Centrality: Measures node's control over information flow
- Nodes with high betweenness centrality are important in the network
Expand Down Expand Up @@ -86,36 +92,31 @@ def analyze_graph_structure(G):
"""

graph_info = {
"num_nodes": num_nodes,
"num_edges": num_edges,
"top_degree_centrality": get_top_n_central_nodes(degree_centrality, top_n),
"top_betweenness_centrality": get_top_n_central_nodes(betweenness_centrality, top_n),
"top_eigenvector_centrality": get_top_n_central_nodes(eigenvector_centrality, top_n)
}

return graph_info

def print_graph_info(graph_info):
"""Prints the graph information in a formatted and readable way.
# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
closeness_centrality = nx.closeness_centrality(G)

Args:
graph_info: A dictionary containing information about the graph's structure.
"""
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
- Nodes with high closeness centrality are important in the network
print(json.dumps(graph_info, indent=4))

Examples: 4 nodes are connected
0
/ | \
2--1--3
graph_directory = os.fsencode("../.media/graphs/")
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
- Closeness Centrality show the average distance of a node to all other nodes in the network
"""
n = 20 # Number of top nodes to return
# Calculate centrality measures
degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)
eigenvector_centrality = get_top_n_central_nodes(nx.eigenvector_centrality(G), n)
closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)

# Find intersection of top nodes from all measures (set intersection)
all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)

top_n = int(input("Enter the number of top nodes to display: "))
top_nodes = list(all_centrality_nodes)[:6]

with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
for entry in it:
if entry.name.endswith("c.gml") and entry.is_file():
print("-----------------------")
print(f"Filename: {entry.name}")
graph = nx.read_gml(entry.path)
graph_info = analyze_graph_structure(graph)
print_graph_info(graph_info)
return top_nodes
Loading

0 comments on commit a3227d4

Please sign in to comment.