-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomGraph_RW.py
181 lines (159 loc) · 6.77 KB
/
randomGraph_RW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 8 14:01:12 2024
generate random graphs by computing node label disributions and random walks
Ensure each sample is connected by overlapping multiple random walks
Check the resulting node degree for similarity with respect to original sample
@author: hari
"""
from torch import where, tensor, ones, zeros, cat
from torch import outer, diag, sparse_coo_tensor , all
from torch import Tensor, Size
from torch.nn.functional import normalize
from torch.distributions.multinomial import Multinomial
# dataset
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data # graph object
#from torch_geometric.transforms import LargestConnectedComponents
from scipy.sparse.csgraph import connected_components
# utility
from torch_geometric.utils import to_undirected, to_networkx, to_scipy_sparse_matrix
from utility import selectNodeFeats # fix poor one-hot encoding
# graphics
import networkx as nx
from matplotlib import pyplot as plt
#%% utility functions
# TODO move to utility module with dependencies
def visualize_graph(graph:Data, color, title=''):
fig, axs = plt.subplots(1,1)
m = graph.num_edges
if title:
fig.suptitle(title+f" ({m} edges)")
# network structure
G = to_networkx(graph, to_undirected=True)
nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42),
with_labels=True, node_size=100,
node_color=color, cmap="Set2", ax=axs)
# adjacency matrix is not meaningful (permutations)
fig.show()
def runRandWalks(number:int, length:int,
Pr:Tensor, degs_ref:Tensor, degs_now:Tensor) :
# TODO check that Pr dimension is geq than number of degs
assert degs_ref.shape == degs_now.shape
# walk start from uncon. nodes with degree different from original
Pr_start = degs_ref-degs_now
Pr_start[where(Pr_start<0)]=0 # stop degree overflow
i = where(Multinomial(number, Pr_start).sample()>0)[0]
# init random walk
sample = Multinomial(1,Pr[i,:]).sample()
_, j = where(sample>0)
path = cat((i.reshape([1,-1]),j.reshape([1,-1])), dim=0)
# cannot walk backwards
batch_inv = cat((j.reshape([1,-1]),i.reshape([1,-1])), dim=0)
Pr[tuple(path)]=1e-3
Pr[tuple(batch_inv)]=1e-3
# complete the rw short of one step
for k in range(len_rw-1):
# next step in the walk
i=j.clone().detach()
# rw step
sample = Multinomial(1,Pr[i,:]).sample()
_, j = where(sample>0)
batch = cat((i.reshape([1,-1]),j.reshape([1,-1])), dim=0)
path = cat((path, batch),dim=1)
# cannot walk backwards
batch_inv = cat((j.reshape([1,-1]),i.reshape([1,-1])), dim=0)
Pr[tuple(batch)]=1e-3
Pr[tuple(batch_inv)]=1e-3
# sanity check
try:
assert path.shape == Size([2,number*length])
except:
print(path.shape)
path_rw = path.clone().detach()
# connected should have less proba to be chosen as starting point
degs_now[path_rw.unique()] += 1
degs_rw = degs_now.clone().detach()
# return values for next iterations
return path_rw, degs_rw
# %% load dataset and fix poor encoding
dataAlias = 'DHFR'
transform= selectNodeFeats(tensor([ 0, 5, 6, 7, 8, 15, 16, 34, 52]))
dataset = TUDataset(root='tmp', name=dataAlias, transform= transform)
# x shape is not updated BUT when you access the data, the transform is applied
# %% compute node labels probabilities (for a graph)
ig = 500
graph = dataset[ig]
visualize_graph(graph, '#d0d0d0', f"DHFR {ig}")
""" computation of label distribution (discrete)
use the edge index to create two stacks of one-hot encoded node labels
from the node features matrix: let A be the one generated from all edge sources
and B the stack generated form the destination of the edges.
Then the distribution is a simple matrix multiplication.
"""
E0 = graph.x[graph.edge_index[0,:]]
E1 = graph.x[graph.edge_index[1,:]]
labelDistribution = E0.t()@E1 # the result is symmetric
# create corresponding probability matrix for each node based on the label
Ldist = graph.x @ labelDistribution @ graph.x.t()
# enforce null probability for self loops AND normalize
Pr_NL = normalize(Ldist-diag(Ldist.diag()), dim=1)
assert all(Pr_NL>=0) and all(Pr_NL<=1)
#%% node degree probability (Chung-Lu)
m = graph.num_edges
# adjacency matrix (sparse)
A = sparse_coo_tensor(graph.edge_index, ones(m))
# compute degrees (convert to dense afterwards, needed for outer product)
degs= A.sum(dim=0).to_dense()
# Chung-Lu probabilities: deg_u*deg_v (con be writen as outer product)
Pdist = outer(degs,degs)
# remove self loops
Pdist = Pdist - diag(Pdist.diag())
# FIXME normalize Chung-Lu formula
Pr_CL = normalize(Pdist, dim=1)
assert all(Pr_CL>=0) and all(Pr_CL<=1)
# %% generate new random sample with n rw of specified length
num_rw = graph.num_nodes* 1//2
len_rw = 1
# initialize graph (preserve node labels)
graph_sample = Data(x=graph.x)
# rw modifies the probability of visited nodes
Pr_rw = normalize(Pr_NL, dim=1).clone().detach()
# LOOP2 Monte Carlo to ensure exactly 1 connected component
cc_sample = graph_sample.num_nodes
while cc_sample>1:
# init degs to zero
degs_rw = zeros(degs.shape)
# DEBUG fnction run rw
number = num_rw; length = len_rw; Pr = Pr_rw.clone().detach(); degs_ref = degs.clone().detach(); degs_now= degs_rw.clone().detach()
path_rw, degs_rw = runRandWalks(num_rw,len_rw,Pr_rw,degs,degs_rw)
# add RW path to sto_scipy_sparse_matrixample graph
graph_sample.update({'edge_index':path_rw})
# LOOP1 until one connected component
while graph_sample.has_isolated_nodes():
# DEBUG fnction run rw
number = num_rw; length = len_rw; Pr = Pr_rw.clone().detach(); degs_ref = degs.clone().detach(); degs_now= degs_rw.clone().detach()
path_rw, degs_rw = runRandWalks(num_rw,len_rw,Pr_rw,degs,degs_rw)
# add RW path to sample graph
new_edge_index = cat((graph_sample.edge_index, path_rw), dim=1)
graph_sample.update({'edge_index':new_edge_index})
# (LOOP2) assert connected components
adj = to_scipy_sparse_matrix(graph_sample.edge_index)
cc_sample, _ = connected_components(adj, directed=False)
# cast resulting sample to undirected graph
undir_index = to_undirected(graph_sample.edge_index)
graph_sample = Data(x=graph.x, edge_index=undir_index)
#%% sanity check
""" idea
because of the encoding, there are al lot of node labels that never appear.
these sould not be present in any random sample
"""
# self loops should not be present
assert not graph_sample.has_self_loops()
# there should be only one connected component
assert not graph_sample.has_isolated_nodes()
# TODO other ?
# check result visually
A_sample = sparse_coo_tensor(undir_index, ones(undir_index.shape[1]))
visualize_graph(graph_sample, '#a0d0a0', 'RW')