-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearning_based.py
317 lines (280 loc) · 12.1 KB
/
learning_based.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
'''Learning-based method for link prediction'''
__author__ = 'Wenjin Deng'
import pandas as pd
import os
import dgl
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
import random
''' global args '''
A_path = './A_similarity.csv'
B_path = './B_similarity.csv'
A_B_path = './A_B_adjacent.csv'
embed_dim = 256
mid_dim = 1024
out_dim = 512
hidden_edges = 4 # 4,8,16,32,64
max_epoch = 200
neg_node_num = 16 # neg node sample num
rebuild_graph = False
avg_cnt = 3
device=torch.device("cpu")
# if torch.cuda.is_available():
# device=torch.device("cuda")
def get_heterograph_items(data_path, mode, threshold=0.75):
"""
This function is used to get edge pairs.
Parameters:
data_path - Str. graph csv file path
mode - Str. Valid values are 'self' and 'correlation'.
Use 'self' for normalizing and discretizing edge score, further picking edge where score is 1
Use 'correlation' for directly picking edge where score is 1
threshold - Float. edge score = 0 if input < threshold, else 1
Returns:
src - List. Left nodes of Edges
dst - List. Right nodes of Edges
rows_name_list - List. csv rows name
cols_name_list - List. csv cols name
Note:
src contains duplicate elements, and must be subset of rows_name_list.
It's also true for pair (dst, cols_name_list).
"""
assert mode in ['self', 'correlation']
# 1. read csv
matrix = pd.read_csv(data_path)
# 2. get rows name and cols name
rows_name_list = matrix[matrix.columns.tolist()[0]].tolist()
cols_name_list = matrix.columns.tolist()[1:]
src = []
dst = []
# 3. store node pair of edges to src and dst
if mode == 'self':
min_value = min([matrix[name].min() for name in cols_name_list])
max_value = max([matrix[name].max() for name in cols_name_list])
for idx, row in matrix.iterrows():
# print(idx)
# print(row)
for name in cols_name_list:
# 1. normalization (set value to 0-1)
# 2. discretization (round)
# item_val = round(float(row[name])/(max_value-min_value))
item_val = 0 if float(row[name])/(max_value-min_value) < threshold else 1
# 3. record
if item_val == 1:
src.append(rows_name_list[idx])
dst.append(name)
else:
for idx, row in matrix.iterrows():
for name in cols_name_list:
# 3. record
if int(row[name]) == 1:
src.append(rows_name_list[idx])
dst.append(name)
return src, dst, rows_name_list, cols_name_list
# Define a Heterograph Conv model
class RGCN(nn.Module):
"""
copy from dgl offical docs
"""
def __init__(self, in_feats, hid_feats, out_feats, rel_names):
super().__init__()
# 实例化HeteroGraphConv,in_feats是输入特征的维度,out_feats是输出特征的维度,aggregate是聚合函数的类型
self.conv1 = dglnn.HeteroGraphConv({
rel: dglnn.GraphConv(in_feats, hid_feats)
for rel in rel_names}, aggregate='sum')
self.conv2 = dglnn.HeteroGraphConv({
rel: dglnn.GraphConv(hid_feats, out_feats)
for rel in rel_names}, aggregate='sum')
def forward(self, graph, inputs):
# 输入是节点的特征字典
h = self.conv1(graph, inputs)
h = {k: F.relu(v) for k, v in h.items()}
h = self.conv2(graph, h)
return h
# Define a Heterograph DotProduct Predictor
class HeteroDotProductPredictor(nn.Module):
"""
copy from dgl offical docs
"""
def forward(self, graph, h, etype):
# h是从5.1节中对异构图的每种类型的边所计算的节点表示
with graph.local_scope():
graph.ndata['h'] = h
graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
return graph.edges[etype].data['score']
# Define a method to construct negative graph
def construct_negative_graph(graph, k, etype):
"""
copy from dgl offical docs
"""
utype, _, vtype = etype
src, dst = graph.edges(etype=etype)
neg_src = src.repeat_interleave(k)
neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
return dgl.heterograph(
{etype: (neg_src, neg_dst)},
num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})
# Define a method to normalize node embedding vector
def norm_node_embedding(node_h):
"""
normalize node vector
"""
node_h['A'] = F.normalize(node_h['A'], p=2, dim=1)
node_h['B'] = F.normalize(node_h['B'], p=2, dim=1)
return node_h
# Define a model
class Model(nn.Module):
"""
refer from dgl offical docs
"""
def __init__(self, in_features, hidden_features, out_features, rel_names):
super().__init__()
self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
self.pred = HeteroDotProductPredictor()
def forward(self, g, neg_g, x, etype):
h = self.sage(g, x)
h = norm_node_embedding(h) # if not use this, edge score will be out of range [-1,1]
return self.pred(g, h, etype), self.pred(neg_g, h, etype)
# Define a loss
def compute_loss(pos_score, neg_score):
"""
copy from dgl offical docs
"""
# 间隔损失
n_edges = pos_score.shape[0]
return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()
# train
def train(hetero_graph, max_epoch, embed_dim, mid_dim, out_dim, hidden_edges):
# 1. prepare nodes embedding
A_embed = nn.Embedding(hetero_graph.number_of_nodes('A'), embed_dim)
B_embed = nn.Embedding(hetero_graph.number_of_nodes('B'), embed_dim)
A_edge_embed = nn.Embedding(hetero_graph.number_of_edges('A_self'), embed_dim)
B_edge_embed = nn.Embedding(hetero_graph.number_of_edges('B_self'), embed_dim)
AB_edge_embed = nn.Embedding(hetero_graph.number_of_edges('A_and_B'), embed_dim)
hetero_graph.nodes['A'].data['feat'] = A_embed.weight
hetero_graph.nodes['B'].data['feat'] = B_embed.weight
hetero_graph.edges['A_self'].data['feat'] = A_edge_embed.weight
hetero_graph.edges['B_self'].data['feat'] = B_edge_embed.weight
hetero_graph.edges['A_and_B'].data['feat'] = AB_edge_embed.weight
hetero_graph.to(device)
node_features = {'A': hetero_graph.nodes['A'].data['feat'], 'B': hetero_graph.nodes['B'].data['feat']}
# 2. instancing a model
model = Model(embed_dim, mid_dim, out_dim, hetero_graph.etypes).to(device)
model.train()
# 3. prepare an optimizer
opt = torch.optim.Adam(model.parameters())
# 4 training schedule
for epoch in range(max_epoch):
negative_graph = construct_negative_graph(hetero_graph, neg_node_num, ('A', 'A_and_B', 'B'))
pos_score, neg_score = model(hetero_graph, negative_graph, node_features, ('A', 'A_and_B', 'B'))
loss = compute_loss(pos_score, neg_score)
opt.zero_grad()
loss.backward()
opt.step()
print(f'training {epoch+1}//{max_epoch}\tloss:{loss.item()}')
# print(pos_score.shape[0]) # edges num
return model
# eval
def eval(model, hidden_graph):
with torch.no_grad():
model.eval()
# 1. prepare nodes embedding
src_embed = nn.Embedding(hidden_graph.number_of_nodes('A'), embed_dim)
dst_embed = nn.Embedding(hidden_graph.number_of_nodes('B'), embed_dim)
hidden_graph.nodes['A'].data['feat'] = src_embed.weight
hidden_graph.nodes['B'].data['feat'] = dst_embed.weight
hidden_graph.to(device)
hidden_node = {'A':hidden_graph.nodes['A'].data['feat'], 'B':hidden_graph.nodes['B'].data['feat']}
# 2. infer to obtain score list of hidden edges
node_h = model.sage(hidden_graph, hidden_node)
node_h = norm_node_embedding(node_h)
score = model.pred(hidden_graph, node_h, ('A', 'A_and_B', 'B'))
score = score.numpy()
# print(score)
print('src\tdst\tscore')
valid_cnt = 0
for idx, zip_item in enumerate(zip(AB_src_hid,AB_dst_hid)):
src,dst = zip_item
if score[idx][0]>0:
valid_cnt += 1
print(f'{src}\t{dst}\t{score[idx][0]}')
print(f'Pr@k (k={len(AB_src_hid)}): {valid_cnt/len(AB_src_hid)}')
return valid_cnt/len(AB_src_hid)
if __name__== "__main__":
avg_score = 0
for _ in range(avg_cnt):
# 1. prepare items for graph
A_src, A_dst, A_rows_name_list, A_cols_name_list = get_heterograph_items(A_path, 'self')
B_src, B_dst, B_rows_name_list, B_cols_name_list = get_heterograph_items(B_path, 'self')
AB_src, AB_dst, AB_rows_name_list, AB_cols_name_list = get_heterograph_items(A_B_path, 'correlation')
# 2. trans node name to dict for unique id
A_full_dict={}
B_full_dict={}
for id,name in enumerate(AB_rows_name_list):
A_full_dict.update({name: id})
for id,name in enumerate(AB_cols_name_list):
B_full_dict.update({name: id})
# 3. randomly hide edge
if rebuild_graph:
print('rebuild graph task')
AB_src_hid = AB_src
AB_dst_hid = AB_dst
else:
print('link prediction task')
print('random hid...')
hid_pair = random.sample(range(len(AB_src)), hidden_edges)
AB_src_hid = []
AB_dst_hid = []
AB_src_rest = []
AB_dst_rest = []
for idx,zip_item in enumerate(zip(AB_src,AB_dst)):
src, dst = zip_item
if idx in hid_pair:
AB_src_hid.append(src)
AB_dst_hid.append(dst)
else:
AB_src_rest.append(src)
AB_dst_rest.append(dst)
# 4. build rest heterograph (rest = origin - hidden)
if rebuild_graph:
hetero_graph = dgl.heterograph(
{
('A', 'A_self', 'A') : ([A_full_dict['A_'+str(name)] for name in A_src], [A_full_dict['A_'+str(name)] for name in A_dst]),
('B', 'B_self', 'B') : ([B_full_dict['B_'+str(name)] for name in B_src], [B_full_dict['B_'+str(name)] for name in B_dst]),
('A', 'A_and_B', 'B') : ([A_full_dict[name] for name in AB_src],[B_full_dict[name] for name in AB_dst])
}
)
else:
hetero_graph = dgl.heterograph(
{
('A', 'A_self', 'A') : ([A_full_dict['A_'+str(name)] for name in A_src], [A_full_dict['A_'+str(name)] for name in A_dst]),
('B', 'B_self', 'B') : ([B_full_dict['B_'+str(name)] for name in B_src], [B_full_dict['B_'+str(name)] for name in B_dst]),
('A', 'A_and_B', 'B') : ([A_full_dict[name] for name in AB_src_rest],[B_full_dict[name] for name in AB_dst_rest])
}
)
print('training grapha\n', hetero_graph)
# print(hetero_graph.number_of_nodes())
# print(hetero_graph.number_of_edges())
# 5. train
model = train(hetero_graph, max_epoch, embed_dim, mid_dim, out_dim, hidden_edges)
# 6. build hidden graph with hidden edges
if rebuild_graph:
hidden_graph = hetero_graph
else:
hidden_graph = dgl.heterograph(
{
('A', 'A_self', 'A') : ([A_full_dict['A_'+str(name)] for name in A_src], [A_full_dict['A_'+str(name)] for name in A_dst]),
('B', 'B_self', 'B') : ([B_full_dict['B_'+str(name)] for name in B_src], [B_full_dict['B_'+str(name)] for name in B_dst]),
('A', 'A_and_B', 'B') : ([A_full_dict[name] for name in AB_src_hid],[B_full_dict[name] for name in AB_dst_hid])
}
)
print('eval grapha\n', hidden_graph)
# 7. eval
final_score = eval(model, hidden_graph)
avg_score += final_score
print(f'avg_{avg_cnt}: {avg_score/avg_cnt}')