-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeaturePreparation.py
81 lines (66 loc) · 2.59 KB
/
featurePreparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 15 10:02:21 2024
Feature pre-processing before machine learning operations.
Saves data to binary files for later usage.
1. node features. TODO
2. graph features TODO
@author: hari
"""
from torch import tensor, cat, stack, Size, save
from torch_geometric.datasets import TUDataset
from torch_geometric.transforms import Compose
from utility import (onehotDecode, nodeDegree, closenessCentrality,
betweennessCentrality, )
# store collection of graphs to binary file (InMemoryDataset)
from randomGraph_softSwap import storeDataset
import datetime as dt
import os.path as osp
from torch_geometric.data import InMemoryDataset # debug
# condensation transform
from torch_geometric.data import Data
from torch_geometric.transforms import BaseTransform
# %% load dataset with chosen features
composition = Compose([
onehotDecode(3, tensor(list(range(53)))),
nodeDegree(), closenessCentrality(), betweennessCentrality(),
])
dataset = TUDataset(root='tmp', name='DHFR', use_node_attr=True,
transform= composition)
# sanity check: assert num_feats
assert dataset[0].x.shape[1] == 7
# %% export graph collection as torch_geometric file (like random collections)
# create list of Data
prepCollection = [d for d in dataset]
# store result as PyG Dataset
rootFolder='computed'
outName= dt.datetime.now().strftime("%Y%m%d-%H%M%S")
rootPath= osp.join(rootFolder,outName)
storeDataset(rootPath, prepCollection)
# sanity check: load exported data and check features
checkDataset = InMemoryDataset()
checkDataset.load(osp.join(rootPath,'processed','data.pt'))
for i in range(len(dataset)):
assert checkDataset[i].x.shape == dataset[i].x.shape
# %% define condensation transform
def graphFeatures(nodeFeats : tensor)-> tensor:
"""
condense node feature matrix to graph feature vector
1. computes the node feature sample mean
2. appends the node feature sample variance (std)
"""
graphFeats = cat((nodeFeats.mean(dim=0), nodeFeats.std(dim=0)), dim=0)
return graphFeats
# %% condense node information to graph-level features
# compute graph features
inputDataset = stack([graphFeatures(graph.x) for graph in dataset])
# sanity check
assert inputDataset.shape == Size([len(dataset),2*7])
# get graph labels
outputDataset = stack([graph.y for graph in dataset])
# sanity check
assert outputDataset.shape == Size([len(dataset),1])
# %% export dataset to standard torch for traditional machine learning
# FIXME one or two files ? two for the moment
save(inputDataset, rootPath+'_input.pt')
save(outputDataset, rootPath+'_output.pt')