Skip to content

Commit

Permalink
Refactor Attribute Embedder Construction (typedb#110)
Browse files Browse the repository at this point in the history
## What is the goal of this PR?

Reduce the complexity of the code for embedding Concepts

## What are the changes implemented in this PR?

- Remove an overly general `configure_embedders` function
- Make the embedder construction for categorical vs continuous vs non-attribute more explicit in the pipeline
- Make KGCN model properly composed by passing in a ThingEmbedder and a RoleEmbedder
- Move all code related to embedding Concepts into one place
  • Loading branch information
jmsfltchr authored and Ganeshwara Herawan Hananda committed Dec 6, 2019
1 parent 86b49b0 commit 117e89a
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 136 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ __pycache__/

# Data input/output directories
dataset/
kglib/kgcn/examples/diagnosis/events/
9 changes: 6 additions & 3 deletions kglib/kgcn/learn/learn_IT.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from kglib.kgcn.learn.learn import KGCNLearner
from kglib.kgcn.models.attribute import BlankAttribute
from kglib.kgcn.models.core import KGCN
from kglib.kgcn.models.embedding import ThingEmbedder, RoleEmbedder


class ITKGCNLearner(unittest.TestCase):
Expand All @@ -47,10 +48,12 @@ def test_learner_runs(self):
target_graph.add_node(2, type='company', features=np.array([0, 1, 0], dtype=np.float32))
target_graph.graph['features'] = np.zeros(5, dtype=np.float32)

attr_embedding_dim = 6
attr_embedders = {lambda: BlankAttribute(attr_embedding_dim): [0, 1, 2]}
thing_embedder = ThingEmbedder(node_types=['person', 'employment', 'employee'], type_embedding_dim=5,
attr_embedding_dim=6, categorical_attributes={}, continuous_attributes={})

kgcn = KGCN(3, 2, 5, attr_embedding_dim, attr_embedders, edge_output_size=3, node_output_size=3)
role_embedder = RoleEmbedder(num_edge_types=2, type_embedding_dim=5)

kgcn = KGCN(thing_embedder, role_embedder, edge_output_size=3, node_output_size=3)

learner = KGCNLearner(kgcn, num_processing_steps_tr=2, num_processing_steps_ge=2)

Expand Down
10 changes: 10 additions & 0 deletions kglib/kgcn/models/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ py_test(
]
)

py_test(
name = "embedding_IT",
srcs = [
"embedding_IT.py"
],
deps = [
"models"
]
)

py_test(
name = "typewise_test",
srcs = [
Expand Down
32 changes: 7 additions & 25 deletions kglib/kgcn/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,12 @@
# under the License.
#

from functools import partial

import numpy as np
import sonnet as snt
from graph_nets import modules
from graph_nets import utils_tf
from graph_nets.modules import GraphIndependent

from kglib.kgcn.models.embedding import common_embedding, node_embedding


def softmax(x):
return np.exp(x) / np.sum(np.exp(x))
Expand Down Expand Up @@ -81,23 +77,18 @@ class KGCN(snt.AbstractModule):
"""

def __init__(self,
num_node_types,
num_edge_types,
type_embedding_dim,
attr_embedding_dim,
attr_embedders,
thing_embedder,
role_embedder,
edge_output_size=3,
node_output_size=3,
latent_size=16,
num_layers=2,
name="KGCN"):
super(KGCN, self).__init__(name=name)

self._num_node_types = num_node_types
self._num_edge_types = num_edge_types
self._type_embedding_dim = type_embedding_dim
self._attr_embedding_dim = attr_embedding_dim
self._attr_embedders = attr_embedders
self._thing_embedder = thing_embedder
self._role_embedder = role_embedder

self._latent_size = latent_size
self._num_layers = num_layers

Expand All @@ -117,21 +108,12 @@ def __init__(self,
self._output_transform = modules.GraphIndependent(edge_fn, node_fn, None)

def _edge_model(self):
common_embedding_module = snt.Module(
partial(common_embedding, num_types=self._num_edge_types,
type_embedding_dim=self._type_embedding_dim)
)

return snt.Sequential([common_embedding_module,
return snt.Sequential([self._role_embedder,
snt.nets.MLP([self._latent_size] * self._num_layers, activate_final=True),
snt.LayerNorm()])

def _node_model(self):
node_embedding_module = snt.Module(
partial(node_embedding, num_types=self._num_node_types, type_embedding_dim=self._type_embedding_dim,
attr_encoders=self._attr_embedders, attr_embedding_dim=self._attr_embedding_dim)
)
return snt.Sequential([node_embedding_module,
return snt.Sequential([self._thing_embedder,
snt.nets.MLP([self._latent_size] * self._num_layers, activate_final=True),
snt.LayerNorm()])

Expand Down
10 changes: 8 additions & 2 deletions kglib/kgcn/models/core_IT.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from graph_nets.graphs import GraphsTuple

from kglib.kgcn.models.core import KGCN
from kglib.kgcn.models.embedding import ThingEmbedder, RoleEmbedder


class ITKGCN(unittest.TestCase):
Expand All @@ -39,8 +40,13 @@ def test_kgcn_runs(self):
n_node=tf.convert_to_tensor(np.array([3], dtype=np.int32)),
n_edge=tf.convert_to_tensor(np.array([2], dtype=np.int32)))

attr_embedders = {lambda: lambda x: tf.constant(np.zeros((3, 6), dtype=np.float32)): [0, 1, 2]}
kgcn = KGCN(3, 2, 5, 6, attr_embedders, edge_output_size=3, node_output_size=3)
thing_embedder = ThingEmbedder(node_types=['a', 'b', 'c'], type_embedding_dim=5, attr_embedding_dim=6,
categorical_attributes={'a': ['a1', 'a2', 'a3'], 'b': ['b1', 'b2', 'b3']},
continuous_attributes={'c': (0, 1)})

role_embedder = RoleEmbedder(num_edge_types=2, type_embedding_dim=5)

kgcn = KGCN(thing_embedder, role_embedder, edge_output_size=3, node_output_size=3)

kgcn(graph, 2)

Expand Down
103 changes: 98 additions & 5 deletions kglib/kgcn/models/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,51 @@

import tensorflow as tf
import sonnet as snt

from kglib.kgcn.models.attribute import CategoricalAttribute, ContinuousAttribute, BlankAttribute
from kglib.kgcn.models.typewise import TypewiseEncoder


def common_embedding(features, num_types, type_embedding_dim):
class ThingEmbedder(snt.AbstractModule):
def __init__(self, node_types, type_embedding_dim, attr_embedding_dim, categorical_attributes,
continuous_attributes, name="ThingEmbedder"):
super(ThingEmbedder, self).__init__(name=name)

self._node_types = node_types
self._type_embedding_dim = type_embedding_dim
self._attr_embedding_dim = attr_embedding_dim

# Create embedders for the different attribute types
self._attr_embedders = dict()

if categorical_attributes is not None:
self._attr_embedders.update(
construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes))

if continuous_attributes is not None:
self._attr_embedders.update(
construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes))

self._attr_embedders.update(
construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes,
continuous_attributes))

def _build(self, features):
return tf.concat([embed_type(features, len(self._node_types), self._type_embedding_dim),
embed_attribute(features, self._attr_embedders, self._attr_embedding_dim)], axis=1)


class RoleEmbedder(snt.AbstractModule):
def __init__(self, num_edge_types, type_embedding_dim, name="RoleEmbedder"):
super(RoleEmbedder, self).__init__(name=name)
self._num_edge_types = num_edge_types
self._type_embedding_dim = type_embedding_dim

def _build(self, features):
return embed_type(features, self._num_edge_types, self._type_embedding_dim)


def embed_type(features, num_types, type_embedding_dim):
preexistance_feat = tf.expand_dims(tf.cast(features[:, 0], dtype=tf.float32), axis=1)
type_embedder = snt.Embed(num_types, type_embedding_dim)
norm = snt.LayerNorm()
Expand All @@ -31,13 +72,65 @@ def common_embedding(features, num_types, type_embedding_dim):
return tf.concat([preexistance_feat, type_embedding], axis=1)


def attribute_embedding(features, attr_encoders, attr_embedding_dim):
def embed_attribute(features, attr_encoders, attr_embedding_dim):
typewise_attribute_encoder = TypewiseEncoder(attr_encoders, attr_embedding_dim)
attr_embedding = typewise_attribute_encoder(features[:, 1:])
tf.summary.histogram('attribute_embedding_histogram', attr_embedding)
return attr_embedding


def node_embedding(features, num_types, type_embedding_dim, attr_encoders, attr_embedding_dim):
return tf.concat([common_embedding(features, num_types, type_embedding_dim),
attribute_embedding(features, attr_encoders, attr_embedding_dim)], axis=1)
def construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes):
attr_embedders = dict()

# Construct attribute embedders
for attribute_type, categories in categorical_attributes.items():

attr_typ_index = node_types.index(attribute_type)

def make_embedder():
return CategoricalAttribute(len(categories), attr_embedding_dim,
name=attribute_type + '_cat_embedder')

# Record the embedder, and the index of the type that it should encode
attr_embedders[make_embedder] = [attr_typ_index]

return attr_embedders


def construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes):
attr_embedders = dict()

# Construct attribute embedders
for attribute_type in continuous_attributes.keys():

attr_typ_index = node_types.index(attribute_type)

def make_embedder():
return ContinuousAttribute(attr_embedding_dim, name=attribute_type + '_cat_embedder')

# Record the embedder, and the index of the type that it should encode
attr_embedders[make_embedder] = [attr_typ_index]

return attr_embedders


def construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes):

attribute_names = list(categorical_attributes.keys())
attribute_names.extend(list(continuous_attributes.keys()))

non_attribute_nodes = []
for i, type in enumerate(node_types):
if type not in attribute_names:
non_attribute_nodes.append(i)

# All entities and relations (non-attributes) also need an embedder with matching output dimension, which does
# nothing. This is provided as a list of their indices
def make_blank_embedder():
return BlankAttribute(attr_embedding_dim)

attr_embedders = dict()

if len(non_attribute_nodes) > 0:
attr_embedders[make_blank_embedder] = non_attribute_nodes
return attr_embedders
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,37 @@
# specific language governing permissions and limitations
# under the License.
#

import unittest

from kglib.kgcn.pipeline.pipeline import configure_embedders
from kglib.kgcn.models.embedding import construct_categorical_embedders, construct_continuous_embedders, \
construct_non_attribute_embedders


def construct_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes):
attr_embedders = dict()

if categorical_attributes is not None:
attr_embedders.update(construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes))

if continuous_attributes is not None:
attr_embedders.update(construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes))

attr_embedders.update(construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes,
continuous_attributes))
return attr_embedders


class TestConfigureEmbedders(unittest.TestCase):
class TestConstructingEmbedders(unittest.TestCase):

def test_all_types_encoded(self):
node_types = ['a', 'b', 'c']
attr_embedding_dim = 5
categorical_attributes = {'a': ['option1', 'option2']}
continuous_attributes = {'b': (0, 1)}
attr_embedders = configure_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes)

attr_embedders = construct_embedders(node_types, attr_embedding_dim, categorical_attributes,
continuous_attributes)
all_types = [l for el in list(attr_embedders.values()) for l in el]

expected_types = [0, 1, 2]
Expand All @@ -40,7 +58,10 @@ def test_multiple_categorical_embedders(self):
attr_embedding_dim = 5
categorical_attributes = {'a': ['option1', 'option2'], 'c': ['option3', 'option4']}
continuous_attributes = {'b': (0, 1)}
attr_embedders = configure_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes)

attr_embedders = construct_embedders(node_types, attr_embedding_dim, categorical_attributes,
continuous_attributes)

all_types = [l for el in list(attr_embedders.values()) for l in el]
all_types.sort()

Expand All @@ -51,3 +72,7 @@ def test_multiple_categorical_embedders(self):

for types in attr_embedders.values():
self.assertNotEqual(types, [])


if __name__ == "__main__":
unittest.main()
40 changes: 4 additions & 36 deletions kglib/kgcn/models/embedding_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@
import tensorflow as tf
from unittest.mock import Mock
from unittest.mock import patch
from kglib.kgcn.models.embedding import common_embedding, attribute_embedding, node_embedding
from kglib.kgcn.models.embedding import embed_type, embed_attribute
from kglib.utils.test.utils import get_call_args


class TestCommonEmbedding(unittest.TestCase):
class TestTypeEmbedding(unittest.TestCase):
def setUp(self):
tf.enable_eager_execution()

def test_embedding_output_shape_as_expected(self):
features = np.array([[1, 0, 0.7], [1, 2, 0.7], [0, 1, 0.5]], dtype=np.float32)
type_embedding_dim = 5
output = common_embedding(features, 3, type_embedding_dim)
output = embed_type(features, 3, type_embedding_dim)

np.testing.assert_array_equal(np.array([3, 6]), output.shape)

Expand All @@ -54,7 +54,7 @@ def test_embedding_is_typewise(self):
attr_encoders = Mock()
attr_embedding_dim = Mock()

attribute_embedding(features, attr_encoders, attr_embedding_dim) # Function under test
embed_attribute(features, attr_encoders, attr_embedding_dim) # Function under test

mock_class.assert_called_once_with(attr_encoders, attr_embedding_dim)
call_args = get_call_args(mock_instance)
Expand All @@ -64,37 +64,5 @@ def test_embedding_is_typewise(self):
patcher.stop()


class TestNodeEmbedding(unittest.TestCase):

def setUp(self):
tf.enable_eager_execution()

def test_embedding_is_typewise(self):
features = Mock()
num_types = Mock()
type_embedding_dim = Mock()
attr_encoders = Mock()
attr_embedding_dim = Mock()

mock_attribute_embedding = Mock(return_value=np.ones((3, 5)))

mock_common_embedding = Mock(return_value=np.ones((3, 4)))

patcher_attr = patch('kglib.kgcn.models.embedding.attribute_embedding', spec=True,
new=mock_attribute_embedding)
patcher_attr.start()

patcher_common = patch('kglib.kgcn.models.embedding.common_embedding', spec=True,
new=mock_common_embedding)
patcher_common.start()

embedding = node_embedding(features, num_types, type_embedding_dim, attr_encoders, attr_embedding_dim)

np.testing.assert_array_equal(np.ones((3, 9)), embedding.numpy())

patcher_attr.stop()
patcher_common.stop()


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 117e89a

Please sign in to comment.