commit 9aa890972b341519afb1339f636d968944f86ecf
parent 194510b0a3c0718ad8137de758e2646d5a4a93e7
Author: Étienne Simon <esimon@esimon.eu>
Date: Wed, 16 Apr 2014 13:09:36 +0200
Add Model
Diffstat:
9 files changed, 340 insertions(+), 64 deletions(-)
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python2
+
+import scipy
+import numpy
+import sys
+import theano
+
+class Dataset(object):
+ def __init__(self, prefix):
+ print >>sys.stderr, '# Loading dataset "{0}"'.format(prefix)
+ with open(prefix+'/embeddings', 'r') as file:
+ self.embeddings = file.readlines()
+ with open(prefix+'/relations', 'r') as file:
+ self.relations = file.readlines()
+ self.number_embeddings = len(self.embeddings)
+ self.number_relations = len(self.relations)
+ self.load_file(prefix, 'train')
+ self.load_file(prefix, 'valid')
+ self.load_file(prefix, 'test')
+
+ def load_file(self, prefix, name):
+ with open('{0}/{1}'.format(prefix, name), 'r') as file:
+ content = map(lambda line: map(int, line.split('\t')), file.readlines())
+ [left, relation, right] = map(list, zip(*content))
+ N = len(relation)
+ setattr(self, name+'_size', N)
+ setattr(self, name+'_right', scipy.sparse.csr_matrix(([1]*N, right, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX))
+ setattr(self, name+'_relation', scipy.sparse.csr_matrix(([1]*N, relation, range(N+1)), shape=(N, self.number_relations), dtype=theano.config.floatX))
+ setattr(self, name+'_left', scipy.sparse.csr_matrix(([1]*N, left, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX))
+
+ def training_minibatch(self, batch_size):
+ # Sampling corrupted entities
+ def sample_matrix():
+ row = range(self.train_size+1)
+ col = numpy.random.randint(0, self.number_embeddings, size=self.train_size)
+ data = numpy.ones(self.train_size)
+ random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX)
+ return random_embeddings
+ corrupted_left = sample_matrix()
+ corrupted_right = sample_matrix()
+
+ # Shuffling training set
+ order = numpy.random.permutation(self.train_size)
+ train_left = self.train_left[order, :]
+ train_right = self.train_right[order, :]
+ train_relation = self.train_relation[order, :]
+
+ # Yielding batches
+ ls = numpy.linspace(0, self.train_size, 1+self.train_size/batch_size)
+ for i in xrange(len(ls)-1):
+ left_positive = train_left[ls[i]:ls[i+1]]
+ right_positive = train_right[ls[i]:ls[i+1]]
+ left_negative = corrupted_left[ls[i]:ls[i+1]]
+ right_negative = corrupted_right[ls[i]:ls[i+1]]
+ relation = train_relation[ls[i]:ls[i+1]]
+ yield (relation, left_positive, right_positive, left_negative, right_negative)
+
+ def iterate(self, name, batch_size):
+ def repeat_csr(matrix, size):
+ data = list(matrix.data)*size
+ indices = list(matrix.indices)*size
+ indptr = range(size+1)
+ return scipy.sparse.csr_matrix((data, indices, indptr), shape=(size, matrix.shape[1]), dtype=theano.config.floatX)
+ N = getattr(self, name+'_size')
+ relation = getattr(self, name+'_relation')
+ left = getattr(self, name+'_left')
+ right = getattr(self, name+'_right')
+ for i in xrange(N):
+ yield (repeat_csr(relation[i], batch_size), repeat_csr(left[i], batch_size), right[i])
+
+ def universe_minibatch(self, batch_size):
+ N = len(self.embeddings)
+ entities = scipy.sparse.eye(N, format='csr', dtype=theano.config.floatX)
+ for i in xrange(N/batch_size):
+ yield entities[i*batch_size:(i+1)*batch_size]
diff --git a/embeddings.py b/embeddings.py
@@ -26,10 +26,10 @@ class Embeddings(object):
E_bound = numpy.sqrt(6. / dimension)
E_values = rng.uniform(low=-E_bound, high=E_bound, size=(number, dimension))
- E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1))
+ E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1))[:, numpy.newaxis]
self.E = theano.shared(name=tag, value=numpy.asarray(E_values, dtype=theano.config.floatX))
- self.params = [E]
+ self.parameters = [self.E]
def embed(self, entities):
""" Embed given entities.
@@ -39,15 +39,7 @@ class Embeddings(object):
"""
return S.dot(entities, self.E)
- def L1_norm(self):
- """ Compute the L1-norm of the embeddings parameter. """
- return T.sum(T.abs(self.E))
-
- def sqrL2_norm(self):
- """ Compute the squared L2-norm of the embeddings parameter. """
- return T.sum(T.sqr(self.E))
-
- def sgd_updates(self, cost, learning_rate):
+ def updates(self, cost, learning_rate):
""" Compute the updates to perform a SGD step w.r.t. a given cost.
Keyword arguments:
diff --git a/main.py b/main.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python2
+
+from dataset import *
+from model import *
+from relations.translations import *
+
+if __name__ == '__main__':
+ hyperparameters = dict()
+ hyperparameters['similarity'] = L1_norm
+ hyperparameters['rng'] = numpy.random
+ hyperparameters['dimension'] = 20
+ hyperparameters['margin'] = 1.
+ hyperparameters['relation_learning_rate'] = 1
+ hyperparameters['embeddings_learning_rate'] = 0.1
+ hyperparameters['train_batch_size'] = 100
+ hyperparameters['test_batch_size'] = 500
+ hyperparameters['validation_frequency'] = 500
+ hyperparameters['number_epoch'] = 10000
+
+ data = Dataset('data/dummy')
+ model = Model.initialise(Translations, data, hyperparameters, 'dummy')
+ model.train()
+ model.test()
diff --git a/model.py b/model.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python2
+
+import time
+
+import sys
+import numpy
+import scipy
+import theano
+import theano.tensor as T
+import theano.sparse as S
+
+from embeddings import *
+
+def L1_norm(l, r):
+ return T.sum(abs(l-r), axis=1)
+
+def L2_norm(l, r):
+ return T.sqrt(T.sum(T.sqr(l-r), axis=1))
+
+class Model(object):
+ """ Model class.
+
+ Training model using SGD with a contrastive criterion.
+ """
+
+ @classmethod
+ def initialise(cls, Relations, dataset, hyperparameters, tag):
+ """ Initialise a model.
+
+ Keyword arguments:
+ Relations -- relations class
+ dataset -- dataset on which the model will be trained and tested
+ hyperparameters -- hyperparameters dictionary
+ tag -- name of the embeddings for parameter declaration
+ """
+
+ print >>sys.stderr, '# Initialising model "{0}"'.format(tag)
+
+ self = cls()
+ self.embeddings = Embeddings(hyperparameters['rng'], dataset.number_embeddings, hyperparameters['dimension'], tag+'.embeddings')
+ self.relations = Relations(hyperparameters['rng'], dataset.number_relations, hyperparameters['dimension'], tag+'.relations')
+ self.dataset = dataset
+ self.hyperparameters = hyperparameters
+ self.tag = tag
+
+ self.build()
+ return self
+
+ @classmethod
+ def load(cls, filepath, dataset, hyperparameters):
+ """ Load a model from a file.
+
+ Keyword arguments:
+ filepath -- path to the Model file
+ dataset -- dataset on which the model will be trained and tested
+ hyperparameters -- hyperparameters dictionary
+ """
+
+ print >>sys.stderr, '# Loading model from "{0}"'.format(filepath)
+
+ self = cls()
+
+ with open(filepath, 'rb') as file:
+ self.embeddings = cPickle.load(file)
+ self.relations = cPickle.load(file)
+ self.dataset = dataset;
+ self.hyperparameters = hyperparameters;
+
+ self.build()
+ return self
+
+ def save(self, filepath):
+ """ Save the model in a file. """
+ with open(filepath, 'wb') as file:
+ cPickle.dump(self.embeddings, file, -1)
+ cPickle.dump(self.relations, file, -1)
+
+ def build(self):
+ """ Build theano functions. """
+ print >>sys.stderr, '## Compiling Theano graph for model "{0}"'.format(self.tag)
+
+ self.parameters = self.relations.parameters + self.embeddings.parameters
+ inputs = tuple(S.csr_matrix() for _ in xrange(5))
+ positive_left, positive_right = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2])
+ negative_left, negative_right = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4])
+ positive_score = self.hyperparameters['similarity'](self.relations.apply(positive_left, inputs[0]), positive_right)
+ negative_score = self.hyperparameters['similarity'](self.relations.apply(negative_left, inputs[0]), negative_right)
+ score = self.hyperparameters['margin'] + positive_score - negative_score
+ violating_margin = score>0
+ criterion = T.mean(violating_margin*score)
+
+ self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion))
+ self.scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[positive_score])
+
+ def updates(self, cost):
+ """ Compute the updates to perform a SGD step w.r.t. a given cost.
+
+ Keyword arguments:
+ cost -- The cost to optimise.
+ """
+ lr_relations = self.hyperparameters['relation_learning_rate']
+ lr_embeddings = self.hyperparameters['embeddings_learning_rate']
+ return self.relations.updates(cost, lr_relations) + self.embeddings.updates(cost, lr_embeddings)
+
+ def train(self):
+ """ Train the model. """
+ print >>sys.stderr, '# Training the model "{0}"'.format(self.tag)
+
+ batch_size = self.hyperparameters['train_batch_size']
+ validation_frequency = self.hyperparameters['validation_frequency']
+ number_epoch = self.hyperparameters['number_epoch']
+
+ for epoch in xrange(number_epoch):
+ if epoch % validation_frequency == 0:
+ self.validate(epoch)
+
+ for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size):
+ c1=self.train_function(relation, left_positive, right_positive, left_positive, right_negative)
+ c2=self.train_function(relation, left_positive, right_positive, left_negative, right_positive)
+
+ def error(self, name):
+ """ Compute the mean rank and top 10 on a given data. """
+ batch_size = self.hyperparameters['test_batch_size']
+ count, mean, top10 = 0, 0, 0
+ for (relation, left, right) in self.dataset.iterate(name, batch_size):
+ scores = None
+ for entities in self.dataset.universe_minibatch(batch_size):
+ batch_result = self.scoring_function(relation, left, entities)
+ scores = numpy.array(batch_result, dtype=theano.config.floatX) if scores is None else numpy.concatenate((scores, batch_result), axis=1)
+ rank = 1+numpy.where(numpy.argsort(scores)==right.indices[0])[1] # FIXME ugly
+ mean = mean + rank
+ count = count + 1
+ top10 = top10 + (rank<=10)
+ mean = float(mean) / count
+ top10 = float(top10) / count
+ return (mean, top10)
+
+ def validate(self, epoch):
+ """ Validate the model. """
+ print >>sys.stderr, 'Validation epoch {:<5}'.format(epoch),
+ (valid_mean, valid_top10) = self.error('valid')
+ (train_mean, train_top10) = self.error('train')
+ print >>sys.stderr, 'valid mean: {0:<15} valid top10: {1:<15} train mean: {0:<15} train top10: {1:<15}'.format(valid_mean, valid_top10, train_mean, train_top10)
+
+ def test(self):
+ """ Test the model. """
+ print >>sys.stderr, '# Testing the model "{0}"'.format(self.tag),
+ (mean, top10) = self.error('test')
+ print >>sys.stderr, ' mean: {0:<15} top10: {1:<15}'.format(mean, top10)
diff --git a/relations/__init__.py b/relations/__init__.py
diff --git a/relations/translation.py b/relations/translation.py
@@ -1,53 +0,0 @@
-#!/usr/bin/env python2
-
-import numpy
-import theano
-import theano.tensor as T
-import theano.sparse as S
-
-class Translations(object):
- """ Translations class.
-
- This class has one parameter:
- R -- the translations
- """
- def __init__(self, rng, number, dimension, tag):
- """ Initialise the parameter.
-
- Keyword arguments:
- rng -- module for random number generation
- number -- number of relation
- dimension -- dimension of the embeddings
- tag -- name of the relations for parameter declaration
- """
-
- self.number = number
- self.dimension = dimension
-
- R_bound = numpy.sqrt(6. / dimension)
- R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension))
- R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1))
- self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX))
-
- self.params = [R]
-
- def L1_norm(self):
- """ Compute the L1-norm of the relations parameter. """
- return T.sum(T.abs(self.R))
-
- def sqrL2_norm(self):
- """ Compute the squared L2-norm of the relations parameter. """
- return T.sum(T.sqr(self.R))
-
- def apply(self, input, relations):
- """ Apply the given relations to a given input. """
- return S.dot(relations, self.R)+inputs
-
- def sgd_updates(self, cost, learning_rate):
- """ Compute the updates to perform a SGD step w.r.t. a given cost.
-
- Keyword arguments:
- cost -- The cost to optimise.
- learning_rate -- The learning rate used for gradient descent.
- """
- return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))]
diff --git a/relations/translations.py b/relations/translations.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python2
+
+import numpy
+import theano
+import theano.tensor as T
+import theano.sparse as S
+
+class Translations(object):
+ """ Translations class.
+
+ This class has one parameter:
+ R -- the translations
+ """
+ def __init__(self, rng, number, dimension, tag):
+ """ Initialise the parameter.
+
+ Keyword arguments:
+ rng -- module for random number generation
+ number -- number of relation
+ dimension -- dimension of the embeddings
+ tag -- name of the relations for parameter declaration
+ """
+
+ self.number = number
+ self.dimension = dimension
+
+ R_bound = numpy.sqrt(6. / dimension)
+ R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension))
+ R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1))[:, numpy.newaxis]
+ self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX))
+
+ self.parameters = [self.R]
+
+ def apply(self, inputs, relations):
+ """ Apply the given relations to a given input. """
+ return S.dot(relations, self.R)+inputs
+
+ def updates(self, cost, learning_rate):
+ """ Compute the updates to perform a SGD step w.r.t. a given cost.
+
+ Keyword arguments:
+ cost -- The cost to optimise.
+ learning_rate -- The learning rate used for gradient descent.
+ """
+ return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))]
diff --git a/utils/__init__.py b/utils/__init__.py
diff --git a/utils/construct_dummy_dataset.py b/utils/construct_dummy_dataset.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python2
+
+import sys
+import os
+import shutil
+import random
+
+def construct_dummy_dataset(kind, prefix, n_embeddings, n_relations):
+ os.mkdir(prefix)
+
+ with open(prefix+'/embeddings', 'w') as file:
+ for i in xrange(n_embeddings):
+ file.write('E{0}\n'.format(i))
+
+ with open(prefix+'/relations', 'w') as file:
+ for i in xrange(n_relations):
+ file.write('R{0}\n'.format(i))
+
+ with open(prefix+'/train', 'w') as file:
+ for r in xrange(n_relations):
+ right = range(n_embeddings/2)
+ random.shuffle(right)
+ if kind=='id':
+ for e in xrange(n_embeddings):
+ file.write('{0}\t{1}\t{2}\n'.format(e, r, e))
+ elif kind=='halfperm':
+ for e in xrange(n_embeddings/2):
+ file.write('{0}\t{1}\t{2}\n'.format(e, r, right[e]+n_embeddings/2))
+ else:
+ raise error('Unknown kind')
+
+ shutil.copyfile(prefix+'/train', prefix+'/valid')
+ shutil.copyfile(prefix+'/train', prefix+'/test')
+
+if __name__ == '__main__':
+ if len(sys.argv)<5:
+ print >>sys.stderr, 'Usage: {0} {{id, halfperm}} dataset_name n_embeddings n_relations'.format(sys.argv[0])
+ sys.exit(1)
+ kind = sys.argv[1]
+ prefix = sys.argv[2]
+
+ n_embeddings = int(sys.argv[3])
+ n_relations = int(sys.argv[4])
+
+ construct_dummy_dataset(kind, prefix, n_embeddings, n_relations)