transform

old TransE-like models
git clone https://esimon.eu/repos/transform.git
Log | Files | Refs | README

commit 9aa890972b341519afb1339f636d968944f86ecf
parent 194510b0a3c0718ad8137de758e2646d5a4a93e7
Author: Étienne Simon <esimon@esimon.eu>
Date:   Wed, 16 Apr 2014 13:09:36 +0200

Add Model

Diffstat:
Adataset.py | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Membeddings.py | 14+++-----------
Amain.py | 23+++++++++++++++++++++++
Amodel.py | 149+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Arelations/__init__.py | 0
Drelations/translation.py | 53-----------------------------------------------------
Arelations/translations.py | 45+++++++++++++++++++++++++++++++++++++++++++++
Autils/__init__.py | 0
Autils/construct_dummy_dataset.py | 45+++++++++++++++++++++++++++++++++++++++++++++
9 files changed, 340 insertions(+), 64 deletions(-)

diff --git a/dataset.py b/dataset.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python2 + +import scipy +import numpy +import sys +import theano + +class Dataset(object): + def __init__(self, prefix): + print >>sys.stderr, '# Loading dataset "{0}"'.format(prefix) + with open(prefix+'/embeddings', 'r') as file: + self.embeddings = file.readlines() + with open(prefix+'/relations', 'r') as file: + self.relations = file.readlines() + self.number_embeddings = len(self.embeddings) + self.number_relations = len(self.relations) + self.load_file(prefix, 'train') + self.load_file(prefix, 'valid') + self.load_file(prefix, 'test') + + def load_file(self, prefix, name): + with open('{0}/{1}'.format(prefix, name), 'r') as file: + content = map(lambda line: map(int, line.split('\t')), file.readlines()) + [left, relation, right] = map(list, zip(*content)) + N = len(relation) + setattr(self, name+'_size', N) + setattr(self, name+'_right', scipy.sparse.csr_matrix(([1]*N, right, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX)) + setattr(self, name+'_relation', scipy.sparse.csr_matrix(([1]*N, relation, range(N+1)), shape=(N, self.number_relations), dtype=theano.config.floatX)) + setattr(self, name+'_left', scipy.sparse.csr_matrix(([1]*N, left, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX)) + + def training_minibatch(self, batch_size): + # Sampling corrupted entities + def sample_matrix(): + row = range(self.train_size+1) + col = numpy.random.randint(0, self.number_embeddings, size=self.train_size) + data = numpy.ones(self.train_size) + random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX) + return random_embeddings + corrupted_left = sample_matrix() + corrupted_right = sample_matrix() + + # Shuffling training set + order = numpy.random.permutation(self.train_size) + train_left = self.train_left[order, :] + train_right = self.train_right[order, :] + train_relation = self.train_relation[order, :] + + # Yielding batches + ls = numpy.linspace(0, self.train_size, 1+self.train_size/batch_size) + for i in xrange(len(ls)-1): + left_positive = train_left[ls[i]:ls[i+1]] + right_positive = train_right[ls[i]:ls[i+1]] + left_negative = corrupted_left[ls[i]:ls[i+1]] + right_negative = corrupted_right[ls[i]:ls[i+1]] + relation = train_relation[ls[i]:ls[i+1]] + yield (relation, left_positive, right_positive, left_negative, right_negative) + + def iterate(self, name, batch_size): + def repeat_csr(matrix, size): + data = list(matrix.data)*size + indices = list(matrix.indices)*size + indptr = range(size+1) + return scipy.sparse.csr_matrix((data, indices, indptr), shape=(size, matrix.shape[1]), dtype=theano.config.floatX) + N = getattr(self, name+'_size') + relation = getattr(self, name+'_relation') + left = getattr(self, name+'_left') + right = getattr(self, name+'_right') + for i in xrange(N): + yield (repeat_csr(relation[i], batch_size), repeat_csr(left[i], batch_size), right[i]) + + def universe_minibatch(self, batch_size): + N = len(self.embeddings) + entities = scipy.sparse.eye(N, format='csr', dtype=theano.config.floatX) + for i in xrange(N/batch_size): + yield entities[i*batch_size:(i+1)*batch_size] diff --git a/embeddings.py b/embeddings.py @@ -26,10 +26,10 @@ class Embeddings(object): E_bound = numpy.sqrt(6. / dimension) E_values = rng.uniform(low=-E_bound, high=E_bound, size=(number, dimension)) - E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1)) + E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1))[:, numpy.newaxis] self.E = theano.shared(name=tag, value=numpy.asarray(E_values, dtype=theano.config.floatX)) - self.params = [E] + self.parameters = [self.E] def embed(self, entities): """ Embed given entities. @@ -39,15 +39,7 @@ class Embeddings(object): """ return S.dot(entities, self.E) - def L1_norm(self): - """ Compute the L1-norm of the embeddings parameter. """ - return T.sum(T.abs(self.E)) - - def sqrL2_norm(self): - """ Compute the squared L2-norm of the embeddings parameter. """ - return T.sum(T.sqr(self.E)) - - def sgd_updates(self, cost, learning_rate): + def updates(self, cost, learning_rate): """ Compute the updates to perform a SGD step w.r.t. a given cost. Keyword arguments: diff --git a/main.py b/main.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python2 + +from dataset import * +from model import * +from relations.translations import * + +if __name__ == '__main__': + hyperparameters = dict() + hyperparameters['similarity'] = L1_norm + hyperparameters['rng'] = numpy.random + hyperparameters['dimension'] = 20 + hyperparameters['margin'] = 1. + hyperparameters['relation_learning_rate'] = 1 + hyperparameters['embeddings_learning_rate'] = 0.1 + hyperparameters['train_batch_size'] = 100 + hyperparameters['test_batch_size'] = 500 + hyperparameters['validation_frequency'] = 500 + hyperparameters['number_epoch'] = 10000 + + data = Dataset('data/dummy') + model = Model.initialise(Translations, data, hyperparameters, 'dummy') + model.train() + model.test() diff --git a/model.py b/model.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python2 + +import time + +import sys +import numpy +import scipy +import theano +import theano.tensor as T +import theano.sparse as S + +from embeddings import * + +def L1_norm(l, r): + return T.sum(abs(l-r), axis=1) + +def L2_norm(l, r): + return T.sqrt(T.sum(T.sqr(l-r), axis=1)) + +class Model(object): + """ Model class. + + Training model using SGD with a contrastive criterion. + """ + + @classmethod + def initialise(cls, Relations, dataset, hyperparameters, tag): + """ Initialise a model. + + Keyword arguments: + Relations -- relations class + dataset -- dataset on which the model will be trained and tested + hyperparameters -- hyperparameters dictionary + tag -- name of the embeddings for parameter declaration + """ + + print >>sys.stderr, '# Initialising model "{0}"'.format(tag) + + self = cls() + self.embeddings = Embeddings(hyperparameters['rng'], dataset.number_embeddings, hyperparameters['dimension'], tag+'.embeddings') + self.relations = Relations(hyperparameters['rng'], dataset.number_relations, hyperparameters['dimension'], tag+'.relations') + self.dataset = dataset + self.hyperparameters = hyperparameters + self.tag = tag + + self.build() + return self + + @classmethod + def load(cls, filepath, dataset, hyperparameters): + """ Load a model from a file. + + Keyword arguments: + filepath -- path to the Model file + dataset -- dataset on which the model will be trained and tested + hyperparameters -- hyperparameters dictionary + """ + + print >>sys.stderr, '# Loading model from "{0}"'.format(filepath) + + self = cls() + + with open(filepath, 'rb') as file: + self.embeddings = cPickle.load(file) + self.relations = cPickle.load(file) + self.dataset = dataset; + self.hyperparameters = hyperparameters; + + self.build() + return self + + def save(self, filepath): + """ Save the model in a file. """ + with open(filepath, 'wb') as file: + cPickle.dump(self.embeddings, file, -1) + cPickle.dump(self.relations, file, -1) + + def build(self): + """ Build theano functions. """ + print >>sys.stderr, '## Compiling Theano graph for model "{0}"'.format(self.tag) + + self.parameters = self.relations.parameters + self.embeddings.parameters + inputs = tuple(S.csr_matrix() for _ in xrange(5)) + positive_left, positive_right = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2]) + negative_left, negative_right = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4]) + positive_score = self.hyperparameters['similarity'](self.relations.apply(positive_left, inputs[0]), positive_right) + negative_score = self.hyperparameters['similarity'](self.relations.apply(negative_left, inputs[0]), negative_right) + score = self.hyperparameters['margin'] + positive_score - negative_score + violating_margin = score>0 + criterion = T.mean(violating_margin*score) + + self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion)) + self.scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[positive_score]) + + def updates(self, cost): + """ Compute the updates to perform a SGD step w.r.t. a given cost. + + Keyword arguments: + cost -- The cost to optimise. + """ + lr_relations = self.hyperparameters['relation_learning_rate'] + lr_embeddings = self.hyperparameters['embeddings_learning_rate'] + return self.relations.updates(cost, lr_relations) + self.embeddings.updates(cost, lr_embeddings) + + def train(self): + """ Train the model. """ + print >>sys.stderr, '# Training the model "{0}"'.format(self.tag) + + batch_size = self.hyperparameters['train_batch_size'] + validation_frequency = self.hyperparameters['validation_frequency'] + number_epoch = self.hyperparameters['number_epoch'] + + for epoch in xrange(number_epoch): + if epoch % validation_frequency == 0: + self.validate(epoch) + + for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size): + c1=self.train_function(relation, left_positive, right_positive, left_positive, right_negative) + c2=self.train_function(relation, left_positive, right_positive, left_negative, right_positive) + + def error(self, name): + """ Compute the mean rank and top 10 on a given data. """ + batch_size = self.hyperparameters['test_batch_size'] + count, mean, top10 = 0, 0, 0 + for (relation, left, right) in self.dataset.iterate(name, batch_size): + scores = None + for entities in self.dataset.universe_minibatch(batch_size): + batch_result = self.scoring_function(relation, left, entities) + scores = numpy.array(batch_result, dtype=theano.config.floatX) if scores is None else numpy.concatenate((scores, batch_result), axis=1) + rank = 1+numpy.where(numpy.argsort(scores)==right.indices[0])[1] # FIXME ugly + mean = mean + rank + count = count + 1 + top10 = top10 + (rank<=10) + mean = float(mean) / count + top10 = float(top10) / count + return (mean, top10) + + def validate(self, epoch): + """ Validate the model. """ + print >>sys.stderr, 'Validation epoch {:<5}'.format(epoch), + (valid_mean, valid_top10) = self.error('valid') + (train_mean, train_top10) = self.error('train') + print >>sys.stderr, 'valid mean: {0:<15} valid top10: {1:<15} train mean: {0:<15} train top10: {1:<15}'.format(valid_mean, valid_top10, train_mean, train_top10) + + def test(self): + """ Test the model. """ + print >>sys.stderr, '# Testing the model "{0}"'.format(self.tag), + (mean, top10) = self.error('test') + print >>sys.stderr, ' mean: {0:<15} top10: {1:<15}'.format(mean, top10) diff --git a/relations/__init__.py b/relations/__init__.py diff --git a/relations/translation.py b/relations/translation.py @@ -1,53 +0,0 @@ -#!/usr/bin/env python2 - -import numpy -import theano -import theano.tensor as T -import theano.sparse as S - -class Translations(object): - """ Translations class. - - This class has one parameter: - R -- the translations - """ - def __init__(self, rng, number, dimension, tag): - """ Initialise the parameter. - - Keyword arguments: - rng -- module for random number generation - number -- number of relation - dimension -- dimension of the embeddings - tag -- name of the relations for parameter declaration - """ - - self.number = number - self.dimension = dimension - - R_bound = numpy.sqrt(6. / dimension) - R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension)) - R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1)) - self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX)) - - self.params = [R] - - def L1_norm(self): - """ Compute the L1-norm of the relations parameter. """ - return T.sum(T.abs(self.R)) - - def sqrL2_norm(self): - """ Compute the squared L2-norm of the relations parameter. """ - return T.sum(T.sqr(self.R)) - - def apply(self, input, relations): - """ Apply the given relations to a given input. """ - return S.dot(relations, self.R)+inputs - - def sgd_updates(self, cost, learning_rate): - """ Compute the updates to perform a SGD step w.r.t. a given cost. - - Keyword arguments: - cost -- The cost to optimise. - learning_rate -- The learning rate used for gradient descent. - """ - return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))] diff --git a/relations/translations.py b/relations/translations.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python2 + +import numpy +import theano +import theano.tensor as T +import theano.sparse as S + +class Translations(object): + """ Translations class. + + This class has one parameter: + R -- the translations + """ + def __init__(self, rng, number, dimension, tag): + """ Initialise the parameter. + + Keyword arguments: + rng -- module for random number generation + number -- number of relation + dimension -- dimension of the embeddings + tag -- name of the relations for parameter declaration + """ + + self.number = number + self.dimension = dimension + + R_bound = numpy.sqrt(6. / dimension) + R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension)) + R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1))[:, numpy.newaxis] + self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX)) + + self.parameters = [self.R] + + def apply(self, inputs, relations): + """ Apply the given relations to a given input. """ + return S.dot(relations, self.R)+inputs + + def updates(self, cost, learning_rate): + """ Compute the updates to perform a SGD step w.r.t. a given cost. + + Keyword arguments: + cost -- The cost to optimise. + learning_rate -- The learning rate used for gradient descent. + """ + return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))] diff --git a/utils/__init__.py b/utils/__init__.py diff --git a/utils/construct_dummy_dataset.py b/utils/construct_dummy_dataset.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python2 + +import sys +import os +import shutil +import random + +def construct_dummy_dataset(kind, prefix, n_embeddings, n_relations): + os.mkdir(prefix) + + with open(prefix+'/embeddings', 'w') as file: + for i in xrange(n_embeddings): + file.write('E{0}\n'.format(i)) + + with open(prefix+'/relations', 'w') as file: + for i in xrange(n_relations): + file.write('R{0}\n'.format(i)) + + with open(prefix+'/train', 'w') as file: + for r in xrange(n_relations): + right = range(n_embeddings/2) + random.shuffle(right) + if kind=='id': + for e in xrange(n_embeddings): + file.write('{0}\t{1}\t{2}\n'.format(e, r, e)) + elif kind=='halfperm': + for e in xrange(n_embeddings/2): + file.write('{0}\t{1}\t{2}\n'.format(e, r, right[e]+n_embeddings/2)) + else: + raise error('Unknown kind') + + shutil.copyfile(prefix+'/train', prefix+'/valid') + shutil.copyfile(prefix+'/train', prefix+'/test') + +if __name__ == '__main__': + if len(sys.argv)<5: + print >>sys.stderr, 'Usage: {0} {{id, halfperm}} dataset_name n_embeddings n_relations'.format(sys.argv[0]) + sys.exit(1) + kind = sys.argv[1] + prefix = sys.argv[2] + + n_embeddings = int(sys.argv[3]) + n_relations = int(sys.argv[4]) + + construct_dummy_dataset(kind, prefix, n_embeddings, n_relations)