transform

old TransE-like models
git clone https://esimon.eu/repos/transform.git
Log | Files | Refs | README

commit c61b71b63396648f490d9cb10e31de2bcdba601f
parent 682d2a64915e11eeaac999306c17f9d12f9fb22a
Author: Étienne Simon <esimon@esimon.eu>
Date:   Wed, 30 Apr 2014 15:44:54 +0200

Add meta-model and train/test executables

Diffstat:
Aconfig.py | 25+++++++++++++++++++++++++
Mdataset.py | 7++++---
Dmain.py | 32--------------------------------
Ameta_model.py | 52++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmodel.py | 134+++++++++++++++++++++++++++++++++++++------------------------------------------
Atest.py | 33+++++++++++++++++++++++++++++++++
Atrain.py | 26++++++++++++++++++++++++++
7 files changed, 203 insertions(+), 106 deletions(-)

diff --git a/config.py b/config.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python2 + +import numpy +import json +from model import * +from relations import * + +def load_config(path): + with open(path, 'r') as config_file: + config = json.load(config_file) + for k, v in config.iteritems(): + if isinstance(v, basestring) and v.startswith('python:'): + config[k] = eval(v[7:]) + return config + +def expand_config(base_config): + size = base_config['size'] + configs = [ base_config.copy() for _ in xrange(size) ] + for (config, index) in zip(configs, xrange(size)): + if not isinstance(config['model name'], list): + config['model name'] = ('{0} {1:0{width}}').format(config['model name'], index, width=len(str(size))) + for k, v in config.iteritems(): + if isinstance(v, list): + config[k] = v[index] + return configs diff --git a/dataset.py b/dataset.py @@ -6,7 +6,8 @@ import numpy import theano class Dataset(object): - def __init__(self, prefix): + def __init__(self, prefix, rng): + self.rng = rng log('# Loading dataset "{0}"\n'.format(prefix)) with open(prefix+'/embeddings', 'r') as file: self.embeddings = file.readlines() @@ -33,7 +34,7 @@ class Dataset(object): # Sampling corrupted entities def sample_matrix(): row = range(self.train_size+1) - col = numpy.random.randint(0, self.number_embeddings, size=self.train_size) + col = self.rng.randint(0, self.number_embeddings, size=self.train_size) data = numpy.ones(self.train_size) random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX) return random_embeddings @@ -41,7 +42,7 @@ class Dataset(object): corrupted_right = sample_matrix() # Shuffling training set - order = numpy.random.permutation(self.train_size) + order = self.rng.permutation(self.train_size) train_left = self.train_left[order, :] train_right = self.train_right[order, :] train_relation = self.train_relation[order, :] diff --git a/main.py b/main.py @@ -1,32 +0,0 @@ -#!/usr/bin/env python2 - -from __future__ import print_function -from utils.log import * -import sys -import json - -from dataset import * -from model import * -from relations import * - -if __name__ == '__main__': - if len(sys.argv)<3: - print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr) - sys.exit(1) - data = sys.argv[1] - config_path = sys.argv[2] - model_path = None if len(sys.argv)<4 else sys.argv[3] - - with open(config_path, 'r') as config_file: - config = json.load(config_file) - for k, v in config.iteritems(): - if isinstance(v, basestring) and v.startswith('python:'): - config[k] = eval(v[7:]) - - data = Dataset(data) - if model_path is None: - model = Model.initialise(config['relations'], data, config, config['model name']) - else: - model = Model.load(model_path, data, config, config['model name']) - model.train() - model.test() diff --git a/meta_model.py b/meta_model.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python2 + +from utils.log import * +from config import * +from model import * +import numpy + +class Meta_model(object): + """ Meta-model class. """ + + def __init__(self, dataset, config, pathes=None): + self.dataset = dataset + self.combine_scores = config['scores combinator'] + configs = expand_config(config) + if pathes is None: + pathes = [ '{0}/{1}.best'.format(config['best model save path'], config['model name']) for config in configs ] + self.models = [ Model(dataset, config, path) for config, path in zip(configs, pathes) ] + + def build_test(self): + for model in self.models: + model.build_test() + + def left_scoring_function(self, relation, left, right): + res = [ model.left_scoring_function(relation, left, right) for model in self.models ] + return numpy.transpose(res).reshape(right.shape[0], len(self.models)) + + def right_scoring_function(self, relation, left, right): + res = [ model.right_scoring_function(relation, left, right) for model in self.models ] + return numpy.transpose(res).reshape(left.shape[0], len(self.models)) + + def error(self): + """ Compute the mean rank, standard deviation and top 10 on a given data. """ + result = [] + for (relation, left, right) in self.dataset.iterate('test'): + entities = self.dataset.universe + raw_left_scores = self.left_scoring_function(relation, left, entities) + raw_right_scores = self.right_scoring_function(relation, entities, right) + left_scores = self.combine_scores(raw_left_scores) + right_scores = self.combine_scores(raw_right_scores) + left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[0]) # FIXME Ugly + right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[0]) # FIXME Ugly + result.extend((left_rank, right_rank)) + mean = numpy.mean(result) + std = numpy.std(result) + top10 = numpy.mean(map(lambda x: x<=10, result)) + return (mean, std, top10) + + def test(self): + """ Test the model. """ + log('# Testing the model') + (mean, std, top10) = self.error() + log(' mean: {0:<15} std: {1:<15} top10: {2:<15}\n'.format(mean, std, top10)) diff --git a/model.py b/model.py @@ -27,50 +27,29 @@ class Model(object): Training model using SGD with a contrastive criterion. """ - @classmethod - def initialise(cls, Relations, dataset, config, tag): + def __init__(self, dataset, config, filepath=None): """ Initialise a model. Keyword arguments: - Relations -- relations class dataset -- dataset on which the model will be trained and tested config -- config dictionary - tag -- name of the embeddings for parameter declaration + filepath -- path to the Model file """ - log('# Initialising model "{0}"\n'.format(tag)) - self = cls() - self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], tag+'.embeddings') - self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], tag+'.relations') + log('# Initialising model "{0}"\n'.format(config['model name'])) self.dataset = dataset self.config = config - self.tag = tag - - self.build() - return self - - @classmethod - def load(cls, filepath, dataset, config, tag): - """ Load a model from a file. - - Keyword arguments: - filepath -- path to the Model file - dataset -- dataset on which the model will be trained and tested - config -- config dictionary - tag -- name of the embeddings for parameter declaration - """ - log('# Loading model from "{0}"\n'.format(filepath)) - - self = cls() - with open(filepath, 'rb') as file: - self.embeddings = cPickle.load(file) - self.relations = cPickle.load(file) - self.dataset = dataset; - self.config = config; - self.tag = tag - - self.build() - return self + self.tag = config['model name'] + + if filepath is None: + Relations = config['relations'] + self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], self.tag+'.embeddings') + self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], self.tag+'.relations') + else: + log('## Loading model from "{0}"\n'.format(filepath)) + with open(filepath, 'rb') as file: + self.embeddings = cPickle.load(file) + self.relations = cPickle.load(file) def save(self, filepath): """ Save the model in a file. """ @@ -78,15 +57,18 @@ class Model(object): cPickle.dump(self.embeddings, file, -1) cPickle.dump(self.relations, file, -1) - def build(self): - """ Build theano functions. """ - log('## Compiling Theano graph for model "{0}"\n'.format(self.tag)) - - self.parameters = self.relations.parameters + self.embeddings.parameters - inputs = tuple(S.csr_matrix() for _ in xrange(5)) - left_positive, right_positive = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2]) - left_negative, right_negative = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4]) - relation = self.relations.lookup(inputs[0]) + def build_train(self): + """ Build theano train functions. """ + log('## Compiling Theano graph for training model "{0}"\n'.format(self.tag)) + input_relation = S.csr_matrix("relation") + input_left_positive = S.csr_matrix("left positive") + input_right_positive = S.csr_matrix("right positive") + input_left_negative = S.csr_matrix("left negative") + input_right_negative = S.csr_matrix("right negative") + inputs = [ input_relation, input_left_positive, input_right_positive, input_left_negative, input_right_negative ] + left_positive, right_positive = self.embeddings.embed(input_left_positive), self.embeddings.embed(input_right_positive) + left_negative, right_negative = self.embeddings.embed(input_left_negative), self.embeddings.embed(input_right_negative) + relation = self.relations.lookup(input_relation) score_positive = self.config['similarity'](self.relations.transform(left_positive, relation), right_positive) score_left_negative = self.config['similarity'](self.relations.transform(left_negative, relation), right_positive) @@ -100,17 +82,27 @@ class Model(object): criterion_right = T.mean(violating_margin_right*score_right) criterion = criterion_left + criterion_right - self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion)) + self.train_function = theano.function(inputs=inputs, outputs=[criterion], updates=self.updates(criterion)) self.normalise_function = theano.function(inputs=[], outputs=[], updates=self.embeddings.normalise_updates()) + def build_test(self): + """ Build theano test functions. """ + log('## Compiling Theano graph for testing model "{0}"\n'.format(self.tag)) + input_relation = S.csr_matrix("relation") + input_left = S.csr_matrix("left") + input_right = S.csr_matrix("right") + inputs = [ input_relation, input_left, input_right ] + left, right = self.embeddings.embed(input_left), self.embeddings.embed(input_right) + relation = self.relations.lookup(input_relation) + relation = map(lambda r: T.addbroadcast(r, 0), relation) - left_broadcasted = T.addbroadcast(left_positive, 0) - right_broadcasted = T.addbroadcast(right_positive, 0) - left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right_positive) - right_score = self.config['similarity'](self.relations.transform(left_positive, relation), right_broadcasted) + left_broadcasted = T.addbroadcast(left, 0) + right_broadcasted = T.addbroadcast(right, 0) + left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right) + right_score = self.config['similarity'](self.relations.transform(left, relation), right_broadcasted) - self.left_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[left_score]) - self.right_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[right_score]) + self.left_scoring_function = theano.function(inputs=inputs, outputs=[left_score]) + self.right_scoring_function = theano.function(inputs=inputs, outputs=[right_score]) def updates(self, cost): """ Compute the updates to perform a SGD step w.r.t. a given cost. @@ -131,36 +123,36 @@ class Model(object): number_epoch = self.config['number of epoch'] for epoch in xrange(number_epoch): - if (epoch+1) % validation_frequency == 0: - self.validate(epoch+1) - for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size): self.normalise_function() self.train_function(relation, left_positive, right_positive, left_negative, right_negative) - def error(self, name): - """ Compute the mean rank and top 10 on a given data. """ - count, mean, top10 = 0, 0, 0 + if (epoch+1) % validation_frequency == 0: + self.validate(epoch+1) + + def error(self, name, transform_scores=(lambda x: x)): + """ Compute the mean rank, standard deviation and top 10 on a given data. """ + result = [] for (relation, left, right) in self.dataset.iterate(name): - left_scores, right_scores = None, None entities = self.dataset.universe left_scores = self.left_scoring_function(relation, left, entities) right_scores = self.right_scoring_function(relation, entities, right) + left_scores = transform_scores(left_scores) + right_scores = transform_scores(right_scores) left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[1]) # FIXME Ugly right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[1]) # FIXME Ugly - count += 2 - mean += left_rank + right_rank - top10 += (left_rank<=10) + (right_rank<=10) - mean = float(mean) / count - top10 = float(top10) / count - return (mean, top10) + result.extend((left_rank, right_rank)) + mean = numpy.mean(result) + std = numpy.std(result) + top10 = numpy.mean(map(lambda x: x<=10, result)) + return (mean, std, top10) def validate(self, epoch): """ Validate the model. """ log('Validation epoch {:<5}'.format(epoch)) - (valid_mean, valid_top10) = self.error('valid') - log(' valid mean: {0:<15} valid top10: {1:<15}'.format(valid_mean, valid_top10)) - datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_top10) + (valid_mean, valid_std, valid_top10) = self.error('valid') + log(' valid mean: {0:<15} valid std: {1:<15} valid top10: {2:<15}'.format(valid_mean, valid_std, valid_top10)) + datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_std, valid_top10) if not hasattr(self, 'best_mean') or valid_mean < self.best_mean: self.best_mean = valid_mean log('(best so far') @@ -171,14 +163,14 @@ class Model(object): log(')') if self.config['validate on training data']: - (train_mean, train_top10) = self.error('train') - log(' train mean: {0:<15} train top10: {1:<15}'.format(train_mean, train_top10)) + (train_mean, train_std, train_top10) = self.error('train') + log(' train mean: {0:<15} std: {1:<15} train top10: {2:<15}'.format(train_mean, train_std, train_top10)) log('\n') def test(self): """ Test the model. """ log('# Testing the model "{0}"'.format(self.tag)) - (mean, top10) = self.error('test') - log(' mean: {0:<15} top10: {1:<15} (saving...'.format(mean, top10)) + (mean, std, top10) = self.error('test') + log(' mean: {0:<15} std: {1:<15} top10: {2:<15} (saving...'.format(mean, std, top10)) self.save('{0}/{1}.last'.format(self.config['last model save path'], self.config['model name'])) log(' done)\n') diff --git a/test.py b/test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python2 + +from __future__ import print_function +import sys + +from dataset import * +from model import * +from meta_model import * +from config import * + +if __name__ == '__main__': + if len(sys.argv)<3: + print('Usage: {0} data config [models]...'.format(sys.argv[0]), file=sys.stderr) + sys.exit(1) + data = sys.argv[1] + config_path = sys.argv[2] + + if len(sys.argv)<4: model_pathes = None + elif len(sys.argv)>4: model_pathes = sys.argv[3:] + else: model_pathes = sys.argv[3] + + config = load_config(config_path) + if not config.get('meta', False) and model_pathes is None: + model_pathes = '{0}/{1}.best'.format(config['best model save path'], config['model name']) + if not config.get('meta', False) and isinstance(model_pathes, list): + print('Error: multiple model specified while running in single mode', file=sys.stderr) + sys.exit(1) + ModelType = Meta_model if config.get('meta', False) else Model + + data = Dataset(data, config['rng']) + model = ModelType(data, config, model_pathes) + model.build_test() + model.test() diff --git a/train.py b/train.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python2 + +from __future__ import print_function +import sys + +from dataset import * +from model import * +from relations import * +from config import * + +if __name__ == '__main__': + if len(sys.argv)<3: + print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr) + sys.exit(1) + data = sys.argv[1] + config_path = sys.argv[2] + model_path = None if len(sys.argv)<4 else sys.argv[3] + + config = load_config(config_path) + data = Dataset(data, config['rng']) + model = Model(data, config, model_path) + + model.build_train() + model.build_test() + model.train() + model.test()