commit c61b71b63396648f490d9cb10e31de2bcdba601f
parent 682d2a64915e11eeaac999306c17f9d12f9fb22a
Author: Étienne Simon <esimon@esimon.eu>
Date: Wed, 30 Apr 2014 15:44:54 +0200
Add meta-model and train/test executables
Diffstat:
A | config.py | | | 25 | +++++++++++++++++++++++++ |
M | dataset.py | | | 7 | ++++--- |
D | main.py | | | 32 | -------------------------------- |
A | meta_model.py | | | 52 | ++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | model.py | | | 134 | +++++++++++++++++++++++++++++++++++++------------------------------------------ |
A | test.py | | | 33 | +++++++++++++++++++++++++++++++++ |
A | train.py | | | 26 | ++++++++++++++++++++++++++ |
7 files changed, 203 insertions(+), 106 deletions(-)
diff --git a/config.py b/config.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python2
+
+import numpy
+import json
+from model import *
+from relations import *
+
+def load_config(path):
+ with open(path, 'r') as config_file:
+ config = json.load(config_file)
+ for k, v in config.iteritems():
+ if isinstance(v, basestring) and v.startswith('python:'):
+ config[k] = eval(v[7:])
+ return config
+
+def expand_config(base_config):
+ size = base_config['size']
+ configs = [ base_config.copy() for _ in xrange(size) ]
+ for (config, index) in zip(configs, xrange(size)):
+ if not isinstance(config['model name'], list):
+ config['model name'] = ('{0} {1:0{width}}').format(config['model name'], index, width=len(str(size)))
+ for k, v in config.iteritems():
+ if isinstance(v, list):
+ config[k] = v[index]
+ return configs
diff --git a/dataset.py b/dataset.py
@@ -6,7 +6,8 @@ import numpy
import theano
class Dataset(object):
- def __init__(self, prefix):
+ def __init__(self, prefix, rng):
+ self.rng = rng
log('# Loading dataset "{0}"\n'.format(prefix))
with open(prefix+'/embeddings', 'r') as file:
self.embeddings = file.readlines()
@@ -33,7 +34,7 @@ class Dataset(object):
# Sampling corrupted entities
def sample_matrix():
row = range(self.train_size+1)
- col = numpy.random.randint(0, self.number_embeddings, size=self.train_size)
+ col = self.rng.randint(0, self.number_embeddings, size=self.train_size)
data = numpy.ones(self.train_size)
random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX)
return random_embeddings
@@ -41,7 +42,7 @@ class Dataset(object):
corrupted_right = sample_matrix()
# Shuffling training set
- order = numpy.random.permutation(self.train_size)
+ order = self.rng.permutation(self.train_size)
train_left = self.train_left[order, :]
train_right = self.train_right[order, :]
train_relation = self.train_relation[order, :]
diff --git a/main.py b/main.py
@@ -1,32 +0,0 @@
-#!/usr/bin/env python2
-
-from __future__ import print_function
-from utils.log import *
-import sys
-import json
-
-from dataset import *
-from model import *
-from relations import *
-
-if __name__ == '__main__':
- if len(sys.argv)<3:
- print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr)
- sys.exit(1)
- data = sys.argv[1]
- config_path = sys.argv[2]
- model_path = None if len(sys.argv)<4 else sys.argv[3]
-
- with open(config_path, 'r') as config_file:
- config = json.load(config_file)
- for k, v in config.iteritems():
- if isinstance(v, basestring) and v.startswith('python:'):
- config[k] = eval(v[7:])
-
- data = Dataset(data)
- if model_path is None:
- model = Model.initialise(config['relations'], data, config, config['model name'])
- else:
- model = Model.load(model_path, data, config, config['model name'])
- model.train()
- model.test()
diff --git a/meta_model.py b/meta_model.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python2
+
+from utils.log import *
+from config import *
+from model import *
+import numpy
+
+class Meta_model(object):
+ """ Meta-model class. """
+
+ def __init__(self, dataset, config, pathes=None):
+ self.dataset = dataset
+ self.combine_scores = config['scores combinator']
+ configs = expand_config(config)
+ if pathes is None:
+ pathes = [ '{0}/{1}.best'.format(config['best model save path'], config['model name']) for config in configs ]
+ self.models = [ Model(dataset, config, path) for config, path in zip(configs, pathes) ]
+
+ def build_test(self):
+ for model in self.models:
+ model.build_test()
+
+ def left_scoring_function(self, relation, left, right):
+ res = [ model.left_scoring_function(relation, left, right) for model in self.models ]
+ return numpy.transpose(res).reshape(right.shape[0], len(self.models))
+
+ def right_scoring_function(self, relation, left, right):
+ res = [ model.right_scoring_function(relation, left, right) for model in self.models ]
+ return numpy.transpose(res).reshape(left.shape[0], len(self.models))
+
+ def error(self):
+ """ Compute the mean rank, standard deviation and top 10 on a given data. """
+ result = []
+ for (relation, left, right) in self.dataset.iterate('test'):
+ entities = self.dataset.universe
+ raw_left_scores = self.left_scoring_function(relation, left, entities)
+ raw_right_scores = self.right_scoring_function(relation, entities, right)
+ left_scores = self.combine_scores(raw_left_scores)
+ right_scores = self.combine_scores(raw_right_scores)
+ left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[0]) # FIXME Ugly
+ right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[0]) # FIXME Ugly
+ result.extend((left_rank, right_rank))
+ mean = numpy.mean(result)
+ std = numpy.std(result)
+ top10 = numpy.mean(map(lambda x: x<=10, result))
+ return (mean, std, top10)
+
+ def test(self):
+ """ Test the model. """
+ log('# Testing the model')
+ (mean, std, top10) = self.error()
+ log(' mean: {0:<15} std: {1:<15} top10: {2:<15}\n'.format(mean, std, top10))
diff --git a/model.py b/model.py
@@ -27,50 +27,29 @@ class Model(object):
Training model using SGD with a contrastive criterion.
"""
- @classmethod
- def initialise(cls, Relations, dataset, config, tag):
+ def __init__(self, dataset, config, filepath=None):
""" Initialise a model.
Keyword arguments:
- Relations -- relations class
dataset -- dataset on which the model will be trained and tested
config -- config dictionary
- tag -- name of the embeddings for parameter declaration
+ filepath -- path to the Model file
"""
- log('# Initialising model "{0}"\n'.format(tag))
- self = cls()
- self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], tag+'.embeddings')
- self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], tag+'.relations')
+ log('# Initialising model "{0}"\n'.format(config['model name']))
self.dataset = dataset
self.config = config
- self.tag = tag
-
- self.build()
- return self
-
- @classmethod
- def load(cls, filepath, dataset, config, tag):
- """ Load a model from a file.
-
- Keyword arguments:
- filepath -- path to the Model file
- dataset -- dataset on which the model will be trained and tested
- config -- config dictionary
- tag -- name of the embeddings for parameter declaration
- """
- log('# Loading model from "{0}"\n'.format(filepath))
-
- self = cls()
- with open(filepath, 'rb') as file:
- self.embeddings = cPickle.load(file)
- self.relations = cPickle.load(file)
- self.dataset = dataset;
- self.config = config;
- self.tag = tag
-
- self.build()
- return self
+ self.tag = config['model name']
+
+ if filepath is None:
+ Relations = config['relations']
+ self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], self.tag+'.embeddings')
+ self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], self.tag+'.relations')
+ else:
+ log('## Loading model from "{0}"\n'.format(filepath))
+ with open(filepath, 'rb') as file:
+ self.embeddings = cPickle.load(file)
+ self.relations = cPickle.load(file)
def save(self, filepath):
""" Save the model in a file. """
@@ -78,15 +57,18 @@ class Model(object):
cPickle.dump(self.embeddings, file, -1)
cPickle.dump(self.relations, file, -1)
- def build(self):
- """ Build theano functions. """
- log('## Compiling Theano graph for model "{0}"\n'.format(self.tag))
-
- self.parameters = self.relations.parameters + self.embeddings.parameters
- inputs = tuple(S.csr_matrix() for _ in xrange(5))
- left_positive, right_positive = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2])
- left_negative, right_negative = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4])
- relation = self.relations.lookup(inputs[0])
+ def build_train(self):
+ """ Build theano train functions. """
+ log('## Compiling Theano graph for training model "{0}"\n'.format(self.tag))
+ input_relation = S.csr_matrix("relation")
+ input_left_positive = S.csr_matrix("left positive")
+ input_right_positive = S.csr_matrix("right positive")
+ input_left_negative = S.csr_matrix("left negative")
+ input_right_negative = S.csr_matrix("right negative")
+ inputs = [ input_relation, input_left_positive, input_right_positive, input_left_negative, input_right_negative ]
+ left_positive, right_positive = self.embeddings.embed(input_left_positive), self.embeddings.embed(input_right_positive)
+ left_negative, right_negative = self.embeddings.embed(input_left_negative), self.embeddings.embed(input_right_negative)
+ relation = self.relations.lookup(input_relation)
score_positive = self.config['similarity'](self.relations.transform(left_positive, relation), right_positive)
score_left_negative = self.config['similarity'](self.relations.transform(left_negative, relation), right_positive)
@@ -100,17 +82,27 @@ class Model(object):
criterion_right = T.mean(violating_margin_right*score_right)
criterion = criterion_left + criterion_right
- self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion))
+ self.train_function = theano.function(inputs=inputs, outputs=[criterion], updates=self.updates(criterion))
self.normalise_function = theano.function(inputs=[], outputs=[], updates=self.embeddings.normalise_updates())
+ def build_test(self):
+ """ Build theano test functions. """
+ log('## Compiling Theano graph for testing model "{0}"\n'.format(self.tag))
+ input_relation = S.csr_matrix("relation")
+ input_left = S.csr_matrix("left")
+ input_right = S.csr_matrix("right")
+ inputs = [ input_relation, input_left, input_right ]
+ left, right = self.embeddings.embed(input_left), self.embeddings.embed(input_right)
+ relation = self.relations.lookup(input_relation)
+
relation = map(lambda r: T.addbroadcast(r, 0), relation)
- left_broadcasted = T.addbroadcast(left_positive, 0)
- right_broadcasted = T.addbroadcast(right_positive, 0)
- left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right_positive)
- right_score = self.config['similarity'](self.relations.transform(left_positive, relation), right_broadcasted)
+ left_broadcasted = T.addbroadcast(left, 0)
+ right_broadcasted = T.addbroadcast(right, 0)
+ left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right)
+ right_score = self.config['similarity'](self.relations.transform(left, relation), right_broadcasted)
- self.left_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[left_score])
- self.right_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[right_score])
+ self.left_scoring_function = theano.function(inputs=inputs, outputs=[left_score])
+ self.right_scoring_function = theano.function(inputs=inputs, outputs=[right_score])
def updates(self, cost):
""" Compute the updates to perform a SGD step w.r.t. a given cost.
@@ -131,36 +123,36 @@ class Model(object):
number_epoch = self.config['number of epoch']
for epoch in xrange(number_epoch):
- if (epoch+1) % validation_frequency == 0:
- self.validate(epoch+1)
-
for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size):
self.normalise_function()
self.train_function(relation, left_positive, right_positive, left_negative, right_negative)
- def error(self, name):
- """ Compute the mean rank and top 10 on a given data. """
- count, mean, top10 = 0, 0, 0
+ if (epoch+1) % validation_frequency == 0:
+ self.validate(epoch+1)
+
+ def error(self, name, transform_scores=(lambda x: x)):
+ """ Compute the mean rank, standard deviation and top 10 on a given data. """
+ result = []
for (relation, left, right) in self.dataset.iterate(name):
- left_scores, right_scores = None, None
entities = self.dataset.universe
left_scores = self.left_scoring_function(relation, left, entities)
right_scores = self.right_scoring_function(relation, entities, right)
+ left_scores = transform_scores(left_scores)
+ right_scores = transform_scores(right_scores)
left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[1]) # FIXME Ugly
right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[1]) # FIXME Ugly
- count += 2
- mean += left_rank + right_rank
- top10 += (left_rank<=10) + (right_rank<=10)
- mean = float(mean) / count
- top10 = float(top10) / count
- return (mean, top10)
+ result.extend((left_rank, right_rank))
+ mean = numpy.mean(result)
+ std = numpy.std(result)
+ top10 = numpy.mean(map(lambda x: x<=10, result))
+ return (mean, std, top10)
def validate(self, epoch):
""" Validate the model. """
log('Validation epoch {:<5}'.format(epoch))
- (valid_mean, valid_top10) = self.error('valid')
- log(' valid mean: {0:<15} valid top10: {1:<15}'.format(valid_mean, valid_top10))
- datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_top10)
+ (valid_mean, valid_std, valid_top10) = self.error('valid')
+ log(' valid mean: {0:<15} valid std: {1:<15} valid top10: {2:<15}'.format(valid_mean, valid_std, valid_top10))
+ datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_std, valid_top10)
if not hasattr(self, 'best_mean') or valid_mean < self.best_mean:
self.best_mean = valid_mean
log('(best so far')
@@ -171,14 +163,14 @@ class Model(object):
log(')')
if self.config['validate on training data']:
- (train_mean, train_top10) = self.error('train')
- log(' train mean: {0:<15} train top10: {1:<15}'.format(train_mean, train_top10))
+ (train_mean, train_std, train_top10) = self.error('train')
+ log(' train mean: {0:<15} std: {1:<15} train top10: {2:<15}'.format(train_mean, train_std, train_top10))
log('\n')
def test(self):
""" Test the model. """
log('# Testing the model "{0}"'.format(self.tag))
- (mean, top10) = self.error('test')
- log(' mean: {0:<15} top10: {1:<15} (saving...'.format(mean, top10))
+ (mean, std, top10) = self.error('test')
+ log(' mean: {0:<15} std: {1:<15} top10: {2:<15} (saving...'.format(mean, std, top10))
self.save('{0}/{1}.last'.format(self.config['last model save path'], self.config['model name']))
log(' done)\n')
diff --git a/test.py b/test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import sys
+
+from dataset import *
+from model import *
+from meta_model import *
+from config import *
+
+if __name__ == '__main__':
+ if len(sys.argv)<3:
+ print('Usage: {0} data config [models]...'.format(sys.argv[0]), file=sys.stderr)
+ sys.exit(1)
+ data = sys.argv[1]
+ config_path = sys.argv[2]
+
+ if len(sys.argv)<4: model_pathes = None
+ elif len(sys.argv)>4: model_pathes = sys.argv[3:]
+ else: model_pathes = sys.argv[3]
+
+ config = load_config(config_path)
+ if not config.get('meta', False) and model_pathes is None:
+ model_pathes = '{0}/{1}.best'.format(config['best model save path'], config['model name'])
+ if not config.get('meta', False) and isinstance(model_pathes, list):
+ print('Error: multiple model specified while running in single mode', file=sys.stderr)
+ sys.exit(1)
+ ModelType = Meta_model if config.get('meta', False) else Model
+
+ data = Dataset(data, config['rng'])
+ model = ModelType(data, config, model_pathes)
+ model.build_test()
+ model.test()
diff --git a/train.py b/train.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import sys
+
+from dataset import *
+from model import *
+from relations import *
+from config import *
+
+if __name__ == '__main__':
+ if len(sys.argv)<3:
+ print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr)
+ sys.exit(1)
+ data = sys.argv[1]
+ config_path = sys.argv[2]
+ model_path = None if len(sys.argv)<4 else sys.argv[3]
+
+ config = load_config(config_path)
+ data = Dataset(data, config['rng'])
+ model = Model(data, config, model_path)
+
+ model.build_train()
+ model.build_test()
+ model.train()
+ model.test()