transform

old TransE-like models
git clone https://esimon.eu/repos/transform.git
Log | Files | Refs | README

commit 744de49cdcd00d5bead21197cc31bf226cdb03c0
parent a6ea206decf38474e3c970077f96fabe40811829
Author: Étienne Simon <esimon@esimon.eu>
Date:   Wed, 16 Apr 2014 16:02:51 +0200

Add FB15k dataset builder

Diffstat:
Autils/build Bordes FB15k.py | 102+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 102 insertions(+), 0 deletions(-)

diff --git a/utils/build Bordes FB15k.py b/utils/build Bordes FB15k.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python2 + +import sys +import os + +urls = [ 'https://www.hds.utc.fr/everest/lib/exe/fetch.php?id=en%3Atranse&cache=cache&media=en:fb15k.tgz' ] + +def get_archive(path): + import urllib + + class URLopener(urllib.FancyURLopener): + def http_error_default(self, url, fp, errcode, errmsg, headers): + print >>sys.stderr, 'Error: {0} {1}'.format(errcode, errmsg) + raise IOError + + archive = path+'/archive.tgz' + downloaded = False + for url in urls: + print >>sys.stderr, 'Downloading dataset from "{0}"...'.format(url), + try: + URLopener().retrieve(url, archive) + downloaded = True + print >>sys.stderr, ' done' + except IOError: + pass + + if not downloaded: + print >>sys.stderr, 'Error: Unable to download dataset.' + sys.exit(1) + +def get_raw(path): + if os.path.isdir(path+'/raw'): + return + + get_archive(path) + + print >>sys.stderr, 'Raw files not found, extracting archive...', + raw = path+'/raw' + os.mkdir(raw) + + import tarfile + tar = tarfile.open(path+'/archive.tgz', 'r:gz') + tar.extractall(raw) + print >>sys.stderr, ' done' + +def compile_dataset(path): + get_raw(path) + prefix = path+'/raw/FB15k/freebase_mtr100_mte100-' + suffix = '.txt' + + print >>sys.stderr, 'Reading train file...', + with open(prefix+'train'+suffix, 'r') as file: + content = map(lambda line: line.rstrip('\n').split('\t'), file.readlines()) + [left, relations, right] = map(set, zip(*content)) + entities = left | right + print >>sys.stderr, ' done' + + print >>sys.stderr, 'Writting entities...', + e2i, i2e, r2i, i2r = {}, {}, {}, {} + with open(path+'/entities', 'w') as file: + i=0 + for entity in entities: + e2i[entity]=i + i2e[i]=entity + file.write(entity+'\n') + i+=1 + print >>sys.stderr, ' done ({0} entities written)'.format(i) + + print >>sys.stderr, 'Writting relations...', + with open(path+'/relations', 'w') as file: + i=0 + for relation in relations: + r2i[relation]=i + i2r[i]=relation + file.write(relation+'\n') + i+=1 + print >>sys.stderr, ' done ({0} relations written)'.format(i) + + for name in ['train', 'valid', 'test']: + print >>sys.stderr, 'Compiling {0}...'.format(name), + count = 0 + with open(prefix+name+suffix, 'r') as infile: + with open(path+'/'+name, 'w') as outfile: + for line in infile.readlines(): + left, relation, right = line.rstrip('\n').split('\t') + if left in e2i and right in e2i and relation in r2i: + outfile.write('{0}\t{1}\t{2}\n'.format(e2i[left], r2i[relation], e2i[right])) + else: + count+=1 + print >>sys.stderr, ' done ({0} entit{1} removed)'.format(count, 'y' if count<2 else 'ies') + +if __name__ == '__main__': + if len(sys.argv)<2: + print >>sys.stderr, 'Usage: {0} path'.format(sys.argv[0]) + sys.exit(1) + + path = sys.argv[1] + if not os.path.isdir(path): + os.mkdir(path) + + compile_dataset(path) + print 'Bordes FB15k was successfully built in {0}'.format(path)