taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 54613c1f9cf510ca7a71d6619418f2247515aec6
parent 712035b88be1816d3fbd58ce69ae6464767c780e
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Tue,  5 May 2015 14:15:21 -0400

Add models for time predictioAdd models for time prediction

Diffstat:
Aconfig/dest_simple_mlp_2_cs.py | 21+++++++++++++++++++++
Aconfig/dest_simple_mlp_2_cswdt.py | 25+++++++++++++++++++++++++
Aconfig/dest_simple_mlp_2_noembed.py | 18++++++++++++++++++
Aconfig/dest_simple_mlp_tgtcls_0_cs.py | 25+++++++++++++++++++++++++
Aconfig/dest_simple_mlp_tgtcls_1_cs.py | 25+++++++++++++++++++++++++
Aconfig/dest_simple_mlp_tgtcls_1_cswdt.py | 29+++++++++++++++++++++++++++++
Aconfig/dest_simple_mlp_tgtcls_1_cswdtx.py | 30++++++++++++++++++++++++++++++
Dconfig/simple_mlp_2_cs.py | 21---------------------
Dconfig/simple_mlp_2_cswdt.py | 25-------------------------
Dconfig/simple_mlp_2_noembed.py | 18------------------
Dconfig/simple_mlp_tgtcls_0_cs.py | 25-------------------------
Dconfig/simple_mlp_tgtcls_1_cs.py | 25-------------------------
Dconfig/simple_mlp_tgtcls_1_cswdt.py | 29-----------------------------
Dconfig/simple_mlp_tgtcls_1_cswdtx.py | 30------------------------------
Aconfig/time_simple_mlp_1.py | 19+++++++++++++++++++
Aconfig/time_simple_mlp_2_cswdtx.py | 26++++++++++++++++++++++++++
Mdata.py | 6++----
Aerror.py | 40++++++++++++++++++++++++++++++++++++++++
Dhdist.py | 37-------------------------------------
Amodel/dest_simple_mlp.py | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Amodel/dest_simple_mlp_tgtcls.py | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dmodel/simple_mlp.py | 71-----------------------------------------------------------------------
Dmodel/simple_mlp_tgtcls.py | 73-------------------------------------------------------------------------
Amodel/time_simple_mlp.py | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtrain.py | 27++++++++++++++++-----------
Mtransformers.py | 4++--
26 files changed, 491 insertions(+), 371 deletions(-)

diff --git a/config/dest_simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py @@ -0,0 +1,21 @@ +import model.dest_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10) +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200, 100] +dim_output = 2 + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py @@ -0,0 +1,25 @@ +import model.dest_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200, 100] +dim_output = 2 + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py @@ -0,0 +1,18 @@ +import model.dest_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [] # do not use embeddings + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200, 100] +dim_output = 2 + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py @@ -0,0 +1,25 @@ +import cPickle + +import data + +import model.dest_simple_mlp_tgtcls as model + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10) +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [] +dim_output = tgtcls.shape[0] + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py @@ -0,0 +1,25 @@ +import cPickle + +import data + +import model.dest_simple_mlp_tgtcls as model + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10) +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500] +dim_output = tgtcls.shape[0] + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py @@ -0,0 +1,29 @@ +import cPickle + +import data + +import model.dest_simple_mlp_tgtcls as model + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500] +dim_output = tgtcls.shape[0] + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/dest_simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py @@ -0,0 +1,30 @@ +import cPickle + +import data + +import model.dest_simple_mlp_tgtcls as model + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), + ('taxi_id', 448, 10), +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500] +dim_output = tgtcls.shape[0] + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 diff --git a/config/simple_mlp_2_cs.py b/config/simple_mlp_2_cs.py @@ -1,21 +0,0 @@ -import model.simple_mlp as model - -import data - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10) -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [200, 100] -dim_output = 2 - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_2_cswdt.py b/config/simple_mlp_2_cswdt.py @@ -1,25 +0,0 @@ -import model.simple_mlp as model - -import data - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10), - ('week_of_year', 52, 10), - ('day_of_week', 7, 10), - ('qhour_of_day', 24 * 4, 10), - ('day_type', 3, 10), -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [200, 100] -dim_output = 2 - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_2_noembed.py b/config/simple_mlp_2_noembed.py @@ -1,18 +0,0 @@ -import model.simple_mlp as model - -import data - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -dim_embeddings = [] # do not use embeddings - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [200, 100] -dim_output = 2 - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_tgtcls_0_cs.py b/config/simple_mlp_tgtcls_0_cs.py @@ -1,25 +0,0 @@ -import cPickle - -import data - -import model.simple_mlp_tgtcls as model - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10) -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [] -dim_output = tgtcls.shape[0] - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_tgtcls_1_cs.py b/config/simple_mlp_tgtcls_1_cs.py @@ -1,25 +0,0 @@ -import cPickle - -import data - -import model.simple_mlp_tgtcls as model - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10) -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [500] -dim_output = tgtcls.shape[0] - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_tgtcls_1_cswdt.py b/config/simple_mlp_tgtcls_1_cswdt.py @@ -1,29 +0,0 @@ -import cPickle - -import data - -import model.simple_mlp_tgtcls as model - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10), - ('week_of_year', 52, 10), - ('day_of_week', 7, 10), - ('qhour_of_day', 24 * 4, 10), - ('day_type', 3, 10), -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [500] -dim_output = tgtcls.shape[0] - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/simple_mlp_tgtcls_1_cswdtx.py b/config/simple_mlp_tgtcls_1_cswdtx.py @@ -1,30 +0,0 @@ -import cPickle - -import data - -import model.simple_mlp_tgtcls as model - -n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory -n_end_pts = 5 - -n_valid = 1000 - -with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f) - -dim_embeddings = [ - ('origin_call', data.n_train_clients+1, 10), - ('origin_stand', data.n_stands+1, 10), - ('week_of_year', 52, 10), - ('day_of_week', 7, 10), - ('qhour_of_day', 24 * 4, 10), - ('day_type', 3, 10), - ('taxi_id', 448, 10), -] - -dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [500] -dim_output = tgtcls.shape[0] - -learning_rate = 0.0001 -momentum = 0.99 -batch_size = 32 diff --git a/config/time_simple_mlp_1.py b/config/time_simple_mlp_1.py @@ -0,0 +1,19 @@ +import model.time_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200] +dim_output = 1 + +learning_rate = 0.00001 +momentum = 0.99 +batch_size = 32 diff --git a/config/time_simple_mlp_2_cswdtx.py b/config/time_simple_mlp_2_cswdtx.py @@ -0,0 +1,26 @@ +import model.time_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), + ('taxi_id', 448, 10), +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500, 100] +dim_output = 1 + +learning_rate = 0.00001 +momentum = 0.99 +batch_size = 32 diff --git a/data.py b/data.py @@ -179,15 +179,13 @@ taxi_columns_valid = taxi_columns + [ ("time", lambda l: int(l[11])), ] -train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)] -valid_files=["%s/split/valid2-cut.csv" % (DATA_PATH,)] +valid_files=["%s/valid2-cut.csv" % (DATA_PATH,)] test_file="%s/test.csv" % (DATA_PATH,) -train_data=TaxiData(train_files, taxi_columns) valid_data = TaxiData(valid_files, taxi_columns_valid) test_data = TaxiData(test_file, taxi_columns, has_header=True) -valid_trips = [l for l in open(DATA_PATH + "/split/valid2-cut-ids.txt")] +valid_trips = [l for l in open(DATA_PATH + "/valid2-cut-ids.txt")] def train_it(): return DataIterator(DataStream(train_data)) diff --git a/error.py b/error.py @@ -0,0 +1,40 @@ +from theano import tensor +import theano +import numpy + +def const(v): + if theano.config.floatX == 'float32': + return numpy.float32(v) + else: + return numpy.float64(v) + +rearth = const(6371) +deg2rad = const(3.141592653589793 / 180) + +def hdist(a, b): + lat1 = a[:, 0] * deg2rad + lon1 = a[:, 1] * deg2rad + lat2 = b[:, 0] * deg2rad + lon2 = b[:, 1] * deg2rad + + dlat = abs(lat1-lat2) + dlon = abs(lon1-lon2) + + al = tensor.sin(dlat/2)**2 + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2) + d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al)) + + hd = const(2) * rearth * d + + return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd) + +def erdist(a, b): + lat1 = a[:, 0] * deg2rad + lon1 = a[:, 1] * deg2rad + lat2 = b[:, 0] * deg2rad + lon2 = b[:, 1] * deg2rad + x = (lon2-lon1) * tensor.cos((lat1+lat2)/2) + y = (lat2-lat1) + return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth + +def rmsle(a, b): + return tensor.sqrt( ( (tensor.log(a+1)-tensor.log(b+1)) ** 2 ).mean() ) diff --git a/hdist.py b/hdist.py @@ -1,37 +0,0 @@ -from theano import tensor -import theano -import numpy - -def const(v): - if theano.config.floatX == 'float32': - return numpy.float32(v) - else: - return numpy.float64(v) - -rearth = const(6371) -deg2rad = const(3.141592653589793 / 180) - -def hdist(a, b): - lat1 = a[:, 0] * deg2rad - lon1 = a[:, 1] * deg2rad - lat2 = b[:, 0] * deg2rad - lon2 = b[:, 1] * deg2rad - - dlat = abs(lat1-lat2) - dlon = abs(lon1-lon2) - - al = tensor.sin(dlat/2)**2 + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2) - d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al)) - - hd = const(2) * rearth * d - - return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd) - -def erdist(a, b): - lat1 = a[:, 0] * deg2rad - lon1 = a[:, 1] * deg2rad - lat2 = b[:, 0] * deg2rad - lon2 = b[:, 1] * deg2rad - x = (lon2-lon1) * tensor.cos((lat1+lat2)/2) - y = (lat2-lat1) - return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth diff --git a/model/dest_simple_mlp.py b/model/dest_simple_mlp.py @@ -0,0 +1,73 @@ +from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity +from blocks.bricks.lookup import LookupTable + +from blocks.initialization import IsotropicGaussian, Constant + +from theano import tensor + +import data +import error + +class Model(object): + def __init__(self, config): + # The input and the targets + x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] + + x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] + + input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] + embed_tables = [] + + self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] + + for (varname, num, dim) in config.dim_embeddings: + self.require_inputs.append(varname) + vardata = tensor.lvector(varname) + tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) + embed_tables.append(tbl) + input_list.append(tbl.apply(vardata)) + + y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], + tensor.vector('destination_longitude')[:, None]), axis=1) + + # Define the model + mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], + dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) + + # Create the Theano variables + inputs = tensor.concatenate(input_list, axis=1) + # inputs = theano.printing.Print("inputs")(inputs) + outputs = mlp.apply(inputs) + + # Normalize & Center + # outputs = theano.printing.Print("normal_outputs")(outputs) + outputs = data.data_std * outputs + data.porto_center + + # outputs = theano.printing.Print("outputs")(outputs) + # y = theano.printing.Print("y")(y) + + outputs.name = 'outputs' + + # Calculate the cost + cost = error.erdist(outputs, y).mean() + cost.name = 'cost' + hcost = error.hdist(outputs, y).mean() + hcost.name = 'hcost' + + # Initialization + for tbl in embed_tables: + tbl.weights_init = IsotropicGaussian(0.001) + mlp.weights_init = IsotropicGaussian(0.01) + mlp.biases_init = Constant(0.001) + + for tbl in embed_tables: + tbl.initialize() + mlp.initialize() + + self.cost = cost + self.monitor = [cost, hcost] + self.outputs = outputs + self.pred_vars = ['destination_latitude', 'destination_longitude'] + diff --git a/model/dest_simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py @@ -0,0 +1,75 @@ +import numpy + +import theano +from theano import tensor + +from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax +from blocks.bricks.lookup import LookupTable + +from blocks.initialization import IsotropicGaussian, Constant + +import data +import error + +class Model(object): + def __init__(self, config): + # The input and the targets + x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] + + x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] + + input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] + embed_tables = [] + + self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] + + for (varname, num, dim) in config.dim_embeddings: + self.require_inputs.append(varname) + vardata = tensor.lvector(varname) + tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) + embed_tables.append(tbl) + input_list.append(tbl.apply(vardata)) + + y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], + tensor.vector('destination_longitude')[:, None]), axis=1) + + # Define the model + mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()], + dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) + classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') + + # Create the Theano variables + inputs = tensor.concatenate(input_list, axis=1) + + # inputs = theano.printing.Print("inputs")(inputs) + cls_probas = mlp.apply(inputs) + outputs = tensor.dot(cls_probas, classes) + + # outputs = theano.printing.Print("outputs")(outputs) + # y = theano.printing.Print("y")(y) + + outputs.name = 'outputs' + + # Calculate the cost + cost = error.erdist(outputs, y).mean() + cost.name = 'cost' + hcost = error.hdist(outputs, y).mean() + hcost.name = 'hcost' + + # Initialization + for tbl in embed_tables: + tbl.weights_init = IsotropicGaussian(0.001) + mlp.weights_init = IsotropicGaussian(0.01) + mlp.biases_init = Constant(0.001) + + for tbl in embed_tables: + tbl.initialize() + mlp.initialize() + + self.cost = cost + self.monitor = [cost, hcost] + self.outputs = outputs + self.pred_vars = ['destination_latitude', 'destination_longitude'] + diff --git a/model/simple_mlp.py b/model/simple_mlp.py @@ -1,71 +0,0 @@ -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity -from blocks.bricks.lookup import LookupTable - -from blocks.initialization import IsotropicGaussian, Constant - -from theano import tensor - -import data -import hdist - -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], - tensor.vector('destination_longitude')[:, None]), axis=1) - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - # inputs = theano.printing.Print("inputs")(inputs) - outputs = mlp.apply(inputs) - - # Normalize & Center - # outputs = theano.printing.Print("normal_outputs")(outputs) - outputs = data.data_std * outputs + data.porto_center - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - - outputs.name = 'outputs' - - # Calculate the cost - cost = hdist.erdist(outputs, y).mean() - cost.name = 'cost' - hcost = hdist.hdist(outputs, y).mean() - hcost.name = 'hcost' - - # Initialization - for tbl in embed_tables: - tbl.weights_init = IsotropicGaussian(0.001) - mlp.weights_init = IsotropicGaussian(0.01) - mlp.biases_init = Constant(0.001) - - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() - - self.cost = cost - self.hcost = hcost - self.outputs = outputs diff --git a/model/simple_mlp_tgtcls.py b/model/simple_mlp_tgtcls.py @@ -1,73 +0,0 @@ -import numpy - -import theano -from theano import tensor - -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax -from blocks.bricks.lookup import LookupTable - -from blocks.initialization import IsotropicGaussian, Constant - -import data -import hdist - -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], - tensor.vector('destination_longitude')[:, None]), axis=1) - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - - # inputs = theano.printing.Print("inputs")(inputs) - cls_probas = mlp.apply(inputs) - outputs = tensor.dot(cls_probas, classes) - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - - outputs.name = 'outputs' - - # Calculate the cost - cost = hdist.erdist(outputs, y).mean() - cost.name = 'cost' - hcost = hdist.hdist(outputs, y).mean() - hcost.name = 'hcost' - - # Initialization - for tbl in embed_tables: - tbl.weights_init = IsotropicGaussian(0.001) - mlp.weights_init = IsotropicGaussian(0.01) - mlp.biases_init = Constant(0.001) - - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() - - self.cost = cost - self.hcost = hcost - self.outputs = outputs diff --git a/model/time_simple_mlp.py b/model/time_simple_mlp.py @@ -0,0 +1,65 @@ +from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity +from blocks.bricks.lookup import LookupTable + +from blocks.initialization import IsotropicGaussian, Constant + +from theano import tensor + +import data +import error + +class Model(object): + def __init__(self, config): + # The input and the targets + x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] + + x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] + + input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] + embed_tables = [] + + self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] + + for (varname, num, dim) in config.dim_embeddings: + self.require_inputs.append(varname) + vardata = tensor.lvector(varname) + tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) + embed_tables.append(tbl) + input_list.append(tbl.apply(vardata)) + + y = tensor.lvector('time') + + # Define the model + mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], + dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) + + # Create the Theano variables + inputs = tensor.concatenate(input_list, axis=1) + # inputs = theano.printing.Print("inputs")(inputs) + outputs = tensor.exp(mlp.apply(inputs) + 2) + + # outputs = theano.printing.Print("outputs")(outputs) + # y = theano.printing.Print("y")(y) + + outputs.name = 'outputs' + + # Calculate the cost + cost = error.rmsle(outputs.flatten(), y.flatten()) + cost.name = 'cost' + + # Initialization + for tbl in embed_tables: + tbl.weights_init = IsotropicGaussian(0.001) + mlp.weights_init = IsotropicGaussian(0.01) + mlp.biases_init = Constant(0.001) + + for tbl in embed_tables: + tbl.initialize() + mlp.initialize() + + self.cost = cost + self.monitor = [cost] + self.outputs = outputs + self.pred_vars = ['time'] diff --git a/train.py b/train.py @@ -20,7 +20,7 @@ from blocks.model import Model from fuel.datasets.hdf5 import H5PYDataset from fuel.transformers import Batch from fuel.streams import DataStream -from fuel.schemes import ConstantScheme, SequentialExampleScheme +from fuel.schemes import ConstantScheme, SequentialExampleScheme, ShuffledExampleScheme from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum from blocks.graph import ComputationGraph @@ -31,7 +31,6 @@ from blocks.extensions.monitoring import DataStreamMonitoring import data import transformers -import hdist import apply_model if __name__ == "__main__": @@ -48,7 +47,7 @@ def setup_train_stream(req_vars): which_set='train', subset=slice(0, data.dataset_size), load_in_memory=True) - train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid)) + train = DataStream(train, iteration_scheme=ShuffledExampleScheme(data.dataset_size)) train = transformers.TaxiExcludeTrips(data.valid_trips, train) train = transformers.TaxiGenerateSplits(train, max_splits=100) @@ -91,10 +90,9 @@ def main(): model = config.model.Model(config) cost = model.cost - hcost = model.hcost outputs = model.outputs - req_vars = model.require_inputs + [ 'destination_latitude', 'destination_longitude' ] + req_vars = model.require_inputs + model.pred_vars req_vars_test = model.require_inputs + [ 'trip_id' ] train_stream = setup_train_stream(req_vars) @@ -109,7 +107,7 @@ def main(): step_rule=Momentum(learning_rate=config.learning_rate, momentum=config.momentum), params=params) - extensions=[DataStreamMonitoring([cost, hcost], valid_stream, + extensions=[DataStreamMonitoring(model.monitor, valid_stream, prefix='valid', every_n_batches=1000), Printing(every_n_batches=1000), @@ -132,11 +130,18 @@ def main(): outfile = open("output/test-output-%s.csv" % model_name, "w") outcsv = csv.writer(outfile) - outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) - for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): - dest = out['outputs'] - for i, trip in enumerate(out['trip_id']): - outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])]) + if model.pred_vars == ['time']: + outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"]) + for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): + time = out['outputs'] + for i, trip in enumerate(out['trip_id']): + outcsv.writerow([trip, int(time[i, 0])]) + else: + outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) + for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): + dest = out['outputs'] + for i, trip in enumerate(out['trip_id']): + outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])]) outfile.close() diff --git a/transformers.py b/transformers.py @@ -32,7 +32,7 @@ class Select(Transformer): class TaxiGenerateSplits(Transformer): def __init__(self, data_stream, max_splits=-1): super(TaxiGenerateSplits, self).__init__(data_stream) - self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude') + self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude', 'time') self.max_splits = max_splits self.data = None self.splits = [] @@ -63,7 +63,7 @@ class TaxiGenerateSplits(Transformer): dlat = numpy.float32(self.data[self.id_latitude][-1]) dlon = numpy.float32(self.data[self.id_longitude][-1]) - return tuple(r + [dlat, dlon]) + return tuple(r + [dlat, dlon, 15 * (len(self.data[self.id_longitude]) - 1)]) class TaxiAddFirstK(Transformer): def __init__(self, k, stream):