taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 0527e6e696fa1832d599473099429295dea31650
parent bd2826df73554207c88c5918d86fd9707d9e3753
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Fri, 24 Apr 2015 17:32:57 -0400

It kind of works (at least it does something now)

Diffstat:
Mdata.py | 10++++++----
Mmodel.py | 22++++++++++++++--------
Mtransformers.py | 7+++++--
3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/data.py b/data.py @@ -7,6 +7,8 @@ from fuel.iterator import DataIterator PREFIX="/data/lisatmp3/auvolat/taxikaggle" +client_ids = {int(x): y+1 for y, x in enumerate(open(PREFIX+"/client_ids.txt"))} + class CallType(Enum): CENTRAL = 0 STAND = 1 @@ -87,7 +89,7 @@ class TaxiData(Dataset): state.index=0 state.file.close() state.file=open(self.pathes[0]) - state.reader=csv.reader(state[0]) + state.reader=csv.reader(state.file) return state def get_data(self, state, request=None): @@ -95,7 +97,7 @@ class TaxiData(Dataset): raise ValueError try: line=state.reader.next() - except StopIteration: + except ValueError: state.file.close() state.index+=1 if state.index>=len(self.pathes): @@ -104,10 +106,10 @@ class TaxiData(Dataset): state.reader=csv.reader(state.file) if self.has_header: state.reader.next() - line=state.reader.next() + return get_data(self, state) line[1]=CallType.from_data(line[1]) # call_type - line[2]=0 if line[2]=='' or line[2]=='NA' else int(line[2]) # origin_call + line[2]=0 if line[2]=='' or line[2]=='NA' else client_ids[int(line[2])] # origin_call line[3]=0 if line[3]=='' or line[3]=='NA' else int(line[3]) # origin_stand line[4]=int(line[4]) # taxi_id line[5]=int(line[5]) # timestamp diff --git a/model.py b/model.py @@ -29,7 +29,7 @@ n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday n_dom = 31 n_hour = 24 -n_clients = 57106 +n_clients = 57105 n_stands = 63 n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory @@ -45,9 +45,9 @@ def main(): # The input and the targets x_firstk = tensor.matrix('first_k') x_lastk = tensor.matrix('last_k') - x_client = tensor.lmatrix('origin_call') - x_stand = tensor.lmatrix('origin_stand') - y = tensor.vector('destination') + x_client = tensor.lvector('origin_call') + x_stand = tensor.lvector('origin_stand') + y = tensor.matrix('destination') # Define the model client_embed_table = LookupTable(length=n_clients+1, dim=dim_embed, name='client_lookup') @@ -60,12 +60,15 @@ def main(): client_embed = client_embed_table.apply(x_client).flatten(ndim=2) stand_embed = stand_embed_table.apply(x_stand).flatten(ndim=2) - inputs = tensor.concatenate([x_firstk, x_lastk, client_embed, stand_embed], axis=1) + inputs = tensor.concatenate([x_firstk, x_lastk, + client_embed, stand_embed], + axis=1) hidden = hidden_layer.apply(inputs) outputs = output_layer.apply(hidden) # Calculate the cost cost = (outputs - y).norm(2, axis=1).mean() + cost.name = 'cost' # Initialization client_embed_table.weights_init = IsotropicGaussian(0.001) @@ -83,12 +86,14 @@ def main(): train = DataStream(train) train = transformers.add_extremities(train, n_begin_end_pts) train = transformers.add_destination(train) + train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination')) train_stream = Batch(train, iteration_scheme=ConstantScheme(batch_size)) valid = data.valid_data valid = DataStream(valid) valid = transformers.add_extremities(valid, n_begin_end_pts) valid = transformers.add_destination(valid) + valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination')) valid_stream = Batch(valid, iteration_scheme=ConstantScheme(batch_size)) @@ -103,9 +108,10 @@ def main(): extensions=[DataStreamMonitoring([cost], valid_stream, prefix='valid', every_n_batches=100), - Printing(every_n_batches=100), - Dump('ngram_blocks_model', every_n_batches=100), - LoadFromDump('ngram_blocks_model')] + Printing(every_n_batches=100), + # Dump('taxi_model', every_n_batches=100), + # LoadFromDump('taxi_model'), + ] main_loop = MainLoop( model=Model([cost]), diff --git a/transformers.py b/transformers.py @@ -1,4 +1,6 @@ from fuel.transformers import Transformer, Filter, Mapping +import numpy +import theano class Select(Transformer): def __init__(self, data_stream, sources): @@ -15,11 +17,12 @@ class Select(Transformer): def add_extremities(stream, k): id_polyline=stream.sources.index('polyline') def extremities(x): - return (x[id_polyline][:k], x[id_polyline][-k:]) + return (numpy.array(x[id_polyline][:k], dtype=theano.config.floatX).flatten(), + numpy.array(x[id_polyline][-k:], dtype=theano.config.floatX).flatten()) stream = Filter(stream, lambda x: len(x[id_polyline])>=k) stream = Mapping(stream, extremities, ('first_k', 'last_k')) return stream def add_destination(stream): id_polyline=stream.sources.index('polyline') - return Mapping(stream, lambda x: x[id_polyline][-1], ('destination',)) + return Mapping(stream, lambda x: (numpy.array(x[id_polyline][-1], dtype=theano.config.floatX),), ('destination',))