It kind of works (at least it does something now) - taxi - Winning entry to the Kaggle taxi competition

commit 0527e6e696fa1832d599473099429295dea31650
parent bd2826df73554207c88c5918d86fd9707d9e3753
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Fri, 24 Apr 2015 17:32:57 -0400

It kind of works (at least it does something now)

Diffstat:
M data.py  | 10 ++++++----
M model.py  | 22 ++++++++++++++--------
M transformers.py  | 7 +++++--

3 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/data.py b/data.py
@@ -7,6 +7,8 @@ from fuel.iterator import DataIterator
 
 PREFIX="/data/lisatmp3/auvolat/taxikaggle"
 
+client_ids = {int(x): y+1 for y, x in enumerate(open(PREFIX+"/client_ids.txt"))}
+
 class CallType(Enum):
     CENTRAL = 0
     STAND = 1
@@ -87,7 +89,7 @@ class TaxiData(Dataset):
             state.index=0
             state.file.close()
             state.file=open(self.pathes[0])
-        state.reader=csv.reader(state[0])
+        state.reader=csv.reader(state.file)
         return state
 
     def get_data(self, state, request=None):
@@ -95,7 +97,7 @@ class TaxiData(Dataset):
             raise ValueError
         try:
             line=state.reader.next()
-        except StopIteration:
+        except ValueError:
             state.file.close()
             state.index+=1
             if state.index>=len(self.pathes):
@@ -104,10 +106,10 @@ class TaxiData(Dataset):
             state.reader=csv.reader(state.file)
             if self.has_header:
                 state.reader.next()
-            line=state.reader.next()
+            return get_data(self, state)
 
         line[1]=CallType.from_data(line[1]) # call_type
-        line[2]=0 if line[2]=='' or line[2]=='NA' else int(line[2]) # origin_call
+        line[2]=0 if line[2]=='' or line[2]=='NA' else client_ids[int(line[2])] # origin_call
         line[3]=0 if line[3]=='' or line[3]=='NA' else int(line[3]) # origin_stand
         line[4]=int(line[4]) # taxi_id
         line[5]=int(line[5]) # timestamp
diff --git a/model.py b/model.py
@@ -29,7 +29,7 @@ n_dow = 7       # number of division for dayofweek/dayofmonth/hourofday
 n_dom = 31
 n_hour = 24
 
-n_clients = 57106
+n_clients = 57105
 n_stands = 63
 
 n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
@@ -45,9 +45,9 @@ def main():
     # The input and the targets
     x_firstk = tensor.matrix('first_k')
     x_lastk = tensor.matrix('last_k')
-    x_client = tensor.lmatrix('origin_call')
-    x_stand = tensor.lmatrix('origin_stand')
-    y = tensor.vector('destination')
+    x_client = tensor.lvector('origin_call')
+    x_stand = tensor.lvector('origin_stand')
+    y = tensor.matrix('destination')
 
     # Define the model
     client_embed_table = LookupTable(length=n_clients+1, dim=dim_embed, name='client_lookup')
@@ -60,12 +60,15 @@ def main():
 
     client_embed = client_embed_table.apply(x_client).flatten(ndim=2)
     stand_embed = stand_embed_table.apply(x_stand).flatten(ndim=2)
-    inputs = tensor.concatenate([x_firstk, x_lastk, client_embed, stand_embed], axis=1)
+    inputs = tensor.concatenate([x_firstk, x_lastk,
+                                 client_embed, stand_embed],
+                                axis=1)
     hidden = hidden_layer.apply(inputs)
     outputs = output_layer.apply(hidden)
 
     # Calculate the cost
     cost = (outputs - y).norm(2, axis=1).mean()
+    cost.name = 'cost'
 
     # Initialization
     client_embed_table.weights_init = IsotropicGaussian(0.001)
@@ -83,12 +86,14 @@ def main():
     train = DataStream(train)
     train = transformers.add_extremities(train, n_begin_end_pts)
     train = transformers.add_destination(train)
+    train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination'))
     train_stream = Batch(train, iteration_scheme=ConstantScheme(batch_size))
 
     valid = data.valid_data
     valid = DataStream(valid)
     valid = transformers.add_extremities(valid, n_begin_end_pts)
     valid = transformers.add_destination(valid)
+    valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination'))
     valid_stream = Batch(valid, iteration_scheme=ConstantScheme(batch_size))
 
 
@@ -103,9 +108,10 @@ def main():
     extensions=[DataStreamMonitoring([cost], valid_stream,
                                      prefix='valid',
                                      every_n_batches=100),
-                    Printing(every_n_batches=100),
-                    Dump('ngram_blocks_model', every_n_batches=100),
-                    LoadFromDump('ngram_blocks_model')]
+                Printing(every_n_batches=100),
+                # Dump('taxi_model', every_n_batches=100),
+                # LoadFromDump('taxi_model'),
+                ]
 
     main_loop = MainLoop(
         model=Model([cost]),
diff --git a/transformers.py b/transformers.py
@@ -1,4 +1,6 @@
 from fuel.transformers import Transformer, Filter, Mapping
+import numpy
+import theano
 
 class Select(Transformer):
     def __init__(self, data_stream, sources):
@@ -15,11 +17,12 @@ class Select(Transformer):
 def add_extremities(stream, k):
     id_polyline=stream.sources.index('polyline')
     def extremities(x):
-        return (x[id_polyline][:k], x[id_polyline][-k:])
+        return (numpy.array(x[id_polyline][:k], dtype=theano.config.floatX).flatten(),
+                numpy.array(x[id_polyline][-k:], dtype=theano.config.floatX).flatten())
     stream = Filter(stream, lambda x: len(x[id_polyline])>=k)
     stream = Mapping(stream, extremities, ('first_k', 'last_k'))
     return stream
 
 def add_destination(stream):
     id_polyline=stream.sources.index('polyline')
-    return Mapping(stream, lambda x: x[id_polyline][-1], ('destination',))
+    return Mapping(stream, lambda x: (numpy.array(x[id_polyline][-1], dtype=theano.config.floatX),), ('destination',))

	taxi Winning entry to the Kaggle taxi competition
	git clone https://esimon.eu/repos/taxi.git
	Log \| Files \| Refs \| README

M	data.py	\|	10	++++++----
M	model.py	\|	22	++++++++++++++--------
M	transformers.py	\|	7	+++++--