taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 13fc171f60ae1981c7ad4f2a302a8a85c29addc5
parent 8d31f9240056ec110cf63bde79d7661321d8ca7a
Author: Étienne Simon <esimon@esimon.eu>
Date:   Thu, 23 Jul 2015 21:20:32 -0400

Use new tvt dataset with option --tvt

Diffstat:
Mconfig/bidirectional_1.py | 1-
Mconfig/bidirectional_tgtcls_1.py | 1-
Mconfig/dest_mlp_2_cs.py | 1-
Mconfig/dest_mlp_2_cswdt.py | 1-
Mconfig/dest_mlp_2_noembed.py | 1-
Mconfig/dest_mlp_emb_only.py | 1-
Mconfig/dest_mlp_tgtcls_0_cs.py | 1-
Mconfig/dest_mlp_tgtcls_1_cs.py | 1-
Mconfig/dest_mlp_tgtcls_1_cswdt.py | 1-
Mconfig/dest_mlp_tgtcls_1_cswdtx.py | 3---
Mconfig/dest_mlp_tgtcls_1_cswdtx_alexandre.py | 1-
Mconfig/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py | 1-
Mconfig/dest_mlp_tgtcls_1_cswdtx_small.py | 1-
Mconfig/dest_mlp_tgtcls_2_cswdtx_small.py | 1-
Mconfig/joint_mlp_tgtcls_111_cswdtx.py | 1-
Mconfig/joint_mlp_tgtcls_111_cswdtx_bigger.py | 2--
Mconfig/joint_mlp_tgtcls_111_cswdtx_bigger_dropout.py | 2--
Mconfig/joint_mlp_tgtcls_111_cswdtx_noise_dout.py | 1-
Mconfig/joint_mlp_tgtcls_1_cswdtx.py | 1-
Mconfig/joint_mlp_tgtcls_1_cswdtx_bigger.py | 1-
Mconfig/memory_network_1.py | 1-
Mconfig/memory_network_2.py | 1-
Mconfig/memory_network_3.py | 1-
Mconfig/memory_network_adeb.py | 1-
Mconfig/memory_network_bidir.py | 1-
Mconfig/rnn_1.py | 1-
Mconfig/rnn_lag_tgtcls_1.py | 1-
Mconfig/rnn_tgtcls_1.py | 1-
Mconfig/time_mlp_1.py | 1-
Mconfig/time_mlp_2_cswdtx.py | 1-
Mconfig/time_mlp_tgtcls_2_cswdtx.py | 1-
Mdata/__init__.py | 48++++++++++++++++++++++++++++++++++++------------
Mdata/transformers.py | 17+++++++++++------
Mmodel/bidirectional.py | 15++++++++-------
Mmodel/memory_network.py | 11++++++-----
Mmodel/mlp.py | 15++++++++-------
Mmodel/rnn.py | 16++++++++--------
Mtrain.py | 6+++---
38 files changed, 80 insertions(+), 83 deletions(-)

diff --git a/config/bidirectional_1.py b/config/bidirectional_1.py @@ -27,5 +27,4 @@ mlp_biases_init = Constant(0.01) batch_size = 20 batch_sort_size = 20 -valid_set = 'cuts/large_valid' max_splits = 100 diff --git a/config/bidirectional_tgtcls_1.py b/config/bidirectional_tgtcls_1.py @@ -32,5 +32,4 @@ mlp_biases_init = Constant(0.01) batch_size = 20 batch_sort_size = 20 -valid_set = 'cuts/large_valid' max_splits = 100 diff --git a/config/dest_mlp_2_cs.py b/config/dest_mlp_2_cs.py @@ -23,5 +23,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_2_cswdt.py b/config/dest_mlp_2_cswdt.py @@ -27,5 +27,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_2_noembed.py b/config/dest_mlp_2_noembed.py @@ -20,5 +20,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_emb_only.py b/config/dest_mlp_emb_only.py @@ -27,5 +27,4 @@ learning_rate = 0.001 momentum = 0.9 batch_size = 100 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_0_cs.py b/config/dest_mlp_tgtcls_0_cs.py @@ -28,5 +28,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_1_cs.py b/config/dest_mlp_tgtcls_1_cs.py @@ -28,5 +28,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_1_cswdt.py b/config/dest_mlp_tgtcls_1_cswdt.py @@ -32,5 +32,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_1_cswdtx.py b/config/dest_mlp_tgtcls_1_cswdtx.py @@ -35,6 +35,3 @@ batch_size = 100 use_cuts_for_training = True max_splits = 1 - -valid_set = 'cuts/test_times_0' - diff --git a/config/dest_mlp_tgtcls_1_cswdtx_alexandre.py b/config/dest_mlp_tgtcls_1_cswdtx_alexandre.py @@ -34,5 +34,4 @@ step_rule = Momentum(learning_rate=0.01, momentum=0.9) batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py b/config/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py @@ -36,5 +36,4 @@ batch_size = 200 shuffle_batch_size = 5000 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_1_cswdtx_small.py b/config/dest_mlp_tgtcls_1_cswdtx_small.py @@ -34,5 +34,4 @@ step_rule = Momentum(learning_rate=0.01, momentum=0.9) batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/dest_mlp_tgtcls_2_cswdtx_small.py b/config/dest_mlp_tgtcls_2_cswdtx_small.py @@ -34,5 +34,4 @@ step_rule = Momentum(learning_rate=0.01, momentum=0.9) batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/joint_mlp_tgtcls_111_cswdtx.py b/config/joint_mlp_tgtcls_111_cswdtx.py @@ -50,5 +50,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/joint_mlp_tgtcls_111_cswdtx_bigger.py b/config/joint_mlp_tgtcls_111_cswdtx_bigger.py @@ -49,6 +49,4 @@ mlp_biases_init = Constant(0.01) # use adadelta, so no learning_rate or momentum batch_size = 200 -valid_set = 'cuts/test_times_0' - max_splits = 100 diff --git a/config/joint_mlp_tgtcls_111_cswdtx_bigger_dropout.py b/config/joint_mlp_tgtcls_111_cswdtx_bigger_dropout.py @@ -55,6 +55,4 @@ dropout_inputs = VariableFilter(bricks=[Rectifier], name='output') # use adadelta, so no learning_rate or momentum batch_size = 200 -valid_set = 'cuts/test_times_0' - max_splits = 100 diff --git a/config/joint_mlp_tgtcls_111_cswdtx_noise_dout.py b/config/joint_mlp_tgtcls_111_cswdtx_noise_dout.py @@ -57,5 +57,4 @@ dropout_inputs = VariableFilter(bricks=[Rectifier], name='output') noise = 0.01 noise_inputs = VariableFilter(roles=[roles.PARAMETER]) -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/joint_mlp_tgtcls_1_cswdtx.py b/config/joint_mlp_tgtcls_1_cswdtx.py @@ -50,5 +50,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/joint_mlp_tgtcls_1_cswdtx_bigger.py b/config/joint_mlp_tgtcls_1_cswdtx_bigger.py @@ -50,5 +50,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 200 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/memory_network_1.py b/config/memory_network_1.py @@ -37,7 +37,6 @@ embed_weights_init = IsotropicGaussian(0.001) batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 1 num_cuts = 1000 diff --git a/config/memory_network_2.py b/config/memory_network_2.py @@ -48,7 +48,6 @@ noise_inputs = VariableFilter(roles=[roles.PARAMETER]) batch_size = 512 -valid_set = 'cuts/test_times_0' max_splits = 1 num_cuts = 1000 diff --git a/config/memory_network_3.py b/config/memory_network_3.py @@ -48,7 +48,6 @@ noise_inputs = VariableFilter(roles=[roles.PARAMETER]) batch_size = 512 -valid_set = 'cuts/test_times_0' max_splits = 1 num_cuts = 1000 diff --git a/config/memory_network_adeb.py b/config/memory_network_adeb.py @@ -38,7 +38,6 @@ embed_weights_init = IsotropicGaussian(0.001) step_rule = Momentum(learning_rate=0.001, momentum=0.9) batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 1 num_cuts = 1000 diff --git a/config/memory_network_bidir.py b/config/memory_network_bidir.py @@ -47,7 +47,6 @@ normalize_representation = True batch_size = 32 batch_sort_size = 20 -valid_set = 'cuts/test_times_0' max_splits = 100 num_cuts = 1000 diff --git a/config/rnn_1.py b/config/rnn_1.py @@ -30,4 +30,3 @@ biases_init = Constant(0.001) batch_size = 10 batch_sort_size = 10 -valid_set = 'cuts/test_times_0' diff --git a/config/rnn_lag_tgtcls_1.py b/config/rnn_lag_tgtcls_1.py @@ -46,4 +46,3 @@ noise_inputs = VariableFilter(roles=[roles.PARAMETER]) batch_size = 10 batch_sort_size = 10 -valid_set = 'cuts/test_times_0' diff --git a/config/rnn_tgtcls_1.py b/config/rnn_tgtcls_1.py @@ -34,4 +34,3 @@ biases_init = Constant(0.001) batch_size = 10 batch_sort_size = 10 -valid_set = 'cuts/test_times_0' diff --git a/config/time_mlp_1.py b/config/time_mlp_1.py @@ -23,5 +23,4 @@ learning_rate = 0.00001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/time_mlp_2_cswdtx.py b/config/time_mlp_2_cswdtx.py @@ -30,5 +30,4 @@ learning_rate = 0.00001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/config/time_mlp_tgtcls_2_cswdtx.py b/config/time_mlp_tgtcls_2_cswdtx.py @@ -33,5 +33,4 @@ learning_rate = 0.0001 momentum = 0.99 batch_size = 32 -valid_set = 'cuts/test_times_0' max_splits = 100 diff --git a/data/__init__.py b/data/__init__.py @@ -1,4 +1,5 @@ import os +import sys import h5py import numpy @@ -8,23 +9,46 @@ path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle') Polyline = h5py.special_dtype(vlen=numpy.float32) -# `wc -l test.csv` - 1 # Minus 1 to ignore the header -test_size = 320 - -# `wc -l train.csv` - 1 -train_size = 1710670 - # `wc -l metaData_taxistandsID_name_GPSlocation.csv` stands_size = 64 # include 0 ("no origin_stands") # `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1 taxi_id_size = 448 -# `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2 -origin_call_size = 57125 # include 0 ("no origin_call") - -# As printed by csv_to_hdf5.py -origin_call_train_size = 57106 - train_gps_mean = numpy.array([41.1573, -8.61612], dtype=numpy.float32) train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=numpy.float32)) + +tvt = '--tvt' in sys.argv + +if tvt: + test_size = 19770 + valid_size = 19427 + train_size = 1671473 + + origin_call_size = 57106 + origin_call_train_size = 57106 + + valid_set = 'valid' + valid_ds = 'tvt.hdf5' + traintest_ds = 'tvt.hdf5' + +else: + # `wc -l test.csv` - 1 # Minus 1 to ignore the header + test_size = 320 + + # `wc -l train.csv` - 1 + train_size = 1710670 + + # `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2 + origin_call_size = 57125 # include 0 ("no origin_call") + + # As printed by csv_to_hdf5.py + origin_call_train_size = 57106 + + if '--largevalid' in sys.argv: + valid_set = 'cuts/large_valid' + else: + valid_set = 'cuts/test_times_0' + + valid_ds = 'valid.hdf5' + traintest_ds = 'data.hdf5' diff --git a/data/transformers.py b/data/transformers.py @@ -70,7 +70,9 @@ class TaxiGenerateSplits(Transformer): def __init__(self, data_stream, max_splits=-1): super(TaxiGenerateSplits, self).__init__(data_stream) - self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude', 'travel_time') + self.sources = data_stream.sources + if not data.tvt: + self.sources += ('destination_latitude', 'destination_longitude', 'travel_time') self.max_splits = max_splits self.data = None self.splits = [] @@ -100,12 +102,15 @@ class TaxiGenerateSplits(Transformer): r[self.id_latitude] = numpy.array(r[self.id_latitude][:n], dtype=theano.config.floatX) r[self.id_longitude] = numpy.array(r[self.id_longitude][:n], dtype=theano.config.floatX) - dlat = numpy.float32(self.data[self.id_latitude][-1]) - dlon = numpy.float32(self.data[self.id_longitude][-1]) - ttime = numpy.int32(15 * (len(self.data[self.id_longitude]) - 1)) - - return tuple(r + [dlat, dlon, ttime]) + r = tuple(r) + if data.tvt: + return r + else: + dlat = numpy.float32(self.data[self.id_latitude][-1]) + dlon = numpy.float32(self.data[self.id_longitude][-1]) + ttime = numpy.int32(15 * (len(self.data[self.id_longitude]) - 1)) + return r + (dlat, dlon, ttime) class _taxi_add_first_last_len_helper(object): def __init__(self, k, id_latitude, id_longitude): diff --git a/model/bidirectional.py b/model/bidirectional.py @@ -98,17 +98,18 @@ class Stream(object): self.config = config def train(self, req_vars): - valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) - valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] - - stream = TaxiDataset('train') + stream = TaxiDataset('train', data.traintest_ds) if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) - stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + if not data.tvt: + valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) + valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] + stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) if hasattr(self.config, 'shuffle_batch_size'): @@ -128,7 +129,7 @@ class Stream(object): return stream def valid(self, req_vars): - stream = TaxiStream(self.config.valid_set, 'valid.hdf5') + stream = TaxiStream(data.valid_set, data.valid_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) @@ -139,7 +140,7 @@ class Stream(object): return stream def test(self, req_vars): - stream = TaxiStream('test') + stream = TaxiStream('test', data.traintest_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_remove_test_only_clients(stream) diff --git a/model/memory_network.py b/model/memory_network.py @@ -53,20 +53,20 @@ class StreamBase(object): @property def valid_dataset(self): - return TaxiDataset(self.config.valid_set, 'valid.hdf5') + return TaxiDataset(data.valid_set, data.valid_ds) @property def valid_trips_ids(self): - valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) + valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) return valid.get_data(None, slice(0, valid.num_examples))[0] @property def train_dataset(self): - return TaxiDataset('train') + return TaxiDataset('train', data.traintest_ds) @property def test_dataset(self): - return TaxiDataset('test') + return TaxiDataset('test', data.traintest_ds) class StreamSimple(StreamBase): @@ -96,7 +96,8 @@ class StreamSimple(StreamBase): prefix_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme(self.train_dataset.num_examples)) - prefix_stream = transformers.TaxiExcludeTrips(prefix_stream, self.valid_trips_ids) + if not data.tvt: + prefix_stream = transformers.TaxiExcludeTrips(prefix_stream, self.valid_trips_ids) prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.TaxiGenerateSplits(prefix_stream, max_splits=self.config.max_splits) diff --git a/model/mlp.py b/model/mlp.py @@ -63,17 +63,18 @@ class Stream(object): self.config = config def train(self, req_vars): - valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) - valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] - - stream = TaxiDataset('train') + stream = TaxiDataset('train', data.traintest_ds) if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) - stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + if not data.tvt: + valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) + valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] + stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) if hasattr(self.config, 'shuffle_batch_size'): @@ -92,7 +93,7 @@ class Stream(object): return stream def valid(self, req_vars): - stream = TaxiStream(self.config.valid_set, 'valid.hdf5') + stream = TaxiStream(data.valid_set, data.valid_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) @@ -100,7 +101,7 @@ class Stream(object): return Batch(stream, iteration_scheme=ConstantScheme(1000)) def test(self, req_vars): - stream = TaxiStream('test') + stream = TaxiStream('test', data.traintest_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) diff --git a/model/rnn.py b/model/rnn.py @@ -146,15 +146,15 @@ class Stream(object): self.config = config def train(self, req_vars): - valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) - valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] - - stream = TaxiDataset('train') + stream = TaxiDataset('train', data.traintest_ds) stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) - stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + if not data.tvt: + valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) + valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] + stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) - stream = transformers.add_destination(stream) stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch(stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) @@ -163,7 +163,7 @@ class Stream(object): return stream def valid(self, req_vars): - stream = TaxiStream(self.config.valid_set, 'valid.hdf5') + stream = TaxiStream(data.valid_set, data.valid_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) @@ -173,7 +173,7 @@ class Stream(object): return stream def test(self, req_vars): - stream = TaxiStream('test') + stream = TaxiStream('test', data.traintest_ds) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_remove_test_only_clients(stream) stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) diff --git a/train.py b/train.py @@ -37,10 +37,10 @@ from ext_test import RunOnTest logger = logging.getLogger(__name__) if __name__ == "__main__": - if len(sys.argv) != 2: - print >> sys.stderr, 'Usage: %s config' % sys.argv[0] + if len(sys.argv) < 2 or len(sys.argv) > 3: + print >> sys.stderr, 'Usage: %s [--tvt | --largevalid] config' % sys.argv[0] sys.exit(1) - model_name = sys.argv[1] + model_name = sys.argv[-1] config = importlib.import_module('.%s' % model_name, 'config') logger.info('# Configuration: %s' % config.__name__)