taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 8442ba13a442d509a407b5913b95d9f1b00989d2
parent c5187418bc93c34e3fdce4fdc1a3b5316812b69a
Author: Alex Auvolat <alex@adnab.me>
Date:   Fri, 10 Jul 2015 19:21:04 -0400

Merge branch 'master' of github.com:adbrebs/taxi

Diffstat:
AREADME.md | 3+++
Mconfig/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py | 4++--
Mdata_analysis/cluster_arrival.py | 23+++++++++++++++++------
Mmodel/mlp.py | 18+++++++++++-------
4 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md @@ -0,0 +1,3 @@ +Winning entry to the Kaggle ECML/PKDD destination competition. + +https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i diff --git a/config/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py b/config/dest_mlp_tgtcls_1_cswdtx_batchshuffle.py @@ -23,14 +23,14 @@ dim_embeddings = [ ] dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) -dim_hidden = [1000] +dim_hidden = [500] dim_output = tgtcls.shape[0] embed_weights_init = IsotropicGaussian(0.01) mlp_weights_init = IsotropicGaussian(0.1) mlp_biases_init = Constant(0.01) -step_rule = Momentum(learning_rate=0.01, momentum=0.9) +step_rule = Momentum(learning_rate=0.001, momentum=0.99) batch_size = 200 diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py @@ -1,20 +1,31 @@ -import matplotlib.pyplot as plt +#!/usr/bin/env python import numpy import cPickle import scipy.misc +import os from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs from itertools import cycle -print "Reading arrival point list" -with open("arrivals.pkl") as f: - pts = cPickle.load(f) +import data +from data.hdf5 import taxi_it +from data.transformers import add_destination + +print "Generating arrival point list" +dests = [] +for v in taxi_it("train"): + if len(v['latitude']) == 0: continue + dests.append([v['latitude'][-1], v['longitude'][-1]]) +pts = numpy.array(dests) + +with open(os.path.join(data.path, "arrivals.pkl"), "w") as f: + cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL) print "Doing clustering" bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) print bw -bw = 0.001 +bw = 0.001 # ( ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) ms.fit(pts) @@ -22,6 +33,6 @@ cluster_centers = ms.cluster_centers_ print "Clusters shape: ", cluster_centers.shape -with open("arrival-cluters.pkl", "w") as f: +with open(os.path.join(data.path, "arrival-clusters.pkl"), "w") as f: cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL) diff --git a/model/mlp.py b/model/mlp.py @@ -52,6 +52,12 @@ class FFMLP(Initializable): def predict_inputs(self): return self.inputs +class UniformGenerator(object): + def __init__(self): + self.rng = numpy.random.RandomState(123) + def __call__(self, *args): + return float(self.rng.uniform()) + class Stream(object): def __init__(self, config): self.config = config @@ -69,17 +75,15 @@ class Stream(object): stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) - stream = transformers.add_destination(stream) - - stream = transformers.taxi_add_datetime(stream) - stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) - stream = transformers.Select(stream, tuple(req_vars)) if hasattr(self.config, 'shuffle_batch_size'): stream = transformers.Batch(stream, iteration_scheme=ConstantScheme(self.config.shuffle_batch_size)) - rng = numpy.random.RandomState(123) - stream = Mapping(stream, SortMapping(lambda x: float(rng.uniform()))) + stream = Mapping(stream, SortMapping(key=UniformGenerator())) stream = Unpack(stream) + + stream = transformers.taxi_add_datetime(stream) + stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) + stream = transformers.Select(stream, tuple(req_vars)) stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size))