taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 80d3ea67a845484d119cb88f0a0412f981ab344c
parent f9a31bd246e3c4736d3f532b566b7437eba6b4de
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Mon,  4 May 2015 16:43:48 -0400

Mew data analysis tool: clustering of arrival points.

Diffstat:
Mconfig/model_0.py | 4++--
Adata_analysis/cluster_arrival.py | 27+++++++++++++++++++++++++++
Mdata_analysis/destmaps.py | 20++++++++++----------
Mmodel.py | 2+-
4 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/config/model_0.py b/config/model_0.py @@ -12,6 +12,6 @@ dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed dim_hidden = [200, 100] dim_output = 2 -learning_rate = 0.002 -momentum = 0.9 +learning_rate = 0.0001 +momentum = 0.99 batch_size = 32 diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import numpy +import cPickle +import scipy.misc + +from sklearn.cluster import MeanShift, estimate_bandwidth +from sklearn.datasets.samples_generator import make_blobs +from itertools import cycle + +print "Reading arrival point list" +with open("arrivals.pkl") as f: + pts = cPickle.load(f) + +print "Doing clustering" +bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) +print bw +bw = 0.001 + +ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) +ms.fit(pts) +cluster_centers = ms.cluster_centers_ + +print "Clusters shape: ", cluster_centers.shape + +with open("arrival-cluters.pkl", "w") as f: + cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL) + diff --git a/data_analysis/destmaps.py b/data_analysis/destmaps.py @@ -4,7 +4,7 @@ import cPickle import scipy.misc print "Loading data..." -with open("train_normal.pkl") as f: normal = cPickle.load(f) +with open("train.pkl") as f: normal = cPickle.load(f) print "Extracting x and y" # xes = [c[0] for l in normal for c in l[-1]] @@ -12,21 +12,21 @@ print "Extracting x and y" xes = [l[-1][-1][0] for l in normal if len(l[-1]) > 0] yes = [l[-1][-1][1] for l in normal if len(l[-1]) > 0] -xrg = [-8.75, -8.55] -yrg = [41.05, 41.25] +xrg = [-8.80, -8.50] +yrg = [41.00, 41.30] -print "Doing 1d x histogram" -plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist_dest.pdf") -print "Doing 1d y histogram" -plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist_dest.pdf") +#print "Doing 1d x histogram" +#plt.clf(); plt.hist(xes, bins=2000, range=xrg); plt.savefig("xhist_dest.pdf") +#print "Doing 1d y histogram" +#plt.clf(); plt.hist(yes, bins=2000, range=yrg); plt.savefig("yhist_dest.pdf") print "Doing 2d histogram" -hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg]) +hist, xx, yy = numpy.histogram2d(xes, yes, bins=4000, range=[xrg, yrg]) # import ipdb; ipdb.set_trace() print "Imshow" -plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest.png", dpi=600) +plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest_x.png", dpi=600) print "Imsave" -scipy.misc.imsave("xymap_dest_2.png", numpy.log(hist + 1)) +scipy.misc.imsave("xymap_dest_2_x.png", numpy.log(hist + 1)) diff --git a/model.py b/model.py @@ -53,7 +53,7 @@ def setup_train_stream(): load_in_memory=True) train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid)) train = transformers.filter_out_trips(data.valid_trips, train) - train = transformers.TaxiGenerateSplits(train) + train = transformers.TaxiGenerateSplits(train, max_splits=100) train = transformers.add_first_k(config.n_begin_end_pts, train) train = transformers.add_last_k(config.n_begin_end_pts, train) train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k_latitude',