taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 58dcf7b17e9db6af53808994a7d39a759fcc5028
parent 3a694dde577103f269ff888c19c820712fbab96a
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Mon, 20 Jul 2015 17:40:28 -0400

Merge branch 'master' of github.com:adbrebs/taxi

Diffstat:
MREADME.md | 2+-
Mdata_analysis/maps.py | 27++++++++++++++-------------
Mprepare.sh | 2+-
3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md @@ -51,6 +51,6 @@ Note that some script expect the repository to be in your PYTHONPATH (go to the 6. Create a folder `model_data` and a folder `output` (next to the training script), which will receive respectively a regular save of the model parameters and many submission files generated from the model at a regular interval. 7. Run `./train.py dest_mlp_tgtcls_1_cswdtx_alexandre` to train the model. Output solutions are generated in `output/` every 1000 iterations. Interrupt the model with three consecutive Ctrl+C at any times. The training script is set to stop training after 10 000 000 iterations, but a result file produced after less than 2 000 000 iterations is already the winning solution. We trained our model on a GeForce GTX 680 card and it took about an afternoon to generate the winning solution. When running the training script, set the following Theano flags environment variable to exploit GPU parallelism: - `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN` + `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run` *More information in this pdf: https://github.com/adbrebs/taxi/blob/master/doc/short_report.pdf* diff --git a/data_analysis/maps.py b/data_analysis/maps.py @@ -1,18 +1,17 @@ import cPickle -import scipy import numpy as np import matplotlib.pyplot as plt import data +from data.hdf5 import taxi_it def compute_number_coordinates(): - train_it = data.train_it() # Count the number of coordinates n_coordinates = 0 - for ride in train_it: - n_coordinates += len(ride[-1]) + for ride in taxi_it('train'): + n_coordinates += len(ride['latitude']) print n_coordinates return n_coordinates @@ -25,15 +24,16 @@ def extract_coordinates(n_coordinates=None): n_coordinates = compute_number_coordinates() coordinates = np.zeros((n_coordinates, 2), dtype="float32") - train_it = data.train_it() c = 0 - for ride in train_it: - for point in ride[-1]: + for ride in taxi_it('train'): + for point in zip(ride['latitude'], ride['longitude']): coordinates[c] = point c += 1 - cPickle.dump(coordinates, open(data.DATA_PATH + "/coordinates_array.pkl", "wb")) + print c + + cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb")) def draw_map(coordinates, xrg, yrg): @@ -43,13 +43,14 @@ def draw_map(coordinates, xrg, yrg): hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg]) plt.imshow(np.log(hist)) - plt.savefig(data.DATA_PATH + "/analysis/xyhmap2.png") + plt.gca().invert_yaxis() + plt.savefig(data.path + "/analysis/xyhmap2.png") if __name__ == "__main__": - # extract_coordinates(n_coordinates=83360928) + extract_coordinates(n_coordinates=83409386) - coordinates = cPickle.load(open(data.DATA_PATH + "/coordinates_array.pkl", "rb")) - xrg = [-8.75, -8.55] - yrg = [41.05, 41.25] + coordinates = cPickle.load(open(data.path + "/coordinates_array.pkl", "rb")) + xrg = [41.05, 41.25] + yrg = [-8.75, -8.55] draw_map(coordinates, xrg, yrg) diff --git a/prepare.sh b/prepare.sh @@ -121,4 +121,4 @@ echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok" echo -e "\n$GREEN${BOLD}The data was successfully prepared" echo "${YELLOW}To train the winning model on gpu, you can now run the following command:" -echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre" +echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre"