taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit defab74395f2ddb2641bba6ab8d18bdedde7a334
parent ca40e5c81d385e1422cebe40e009d7e93b95bfbb
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Wed, 29 Jul 2015 12:06:00 -0400

p-value caluculation script

Diffstat:
Adata/make_reference_output.py | 28++++++++++++++++++++++++++++
Mdata/transformers.py | 2+-
Apvalue.py | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/data/make_reference_output.py b/data/make_reference_output.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +import csv +import os + +from fuel.iterator import DataIterator +from fuel.schemes import SequentialExampleScheme +from fuel.streams import DataStream + +from data.hdf5 import TaxiDataset +import data + +dest_outfile = open(os.path.join(data.path, 'test_answer.csv'), 'w') +dest_outcsv = csv.writer(dest_outfile) +dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) + +dataset = TaxiDataset('test', 'tvt.hdf5', + sources=('trip_id', 'longitude', 'latitude', + 'destination_longitude', 'destination_latitude')) +it = DataIterator(DataStream(dataset), iter(xrange(dataset.num_examples)), as_dict=True) + +for v in it: + # print v + dest_outcsv.writerow([v['trip_id'], v['destination_latitude'], + v['destination_longitude']]) + +dest_outfile.close() + diff --git a/data/transformers.py b/data/transformers.py @@ -187,7 +187,7 @@ class _window_helper(object): if x.shape[0] < self.window_len: x = numpy.concatenate( - [x, numpy.full((self.window_len - x.shape[0],), x[-1])]) + [numpy.full((self.window_len - x.shape[0],), x[0]), x]) y = [x[i: i+x.shape[0]-self.window_len+1][:, None] for i in range(self.window_len)] diff --git a/pvalue.py b/pvalue.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import os +import sys + +import math +import numpy + +import data + +# Haversine distance calculation +# --------- -------- ----------- + +rearth = 6371. +deg2rad = 3.141592653589793 / 180. + +def hdist(a, b): + lat1 = a[:, 0] * deg2rad + lon1 = a[:, 1] * deg2rad + lat2 = b[:, 0] * deg2rad + lon2 = b[:, 1] * deg2rad + + dlat = abs(lat1-lat2) + dlon = abs(lon1-lon2) + + al = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2) + d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al)) + + hd = 2. * rearth * d + + return hd + + +# Read the inputs +# ---- --- ------ + +def readcsv(f): + return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3] + +answer = readcsv(os.path.join(data.path, 'test_answer.csv')) + +tables = [readcsv(f) for f in sys.argv if '.csv' in f] +etables = [hdist(t, answer) for t in tables] + +# Calculate p-values +# --------- -------- + +pvalue = numpy.zeros((len(tables), len(tables))) + +for i, a in enumerate(etables): + for j, b in enumerate(etables): + if i == j: + continue + d = b - a + var = (numpy.mean((a - numpy.mean(a))**2) + + numpy.mean((b - numpy.mean(b))**2)) / 2. + pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var))) + pvalue[i, j] = pv + +print pvalue