taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

pvalue.py (1309B)


      1 #!/usr/bin/env python
      2 
      3 import os
      4 import sys
      5 
      6 import math
      7 import numpy
      8 
      9 import data
     10 
     11 # Haversine distance calculation
     12 # --------- -------- -----------
     13 
     14 rearth = 6371.
     15 deg2rad = 3.141592653589793 / 180.
     16 
     17 def hdist(a, b):
     18     lat1 = a[:, 0] * deg2rad
     19     lon1 = a[:, 1] * deg2rad
     20     lat2 = b[:, 0] * deg2rad
     21     lon2 = b[:, 1] * deg2rad
     22 
     23     dlat = abs(lat1-lat2)
     24     dlon = abs(lon1-lon2)
     25 
     26     al = numpy.sin(dlat/2)**2  + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2)
     27     d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al))
     28 
     29     hd = 2. * rearth * d
     30 
     31     return hd
     32 
     33 
     34 # Read the inputs
     35 # ---- --- ------
     36 
     37 def readcsv(f):
     38     return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3]
     39 
     40 answer = readcsv(os.path.join(data.path, 'test_answer.csv'))
     41 
     42 tables = [readcsv(f) for f in sys.argv if '.csv' in f]
     43 etables = [hdist(t, answer) for t in tables]
     44 
     45 # Calculate p-values
     46 # --------- --------
     47 
     48 pvalue = numpy.zeros((len(tables), len(tables)))
     49 
     50 for i, a in enumerate(etables):
     51     for j, b in enumerate(etables):
     52         if i == j:
     53             continue
     54         d = b - a
     55         var = (numpy.mean((a - numpy.mean(a))**2)
     56                 + numpy.mean((b - numpy.mean(b))**2)) / 2.
     57         pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var)))
     58         pvalue[i, j] = pv
     59 
     60 print pvalue