taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

prepare.sh (4214B)


      1 #!/usr/bin/env bash
      2 
      3 RESET=`tput sgr0`
      4 BOLD="`tput bold`"
      5 RED="$RESET`tput setaf 1`$BOLD"
      6 GREEN="$RESET`tput setaf 2`"
      7 YELLOW="$RESET`tput setaf 3`"
      8 BLUE="$RESET`tput setaf 4`$BOLD"
      9 
     10 export PYTHONPATH="$PWD:$PYTHONPATH"
     11 
     12 echo "${YELLOW}This script will prepare the data."
     13 echo "${YELLOW}You should run it from inside the repository."
     14 echo "${YELLOW}You should set the TAXI_PATH variable to where the data downloaded from kaggle is."
     15 echo "${YELLOW}Three data files are needed: ${BOLD}train.csv.zip${YELLOW}, ${BOLD}test.csv.zip${YELLOW} and ${BOLD}metaData_taxistandsID_name_GPSlocation.csv.zip${YELLOW}. They can be found at the following url: ${BOLD}https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/data"
     16 if [ ! -e train.py ]; then
     17     echo "${RED}train.py not found, you are not inside the taxi repository."
     18     exit 1
     19 fi
     20 
     21 
     22 echo -e "\n$BLUE# Checking dependencies"
     23 
     24 python_import(){
     25     echo -n "${YELLOW}$1... $RESET"
     26     if ! python2 -c "import $1; print '${GREEN}version', $1.__version__, '${YELLOW}(we used version $2)'"; then
     27         echo "${RED}failed, $1 is not installed"
     28         exit 1
     29     fi
     30 }
     31 
     32 python_import h5py 2.5.0
     33 python_import theano 0.7.0.dev
     34 python_import fuel 0.0.1-ed725a7ff9f3d080ef882d4ae7e4373c4984f35a
     35 python_import blocks 0.0.1-1e0aca9171611be4df404129d91a991354e67730
     36 python_import sklearn 0.16.1
     37 
     38 
     39 echo -e "\n$BLUE# Checking data"
     40 
     41 echo "${YELLOW}TAXI_PATH is set to $TAXI_PATH"
     42 
     43 md5_check(){
     44     echo -n "${YELLOW}md5sum $1... $RESET"
     45     if [ ! -e "$TAXI_PATH/$1" ]; then
     46         echo "${RED}file not found, are you sure you set the TAXI_PATH variable correctly?"
     47         exit 1
     48     fi
     49 	if command -v md5 >/dev/null 2>&1; then
     50 		md5=`md5 "$TAXI_PATH/$1" | sed -e 's/^.* //'`
     51 	elif command -v md5sum >/dev/null 2>&1; then
     52 		md5=`md5sum "$TAXI_PATH/$1" | sed -e 's/ .*//'`
     53 	else
     54         echo "${RED} no md5 utility"
     55 		return
     56 	fi
     57     if [ $md5 = $2 ]; then
     58         echo "$GREEN$md5 ok"
     59     else
     60         echo "$RED$md5 failed"
     61         exit 1
     62     fi
     63 }
     64 
     65 md5_check train.csv.zip 87a1b75adfde321dc163160b495964e8
     66 md5_check test.csv.zip 47133bf7349cb80cc668fa56af8ce743
     67 md5_check metaData_taxistandsID_name_GPSlocation.csv.zip fecec7286191af868ce8fb208f5c7643
     68 
     69 
     70 echo -e "\n$BLUE# Extracting data"
     71 
     72 zipextract(){
     73 	echo -n "${YELLOW}unziping $1... $RESET"
     74 	unzip -o "$TAXI_PATH/$1" -d "$TAXI_PATH"
     75 	echo "${GREEN}ok"
     76 }
     77 
     78 zipextract train.csv.zip
     79 md5_check train.csv 68cc499ac4937a3079ebf69e69e73971
     80 
     81 zipextract test.csv.zip
     82 md5_check test.csv f2ceffde9d98e3c49046c7d998308e71
     83 
     84 zipextract metaData_taxistandsID_name_GPSlocation.csv.zip
     85 
     86 echo -n "${YELLOW}patching error in metadata csv... $RESET"
     87 cat "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv" | sed -e 's/41,Nevogilde,41.163066654-8.67598304213/41,Nevogilde,41.163066654,-8.67598304213/' > "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.tmp"
     88 mv "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.tmp" "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv"
     89 echo "${GREEN}ok"
     90 
     91 md5_check metaData_taxistandsID_name_GPSlocation.csv 724805b0b1385eb3efc02e8bdfe9c1df
     92 
     93 
     94 echo -e "\n$BLUE# Conversion of training set to HDF5"
     95 echo "${YELLOW}This might take some time$RESET"
     96 python2 data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5"
     97 
     98 
     99 echo -e "\n$BLUE# Generation of validation set"
    100 echo "${YELLOW}This might take some time$RESET"
    101 
    102 echo -n "${YELLOW}initialization... $RESET"
    103 python2 data/init_valid.py
    104 echo "${GREEN}ok"
    105 
    106 echo -n "${YELLOW}cutting... $RESET"
    107 python2 data/make_valid_cut.py test_times_0
    108 echo "${GREEN}ok"
    109 
    110 
    111 echo -e "\n$BLUE# Generation of destination cluster"
    112 echo "${YELLOW}This might take some time$RESET"
    113 echo -n "${YELLOW}generating... $RESET"
    114 python2 data_analysis/cluster_arrival.py
    115 echo "${GREEN}ok"
    116 
    117 
    118 echo -e "\n$BLUE# Creating output folders"
    119 echo -n "${YELLOW}mkdir model_data... $RESET"; mkdir model_data; echo "${GREEN}ok"
    120 echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok"
    121 
    122 echo -e "\n$GREEN${BOLD}The data was successfully prepared"
    123 echo "${YELLOW}To train the winning model on gpu, you can now run the following command:"
    124 echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre"