import numpy as np
import sys
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import Lasso
from sklearn import cross_validation, datasets,linear_model
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import random

def load(fn, stlpce):
  f = open(fn)
  X = []
  y = []
  for l in f:
    its = l.strip().split()
    features = []
    for x in stlpce:
      features.append(float(its[x]))
    X.append(features)
    y.append(float(its[5]))
  return X, y

## tuto funkciu doprogramujte

def crossvalidacne_skore(X, y, funkcia):
  #score = -np.mean(cross_val_score(funkcia, X, y, cv=3, scoring='mean_squared_error'))
  score = np.mean(cross_validation.cross_val_score(funkcia, X, y, n_jobs=1))
  return score

def score(X,y, funkcia):
  score = funkcia.score(X,y)
  return score

def print_results(X,y, funkcia,kolko):
  y_pred = funkcia.predict(X)
  for i in range(kolko):
    print y[i],"\t",y_pred[i]

def vyber_zaujimavych_param(X,y,alpha,threshold):
  f = Lasso(alpha)
  f.fit(X,y)
  #print f.score(X,y)
  print "Indexy zaujimavych stlpcov:"
  for i in range(len(funkcia.coef_)):
   if funkcia.coef_[i]>threshold:
     print i + (2 if i > 3 else 1)
  print "Koeficienty:"
  #print funkcia.coef_
  #print ("%.4f", f.coef_)
  print "[",
  for s in funkcia.coef_:
    print ("%.4f" % s), 
  print "]"

def show_predictions(X_test, Y_test, funkcia):
  y_pred = funkcia.predict(X_test)
  plt.scatter(Y_test,y_pred)
  plt.show()
  plt.clf()
  xy = zip(Y_test,y_pred)
  xy.sort()
  X = [x for (x,y) in xy]
  y = [y for (x,y) in xy]
  plt.plot(X, 'r-', alpha=0.7, color="r")
  plt.plot(y, 'r-', alpha=0.7, color="b")
  plt.show()

def show_test(X_tren, Y_tren, funkcia):
  y_pred = funkcia.predict(X_tren)
  plt.scatter(Y_tren,y_pred)
  plt.show()
  plt.clf()
  xy = zip(Y_tren,y_pred)
  xy.sort()
  X = [x for (x,y) in xy]
  y = [y for (x,y) in xy]
  plt.plot(X, 'r-', alpha=0.7, color="r")
  plt.plot(y, 'r-', alpha=0.7, color="b")
  plt.show()

stlpce = range(1, 5) + range(6, 37) # mozes si upravit, napr range(od, do) znamena interval <od,do)
#stlpce = range(1, 5) + range(6, 24) 
#stlpce = range(1, 5) + range(6, 7) 
X, y = load(sys.argv[1], stlpce)

pocet_test = 10000 #mozes menit
pocet_tren = len(X) - pocet_test #mozes menit


X_tren = X[0:pocet_tren]
X_test = X[pocet_tren:pocet_tren+pocet_test]
Y_tren = y[0:pocet_tren]
Y_test = y[pocet_tren:pocet_tren+pocet_test]


funkcia = linear_model.LinearRegression()
#funkcia = linear_model.Ridge(alpha=.5)
#funkcia = linear_model.RidgeCV(alphas=[0.1, 0.5, 1.0, 5., 10.0])
#funkcia = linear_model.Lasso(alpha = 0.1)
#funkcia = linear_model.ElasticNet()
#funkcia = linear_model.LassoLars(alpha=.1)
#funkcia = RandomForestRegressor(n_estimators=150)

print "Funkcia sa fittuje."
funkcia.fit(X_tren, Y_tren)
print "Funkcia sa fittla."

print "trenovacie skore:"
print score(X_tren, Y_tren, funkcia)

print "crossvalidacne skore"
print crossvalidacne_skore(X_tren, Y_tren, funkcia)

print "testovacie skore:"
print score(X_test, Y_test, funkcia)

print "features:"
#for s in funkcia.feature_importances_:
 #  print ("%.5f" % s)

print "Pocet stromov"
#print len(funkcia.estimators_)

show_predictions(X_test, Y_test, funkcia)
show_test(X_tren, Y_tren, funkcia)

kolko = 200 #mozes menit
#print "vysledky pre tren. data:"
#print_results(X_tren, Y_tren, funkcia, kolko)

#print "vysledky pre test. data:"
#print_results(X_test, Y_test, funkcia, kolko)

alpha=0.1 #mozes menit
threshold=20 #mozes menit
"Vyber parametrov: "
#vyber_zaujimavych_param(X_tren, Y_tren, alpha, threshold)
