import numpy as np
import pandas as pd
import os, copy
from termcolor import cprint
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

class Dataset:
    '''Class for keeping, transforming, splitting, scaling data.'''
    
    def __init__(self, constants_mod, predict='avg_log'):
        # Modeule with all constants.
        self.constants_mod = constants_mod

        # Data directory.
        current_dir = os.getcwd()
        self.data_dir = 'data'.join(current_dir.rsplit('code', 1))
        
        # Input matrix.
        self.X = pd.read_csv('%s/%s' % (self.data_dir, self.constants_mod.FEATURES_FILE))
        # List of features - changes with adding log features etc.
        self.features = list(self.X)[1:self.constants_mod.FEATURE_CNT+1]
        # Name of prediction column.
        self.predict_col = self.constants_mod.get_predict_col_name(predict)
        # Column with level IDs.
        self.id_col = list(self.X)[0]
        
        # List of actions applied on dataset.
        self.actions = []
        # List of features, which had log applied on/
        self.log_applied = []
        
        # Train and test versions of all matrices/vectors. Will be filled after split.
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.id_train, self.id_test = None, None
    
    def apply_log(self, features, remove=True):
        '''Apply log to features from the list.

            remove: if False, the original feature will not be removed
        '''
        applied = []
        for f in features:
            try:
                ind = self.features.index(f)
                new_f = '%s_log' % f
                try:
                    ind = self.features.index(new_f)
                    cprint('Feature %s already exists. Not applying log.' % f, 'yellow')
                    continue
                except:
                    pass
                if (self.X[f] == 0).any():
                    self.X[new_f] = np.log(self.X[f] + 1)
                else:
                    self.X[new_f] = np.log(self.X[f])
                self.log_applied.append(f)
                applied.append(f)
                self.features.append(new_f)
                if remove:
                    self.features.remove(f)
                    self.X.drop(columns=f, inplace=True)
            except ValueError:
                cprint('Feature %s does not exists. Not applying log.' % f, 'red')
                continue
        if len(applied) > 0:
            self.actions.append(('apply_log', applied))

    @staticmethod
    def feature_remove_outliers(x):
        '''Static function - return mask with outliers filtered out.'''
        q75, q25 = np.percentile(x, [75 ,25])
        iqr = q75 - q25

        minv = q25 - (iqr*1.5)
        maxv = q75 + (iqr*1.5)
        return (minv <= x) & (x <= maxv)

    
    def remove_outliers(self):
        '''Remove outliers from input matrix regarding all features.'''
        if len(self.log_applied) == 0:
            cprint('Removing outliers before applying any log.', 'yellow')
        removed = []
        for f in self.features:
            try:
                # If the original version of feature exists simultaneously with its log version,
                # do not remove outliers for the original feature.
                self.log_applied.index(f)
                cprint('Feature %s has log version. Not removing outliers.' % f, 'yellow')
                continue
            except:
                pass
            x = self.X[f]
            self.X = self.X[Dataset.feature_remove_outliers(x)]
            removed.append(f)
        self.actions.append(('remove_outliers', removed))
    
    def remove(self, features):
        '''Remove all features from the list.'''
        removed = []
        for f in features:
            try:
                ind = self.features.index(f)
                removed.append(f)
                self.features.remove(f)
                self.X.drop(columns=f, inplace=True)
            except ValueError:
                cprint('Feature %s does not exists. Not removing.' % f, 'yellow')
                continue
        if len(removed) > 0:
            self.actions.append(('remove', removed))    

    def split(self):
        '''Split the dataset on training and testing.'''

        X, y = self.X[self.features + [self.id_col]], self.X[self.predict_col]
        
        test = []
        with open(os.path.join(self.data_dir, self.constants_mod.TEST_IDS_FILE)) as inp:
            test = [int(x) for x in inp.readline().strip().split()]

        X_train = X[~X.id.isin(test)]
        X_test = X[X.id.isin(test)]
        self.y_train = y[~X.id.isin(test)]
        self.y_test = y[X.id.isin(test)]
        
        self.X_train = X_train[self.features]
        self.X_test = X_test[self.features]
        self.id_train = X_train[self.id_col]
        self.id_test = X_test[self.id_col]
        
        self.scaler_X = preprocessing.StandardScaler()
        self.scaler_X.fit_transform(self.X_train)
        
        self.actions.append(('split', []))

    def save_train_test(self):
        '''Save training and testing dataset.

            This files can change, when e.g. log is applied on different features etc.
        '''
        train_file = '%s/%s' % (self.data_dir, self.constants_mod.TRAIN_FEATURES_FILE)
        X = copy.deepcopy(self.X_train)
        X[self.id_col] = self.id_train
        X[self.predict_col] = self.y_train
        X.to_csv(train_file, columns=[self.id_col] + self.features + [self.predict_col], index=False)        
        
        test_file = '%s/%s' % (self.data_dir, self.constants_mod.TEST_FEATURES_FILE)
        X = copy.deepcopy(self.X_test)
        X[self.id_col] = self.id_test
        X[self.predict_col] = self.y_test
        X.to_csv(test_file, columns=[self.id_col] + self.features + [self.predict_col], index=False)
    
    def get_X_train_test(self, scaled=True):
        '''Get training and testing input matrices.

            scaled: if True use Scaler, otherwise do not use it
        '''
        if scaled:
            return self.scaler_X.transform(self.X_train), self.scaler_X.transform(self.X_test)
        return self.X_train, self.X_test
        
    def get_y_train_test(self):
        '''Get training and testing output vector.'''
        return self.y_train, self.y_test
            
    def get_id_train_test(self):
        '''Get traing and testing IDs.'''
        return self.id_train, self.id_test
        
    def description(self):
        '''Print the dataset description, with all actions which were applied.'''
        cprint('Dataset (%s):' % self.predict_col, 'green', attrs=['bold'])
        for i, (action_name, feats) in enumerate(self.actions):
            cprint('[%2d]: %s' % (i, action_name), 'yellow', attrs=['bold'])
            if len(feats) > 0: cprint('\t[%s]' % ', '.join(feats), 'yellow')
        if self.id_test is not None:
            cprint('TestIds:', 'blue', attrs=['bold'])
            cprint('\t[%s]' % ', '.join([ str(x) for x in sorted(self.id_test)]), 'blue')
