import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD


# Keras import(s)
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Reshape
from keras.optimizers import Adam
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
import csv
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


class Processed_dataset:
    def __init__(self,path, scaler = None):
        
        
        self.path = path
        
        #if the scaler is passed as an argumen then, the prepossing is being done on a BSM dataset. 
        self.scaler = scaler
    
    #This method does preprocessing on each event. 
    #separates all elements in a row by "," and ";"
    #creates variables for the number of electrons, muons, jets and photons
    #removes names of particles
    #sorts particles in descending order of energy
    #for events with less than zero particles, the remaining paticle kinematics are replaced by zero
    
    def process_row(self,row):
         
        #split the elements in a row by ";" and "," and store them in the string_data list
        string_data = []
        split_data = row[0].split(';') 
        for element in split_data:
            splits = element.split(',')
            for i in splits:
                string_data.append(i)
        
        #create variables for the number of electrons, muons and jets.
        #the variables store the number of occurence of these particles
        counter = Counter(string_data)
        n_ele = counter['e-'] + counter['e+']
        n_muon = counter['m-'] + counter['m+']
        n_jets = counter['j']
        n_bjets = counter['b']
        n_photon =counter['g']
        
        
        row.iloc[1] = string_data[0]      #event ID
        row.iloc[2] = string_data[1]      #process ID
        row.iloc[3] = string_data[2]      #event weight
        row.iloc[4] = n_ele               # no. eletrons
        row.iloc[5] = n_muon              #no. muons
        row.iloc[6] = n_jets              #no. jets
        row.iloc[7] = n_bjets             #no. bjets
        row.iloc[8] = n_photon            #no. photons
        

        n_total = n_ele + n_muon + n_jets + n_bjets + n_photon

        particle_data = string_data[3:-1]  #get all information of an event that will be used in training
  
        particles = []
        
        #remove all names of particles
        #convert the kinamatics of each particle into float
        for j in range(len(particle_data)):

            try:
                particles.append(float(particle_data[j]) )
            except ValueError:
                pass
            except IndexError:
                break
        
       
        row.iloc[9] = np.log(particles[0])   #log(MET)
        row.iloc[10] = particles[1]          #METphi

        particles_data = particles[2:]       #remove the log(MET) and METphi
        
        #create an empty list
        #each element in this list is the four kinematics of a single particle in an event
        objects = []
        n = 0
        for i in range(n_total):
            particles_data[n] = np.log(particles_data[n])
            particles_data[n+1] = np.log(particles_data[n+1])
            objects.append(particles_data[n : n+4])
            n = n + 4
        
        #sort the elements in the object list in descending order of energy
        energy_sorted =  sorted(objects, key = lambda x: x[0], reverse=True) 
        
        # for particles with less than 8 particles, the remaining kinematics are replaced by zero.
        while len(energy_sorted) < 8:
            energy_sorted.append([0,0,0,0])


        #append the kinematics of the particles to the remaining elements in the row
        row[11:15] = energy_sorted[0]
        row[15:19] = energy_sorted[1]
        row[19:23] = energy_sorted[2]
        row[23:27] = energy_sorted[3]
        row[27:31] = energy_sorted[4]
        row[31:35] = energy_sorted[5]
        row[35:39] = energy_sorted[6]
        row[39:43] = energy_sorted[7]
        
        return row

        
        
    def process_data(self):
        
        #read the dataset into a pandas dataframe
        df = pd.read_csv(self.path,sep ='; ,', engine='python', delim_whitespace=False, header=None)
        
        #create a new dataframe for the processed dataset
        columns_names = ['event ID','process ID','event weight','N_ele', 'N_muon', 'N_jets', 'N_bjets', 'N_photon', 'log(MET)' , 'METphi']     
        for i in range(1,9):
            columns_names.append('log(E' + str(i) + ')')
            columns_names.append('log(pt' + str(i) + ')')
            columns_names.append('eta' + str(i) )
            columns_names.append('phi' + str(i))      
        new_df = pd.DataFrame(columns=columns_names)
        
        #append the new dataframe to the original dataframe
        df = pd.concat([df, new_df], axis = 1)
        #this method in pandas applies the "process_row" method to each row in the dataset
        df = df.apply(self.process_row, axis = 1)
        #delete the first column as it is no longer needed
        df = df.drop(0, axis=1)
        #get the data that will be used in training
        data = df.iloc[:,3:].values

        self.data = data
        #save processed dataset
        processed_path = self.path[:-4] + '_processed1' + '.csv'
        df.to_csv(processed_path, index=False)
        
        #this if statment is run for the SM dataset
        if self.scaler == None:
            
            scaler = MinMaxScaler()
            scaler.fit(data)
            data_scaled = scaler.transform(data)
            self.data_scaled = data_scaled
            np.savetxt(self.path[:-4] + "_training" + ".csv", data, delimiter=",")
            X_train, X_test = train_test_split(data_scaled, test_size =0.2, random_state = 42)

            return X_train, X_test, scaler
        
        #BSM dataset
        else:    
            data_scaled = self.scaler.transform(data)
            self.data_scaled = data_scaled
            np.savetxt(self.path[:-4] + "_testing_BSM" + ".csv", data, delimiter=",")
            return data_scaled


bgb = Processed_dataset('background_chan2b_7.8.csv')
X_train, X_test, scaler = bgb.process_data()

test9 = Processed_dataset('chacha_cha300_neut140_chan2b.csv',scaler = scaler1)
chacha_cha300 = test9.process_data()

test8 = Processed_dataset('chacha_cha400_neut60_chan2b.csv',scaler = scaler1)
chacha_cha400 = test8.process_data()


test1 = Processed_dataset('gluino_1000.0_neutralino_1.0_chan2b.csv',scaler = scaler1)
gluino_1000 = test1.process_data()

test2 = Processed_dataset('stlp_st1000_chan2b.csv',scaler = scaler1)
st1p_st1000 = test2.process_data()

test3 = Processed_dataset('chaneut_cha200_neut50_chan2b.csv',scaler = scaler1)
chaneut_cha200 = test3.process_data()

test4 = Processed_dataset('pp23mt_50_chan2b.csv',scaler = scaler1)
pp23mt_50 = test4.process_data()

test5 = Processed_dataset('chaneut_cha250_neut150_chan2b.csv',scaler = scaler1)
chaneut_cha250 = test5.process_data()

test6 = Processed_dataset('chacha_cha600_neut200_chan2b.csv',scaler = scaler1)
chacha_cha600 = test6.process_data()

test7 = Processed_dataset('pp24mt_50_chan2b.csv',scaler = scaler1)
pp24mt_50 = test7.process_data()


df = pd.read_csv('background_chan2b_7.8_processed1.csv')
df.head()


data = np.genfromtxt('background_chan2b_7.8.csv_training.csv', delimiter=',')
X_train, X_test = train_test_split(data_scaled, test_size =0.2, random_state = 42)
chacha_cha300 = np.genfromtxt('chacha_cha300_neut140_chan2b_testing_BSM.csv', delimiter=',')
chacha_cha400 = np.genfromtxt('chacha_cha400_neut60_chan2b_testing_BSM.csv', delimiter=',')
gluino_1000 = np.genfromtxt('gluino_1000.0_neutralino_1.0_chan2b_testing_BSM.csv', delimiter=',')
st1p_st1000 = np.genfromtxt('stlp_st1000_chan2b_testing_BSM.csv', delimiter=',')
chaneut_cha200 = np.genfromtxt('chaneut_cha200_neut50_chan2b_testing_BSM.csv', delimiter=',')
pp23mt_50 = np.genfromtxt('pp23mt_50_chan2b_testing_BSM.csv', delimiter=',')
chaneut_cha250 = np.genfromtxt('chaneut_cha250_neut150_chan2b_testing_BSM.csv', delimiter=',')
chacha_cha600 = np.genfromtxt('chacha_cha600_neut200_chan2b_testing_BSM.csv', delimiter=',')
pp24mt_50 = np.genfromtxt('pp24mt_50_chan2b_testing_BSM.csv',', delimiter=',')


class Training:
    def __init__(self, acti_func,encode_acti,decode_acti,layers, latent_layer, kernel_init, optimiser,X_train,name, epoch):

        
        
        #encoder
        input_layer = Input(shape=(X_train.shape[1]))
        
        #creating the hidden layers for the encoder
        for i in range(1,len(layers)):
            layer_current = 'dense_layer' + str(i + 1)  
            if i == 1:
                layer_current = Dense(layers[i],activation=acti_func, kernel_initializer=kernel_init )(input_layer)
                layer_previous =layer_current
            
            else:
                layer_current = Dense(layers[i],activation=acti_func, kernel_initializer=kernel_init )(layer_previous)
                layer_previous =layer_current
                
        #output of the encoder model
        output = Dense(latent_layer, activation=encode_acti)(layer_current)
        #creating the encoder model
        model1 = Model(inputs = input_layer , outputs = output)
        model1.summary()
        
        
        #decoder
        decode_input = Input(shape = (output.shape[1]))
        #creating the hidden layers in the decoder model
        for i in range(len(layers)-1,0,-1):
            decode_current = 'decode_layer' + str(i + 1)  
            print(i)
            if i == len(layers)-1:
                print(i)
                decode_current = Dense(layers[i],activation=acti_func, kernel_initializer=kernel_init )(decode_input)
                decode_previous =decode_current

            else:
                decode_current = Dense(layers[i],activation=acti_func, kernel_initializer=kernel_init )(decode_previous)
                decode_previous =decode_current
        
        #output of the decoder model
        decode_output = Dense(X_train.shape[1], activation = decode_acti)(decode_current)
        #creating the decoder model
        model2 = Model(inputs = decode_input , outputs = decode_output )
        model2.summary()
        
        
        # creating the autoencoder model
        i = model1.input
        cae = Model(i , model2(model1(i)), name = 'ConvAE')
        
        
        #compile
        cae.compile(optimizer=optimiser,
              loss='mean_squared_error',
              metrics=['MSE'],
           )
        
        
        callbacks = []
        filename  = name + ".csv"
        history_logger=tf.keras.callbacks.CSVLogger(filename, separator=",", append=True)
        callbacks.append(history_logger)
        
        cae.fit(X_train, 
            X_train,
            batch_size= 256,
            epochs=epoch,
            validation_split=0.20,
            shuffle=True,
            callbacks = callbacks
                     )
        
        
        #an instance of the model
        self.cae = cae


def evaluate_model(model, model_path):
    model.summary()
    print("Mean squared error(train): " + str((model.evaluate(X_train, X_train, verbose=False)[1])))
    print("Mean squared error(test):  " + str (model.evaluate(X_test,  X_test,  verbose=False)[1]))
    model_loss = pd.read_csv(model_path)
    plt.plot(model_loss['epoch'], model_loss['loss'] ,label = 'Loss function', color = 'red')
    plt.plot(model_loss['epoch'], model_loss['val_loss'] ,label = 'Validation loss function', color = 'yellow')
    plt.xlabel("Epoch number")
    plt.ylabel("Loss function")
    plt.title("Plot the loss function value as a function of number of epochs")
    plt.legend()
    plt.show()


modeltest = Training('tanh', 'linear','linear',[100, 100,50,50], 25, 'normal', 'adam', X_train, 'best_model', 100)

Model: "model_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_29 (InputLayer)       [(None, 39)]              0         
                                                                 
 dense_118 (Dense)           (None, 100)               4000      
                                                                 
 dense_119 (Dense)           (None, 50)                5050      
                                                                 
 dense_120 (Dense)           (None, 50)                2550      
                                                                 
 dense_121 (Dense)           (None, 25)                1275      
                                                                 
=================================================================
Total params: 12,875
Trainable params: 12,875
Non-trainable params: 0
_________________________________________________________________
3
3
2
1
Model: "model_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_30 (InputLayer)       [(None, 25)]              0         
                                                                 
 dense_122 (Dense)           (None, 50)                1300      
                                                                 
 dense_123 (Dense)           (None, 50)                2550      
                                                                 
 dense_124 (Dense)           (None, 100)               5100      
                                                                 
 dense_125 (Dense)           (None, 39)                3939      
                                                                 
=================================================================
Total params: 12,889
Trainable params: 12,889
Non-trainable params: 0
_________________________________________________________________
Train on 217771 samples, validate on 54443 samples
Epoch 1/100
216832/217771 [============================>.] - ETA: 0s - loss: 0.0239 - MSE: 0.0239

C:\Users\boazm\anaconda3\envs\daml\lib\site-packages\keras\engine\training_v1.py:2045: UserWarning: `Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.
  updates = self.state_updates

217771/217771 [==============================] - 10s 44us/sample - loss: 0.0238 - MSE: 0.0238 - val_loss: 0.0110 - val_MSE: 0.0110
Epoch 2/100
217771/217771 [==============================] - 7s 34us/sample - loss: 0.0075 - MSE: 0.0075 - val_loss: 0.0053 - val_MSE: 0.0053
Epoch 3/100
217771/217771 [==============================] - 8s 38us/sample - loss: 0.0047 - MSE: 0.0047 - val_loss: 0.0043 - val_MSE: 0.0043
Epoch 4/100
217771/217771 [==============================] - 6s 29us/sample - loss: 0.0030 - MSE: 0.0030 - val_loss: 0.0027 - val_MSE: 0.0027
Epoch 5/100
217771/217771 [==============================] - 6s 27us/sample - loss: 0.0024 - MSE: 0.0024 - val_loss: 0.0020 - val_MSE: 0.0020
Epoch 6/100
217771/217771 [==============================] - 7s 31us/sample - loss: 0.0017 - MSE: 0.0017 - val_loss: 0.0015 - val_MSE: 0.0015
Epoch 7/100
217771/217771 [==============================] - 6s 28us/sample - loss: 0.0013 - MSE: 0.0013 - val_loss: 0.0011 - val_MSE: 0.0011
Epoch 8/100
217771/217771 [==============================] - 7s 31us/sample - loss: 8.2877e-04 - MSE: 8.2877e-04 - val_loss: 7.4167e-04 - val_MSE: 7.4167e-04
Epoch 9/100
217771/217771 [==============================] - 6s 26us/sample - loss: 7.3312e-04 - MSE: 7.3312e-04 - val_loss: 6.6075e-04 - val_MSE: 6.6075e-04
Epoch 10/100
217771/217771 [==============================] - 6s 28us/sample - loss: 6.0962e-04 - MSE: 6.0962e-04 - val_loss: 5.9651e-04 - val_MSE: 5.9651e-04
Epoch 11/100
217771/217771 [==============================] - 8s 37us/sample - loss: 5.8062e-04 - MSE: 5.8062e-04 - val_loss: 5.2863e-04 - val_MSE: 5.2863e-04
Epoch 12/100
217771/217771 [==============================] - 7s 33us/sample - loss: 4.9892e-04 - MSE: 4.9892e-04 - val_loss: 4.9133e-04 - val_MSE: 4.9133e-04
Epoch 13/100
217771/217771 [==============================] - 8s 37us/sample - loss: 4.8855e-04 - MSE: 4.8855e-04 - val_loss: 4.8341e-04 - val_MSE: 4.8341e-04
Epoch 14/100
217771/217771 [==============================] - 6s 27us/sample - loss: 4.0923e-04 - MSE: 4.0923e-04 - val_loss: 2.9955e-04 - val_MSE: 2.9955e-04
Epoch 15/100
217771/217771 [==============================] - 7s 32us/sample - loss: 2.9160e-04 - MSE: 2.9160e-04 - val_loss: 2.8803e-04 - val_MSE: 2.8803e-04
Epoch 16/100
217771/217771 [==============================] - 7s 32us/sample - loss: 2.8958e-04 - MSE: 2.8958e-04 - val_loss: 2.8828e-04 - val_MSE: 2.8828e-04
Epoch 17/100
217771/217771 [==============================] - 9s 40us/sample - loss: 2.7348e-04 - MSE: 2.7348e-04 - val_loss: 2.1751e-04 - val_MSE: 2.1751e-04
Epoch 18/100
217771/217771 [==============================] - 6s 28us/sample - loss: 2.1271e-04 - MSE: 2.1271e-04 - val_loss: 2.1138e-04 - val_MSE: 2.1138e-04
Epoch 19/100
217771/217771 [==============================] - 7s 32us/sample - loss: 2.1106e-04 - MSE: 2.1106e-04 - val_loss: 2.0555e-04 - val_MSE: 2.0555e-04
Epoch 20/100
217771/217771 [==============================] - 7s 30us/sample - loss: 2.0877e-04 - MSE: 2.0877e-04 - val_loss: 2.1667e-04 - val_MSE: 2.1667e-04
Epoch 21/100
217771/217771 [==============================] - 6s 27us/sample - loss: 2.0584e-04 - MSE: 2.0584e-04 - val_loss: 2.3409e-04 - val_MSE: 2.3409e-04
Epoch 22/100
217771/217771 [==============================] - 7s 32us/sample - loss: 2.0308e-04 - MSE: 2.0308e-04 - val_loss: 1.9995e-04 - val_MSE: 1.9995e-04
Epoch 23/100
217771/217771 [==============================] - 6s 26us/sample - loss: 1.9717e-04 - MSE: 1.9717e-04 - val_loss: 2.0517e-04 - val_MSE: 2.0517e-04
Epoch 24/100
217771/217771 [==============================] - 6s 26us/sample - loss: 1.5273e-04 - MSE: 1.5273e-04 - val_loss: 1.2471e-04 - val_MSE: 1.2471e-04
Epoch 25/100
217771/217771 [==============================] - 7s 32us/sample - loss: 1.2491e-04 - MSE: 1.2491e-04 - val_loss: 1.2577e-04 - val_MSE: 1.2577e-04
Epoch 26/100
217771/217771 [==============================] - 6s 26us/sample - loss: 1.2428e-04 - MSE: 1.2428e-04 - val_loss: 1.3325e-04 - val_MSE: 1.3325e-04
Epoch 27/100
217771/217771 [==============================] - 7s 31us/sample - loss: 1.2327e-04 - MSE: 1.2327e-04 - val_loss: 1.1861e-04 - val_MSE: 1.1861e-04
Epoch 28/100
217771/217771 [==============================] - 6s 28us/sample - loss: 1.2239e-04 - MSE: 1.2239e-04 - val_loss: 1.2360e-04 - val_MSE: 1.2360e-04
Epoch 29/100
217771/217771 [==============================] - 6s 26us/sample - loss: 1.2209e-04 - MSE: 1.2209e-04 - val_loss: 1.2208e-04 - val_MSE: 1.2208e-04
Epoch 30/100
217771/217771 [==============================] - 7s 30us/sample - loss: 1.2195e-04 - MSE: 1.2195e-04 - val_loss: 1.2572e-04 - val_MSE: 1.2572e-04
Epoch 31/100
217771/217771 [==============================] - 6s 26us/sample - loss: 1.2060e-04 - MSE: 1.2060e-04 - val_loss: 1.3121e-04 - val_MSE: 1.3121e-04
Epoch 32/100
217771/217771 [==============================] - 6s 27us/sample - loss: 1.2087e-04 - MSE: 1.2087e-04 - val_loss: 1.1847e-04 - val_MSE: 1.1847e-04
Epoch 33/100
217771/217771 [==============================] - 6s 29us/sample - loss: 1.1574e-04 - MSE: 1.1574e-04 - val_loss: 1.5618e-04 - val_MSE: 1.5618e-04
Epoch 34/100
217771/217771 [==============================] - 6s 25us/sample - loss: 9.4000e-05 - MSE: 9.4000e-05 - val_loss: 8.8716e-05 - val_MSE: 8.8716e-05
Epoch 35/100
217771/217771 [==============================] - 7s 31us/sample - loss: 9.2172e-05 - MSE: 9.2172e-05 - val_loss: 9.1491e-05 - val_MSE: 9.1491e-05
Epoch 36/100
217771/217771 [==============================] - 6s 27us/sample - loss: 9.2317e-05 - MSE: 9.2317e-05 - val_loss: 9.1525e-05 - val_MSE: 9.1525e-05
Epoch 37/100
217771/217771 [==============================] - 6s 28us/sample - loss: 9.1538e-05 - MSE: 9.1538e-05 - val_loss: 9.6908e-05 - val_MSE: 9.6908e-05
Epoch 38/100
217771/217771 [==============================] - 7s 32us/sample - loss: 9.1725e-05 - MSE: 9.1725e-05 - val_loss: 1.0563e-04 - val_MSE: 1.0563e-04
Epoch 39/100
217771/217771 [==============================] - 6s 26us/sample - loss: 9.1250e-05 - MSE: 9.1250e-05 - val_loss: 8.9543e-05 - val_MSE: 8.9543e-05
Epoch 40/100
217771/217771 [==============================] - 7s 30us/sample - loss: 9.0933e-05 - MSE: 9.0933e-05 - val_loss: 8.8175e-05 - val_MSE: 8.8175e-05
Epoch 41/100
217771/217771 [==============================] - 6s 29us/sample - loss: 9.0411e-05 - MSE: 9.0411e-05 - val_loss: 9.2074e-05 - val_MSE: 9.2074e-05
Epoch 42/100
217771/217771 [==============================] - 6s 26us/sample - loss: 9.0635e-05 - MSE: 9.0635e-05 - val_loss: 9.8504e-05 - val_MSE: 9.8504e-05
Epoch 43/100
217771/217771 [==============================] - 7s 30us/sample - loss: 9.0556e-05 - MSE: 9.0556e-05 - val_loss: 9.1058e-05 - val_MSE: 9.1058e-05
Epoch 44/100
217771/217771 [==============================] - 6s 26us/sample - loss: 9.0019e-05 - MSE: 9.0019e-05 - val_loss: 8.8404e-05 - val_MSE: 8.8404e-05
Epoch 45/100
217771/217771 [==============================] - 6s 29us/sample - loss: 8.9698e-05 - MSE: 8.9698e-05 - val_loss: 8.7180e-05 - val_MSE: 8.7180e-05
Epoch 46/100
217771/217771 [==============================] - 7s 32us/sample - loss: 8.9279e-05 - MSE: 8.9279e-05 - val_loss: 8.7713e-05 - val_MSE: 8.7713e-05
Epoch 47/100
217771/217771 [==============================] - 6s 27us/sample - loss: 8.7033e-05 - MSE: 8.7033e-05 - val_loss: 7.2547e-05 - val_MSE: 7.2547e-05
Epoch 48/100
217771/217771 [==============================] - 7s 33us/sample - loss: 6.6740e-05 - MSE: 6.6740e-05 - val_loss: 5.8168e-05 - val_MSE: 5.8167e-05
Epoch 49/100
217771/217771 [==============================] - 6s 26us/sample - loss: 6.1328e-05 - MSE: 6.1328e-05 - val_loss: 6.9193e-05 - val_MSE: 6.9193e-05
Epoch 50/100
217771/217771 [==============================] - 6s 28us/sample - loss: 5.9665e-05 - MSE: 5.9665e-05 - val_loss: 6.2349e-05 - val_MSE: 6.2349e-05
Epoch 51/100
217771/217771 [==============================] - 7s 30us/sample - loss: 5.8658e-05 - MSE: 5.8658e-05 - val_loss: 6.0662e-05 - val_MSE: 6.0662e-05
Epoch 52/100
217771/217771 [==============================] - 5s 25us/sample - loss: 5.8407e-05 - MSE: 5.8407e-05 - val_loss: 6.3840e-05 - val_MSE: 6.3840e-05
Epoch 53/100
217771/217771 [==============================] - 6s 30us/sample - loss: 5.7691e-05 - MSE: 5.7691e-05 - val_loss: 6.6764e-05 - val_MSE: 6.6764e-05
Epoch 54/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.7308e-05 - MSE: 5.7307e-05 - val_loss: 6.1328e-05 - val_MSE: 6.1328e-05
Epoch 55/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.7511e-05 - MSE: 5.7511e-05 - val_loss: 5.5308e-05 - val_MSE: 5.5308e-05
Epoch 56/100
217771/217771 [==============================] - 7s 32us/sample - loss: 5.7154e-05 - MSE: 5.7154e-05 - val_loss: 5.9154e-05 - val_MSE: 5.9154e-05
Epoch 57/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.6723e-05 - MSE: 5.6723e-05 - val_loss: 5.5449e-05 - val_MSE: 5.5449e-05
Epoch 58/100
217771/217771 [==============================] - 6s 28us/sample - loss: 5.7121e-05 - MSE: 5.7121e-05 - val_loss: 5.7061e-05 - val_MSE: 5.7061e-05
Epoch 59/100
217771/217771 [==============================] - 6s 28us/sample - loss: 5.6201e-05 - MSE: 5.6202e-05 - val_loss: 5.4222e-05 - val_MSE: 5.4222e-05
Epoch 60/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.6460e-05 - MSE: 5.6460e-05 - val_loss: 5.1419e-05 - val_MSE: 5.1419e-05
Epoch 61/100
217771/217771 [==============================] - 7s 32us/sample - loss: 5.6372e-05 - MSE: 5.6372e-05 - val_loss: 5.3791e-05 - val_MSE: 5.3791e-05
Epoch 62/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.6342e-05 - MSE: 5.6342e-05 - val_loss: 5.6252e-05 - val_MSE: 5.6252e-05
Epoch 63/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.6200e-05 - MSE: 5.6200e-05 - val_loss: 5.8775e-05 - val_MSE: 5.8775e-05
Epoch 64/100
217771/217771 [==============================] - 7s 30us/sample - loss: 5.6206e-05 - MSE: 5.6206e-05 - val_loss: 5.4704e-05 - val_MSE: 5.4704e-05
Epoch 65/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.5989e-05 - MSE: 5.5989e-05 - val_loss: 5.4425e-05 - val_MSE: 5.4425e-05
Epoch 66/100
217771/217771 [==============================] - 7s 33us/sample - loss: 5.5757e-05 - MSE: 5.5757e-05 - val_loss: 5.5151e-05 - val_MSE: 5.5151e-05
Epoch 67/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.5638e-05 - MSE: 5.5638e-05 - val_loss: 5.3476e-05 - val_MSE: 5.3476e-05
Epoch 68/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.5971e-05 - MSE: 5.5971e-05 - val_loss: 5.1273e-05 - val_MSE: 5.1273e-05
Epoch 69/100
217771/217771 [==============================] - 7s 32us/sample - loss: 5.5579e-05 - MSE: 5.5579e-05 - val_loss: 5.6475e-05 - val_MSE: 5.6475e-05
Epoch 70/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.5698e-05 - MSE: 5.5698e-05 - val_loss: 6.1335e-05 - val_MSE: 6.1336e-05
Epoch 71/100
217771/217771 [==============================] - 6s 30us/sample - loss: 5.5275e-05 - MSE: 5.5275e-05 - val_loss: 5.6068e-05 - val_MSE: 5.6068e-05
Epoch 72/100
217771/217771 [==============================] - 6s 28us/sample - loss: 5.5616e-05 - MSE: 5.5616e-05 - val_loss: 5.4832e-05 - val_MSE: 5.4832e-05
Epoch 73/100
217771/217771 [==============================] - 6s 25us/sample - loss: 5.5695e-05 - MSE: 5.5695e-05 - val_loss: 5.0758e-05 - val_MSE: 5.0758e-05
Epoch 74/100
217771/217771 [==============================] - 7s 30us/sample - loss: 5.5104e-05 - MSE: 5.5104e-05 - val_loss: 5.0710e-05 - val_MSE: 5.0710e-05
Epoch 75/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.5147e-05 - MSE: 5.5147e-05 - val_loss: 5.1808e-05 - val_MSE: 5.1808e-05
Epoch 76/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.5411e-05 - MSE: 5.5411e-05 - val_loss: 5.8626e-05 - val_MSE: 5.8626e-05
Epoch 77/100
217771/217771 [==============================] - 6s 29us/sample - loss: 5.5089e-05 - MSE: 5.5089e-05 - val_loss: 5.4292e-05 - val_MSE: 5.4292e-05
Epoch 78/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.5077e-05 - MSE: 5.5077e-05 - val_loss: 5.5492e-05 - val_MSE: 5.5492e-05
Epoch 79/100
217771/217771 [==============================] - 7s 30us/sample - loss: 5.4716e-05 - MSE: 5.4716e-05 - val_loss: 5.2817e-05 - val_MSE: 5.2817e-05
Epoch 80/100
217771/217771 [==============================] - 6s 28us/sample - loss: 5.4857e-05 - MSE: 5.4857e-05 - val_loss: 5.4814e-05 - val_MSE: 5.4814e-05
Epoch 81/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.4722e-05 - MSE: 5.4722e-05 - val_loss: 5.0855e-05 - val_MSE: 5.0855e-05
Epoch 82/100
217771/217771 [==============================] - 7s 32us/sample - loss: 5.4940e-05 - MSE: 5.4940e-05 - val_loss: 5.6031e-05 - val_MSE: 5.6031e-05
Epoch 83/100
217771/217771 [==============================] - 6s 26us/sample - loss: 5.4364e-05 - MSE: 5.4364e-05 - val_loss: 5.3446e-05 - val_MSE: 5.3446e-05
Epoch 84/100
217771/217771 [==============================] - 6s 27us/sample - loss: 5.3978e-05 - MSE: 5.3978e-05 - val_loss: 5.9559e-05 - val_MSE: 5.9559e-05
Epoch 85/100
217771/217771 [==============================] - 6s 29us/sample - loss: 5.4295e-05 - MSE: 5.4295e-05 - val_loss: 5.1435e-05 - val_MSE: 5.1435e-05
Epoch 86/100
217771/217771 [==============================] - 5s 25us/sample - loss: 5.3016e-05 - MSE: 5.3016e-05 - val_loss: 5.0477e-05 - val_MSE: 5.0477e-05
Epoch 87/100
217771/217771 [==============================] - 7s 31us/sample - loss: 4.9178e-05 - MSE: 4.9178e-05 - val_loss: 4.6748e-05 - val_MSE: 4.6748e-05
Epoch 88/100
217771/217771 [==============================] - 7s 31us/sample - loss: 4.0491e-05 - MSE: 4.0491e-05 - val_loss: 3.9305e-05 - val_MSE: 3.9305e-05
Epoch 89/100
217771/217771 [==============================] - 6s 29us/sample - loss: 3.6674e-05 - MSE: 3.6674e-05 - val_loss: 3.2707e-05 - val_MSE: 3.2707e-05
Epoch 90/100
217771/217771 [==============================] - 7s 30us/sample - loss: 3.5385e-05 - MSE: 3.5385e-05 - val_loss: 3.0936e-05 - val_MSE: 3.0936e-05
Epoch 91/100
217771/217771 [==============================] - 6s 27us/sample - loss: 3.4037e-05 - MSE: 3.4037e-05 - val_loss: 4.0598e-05 - val_MSE: 4.0598e-05
Epoch 92/100
217771/217771 [==============================] - 8s 36us/sample - loss: 3.3469e-05 - MSE: 3.3469e-05 - val_loss: 3.2682e-05 - val_MSE: 3.2682e-05
Epoch 93/100
217771/217771 [==============================] - 6s 27us/sample - loss: 3.3042e-05 - MSE: 3.3042e-05 - val_loss: 3.5450e-05 - val_MSE: 3.5450e-05
Epoch 94/100
217771/217771 [==============================] - 5s 25us/sample - loss: 3.2526e-05 - MSE: 3.2526e-05 - val_loss: 3.2320e-05 - val_MSE: 3.2320e-05
Epoch 95/100
217771/217771 [==============================] - 7s 32us/sample - loss: 3.1928e-05 - MSE: 3.1928e-05 - val_loss: 3.1342e-05 - val_MSE: 3.1342e-05
Epoch 96/100
217771/217771 [==============================] - 7s 34us/sample - loss: 3.1694e-05 - MSE: 3.1694e-05 - val_loss: 3.0107e-05 - val_MSE: 3.0107e-05
Epoch 97/100
217771/217771 [==============================] - 8s 36us/sample - loss: 3.1862e-05 - MSE: 3.1862e-05 - val_loss: 3.9877e-05 - val_MSE: 3.9877e-05
Epoch 98/100
217771/217771 [==============================] - 6s 29us/sample - loss: 3.1279e-05 - MSE: 3.1279e-05 - val_loss: 2.8620e-05 - val_MSE: 2.8620e-05
Epoch 99/100
217771/217771 [==============================] - 5s 23us/sample - loss: 3.1392e-05 - MSE: 3.1392e-05 - val_loss: 2.9307e-05 - val_MSE: 2.9307e-05
Epoch 100/100
217771/217771 [==============================] - 7s 32us/sample - loss: 3.1011e-05 - MSE: 3.1011e-05 - val_loss: 3.0541e-05 - val_MSE: 3.0541e-05


evaluate_model(modeltest.cae ,'best_model.csv')

Model: "ConvAE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_29 (InputLayer)       [(None, 39)]              0         
                                                                 
 model_28 (Functional)       (None, 25)                12875     
                                                                 
 model_29 (Functional)       (None, 39)                12889     
                                                                 
=================================================================
Total params: 25,764
Trainable params: 25,764
Non-trainable params: 0
_________________________________________________________________
Mean squared error(train): 3.0576382e-05
Mean squared error(test):  3.0364243e-05


BSM_data = [chacha_cha300,chacha_cha400,chacha_cha600,chaneut_cha200,chaneut_cha250,pp23mt_50,pp24mt_50, gluino_1000,st1p_st1000 ]
eval_BSM = []
for i in BSM_data:
    model_eval = modeltest.cae.evaluate(i ,i, verbose = False)[1]
    eval_BSM.append(model_eval)


def MSE(original, prediction):
    mse = np.square(original - prediction)
    mse = np.mean(mse, axis = 1)
    return mse


X_predict = modeltest.cae.predict(X_test) # prediction of model on testing dataset of SM
mse_train = MSE(X_test , X_predict)     # mse of events in testing dataset of SM

C:\Users\boazm\anaconda3\envs\daml\lib\site-packages\keras\engine\training_v1.py:2067: UserWarning: `Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.
  updates=self.state_updates,


def histogram_threshold(test_data , mse_train_data, title, y_limit =100000 ):
    test_predict = modeltest.cae.predict(test_data)
    mse_test = MSE(test_data, test_predict)    
    fig, ax = plt.subplots()
    ax.hist(mse_train_data, 1000, density=True, facecolor='g', alpha=0.75, label='MSE for SM data')
    ax.hist(mse_test, 1000, density=True, facecolor='b', alpha=0.75, label = 'MSE for BSM')
    plt.xlabel('Mean square error')
    plt.ylabel('Counts')
    plt.title(title)
    ax.legend()
    ax.set_xlim(0, 0.0005)
    ax.set_ylim(0, y_limit)
    plt.show()


def threshold_plot(mse_train, test_data):
    
    test_predict = modeltest.cae.predict(test_data)
    threshold1 = np.linspace(0, 0.00009, 1000)   # defining the thresholds
    mse_test = MSE(test_data, test_predict)

    background_eff = []
    signal_eff = []

    for i in threshold1:
        background_eff.append((np.sum(mse_train < i)/(mse_train.shape))) # background efficiency
        signal_eff.append((np.sum(mse_test > i))/(mse_test.shape))  # signal efficiency
    
    diff = np.array(background_eff) - np.array(signal_eff)
    x_intersect = threshold1[np.argmin(np.abs(diff))]         # getting the intersect of curves


    
    # Create a figure and subplots
    fig, ax = plt.subplots()

    # Plot the first line graph
    ax.plot(threshold1, background_eff, color='b', label='background_efficiency')

    # Plot the second line graph
    ax.plot(threshold1, signal_eff, color = 'r', label='signal_efficiency')
    
    ax.axvline(x=x_intersect, color='g', linestyle='--', label = 'Threshold: ' + str(round(x_intersect,7)))
    
    plt.xlabel('threshold')
    plt.ylabel('efficiency')

    # Add a legend
    ax.legend()

    # Show the plot
    plt.show()


histogram_threshold(BSM_data[0], mse_train, title = "SUSY chargino-chargino process---chacha_cha300_neut140_chan2b.csv")
threshold_plot(mse_train, BSM_data[0])


histogram_threshold(BSM_data[1], mse_train, title = "SUSY chargino-chargino process---chacha_cha400_neut60_chan2b.csv")
threshold_plot(mse_train, BSM_data[1])


histogram_threshold(BSM_data[2], mse_train, title = "SUSY chargino-chargino process---chacha_cha600_neut200_chan2b.csv")
threshold_plot(mse_train, BSM_data[2])


histogram_threshold(BSM_data[3], mse_train, title = "SUSY chargino-neutralino processes---chaneut_cha200_neut50_chan2b.csv")
threshold_plot(mse_train, BSM_data[3])


histogram_threshold(BSM_data[4], mse_train, title = "SUSY chargino-neutralino processes---chaneut_cha250_neut150_chan2b.csv")
threshold_plot(mse_train, BSM_data[4])


histogram_threshold(BSM_data[5], mse_train, title = "Z decay to leptons---pp23mt_50_chan2b.csv")
threshold_plot(mse_train, BSM_data[5])


histogram_threshold(BSM_data[6], mse_train, title = "Z decay to leptons-pp24mt_50_chan2b.csv")
threshold_plot(mse_train, BSM_data[6])


histogram_threshold(BSM_data[7], mse_train, title = "Gluino and RPV SUSY---gluino_1000.0_neutralino_1.0_chan2b.csv", y_limit= 10000)
threshold_plot(mse_train, BSM_data[7])


histogram_threshold(BSM_data[8], mse_train, title = "Gluino and RPV SUSY---cstlp_st1000_chan2b.csv", y_limit= 10000)
threshold_plot(mse_train, BSM_data[8])


X_predict = modeltest.cae.predict(X_test) # prediction of the model
mse_train = MSE(X_test , X_predict) # getting the mse of the  events in the testing dataset
X_reversed = X_test*(np.max(bgb.data, axis=0) - np.min(bgb.data, axis=0)) + np.min(bgb.data, axis=0) # un-scaling the testing dataset
X_test_new = np.column_stack((X_reversed, mse_train)) #adding the mse to each event in a column

#sorting the dataset in ascending order of mse 
X_test_sorted = X_test_new[X_test_new[:,-1].argsort()]

#getting the 1000 best events in the testing dataset
X_test_best = X_test_sorted[:1000]

#getting the 1000 worst events in the testing dataset
X_test_worst = X_test_sorted[-1000:]

n, bins, patches = plt.hist(X_test_best[:,6], 50, density=True, facecolor='g', alpha=0.75)
n, bins, patches = plt.hist(X_test_worst[:,6], 50, density=True, facecolor='b', alpha=0.75)


particle_counts_best = np.sum(X_test_best[:, :5], axis=1)
particle_counts_worst = np.sum(X_test_worst[:, :5], axis=1)
fig, ax = plt.subplots()
ax.hist(particle_counts_best, 100, density=True, facecolor='g', alpha=0.75, label='Best events')
ax.hist(particle_counts_worst, 100, density=True, facecolor='b', alpha=0.75, label = 'Worst events')
plt.xlabel('Number of particles in an event')
plt.ylabel('Counts')
plt.title('Comparing the effect of the number of particles in an event on the performance of the model')
ax.legend()
plt.show()


fig, ax = plt.subplots()
ax.hist(X_test_best[:,6], 100, density=True, facecolor='g', alpha=0.75, label='Best events')
ax.hist(X_test_worst[:,6], 100, density=True, facecolor='b', alpha=0.75, label = 'Worst events')
plt.xlabel('METphi')
plt.ylabel('Counts')
plt.title('Comparing the effect of METphi  on the performance of the model')
ax.legend()
plt.show()


fig, ax = plt.subplots()
ax.hist(X_test_best[:,5], 100, density=True, facecolor='g', alpha=0.75, label='Best events')
ax.hist(X_test_worst[:,5], 100, density=True, facecolor='b', alpha=0.75, label = 'Worst events')
plt.xlabel('log(MET)')
plt.ylabel('Counts')
plt.title('Comparing the effect of MET on the performance of the model')
ax.legend()
plt.show()

Model	File Name
SUSY chargino-chargino process
	`chacha_cha300_neut140_chan2b.csv`
	`chacha_cha400_neut60_chan2b.csv`
	`chacha_cha600_neut200_chan2b.csv`
SUSY chargino-neutralino processes
	`chaneut_cha200_neut50_chan2b.csv`
	`chaneut_cha250_neut150_chan2b.csv`
$Z'$ decay to leptons
	`pp23mt_50_chan2b.csv`
	`pp24mt_50_chan2b.csv`
Gluino and RPV SUSY
	`gluino_1000.0_neutralino_1.0_chan2b.csv`
	`stlp_st1000_chan2b.csv`

	event ID	process ID	event weight	N_ele	N_muon	N_jets	N_bjets	log(MET)	METphi	...	eta6	phi6	log(E7)	log(pt7)	eta7	phi7	log(E8)	log(pt8)	eta8	phi8
0	5702564	z_jets	1	2	0	7	0	11.538096	-2.96620	...	0.840127	-1.73805	11.289961	11.280763	0.135844	0.275231	10.918245	10.867301	-0.183147	2.62501
1	13085335	z_jets	1	0	2	1	0	11.547018	1.96193	...	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000
2	74025	wtopbar	1	0	2	1	1	11.770725	-1.17889	...	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000
3	2419445	z_jets	1	0	2	2	0	11.261565	-1.09171	...	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000
4	43639	wtop	1	0	2	1	1	11.581994	-1.02642	...	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000

Model	File Name	MSE of autoencoder model on BSM data
SUSY chargino-chargino process
	`chacha_cha300_neut140_chan2b.csv`	4.813e-05
	`chacha_cha400_neut60_chan2b.csv`	7.695e-05
	`chacha_cha600_neut200_chan2b.csv`	7.856e-05
SUSY chargino-neutralino processes
	`chaneut_cha200_neut50_chan2b.csv`	0.000153
	`chaneut_cha250_neut150_chan2b.csv`	0.000150
$Z'$ decay to leptons
	`pp23mt_50_chan2b.csv`	0.000351
	`pp24mt_50_chan2b.csv`	0.000903
Gluino and RPV SUSY
	`gluino_1000.0_neutralino_1.0_chan2b.csv`	0.00233
	`stlp_st1000_chan2b.csv`	0.00121

Model	File Name	Best Threshold
SUSY chargino-chargino process
	`chacha_cha300_neut140_chan2b.csv`	9.1e-06
	`chacha_cha400_neut60_chan2b.csv`	9.6e-06
	`chacha_cha600_neut200_chan2b.csv`	9.9e-06
SUSY chargino-neutralino processes
	`chaneut_cha200_neut50_chan2b.csv`	9.6e-06
	`chaneut_cha250_neut150_chan2b.csv`	9.5e-06
$Z'$ decay to leptons
	`pp23mt_50_chan2b.csv`	9.5e-06
	`pp24mt_50_chan2b.csv`	1.03e-05
Gluino and RPV SUSY
	`gluino_1000.0_neutralino_1.0_chan2b.csv`	4.83e-05
	`stlp_st1000_chan2b.csv`	2.46e-05

Anomaly Detection for Exotic Event Identification at the Large Hadron Collider¶

Brief Introduction to the Standard Model and Large Hadron Collider¶

Anomally detection dataset¶

Project description¶

Overview¶

Breakdown¶

Data Preprocessing¶

Training¶

Evaluation¶

Data PreProcessing¶

Run the code below to get the processed dataset¶

Training¶

Traing the best model for 100 epaoch¶

Evaluation¶

Testing SM dataset¶

Particle counts effect on SM dataset¶

METphi range¶

MET¶

Discussion¶