# IMPORT DEPENDENCIES 
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# READ DATA 
song_data = pd.read_csv('../Data/merged_finaltop100_revised.csv') 
# REMOVE ROWS WITH NULL VALUES 
song_data = song_data.dropna()
song_data.head()


# CHANGE KEY TO CATEGORICAL VARIABLE 
song_data['key'] = song_data.key.astype('category')


# IMPORT DEPENDENCIES 
from sklearn.feature_selection import VarianceThreshold


# CHOOSE QUANTITATIVE VARIABLES 
X = song_data[['valence', 'mode', 'loudness', 
                'acousticness', 'tempo', 'energy', 'liveness', 
                'key', 'duration_ms', 'instrumentalness', 'danceability', 
                'speechiness', 'len_words_orig', 'len_words_trans']]


# SELECT FEATURES USING 0 VARIANCE THRESHOLD 
var_thresh = VarianceThreshold(threshold =0.0) #set threshold to 0.0 
var_thresh.fit(X) 
print(var_thresh.get_support())
print(var_thresh.fit_transform(X).var(axis=0)) #print variances

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
[4.83936367e-02 2.49423408e-01 5.76396379e+00 5.73799561e-02
 8.04333374e+02 2.28960102e-02 1.38679966e-02 1.30240889e+01
 2.15771595e+09 8.49994820e-03 1.81675201e-02 9.17713769e-03
 2.88829014e+04 2.79634791e+04]


corr = [] #list of correlations 
feature_names = ['valence', 'mode', 'loudness', 
                'acousticness', 'tempo', 'energy', 'liveness', 
                'key', 'duration_ms', 'instrumentalness', 'danceability', 
                'speechiness', 'len_words_orig', 'len_words_trans'] #list of features


# COMPUTE CORRELATION BETW EACH FEATURE AND TARGET VARIABLE  
corr = [np.corrcoef(song_data[i], song_data['streams'])[0,1] for i in feature_names] 
corr = np.abs(corr) #absolute value of Peason's correlation 
corr_feature = X.iloc[:,np.argsort(np.abs(corr))[-6:]].columns.tolist() #chosen top 6 features 
corr_support = [True if i in corr_feature else False for i in feature_names]
print(corr_support)

[True, False, False, True, False, False, False, False, False, False, True, True, True, True]


# DISPLAY RESULTS AS DATAFRAME 
corr_df = pd.DataFrame({'feature_names': feature_names, 
                        'abs_pearsoncorr': corr, 'support':corr_support })
corr_df.sort_values('abs_pearsoncorr', ascending=False)


# IMPORT DEPENDENCIES 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler


# GET VARIABLES 
X = song_data[feature_names] #independent variables 
y = song_data['streams'] #target variable
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)


# GET CHI-SQuare VAUES FOR ALL FEATURES 
chi_scores = chi2(X_norm,y) #compute chi-sq values 
scores_df = pd.DataFrame({'feature_names': feature_names, 
                          'chi-square_values': chi_scores[0], 
                          'p_values': chi_scores[1]})
print(scores_df.sort_values('chi-square_values', ascending= False))

       feature_names  chi-square_values  p_values
9   instrumentalness        3368.369840       1.0
1               mode        3207.603840       1.0
3       acousticness        1540.538880       1.0
7                key        1520.387532       1.0
11       speechiness         902.343689       1.0
0            valence         717.389487       1.0
6           liveness         662.960253       1.0
4              tempo         493.924591       1.0
10      danceability         333.264595       1.0
5             energy         261.738070       1.0
8        duration_ms         191.133601       1.0
12    len_words_orig         175.036027       1.0
13   len_words_trans         157.826374       1.0
2           loudness         125.937553       1.0


# GET TOP 5 FEATURES 
chi_selector = SelectKBest(chi2, k=6) #get top 6 features 
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(f"{len(chi_feature)} selected features: {chi_feature}")

6 selected features: ['valence', 'mode', 'acousticness', 'key', 'instrumentalness', 'speechiness']


# IMPORT DEPENDENCIES 
import pandas as pd
import statsmodels.api as sm
import itertools


# CONVERT INDEPENDENT FEATURES TO DATAFRAME 
xnorm_df = pd.DataFrame(X_norm, columns= X.columns)
xnorm_df


# CREATE ALL POSSIBLE SUBSETS OF PREDICTORS 
subsets = []
for k in range(1, X.shape[1] + 1):
    subsets.extend(itertools.combinations(range(X.shape[1]), k))

# INTIALIZE VARIABLES FOR STORING RESULTS 
best_model = None
best_aic = np.inf
best_bic = np.inf
best_r2 = -np.inf
best_adjr2 = -np.inf


# LOOP THROUGH ALL SUBSETS AND FIT LINEA REGRESSION MODELS 
for subset in subsets:
    model = sm.OLS(y, sm.add_constant(X_norm[:, list(subset)])).fit()
    aic = model.aic
    bic = model.bic
    r2 = model.rsquared
    adjr2 = model.rsquared_adj
    #Check if current model is the best so far
    if aic < best_aic: #lower aic = best model 
        best_model = model
        best_aic = aic
        best_bic = bic
        best_r2 = r2
        best_adjr2 = adjr2


# BEST FEATURES USING AIC 
variables = list(best_model.params.index)
variables.remove('const')
results = pd.DataFrame({'AIC': [best_aic], 'BIC': [best_bic],  'R2': [best_r2], 'AdjR2': [best_adjr2]}, 
                       index=[str(variables)])
results


# CONVERT TO DATAFRAME 
aic_df = pd.DataFrame({'feature_names': X.columns.tolist(),  'support': True})
aic_df.loc[8:, 'support'] = False 
print(aic_df)

       feature_names  support
0            valence     True
1               mode     True
2           loudness     True
3       acousticness     True
4              tempo     True
5             energy     True
6           liveness     True
7                key     True
8        duration_ms    False
9   instrumentalness    False
10      danceability    False
11       speechiness    False
12    len_words_orig    False
13   len_words_trans    False


# LOOP THROUGH ALL SUBSETS AND FIT LINEA REGRESSION MODELS 
for subset in subsets:
    model = sm.OLS(y, sm.add_constant(X_norm[:, list(subset)])).fit()
    aic = model.aic
    bic = model.bic
    r2 = model.rsquared
    adjr2 = model.rsquared_adj
    #check if current model is the best so far
    if bic < best_bic:  #lower bic = best model 
        best_model = model
        best_aic = aic
        best_bic = bic
        best_r2 = r2
        best_adjr2 = adjr2


# BEST FEATURES USING BIC 
variables = list(best_model.params.index)
variables.remove('const')
results = pd.DataFrame({'BIC': [best_bic], 'AIC': [best_aic],  'R2': [best_r2], 'AdjR2': [best_adjr2]}, 
                       index=[str(variables)])
results


# CONVERT TO DATAFRAME 
bic_df = pd.DataFrame({'feature_names': X.columns.tolist(),  'support': False})
bic_df.loc[0, 'support'] = True  
print(bic_df)

       feature_names  support
0            valence     True
1               mode    False
2           loudness    False
3       acousticness    False
4              tempo    False
5             energy    False
6           liveness    False
7                key    False
8        duration_ms    False
9   instrumentalness    False
10      danceability    False
11       speechiness    False
12    len_words_orig    False
13   len_words_trans    False


# IMPORT DEPENDENCIES 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


# PERFORM RFE AND SELECT TOP 6 FEATURES 
estimator = LinearRegression() #define estimator 
rfe_lr = RFE(estimator, n_features_to_select=6, step=1)
rfe_lr.fit(X_norm, y)
rfe_lr_support = rfe_lr.get_support()
rfe_lr_feature = X.loc[:,rfe_lr_support].columns.tolist()
#print(f"Initial features: {X.columns.tolist()}")
#print(f"Feature Ranking: {rfe_lr.ranking_}")


# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 
                          'feature_ranking': rfe_lr.ranking_, 'support': rfe_lr_support})
print(df.sort_values('feature_ranking'))
print(f"{len(rfe_lr_feature)} selected features: {rfe_lr_feature} ")

       feature_names  feature_ranking  support
0            valence                1     True
2           loudness                1     True
3       acousticness                1     True
11       speechiness                1     True
12    len_words_orig                1     True
13   len_words_trans                1     True
10      danceability                2    False
6           liveness                3    False
5             energy                4    False
9   instrumentalness                5    False
8        duration_ms                6    False
4              tempo                7    False
7                key                8    False
1               mode                9    False
6 selected features: ['valence', 'loudness', 'acousticness', 'speechiness', 'len_words_orig', 'len_words_trans']


# EVALUATE PERFORMANCE 
X_selected = rfe_lr.transform(X_norm)
#split transformed dataset & target variable into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
estimator.fit(X_train, y_train) #fit a regression model on training set 
y_pred = estimator.predict(X_test) #predict on test set
mse = mean_squared_error(y_test, y_pred) #compute mse
print(f'MSE: {mse}')

MSE: 1046198083685.7289


# IMPORT DEPENDENCIES 
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor


# PERFORM RFECV
estimator = DecisionTreeRegressor() #use decision tree estimator 
rfecv_dtr = RFECV(estimator=estimator,step=1, cv=5)
rfecv_dtr.fit(X_norm, y)
#print(f"Feature Ranking: {rfecv_dtr.ranking_}")
#print(f"Initial features: {X.columns.tolist()}")
#print(rfecv_dtr.support_)

RFECV(cv=5, estimator=DecisionTreeRegressor())

RFECV(cv=5, estimator=DecisionTreeRegressor())

DecisionTreeRegressor()

DecisionTreeRegressor()


# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 
                          'feature_ranking': rfecv_dtr.ranking_, 'support': rfecv_dtr.support_})
print(df.sort_values('feature_ranking'))
print(f"Number of features selected with CV: {rfecv_dtr.n_features_}")

       feature_names  feature_ranking  support
4              tempo                1     True
0            valence                2    False
8        duration_ms                3    False
2           loudness                4    False
13   len_words_trans                5    False
3       acousticness                6    False
6           liveness                7    False
12    len_words_orig                8    False
11       speechiness                9    False
10      danceability               10    False
5             energy               11    False
9   instrumentalness               12    False
7                key               13    False
1               mode               14    False
Number of features selected with CV: 1


# EVALUATE PERFORMANCE
X_selected = rfecv_dtr.transform(X_norm)
#split transformed dataset & target variable into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
estimator.fit(X_train, y_train) #fit a regression model on training set 
y_pred = estimator.predict(X_test) #predict on test set
mse = mean_squared_error(y_test, y_pred) #compute mse
print(f'MSE: {mse}')

MSE: 1683888684985.9526


# PERFORM RFECV WITH SPECIFIED SCORING AND DIFFERENT STEP AND CV
rfecv_dtr = RFECV(estimator=estimator,scoring="neg_mean_squared_error", step=2, cv=2)
rfecv_dtr.fit(X_norm, y)
df = pd.DataFrame({'feature_names': X.columns.tolist(), 
                          'feature_ranking': rfecv_dtr.ranking_, 'support': rfecv_dtr.support_})
print(df.sort_values('feature_ranking'))
print(f"Number of features selected with CV: {rfecv_dtr.n_features_}")

       feature_names  feature_ranking  support
4              tempo                1     True
3       acousticness                2    False
0            valence                3    False
6           liveness                3    False
8        duration_ms                4    False
13   len_words_trans                4    False
2           loudness                5    False
5             energy                5    False
11       speechiness                6    False
12    len_words_orig                6    False
9   instrumentalness                7    False
10      danceability                7    False
1               mode                8    False
7                key                8    False
Number of features selected with CV: 1


# EVALUATE PERFORMANCE
X_selected = rfecv_dtr.transform(X_norm)
#split transformed dataset & target variable into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
estimator.fit(X_train, y_train) #fit a regression model on training set 
y_pred = estimator.predict(X_test) #predict on test set
mse = mean_squared_error(y_test, y_pred) #compute mse
print(f'MSE: {mse}')

MSE: 1683888684985.9526


# IMPORT DEPENDENCIES
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression


# PERFORM FORWARD SELECTION AND AUTO SELECT NUMBER OF FEATURES 
sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select='auto', direction='forward')
sfs.fit(X_norm, y)
sfs_support = sfs.get_support()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': sfs_support})
print(df)
sfs_feature = X.loc[:,sfs_support].columns.tolist()
print(f"Optimal subset size using forward feature selection: {len(sfs_feature)}")

       feature_names  support
0            valence    False
1               mode     True
2           loudness    False
3       acousticness    False
4              tempo     True
5             energy    False
6           liveness    False
7                key     True
8        duration_ms    False
9   instrumentalness     True
10      danceability    False
11       speechiness     True
12    len_words_orig     True
13   len_words_trans     True
Optimal subset size using forward feature selection: 7


# PERFORM BACKWARD SELECTION AND AUTO SELECT NUMBER OF FEATURES 
sfs2 = SequentialFeatureSelector(LinearRegression(), n_features_to_select='auto', direction='backward')
sfs2.fit(X_norm, y)
sfs2_support = sfs2.get_support()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': sfs2_support})
print(df)
sfs2_feature = X.loc[:,sfs2_support].columns.tolist()
print(f"Optimal subset size using forward feature selection: {len(sfs2_feature)}")

       feature_names  support
0            valence    False
1               mode     True
2           loudness     True
3       acousticness    False
4              tempo     True
5             energy     True
6           liveness    False
7                key     True
8        duration_ms    False
9   instrumentalness    False
10      danceability    False
11       speechiness     True
12    len_words_orig    False
13   len_words_trans     True
Optimal subset size using forward feature selection: 7


# IMPORT DEPENDENCIES
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV


#set hyperparam n_estimators=50 
embedded_rf = SelectFromModel(RandomForestRegressor(n_estimators=50), max_features=6)
embedded_rf.fit(X_norm, y)
embedded_rf_support = embedded_rf.get_support()
embedded_rf_feature = X.loc[:,embedded_rf_support].columns.tolist()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': embedded_rf_support})
print(df)
print(f"{len(embedded_rf_feature)} selected features: {embedded_rf_feature} ")

       feature_names  support
0            valence     True
1               mode    False
2           loudness    False
3       acousticness    False
4              tempo     True
5             energy     True
6           liveness     True
7                key    False
8        duration_ms     True
9   instrumentalness    False
10      danceability    False
11       speechiness    False
12    len_words_orig    False
13   len_words_trans     True
6 selected features: ['valence', 'tempo', 'energy', 'liveness', 'duration_ms', 'len_words_trans']


# SELECT FEATURES USING LASSO REG 
reg = LassoCV()
reg.fit(X_norm, y)
df = pd.DataFrame({'feature_names': feature_names, 'coefficients': reg.coef_})
df


featureselection_df = pd.DataFrame({'Feature':feature_names,
                                    'Pearson':corr_support, 
                                    'Chi-2':chi_support, 
                                    'AIC': aic_df['support'], 
                                    'BIC': bic_df['support'], 
                                    'RFE_LinearRegression':rfe_lr_support , 
                                    'RFECV_DecisionTreeRegressor': rfecv_dtr.support_ , 
                                    'FFS_LinearRegression':sfs_support, 
                                    'BE_LinearRegression':sfs2_support,
                                    'Random Forest':embedded_rf_support})
featureselection_df['Total'] = np.sum(featureselection_df.iloc[:, 1:], axis=1) #sum Trues 
featureselection_df.sort_values('Total', ascending=False)


# IMPORT DEPENDENCIES 
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm


# SPLIT INTO TRAINING AND TEST SETS 
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=1)


def loss(yhat,y):
    """sqr error loss"""
    return (yhat - y)**2

def fit(X,Y):
    """fit the OLS from training w/ intercept"""
    lin1 = LinearRegression(fit_intercept=True) # OLS from sklearn
    lin1.fit(X,Y) # fit OLS
    return np.append(lin1.intercept_,lin1.coef_) # return betahat

def predict(x, betahat):
    """predict for point x"""
    return betahat[0] + x @ betahat[1:]


# COMPUTE AVERAGE TRAINING AND TEST LOSS 
n,p = X_norm.shape #get dimensions 

# COMPUTE LOSSES ON TEST SET 
betahat = fit(X_train,y_train)
Y_hat_te = [predict(x,betahat) for x in X_test]
test_losses = [loss(yhat,y) for yhat,y in zip(Y_hat_te,y_test)]

# COMPUTE LOSSES ON TRAIN SET 
Y_hat_tr = [predict(x,betahat) for x in X_train]
train_losses = [loss(yhat,y) for yhat,y in zip(Y_hat_tr,y_train)]


print("train avg loss: {}\ntest avg loss: {}".format(np.mean(train_losses), np.mean(test_losses)))
print("n, p :",n,p)

train avg loss: 985083046760.4008
test avg loss: 1034618292635.4753
n, p : 6809 14


# PERFORM SELECTION TO GET SCORES 
fs = SelectKBest(score_func=f_regression, k='all')
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)

# PRINT SCORES FOR EACH FEATURE 
for i in range(len(fs.scores_)):
    print(f"{feature_names[i]}: { fs.scores_[i]} ")

valence: 4.866863426784755 
mode: 4.781261625930871 
loudness: 1.9683197099587608 
acousticness: 4.793979095391341 
tempo: 2.821715977917632 
energy: 0.09602678653190241 
liveness: 2.333521250752356 
key: 0.5454139312819927 
duration_ms: 0.0673084007003831 
instrumentalness: 1.6787315102411215 
danceability: 2.7773774576764736 
speechiness: 9.974707855423258 
len_words_orig: 4.840817447008443 
len_words_trans: 8.656145139661833


# FIT LINEAR REGRESSION MODEL 
model = LinearRegression()
model.fit(X_train, y_train)

# EVALUATE MODEL 
predicted_y = model.predict(X_test) #get predicted values 
mae = mean_absolute_error(y_test, predicted_y) #mean absolute error 
mse = ((predicted_y - y_test)**2).mean() #or use sklearn mean_squared_error func
print('MAE: %.3f' % mae)
print('MSE: %.3f' % mse)

MAE: 539245.869
MSE: 1034618292635.474


# OBTAIN SUMMARY USING OLS() 
x = sm.add_constant(X_train) #design matrix (allow computation of intercept)
model = sm.OLS(y_train, x).fit()
print(model.summary())
print('MSE:', model.mse_model) # print MSE

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                streams   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     3.033
Date:                Thu, 09 Mar 2023   Prob (F-statistic):           0.000110
Time:                        16:18:22   Log-Likelihood:                -72572.
No. Observations:                4766   AIC:                         1.452e+05
Df Residuals:                    4751   BIC:                         1.453e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.534e+05    1.4e+05      1.805      0.071   -2.18e+04    5.29e+05
x1          1.697e+05   7.32e+04      2.318      0.021    2.62e+04    3.13e+05
x2          5.004e+04   2.94e+04      1.703      0.089   -7562.742    1.08e+05
x3          2.667e+05   1.85e+05      1.441      0.150   -9.61e+04     6.3e+05
x4          1.277e+05   7.01e+04      1.822      0.068   -9677.372    2.65e+05
x5          1.112e+05   9.37e+04      1.186      0.236   -7.26e+04    2.95e+05
x6         -1.162e+05   1.43e+05     -0.815      0.415   -3.96e+05    1.63e+05
x7          1.771e+05   1.17e+05      1.511      0.131   -5.26e+04    4.07e+05
x8          3.471e+04   4.45e+04      0.780      0.436   -5.26e+04    1.22e+05
x9          1.378e+05   1.99e+05      0.692      0.489   -2.53e+05    5.28e+05
x10        -1.472e+05   1.59e+05     -0.923      0.356    -4.6e+05    1.65e+05
x11        -7.938e+04      1e+05     -0.792      0.428   -2.76e+05    1.17e+05
x12         -2.77e+05   1.41e+05     -1.965      0.049   -5.53e+05    -620.112
x13         1.538e+06   7.84e+05      1.963      0.050    2250.959    3.07e+06
x14        -2.018e+06    8.2e+05     -2.461      0.014   -3.63e+06    -4.1e+05
==============================================================================
Omnibus:                     5349.165   Durbin-Watson:                   1.983
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           553712.657
Skew:                           5.739   Prob(JB):                         0.00
Kurtosis:                      54.542   Cond. No.                         142.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
MSE: 2997150372521.643


X = song_data[['valence',  'tempo', 'speechiness', 'len_words_trans', 'mode', 'acousticness', 'key']] #inputs 
y = song_data['streams']
X_norm = MinMaxScaler().fit_transform(X) #scale features (ML can be sensitive to scale of input)


X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=1)


# FIT MODEL 
model = LinearRegression()
model.fit(X_train, y_train)

# EVALUATE MODEL 
predicted_y = model.predict(X_test)
mae = mean_absolute_error(y_test, predicted_y)
mse = ((predicted_y - y_test)**2).mean() #or use sklearn mean_squared_error func
print('MAE: %.3f' % mae)
print('MSE: %.3f' % mse)

MAE: 539545.956
MSE: 1037037766985.242


# OBTAIN SUMMARY USING OLS() 
x = sm.add_constant(X_train) #design matrix (allow computation of intercept)
model = sm.OLS(y_train, x).fit()
print(model.summary())
print('MSE:', model.mse_model) # print MSE

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                streams   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     4.412
Date:                Thu, 09 Mar 2023   Prob (F-statistic):           6.78e-05
Time:                        16:21:26   Log-Likelihood:                -72577.
No. Observations:                4766   AIC:                         1.452e+05
Df Residuals:                    4758   BIC:                         1.452e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.556e+05   6.61e+04      5.377      0.000    2.26e+05    4.85e+05
x1          1.521e+05   6.24e+04      2.437      0.015    2.98e+04    2.74e+05
x2          1.296e+05   9.09e+04      1.427      0.154   -4.85e+04    3.08e+05
x3         -3.285e+05   1.34e+05     -2.446      0.014   -5.92e+05   -6.52e+04
x4         -4.753e+05   2.76e+05     -1.721      0.085   -1.02e+06    6.61e+04
x5          5.548e+04   2.92e+04      1.900      0.058   -1779.946    1.13e+05
x6           1.22e+05   6.02e+04      2.029      0.043    4097.666     2.4e+05
x7           3.24e+04   4.44e+04      0.730      0.466   -5.47e+04    1.19e+05
==============================================================================
Omnibus:                     5352.588   Durbin-Watson:                   1.983
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           552917.268
Skew:                           5.747   Prob(JB):                         0.00
Kurtosis:                      54.500   Cond. No.                         28.1
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
MSE: 4363608708764.5713

	Unnamed: 0	track_id	artist_names	track_name	source	rank	weeks_on_chart	streams	country	danceability	...	duration_ms	time_signature	album_release_date	lyrics	lyrics_trans	continent	iso_alpha3	len_words_orig	len_words_trans	lyrics_clean
0	0	0yLdNVWF3Srea0uzk55zFn	Miley Cyrus	Flowers	Columbia	1	5	124198	United Arab Emirates	0.707	...	200455.0	4.0	2023-01-13	We were good, we were gold\nKinda dream that c...	we were good we were gold kinda dream that can...	Asia	ARE	334	334	good gold dream sell right til build home watc...
1	1	1Qrg8KqiBpW07V7PNxwwwL	SZA	Kill Bill	Top Dawg Entertainment/RCA Records	2	10	106927	United Arab Emirates	0.644	...	153947.0	4.0	2022-12-08	I'm still a fan even though I was salty\nHate ...	im still a fan even though i was salty hate to...	Asia	ARE	362	362	fan even though salty hate see broad know happ...
2	2	6AQbmUe0Qwf5PZnt4HmTXv	PinkPantheress, Ice Spice	Boy's a liar Pt. 2	Warner Records	3	2	83627	United Arab Emirates	0.696	...	131013.0	4.0	2023-02-03	Take a look inside your heart\nIs there any ro...	take a look inside your heart is there any roo...	Asia	ARE	372	372	take look inside heart room room would hold br...
3	3	0WtM2NBVQNNJLh6scP13H8	Rema, Selena Gomez	Calm Down (with Selena Gomez)	Mavin Records / Jonzing World	4	25	79714	United Arab Emirates	0.801	...	239318.0	4.0	2022-08-25	Vibez\nOh, no\nAnother banger\nBaby, calm down...	vibez oh no another banger baby calm down calm...	Asia	ARE	495	495	another banger baby calm calm girl body put he...
4	4	2dHHgzDwk4BJdRwy9uXhTO	Metro Boomin, The Weeknd, 21 Savage	Creepin' (with The Weeknd & 21 Savage)	Republic Records	5	11	79488	United Arab Emirates	0.715	...	221520.0	4.0	2022-12-02	Ooh, ooh-ooh\nOoh-ooh-ooh, ooh, ooh-ooh (Just ...	ooh oohooh oohoohooh ooh oohooh just cant beli...	Asia	ARE	458	456	believe man want somebody say saw person kiss ...

	valence	mode	loudness	acousticness	tempo	energy	liveness	key	duration_ms	instrumentalness	danceability	speechiness	len_words_orig	len_words_trans
0	0.648231	1.0	0.754958	0.064286	0.329883	0.672251	0.015283	0.000000	0.250501	0.000005	0.644617	0.051979	0.107597	0.108410
1	0.406654	1.0	0.689064	0.052994	0.164828	0.728436	0.151985	0.727273	0.167554	0.150470	0.562905	0.018956	0.116726	0.117608
2	0.871795	1.0	0.572892	0.256352	0.414990	0.805431	0.244322	0.454545	0.126652	0.000134	0.630350	0.031950	0.119987	0.120894
3	0.813520	1.0	0.714133	0.388602	0.267317	0.802310	0.102101	1.000000	0.319813	0.000699	0.766537	0.017763	0.160091	0.161301
4	0.146006	0.0	0.677108	0.424207	0.215847	0.608782	0.068351	0.090909	0.288071	0.000000	0.654994	0.030043	0.148027	0.148489
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6804	0.762662	1.0	0.732994	0.646996	0.795130	0.587972	0.093611	0.727273	0.278226	0.000005	0.670558	0.258464	0.191392	0.192838
6805	0.776436	1.0	0.588462	0.026849	0.454947	0.458953	0.072914	0.000000	0.252187	0.000000	0.817121	0.047330	0.160743	0.161958
6806	0.422547	0.0	0.460936	0.014336	0.295818	0.436063	0.006686	0.727273	0.634991	0.000860	0.810636	0.047210	0.082165	0.134691
6807	0.296461	0.0	0.312882	0.016880	0.301495	0.451670	0.076098	0.090909	0.696106	0.142111	0.767834	0.032308	0.132377	0.151774
6808	0.595253	0.0	0.372382	0.011895	0.295824	0.698262	0.240076	0.909091	0.568898	0.001766	0.810636	0.025632	0.133681	0.169185

Contents¶

Feature Selection ¶

I. Variance Threshold ¶

II. Filter-based methods ¶

1. Pearson Correlation ¶

2. Chi-square Test ¶

Information criterion ¶

AIC ¶

BIC ¶

Wrapper-based methods ¶

1. Recursive Feature Elimination ¶

2. Forward Feature Selection ¶

3. Backward Elimination ¶

Embedded methods ¶

1. Tree-based - SelectFromModel ¶

2. Lasso Regularization ¶

Combined results ¶

Predictive Analysis ¶

	feature_names	abs_pearsoncorr	support
13	len_words_trans	0.049183	True
11	speechiness	0.047060	True
12	len_words_orig	0.038529	True
3	acousticness	0.036375	True
0	valence	0.029379	True
10	danceability	0.028526	True
6	liveness	0.020586	False
1	mode	0.020552	False
4	tempo	0.019645	False
2	loudness	0.019227	False
9	instrumentalness	0.018003	False
7	key	0.013108	False
8	duration_ms	0.008091	False
5	energy	0.005559	False

	feature_names	coefficients
0	valence	0.000000e+00
1	mode	8.774039e-12
2	loudness	0.000000e+00
3	acousticness	0.000000e+00
4	tempo	0.000000e+00
5	energy	-0.000000e+00
6	liveness	0.000000e+00
7	key	0.000000e+00
8	duration_ms	-0.000000e+00
9	instrumentalness	-0.000000e+00
10	danceability	-0.000000e+00
11	speechiness	-0.000000e+00
12	len_words_orig	-0.000000e+00
13	len_words_trans	-0.000000e+00