import statsmodels.api as sm                                                               # Package for clean regression output

stat = sm.OLS(training_sample['R1M_Usd'],sm.add_constant(training_sample[features])).fit()  # Model: predict R1M_Usd
reg_thrhld=3                                                                                # Keep significant predictors only
boo_filter = np.abs(stat.tvalues) >= reg_thrhld                                             # regressors significance threshold
estimate=stat.params[boo_filter]                                                            # estimate
std_error=stat.bse[boo_filter]                                                              # std.error
statistic=stat.tvalues[boo_filter]                                                          # statistic
p_value=stat.pvalues[boo_filter]                                                            # p.value
significant_regressors = pd.concat([estimate,std_error,statistic,p_value],axis=1)           # Put output in clean format
significant_regressors.columns=['estimate','std.error','statistic','p.value']               # Renaming columns
print(significant_regressors)

                   estimate  std.error  statistic       p.value
const              0.040574   0.005343   7.594323  3.107512e-14
Ebitda_Margin      0.013237   0.003493   3.789999  1.506925e-04
Ev_Ebitda          0.006814   0.002256   3.020213  2.526288e-03
Fa_Ci              0.007231   0.002347   3.081471  2.060090e-03
Fcf_Bv             0.025054   0.005131   4.882465  1.048492e-06
Fcf_Yld           -0.015893   0.003736  -4.254127  2.099628e-05
Mkt_Cap_12M_Usd    0.204738   0.027432   7.463476  8.461142e-14
Mkt_Cap_6M_Usd    -0.179780   0.045939  -3.913443  9.101987e-05
Mom_5M_Usd        -0.018669   0.004431  -4.212972  2.521442e-05
Mom_Sharp_11M_Usd  0.017817   0.004695   3.795131  1.476096e-04
Ni                 0.015461   0.004497   3.438361  5.853680e-04
Ni_Avail_Margin    0.011814   0.003861   3.059359  2.218407e-03
Ocf_Bv            -0.019811   0.005294  -3.742277  1.824119e-04
Pb                -0.017897   0.003129  -5.720637  1.062777e-08
Pe                -0.008991   0.002354  -3.819565  1.337278e-04
Sales_Ps          -0.015786   0.004628  -3.411062  6.472325e-04
Vol1Y_Usd          0.011425   0.002792   4.091628  4.285247e-05
Vol3Y_Usd          0.008459   0.002795   3.026169  2.477060e-03


import seaborn as sns                                                               # Package for plots
sns.set(rc={'figure.figsize':(16,16)})                                               # Setting the figsize in seaborn 
sns.heatmap(training_sample[features].corr())                                       # Correlation matrix and plot

<AxesSubplot:>


from sklearn import decomposition

pca = decomposition.PCA(n_components=7)                             # we impose the number of components
pca.fit(training_sample[features_short])                            # Performs PCA on smaller number of predictors
print(pca.explained_variance_ratio_)                                # Cheking the variance explained per component

P=pd.DataFrame(pca.components_,columns=features_short).T            # Rotation (n x k) = (7 x 7)
P.columns = ['P' + str(col)  for col in P.columns]                  # tidying up columns names
P

[0.35718238 0.1940806  0.15561321 0.10434453 0.09601422 0.07017118
 0.02259388]


from pca import pca
model = pca(n_components=7)                                                               # Initialize
results = model.fit_transform(training_sample[features_short], col_labels=features_short) # Fit transform and include the column labels and row labels
model.biplot(n_feat=7, PC=[0,1],cmap=None, label=None, legend=False)                      # Make biplot

[pca] >Processing dataframe..
[pca] >The PCA reduction is performed on the [7] columns of the input dataframe.
[pca] >Fit using PCA.
[pca] >Compute loadings and PCs.
[pca] >Compute explained variance.
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[7]
[pca] >Outlier detection using SPE/DmodX with n_std=[2]
[pca] >Plot PC1 vs PC2 with loadings.

(<Figure size 1500x1000 with 1 Axes>,
 <AxesSubplot:title={'center':'7 Principal Components explain [99.99%] of the variance'}, xlabel='PC1 (35.7% expl.var)', ylabel='PC2 (19.4% expl.var)'>)


pd.DataFrame(                                                   # Using DataFrame format
    np.matmul(                                                  # Matrix product using numpy                    
    training_sample[features_short].values,P.values[:, :4]),    # Matrix values
    columns=['PC1','PC2','PC3','PC4']                           # Change column names
    ).head()                                                    # Show first 5 lines


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from plot_keras_history import show_history, plot_history

input_layer = Input(shape=(7,))                                               # features_short has 7 columns 
encoder = tf.keras.layers.Dense(units=32, activation="sigmoid")(input_layer)  # First, encode
encoder = tf.keras.layers.Dense(units=4)(encoder)                             # 4 dimensions for the output layer (same as PCA example)
decoder = tf.keras.layers.Dense(units=32, activation="sigmoid")(encoder)      # Then, from encoder, decode
decoder = tf.keras.layers.Dense(units=7)(decoder)                             # the original sample has 7 features


ae_model = keras.Model(input_layer, decoder)      # Builds the model
ae_model.compile(                                 # Learning parameters
    optimizer='adam', 
    loss='mean_squared_error', 
    metrics='mean_squared_error')


history=ae_model.fit(NN_train_features, # Input
                     NN_train_features, # Output
                epochs=15,
                batch_size=512,
                validation_data=(NN_test_features, NN_test_features))
plot_history(history)

Epoch 1/15
387/387 [==============================] - 2s 3ms/step - loss: 0.0756 - mean_squared_error: 0.0756 - val_loss: 0.0424 - val_mean_squared_error: 0.0424
Epoch 2/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0307 - mean_squared_error: 0.0307 - val_loss: 0.0236 - val_mean_squared_error: 0.0236
Epoch 3/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0194 - mean_squared_error: 0.0194 - val_loss: 0.0167 - val_mean_squared_error: 0.0167
Epoch 4/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0163 - mean_squared_error: 0.0163 - val_loss: 0.0160 - val_mean_squared_error: 0.0160
Epoch 5/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0159 - mean_squared_error: 0.0159 - val_loss: 0.0157 - val_mean_squared_error: 0.0157
Epoch 6/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0157 - mean_squared_error: 0.0157 - val_loss: 0.0156 - val_mean_squared_error: 0.0156
Epoch 7/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0156 - mean_squared_error: 0.0156 - val_loss: 0.0156 - val_mean_squared_error: 0.0156
Epoch 8/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0156 - mean_squared_error: 0.0156 - val_loss: 0.0155 - val_mean_squared_error: 0.0155
Epoch 9/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0155 - mean_squared_error: 0.0155 - val_loss: 0.0155 - val_mean_squared_error: 0.0155
Epoch 10/15
387/387 [==============================] - 1s 3ms/step - loss: 0.0155 - mean_squared_error: 0.0155 - val_loss: 0.0155 - val_mean_squared_error: 0.0155
Epoch 11/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0155 - mean_squared_error: 0.0155 - val_loss: 0.0154 - val_mean_squared_error: 0.0154
Epoch 12/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0154 - mean_squared_error: 0.0154 - val_loss: 0.0154 - val_mean_squared_error: 0.0154
Epoch 13/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0154 - mean_squared_error: 0.0154 - val_loss: 0.0153 - val_mean_squared_error: 0.0153
Epoch 14/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0153 - mean_squared_error: 0.0153 - val_loss: 0.0152 - val_mean_squared_error: 0.0152
Epoch 15/15
387/387 [==============================] - 1s 2ms/step - loss: 0.0152 - mean_squared_error: 0.0152 - val_loss: 0.0150 - val_mean_squared_error: 0.0150

(<Figure size 1000x500 with 2 Axes>,
 array([<AxesSubplot:title={'center':'Loss'}, xlabel='Epochs', ylabel='Loss'>,
        <AxesSubplot:title={'center':'MSE'}, xlabel='Epochs', ylabel='MSE'>],
       dtype=object))


ae_weights=ae_model.get_weights()


from sklearn import cluster

k_means = cluster.KMeans(n_clusters=10)                                                     # setting the number of cluster
k_means.fit(training_sample[features].T)                                                    # Performs the k-means clustering
clusters = pd.DataFrame([features, k_means.labels_],index=["factor", "cluster"]).T          # Organize the cluster data
clusters.loc[clusters['cluster']==4,:]                                                      # Shows one particular group


from sklearn import neighbors as nb             # Package for Nearest Neighbors detection

knn_data = data_ml.loc[data_ml['date']=='2006-12-31',:]          # Dataset for k-NN exercise
knn_target = knn_data.loc[knn_data['stock_id'] == 13, features]  # Target observation
knn_sample = knn_data.loc[knn_data['stock_id'] != 13, features]  # All other observations
neighbors = nb.NearestNeighbors(n_neighbors=30)       # Number of neighbors to use
neighbors.fit(knn_sample)

NearestNeighbors(n_neighbors=30)


neigh_dist, neigh_ind = neighbors.kneighbors(knn_target)
print(pd.DataFrame(neigh_ind))                              # Indices of the k nearest neighbors

   0   1   2    3    4   5    6    7   8    9   ...   20  21   22   23   24  \
0   9   8  99  185  294  21  266  180  23  191  ...  310  95  165  215  268   

    25   26  27   28   29  
0  194  103  17  117  539  

[1 rows x 30 columns]


knn_labels = knn_data.loc[:, 'R1M_Usd'].values[neigh_ind]                   # y values for neigh_ind
np.sum(knn_labels * np.exp(-neigh_dist)/np.sum(np.exp(-neigh_dist)))        # Pred w. k(z)=e^(-z)

0.03092438258317905


knn_data.loc[knn_data['stock_id'] == 13, 'R1M_Usd']                         # True y

96734    0.089
Name: R1M_Usd, dtype: float64

	P0	P1	P2	P3	P4	P5	P6
Div_Yld	-0.271599	0.579099	0.045725	-0.528956	0.226626	0.506566	0.032012
Eps	-0.420407	0.150082	-0.024767	0.337373	-0.771377	0.301883	0.011965
Mkt_Cap_12M_Usd	-0.523868	-0.343239	0.172289	0.062495	0.252781	0.002987	0.714319
Mom_11M_Usd	-0.047238	-0.057714	-0.897160	0.241015	0.250559	0.258477	0.043179
Ocf	-0.532947	-0.195890	0.185039	0.234371	0.357596	0.049015	-0.676866
Pb	-0.152413	-0.580806	-0.221048	-0.682136	-0.308665	0.038675	-0.168799
Vol1Y_Usd	0.406890	-0.381139	0.282162	0.155411	0.061575	0.762588	0.008632

	PC1	PC2	PC3	PC4
0	-0.591998	-0.177306	0.058881	-0.349897
1	-0.043180	-0.718323	-0.510459	-0.050138
2	-1.104983	-0.429470	0.023240	-0.171445
3	-0.376485	-0.418983	-0.650190	-0.081842
4	-0.018831	-0.581435	0.242719	-0.358501

Chapter 15 Unsupervised learning¶

15.1 The problem with correlated predictors¶

15.2 Principal component analysis and autoencoders¶

15.2.1 A bit of algebra¶

15.2.2 PCA¶

15.2.3 Autoencoders¶

15.2.4 Application¶

15.3 Clustering via k-means¶

15.4 Nearest neighbors¶

15.5 Coding exercices¶

References¶

	factor	cluster
6	Capex_Ps_Cf	4
19	Eps	4
20	Eps_Basic	4
21	Eps_Basic_Gr	4
22	Eps_Contin_Oper	4
23	Eps_Dil	4
68	Op_Prt_Margin	4
69	Oper_Ps_Net_Cf	4
80	Sales_Ps	4