-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_utility_functions.py
379 lines (321 loc) · 12.6 KB
/
main_utility_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
import pandas as pd
import numpy as np
import random, math
import riskfolio as rp
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hr
#%%
'''
FUNCTIONS FOR CREATING FEATURES
'''
def get_sharpe(ret, freq):
'''
for e.g. if monthly selected:
find the first date of each month present, then for each COMPLETE month, calc the cum ret
note that for BOTH the first and last 'first-day-of-month date', we just ignore those months
ignore data in the month of the first 'first-day-of-month date' because they may not form a complete month
ignore data after the last 'first-day-of-month date' as they may not form a complete month
Parameters
----------
ret : dataframe, portfolio ret, datetime index, day by day prices
(assumes NO NA values, fully filled)
freq : string, either 'daily' , 'monthly' or 'yearly'
which frequency to use to estimate the sharpe ratio.
if daily, daily returns are used and are annualised
if monthly, monthly returns are used and are annualised
if yearly, yearly returns are used, already annualised
Returns
-------
the sharpe ratio based on all these cum ret values
'''
#notify if NaN or zero exists:
if ret.isnull().values.any() or 0 in ret.values:
print('There exists NaN or 0 values in get_sharpe!')
#daily frequency selected
if freq=='daily':
return (ret.mean()/ret.std()) * (252**0.5)
#monthly frequency selected
elif freq=='monthly':
#datetime index
#ignore first date(usually not be the start of a year), but need to keep last 'first year date' for indexing
index = ret.groupby([ret.index.year, ret.index.month]).head(1).index[1:]
#convert to int index
index = [ret.index.get_loc(x) for x in index]
sharpe=pd.Series([ cum_ret(ret.iloc[index[i]:index[i+1]]) for i in range(len(index)-1) ])
return sharpe.mean()/sharpe.std() * (12**0.5)
#yearly frequency selected
elif freq=='yearly':
#datetime indexes
#this time filther out the first day of each year
index = ret.groupby([ret.index.year]).head(1).index[1:]
#convert to integer indexes in ret
index = [ret.index.get_loc(x) for x in index]
sharpe=pd.Series([ cum_ret(ret.iloc[index[i]:index[i+1]]) for i in range(len(index)-1) ])
return sharpe.mean()/sharpe.std()
def cagr(ret):
'''
Calculate the compound annual growth rate
Parameters
----------
ret : dataframe, portfolio ret, datetime index, DAY BY DAY prices (assumes NO NA values, fully filled)
can be of any length, simply finds cagr by estimating number of years elapsed, can be decimal years
Returns
-------
float
the compound annual growth rate, as a fraction, not a percentage
'''
#notify if NaN exists:
if ret.isnull().values.any() or 0 in ret.values:
print('There exists NaN or 0 values in cagr!')
end=(1 + ret).cumprod().iloc[-1]
num_years=len(ret.index)/252
return end**(1/num_years)-1
def rea_vol(ret):
'''
find the realized volatility for the WHOLE entire period
is the root of the realized variance
Parameters
----------
ret : dataframe, portfolio return
Returns
-------
float
'''
#notify if NaN exists:
if ret.isnull().values.any() or 0 in ret.values:
print('There exists NaN or 0 values in rea_vol!')
return np.sum(np.log(ret+1)**2)**0.5
def mdd(ret):
'''
max drawdown is defined as the highest peak-to-trough value, divided by the peak (trough must be aft peak)
Parameters
----------
ret : series or df
can be indiv asset returns, in which case this will return a series
Returns
-------
float
max drawdown of period
'''
#notify if NaN exists:
if ret.isnull().values.any() or 0 in ret.values:
print('There exists NaN or 0 values in mdd!')
#cum_returns is basically the price history
cum_returns = (1 +ret).cumprod()
#cum_returns.cummax gives highest price observed so far at any point
#drawdown gives many 'windows' of (highest_price_so_far-current price)/highest price
#this constraints highest price to occur before current price
#maximizing this window is the same as finding the lowest trough after the peak
drawdown = 1 - cum_returns.div(cum_returns.cummax())
return drawdown.max(axis=0) #maximum vertically
def semi_std(ret):
'''
find semi-standard deviation of period, a measure of downside risk
calcs std of only those returns that fall below mean return
Parameters
----------
ret : dataframe, portfolio returns
Returns
-------
float
'''
#notify if NaN exists:
if ret.isnull().values.any() or 0 in ret.values:
print('There exists NaN or 0 values in semi_std!')
average = np.nanmean(ret.to_numpy())
r_below = ret[ret < average]
return np.sqrt(1/len(r_below) * np.sum((average - r_below)**2))
def coph_corr(port):
'''
calculates the cophenetic correlation coefficient of this clustering structure
this is the corr coeff btwn distance matrix and cophenetic matrix
note that this assumes p_dist exists, which is not true for all cov-estimation methods
Parameters
----------
port : portfolio object belonging to riskfolio module
a portfolio object that has already gone through optimization
Returns
-------
float
first gives a numpy scalar value, then converts it to a float!
'''
return hr.cophenet(port.clusters,Y=port.p_dist)[0].item() #check out hr's cophenet method
#%%
'''
FUNCTIONS FOR MANIPULATING PORTFOLIO RETURNS AND WEIGHTS
'''
def inv_vol(ret):
'''
assets are weighted accordingly to the inverse volatility allocation
Parameters
----------
ret : a dataframe
contain indiv asset returns
Returns
-------
list
the weights listed in order
'''
variance=ret.var(axis=0)
return (1/variance)/((1/variance).sum())
def port_ret(weights, ret, index_):
'''
this function applies a set of weights to daily returns of an asset universes
and creates a portfolio return. note that it makes use of the theoretical property that
ret(portfolio)= sum[asset_weight * ret(asset)]
Parameters
----------
weights : dictionary
keys are the integer index of returns while each item is a set of weights for that day
e.g.
1105: [w1,w2,w3]
1206: [w1_new,w2_new,w3_new]
This will mean that weight applies on ret from 1105 to 1205, and 1206 is a diff set of weights
ret : a dataframe, DAILY RETURNS OF EACH TICKER
index_ : list
These are integer indexes of ret, where REBALANCING(CHOOSING ERC/HRP) happens
note that index_ may be a subset of the index of weights
because XGboost needs some months of data, can only start ltr
Returns
-------
series
daily portfolio returns, COMBINED into one value
note that port_returns will automatically make nan all the returns up to the first date in index_
'''
ret=ret.copy()
for i in range(len(index_)):
if i!=len(index_)-1: #if the index is not the last index
ret.iloc[index_[i]:index_[i+1]]=weights[index_[i]].to_list()*ret.iloc[index_[i]:index_[i+1]]
else: #if last number in index_, just apply weights all the way to last row
ret.iloc[index_[i]:]=weights[index_[i]].to_list()*ret.iloc[index_[i]:]
#sum all the returns horizontally
#note that if rows are NaN, summing will make them zero!!
ret=ret.sum(axis=1)
#make all rows nan up to first rebalancing date, not including first date
ret.iloc[:index_[0]]=np.nan
return ret #will contain nan, not fully filled
def cum_ret(ret):
'''
takes in some daily returns (EITHER indiv assets OR combined portfolio) and gives cumulative returns
if portfolio returns, take in series, gives a float(cum_ret of portfolio)
Parameters(assuming indiv asset returns)
----------
ret : dataframe
individual asset returns,assumes there are NO NA VALUES (fully filled)
Returns
-------
series
indexed by the asset name, gives cum_ret of each asset
If PORTFOLIO RETURNS entered, take in series, gives a float(cum_ret of portfolio)
'''
return (1 + ret).cumprod().iloc[-1]-1
def stats_corr(corr):
'''
find some stats on the correlation matrix
1st value is mean of all lower triangular elem,
2nd is std of all lower trian elems
3rd elem is the condition number of this corr matrix, measures how sensitive a matrix is (??)
4th elem is the determinant of the matrix
Parameters
----------
corr : dataframe
the correlation matrix btwn all asset pairs.
Returns
-------
tuple
contains values in order above
'''
#note that here values is a np array!!
values=corr.values[np.triu_indices_from(corr.values,1)]
cond=np.linalg.cond(corr)
det=np.linalg.det(corr)
#ddof=1 for unbiased estimator in numpy
return (values.mean(),values.std(ddof=1),cond,det)
def stats_asset_univ(uni_ret):
'''
calc some stats about the asset universe
Parameters
----------
uni_ret : dataframe
THE UNIVERSE' RETURNS', for each asset. Assumes no NA!
Returns
-------
a tuple of 4 values, as specified in strat
'''
m_m=uni_ret.mean(axis=0).mean()
s_m=uni_ret.std(axis=0).mean()
m_s=uni_ret.mean(axis=0).std()
s_s=uni_ret.std(axis=0).std()
#this usually gives alot of NaN as some prices in uni_ret are the same aft a day
mdd_m=mdd(uni_ret).mean()
mdd_s=mdd(uni_ret).std()
return(m_m,s_m,m_s,s_s,mdd_m,mdd_s)
#%%
'''
MISCELLANOUS FUNCTIONS
'''
def ticker_gen(ticker_pool, uni_len, uni_num, num_share):
'''
this helper function generates some universes of tickers from a pool of tickers.
Each universe can only only share max num_share tickers with any other universes
Parameters
----------
ticker_pool : list
a list, the list of possible tickers
uni_len : float
the number of tickers in each universe
uni_num : float
the number of universes to generate (has an upper bound!)
may give not possible if too high
num_share : float
max number each uni can share w/ each other
Returns
-------
lis : list of lists
each list consisting of asset tickers
'''
#this is the current list of universes, a list of lists
lis=[]
#if this counter gets too big, its impossible to gen this list.
count=0
#while we havent reach the number of uni to generate
while len(lis)<uni_num:
count+=1
#create a univ
cur=random.sample(ticker_pool,uni_len)
#check if cur is valid
valid=True
for uni in lis:
#if alr share more than allowed
if len(set(uni).intersection(cur))>num_share:
#terminate loop
valid=False
if valid:
lis.append(cur)
#not possible generate, has iterated over every possible comb
if count>math.comb(len(ticker_pool),uni_len) * len(ticker_pool) * uni_len:
print('Not possible to gen such a list!!')
return
return lis
def stats_uni_corr(ret):
'''
get some insights into this asset universe, through its correlation matrix of all returns
compiles some stats about the correlation matrix
note that the ones diagonal is a line of symmetry, the upper and lower and triangles have the same values
Parameters
----------
ret : dataframe
the returns for the whole universe, indiv assets
should be ALL the historical returns
Returns
-------
dictionary
contains correlation matrix, mean of upper triangle elems, std of upper triangle elems
'''
port = rp.Portfolio(returns=ret)
port.assets_stats(method_cov='gerber1')
corr=rp.AuxFunctions.cov2corr(port.cov)
values=corr.values[np.triu_indices_from(corr.values,1)]
return {'Correlation Matrix':corr,
'Mean Correlation':values.mean(),
'Standard Deviation of Correlations':values.std(ddof=1)}