-
Notifications
You must be signed in to change notification settings - Fork 0
/
CMN Project.sas
438 lines (402 loc) · 14.5 KB
/
CMN Project.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
* import comments analyzed for sentiment;
proc import datafile = "C:\Users\mcraft\Desktop\CMN\DDBB_sent_emo (2).csv"
out = sent_emo dbms = csv replace; run;
* create a file with renamed videos for the two-part model;
data twopart;
set sent_emo;
Subject = .;
if Video = "WrestleMania" then Subject = 1;
else if Video = "Venezuela" then Subject = 2;
else if Video = "Taylor4Apple" then Subject = 3;
else if Video = "Statue" then Subject = 4;
else if Video = "Sneakers" then Subject = 5;
else if Video = "Samsung" then Subject = 6;
else if Video = "Rogan" then Subject = 7;
else if Video = "Rkelly" then Subject = 8;
else if Video = "RedTable" then Subject = 9;
else if Video = "Ramsay" then Subject = 10;
else if Video = "Obama" then Subject = 11;
else if Video = "Nike" then Subject = 12;
else if Video = "Neverland" then Subject = 13;
else if Video = "NatGeo" then Subject = 14;
else if Video = "Makeup" then Subject = 15;
else if Video = "Magnets" then Subject = 16;
else if Video = "Lebron" then Subject = 17;
else if Video = "KylieReunite" then Subject = 18;
else if Video = "Kavanaugh" then Subject = 19;
else if Video = "Jlo" then Subject = 20;
else if Video = "Jaguar" then Subject = 21;
else if Video = "Gucci" then Subject = 22;
else if Video = "Football" then Subject = 23;
else if Video = "FoodSwitch" then Subject = 24;
else if Video = "Flo" then Subject = 25;
else if Video = "DrOz" then Subject = 26;
else if Video = "Diet" then Subject = 27;
else if Video = "Debates" then Subject = 28;
else if Video = "BlaseyonFox" then Subject = 29;
else if Video = "ASMR" then Subject = 30;
run;
* likes are skewed, so take the log of the likes + 1;
data twopart;
set twopart;
loglikes = .;
loglikes = log(likes + 1);
logloglikes = log(loglikes + 1);
run;
* determine whether a multilevel model is necessary--unconditional multilevel model;
proc mixed data = new_sent_emo;
class Subject;
model likes = /s;
random int / subject = Subject type = un;
run;
** explore effects of covariates;
* simple MLM-Sentiment;
proc mixed data=sent_emo;
class Video;
model likes = sent_score / s;
random int / subject=Video type=un;
run;
* simple MLM-Sadness;
proc mixed data=sent_emo;
class Video;
model likes = sadness / s;
random int / subject=Video type=un;
run;
* simple MLM-Joy;
proc mixed data=sent_emo;
class Video;
model likes = joy / s;
random int / subject=Video type=un;
run;
* simple MLM-Fear;
proc mixed data=sent_emo;
class Video;
model likes = fear / s;
random int / subject=Video type=un;
run;
* simple MLM-Disgust;
proc mixed data=sent_emo;
class Video;
model likes = disgust / s;
random int / subject=Video type=un;
run;
* simple MLM-Anger;
proc mixed data=sent_emo;
class Video;
model likes = anger / s;
random int / subject=Video type=un;
run;
* create subset and plot zero-inflated Poisson data;
data subvideos;
set twopart;
where Video = "Nike" or Video = "Statue" or Video = "Venezuela"
or Video = "Samsung" or Video = "Gucci" or Video = "Diet";
run;
title 'Histograms of Zero-Inflated Likes';
proc sgpanel data = subvideos;
panelby Video / onepanel layout = panel;
vbar likes;
colaxis label = "YouTube Comment Like Counts" Fitpolicy = thin;
run;
title 'Histograms of Logged Zero-Inflated Likes';
proc sgpanel data = subvideos;
panelby Video / onepanel layout = panel;
vbar logloglikes;
colaxis label = "YouTube Comment Like Counts" Fitpolicy = thin;
run;
title 'Single Histogram Logged Likes for Nike';
proc sgplot data = twopart;
where Video = "Nike";
vbar loglikes / groupdisplay = cluster baselineintercept=.0000001;
yaxis type = log logbase = 10;
run;
title 'Histograms of Zero-Inflated Replies';
proc sgpanel data = subvideos;
panelby Video / onepanel layout = panel;
vbar numberOfReplies;
colaxis label = "YouTube Comment Reply Counts" Fitpolicy = thin;
run;
* create variable needed for fitting a two-part model;
data twopart;
set twopart;
P_u = 1;
if likes = 0 then P_u = 0;
else if likes = . then P_u = .;
P_m = .;
if P_u = 1 then P_m = likes;
run;
* export dataset for use in R;
proc export data = twopart dbms = csv outfile = "C:\Users\mcraft\Desktop\CMN\comments.csv" replace;
putnames = yes;
run;
*****model fitting for number of likes*****;
title 'Starting Values for Two-part Parameters';
proc genmod data = twopart;
class Subject;
model likes = sent_score / dist = zip;
zeromodel sent_score / link = logit;
run;
title 'Starting Values for Random-effects Parameters';
proc glimmix data = twopart noclprint method = laplace;
class Subject;
model likes = sent_score / s dist = poisson;
random int / subject = Subject;
run;
title "Starting Values for the Random Effects Logistic Portion Only";
proc nlmixed data = twopart method = gaus qpoints = 5 maxfunc = 3000
maxiter = 10000 noad gconv = 0 absconv = 0;
parms a0 = 1.9445 a1 = 0.6282 vara = 1.6906;
u = P_u;
m = P_m;
*Binary part;
ueta = a0 + a1*sent_score + au;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u));
model likes ~ general(LL1);
random au ~ normal (0, vara) subject = Subject;
bounds vara >= 0;
run;
title "Starting Values for the Fixed Effects Logistic Portion Only";
proc nlmixed data = twopart method = gaus qpoints = 5 maxfunc = 3000
maxiter = 10000 noad gconv = 0 absconv = 0;
parms a0 = 1.9445 a1 = 0.6282;
u = P_u;
m = P_m;
*Binary part;
ueta = a0 + a1*sent_score;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u));
model likes ~ general(LL1);
run;
title "Starting Values for the Poisson Portion Only";
proc nlmixed data = twopart method = gaus qpoints = 5 maxfunc = 3000
maxiter = 10000 noad gconv = 0 absconv = 0;
parms b10 = 1.9445 b11 = 0.6282 varb = 1.6906;
u = P_u;
m = P_m;
*Poisson part;
mu = exp(b10 + b11*sent_score + bu);
LL2 = likes*log(mu) - mu - lgamma(likes + 1);
model likes ~ general(LL2);
random bu ~ normal (0, varb) subject = Subject;
bounds varb >= 0;
run;
title "Two-part RANDOM Logistic and Poisson Model";
proc nlmixed data = twopart noad method = gaus qpoints = 50 maxfunc = 3000
maxiter = 10000 gconv = 0 absconv = 0;
parms b10 = 3.7300 b11 = 0.6063
a0 = -1.1279 a1 = -0.05894
varb = 0.6573 vara = 0.4526 covab = 0.2944;
pi = arcos(-1);
u = P_u;
m = P_m;
ueta = a0 + a1*sent_score + au; *logistic model with time varying predictor;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u)); *loglikelihood for a binary dist;
if likes > 0 then do;
mu = exp(b10 + b11*sent_score + bu); *Poisson model with time varying predictor;
LL2 = likes*log(mu) - mu - lgamma(likes + 1); *loglikelihood for a Poisson dist;
end;
if likes = 0 then Loglik = LL1;
else if likes > 0 then Loglik = LL1+LL2;
model likes ~ general(Loglik);
random au bu ~ normal ([0,0],[vara, covab, varb]) subject = Subject;
bounds vara >= 0, varb >= 0;
run;
title "Two-part FIXED Logistic and Poisson Model";
proc nlmixed data = twopart noad method = gaus qpoints = 50 maxfunc = 3000
maxiter = 10000 gconv = 0 absconv = 0;
parms b10 = 3.7300 b11 = 0.6063
a0 = -1.1279 a1 = -0.05894;
pi = arcos(-1);
u = P_u;
m = P_m;
ueta = a0 + a1*sent_score; *logistic model with time varying predictor;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u)); *loglikelihood for a binary dist;
if likes > 0 then do;
mu = exp(b10 + b11*sent_score); *Poisson model with time varying predictor;
LL2 = likes*log(mu) - mu - lgamma(likes + 1); *loglikelihood for a Poisson dist;
end;
if likes = 0 then Loglik = LL1;
else if likes > 0 then Loglik = LL1+LL2;
model likes ~ general(Loglik);
run;
title "Two-part RANDOM Logistic and Truncated Poisson Model";
proc nlmixed data = twopart noad method = gaus qpoints = 50 maxfunc = 3000
maxiter = 10000 gconv = 0 absconv = 0;
parms b10 = 21.4195 b11 = 0.6178 b12 = -0.1109
a0 = 0.5792 a1 = -0.04402 a2 = -0.01279
vara = 0.2053 covab = -0.1852 varb = 1.1959;
pi = arcos(-1);
u = P_u;
m = P_m;
ueta = a0 + a1*sent_score + a2*short_timestamp + au; *logistic model with time varying predictor;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u)); *loglikelihood for a binary dist;
if likes > 0 then do;
log_lambda = b10 + b11*sent_score + b12*short_timestamp + bu; *Poisson model with time varying predictor;
lambda = exp(log_lambda);
*LL2 = likes*log_lambda - lambda - log(1-exp(-lambda)) - lgamma(likes+1);*loglikelihood for a truncated Poisson dist;
*or;
LL2 = log(pi) - log(1-exp(-lambda)) - lambda - lgamma(likes+1) + likes*log(lambda);
end;
if likes = 0 then Loglik = LL1;
else if likes > 0 then Loglik = LL1+LL2;
model likes ~ general(Loglik);
random au bu ~ normal ([0,0],[vara, covab, varb]) subject = Subject;
bounds vara >= 0, varb >= 0;
run;
title "Two-part FIXED Logistic and Truncated Poisson Model";
proc nlmixed data = twopart noad method = gaus qpoints = 50 maxfunc = 3000
maxiter = 10000 gconv = 0 absconv = 0;
parms b10 = 3.6721 b11 = 0.6060 b12 = 0
a0 = -1.3528 a1 = -0.04973 a2 = 0;
pi = arcos(-1);
u = P_u;
m = P_m;
ueta = a0 + a1*sent_score + a2*short_timestamp; *logistic model with time varying predictor;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u)); *loglikelihood for a binary dist;
if likes > 0 then do;
log_lambda = b10 + b11*sent_score + b12*short_timestamp; *Poisson model with time varying predictor;
lambda = exp(log_lambda);
*LL2 = likes*log_lambda - lambda - log(1-exp(-lambda)) - lgamma(likes+1);*loglikelihood for a truncated Poisson dist;
*or;
LL2 = log(pi) - log(1-exp(-lambda)) - lambda - lgamma(likes+1) + likes*log(lambda);
end;
if likes = 0 then Loglik = LL1;
else if likes > 0 then Loglik = LL1+LL2;
model likes ~ general(Loglik);
run;
title1 "Two-part Random Poisson Model with LSM";
proc nlmixed data = twopart method = gaus qpoints = 5 maxfunc = 3000
maxiter = 10000 noad gconv = 0 absconv = 0;
parms b10 = 3.6455 b11 = 0.6063
a0 = -1.2902 a1 = -0.05574
lnvarb = -0.05858 /*lnvara = -1.5474*/ tau1 = 1;
pi = arcos(-1);
u = P_u;
m = P_m;
ueta = a0 + a1*sent_score + au; *logistic model with time varying predictor;
expeta = exp(ueta);
p = expeta/(1+expeta);
LL1 = log((1-p)**(1-u)) + log(p**(u)); *loglikelihood for a binary dist;
if likes > 0 then do;
log_lambda = b10 + b11*sent_score + bu; *Poisson model with time varying predictor;
lambda = exp(log_lambda);
*LL2 = likes*log_lambda - lambda - log(1-exp(-lambda)) - lgamma(likes+1);*loglikelihood for a truncated Poisson dist;
*or;
LL2 = log(pi) - log(1-exp(-lambda)) - lambda - lgamma(likes+1) + likes*log(lambda);
varb = EXP(lnvarb + tau1*avg_sent); *LSM;
end;
if likes = 0 then Loglik = LL1;
else if likes > 0 then Loglik = LL1+LL2;
model likes ~ general(Loglik);
random au bu ~ normal ([0,0],[vara, covab, varb]) subject = Subject;
*bounds vara >= 0, varb >= 0;
run;
*LSM does not converge;
*****model fitting for number of responses*****;
title 'Starting Values for Two-part Parameters';
proc genmod data = twopart;
class Subject;
model numberOfReplies = sent_score / dist = zip;
zeromodel sent_score / link = logit;
run;
title 'Starting Values for Random-effects Parameters';
proc glimmix data = twopart noclprint method = laplace;
class Subject;
model numberOfReplies = sent_score / s dist = poisson;
random int / subject = Subject;
run;
title1 "Two-part Random Poisson Model";
proc nlmixed data = twopart method = gaus qpoints = 25 maxfunc = 3000 maxiter = 10000;
parms b10 = 1.6596 b11 = 0.0689
a0 = 1.9769 a1 = 0.1912
varb = 0.8201;
logit0 = a0 + a1*sent_score; *logistic model with time varying predictor;
prob0 = 1 / (1 + exp(-logit0));
mu = exp(b10 + b11*sent_score + bu); *Poisson model with time varying predictor;
if numberOfReplies = 0 then
LL = log(prob0 + (1-prob0)*exp(-mu)); *loglikelihood for a binary dist;
else
LL = numberOfReplies*log(mu) - mu - lgamma(numberOfReplies + 1); *loglikelihood for a poisson dist;
model numberOfReplies ~ general(LL);
random bu ~ normal (0,varb) subject = Subject;
run;
*Note: I tried putting a random effect in the logistic portion but couldn't
get it to converge;
title1 "Two-part Random Poisson Model with LSM";
proc nlmixed data = twopart method = gaus qpoints = 25 maxfunc = 3000 maxiter = 10000;
parms b10 = 1.5829 b11 = 0.08175
a0 = 20.2278 a1 = 1.1711
lnvarb = -1.033387 tau1 = 0;
logit0 = a0 + a1*sent_score; *logistic model with time varying predictor;
prob0 = 1 / (1 + exp(-logit0));
mu = exp(b10 + b11*sent_score + bu); *Poisson model with time varying predictor;
varb = EXP(lnvarb + tau1*avg_replies);
if numberOfReplies = 0 then
LL = log(prob0 + (1-prob0)*exp(-mu)); *loglikelihood for a binary dist;
else
LL = numberOfReplies*log(mu) - mu - lgamma(numberOfReplies + 1); *loglikelihood for a poisson dist;
model numberOfReplies ~ general(LL);
random bu ~ normal (0,varb) subject = Subject;
run;
*plots;
proc freq data = twopart;
table likes;
run;
*histogram;
title 'Youtube Comment Likes';
proc univariate data = twopart;
where likes between 0 and 500;
var likes;
histogram / odstitle = "Distribution of YouTube Comment Likes" ;
quit;
*spaghetti plot;
*export fixed and random effects to data sets;
ods output solutionf=sf(keep=effect estimate
rename=(estimate=FE));
ods output solutionr=sr(keep=effect M2ID estimate
rename=(estimate=RE));
proc mixed data=twopart;
class Subject;
model likes = / s outp = pred;
random int / s g gcorr type=un subject=Subject;
run; quit;
ods output close;
*create random subsets of data for plotting;
data subpred;
set pred;
where Video = "Nike" or Video = "Venezuela" or Video = "Neverland" or
Video = "NatGeo" or Video = "Football" or Video = "Diet";
run;
ods graphics off;
proc reg data = twopart;
model likes = ;
ods output ParameterEstimates=PE;
run;
data _null_;
set PE;
if _n_ = 1 then call symput('Int', put(estimate, BEST6.));
else call symput('Slope', put(estimate, BEST6.));
run;
proc sgpanel data=subpred;
panelby Video / columns=3 spacing=5 /*novarname*/;
styleattrs datacolors=(GRAY4F);
series x = numid y = likes;
scatter x = numid y = likes;
rowaxis label = "# of YouTube Comment Likes";
colaxis label = " " display = all;
series x = numid y = pred / lineattrs = (color = black)
legendlabel = "Fitted Values";
*reg x=b2dday y=b2dnegav/ cli clm;
title "Fitted Intercepts for a Random Subsample of Six Individuals";
run;