Showing posts with label probabilistic linear models. Show all posts
Showing posts with label probabilistic linear models. Show all posts

### How to learn probabilistic linear models to predict a qualitative

#we looked at using probabilistic linear models to predict a qualitative
> response with the two most common methods: logistic regression and discriminant
package ‘leaps’ successfully unpacked and MD5 sums checked

> library("leaps", lib.loc="~/R/win-library/3.6")
Warning message:
package ‘leaps’ was built under R version 3.6.3
> x<-matrix(rnorm(100),ncol=4)
> y<-rnorm(25)
> leaps(x,y
+ )
\$which
1     2     3     4
1 FALSE FALSE FALSE  TRUE
1  TRUE FALSE FALSE FALSE
1 FALSE FALSE  TRUE FALSE
1 FALSE  TRUE FALSE FALSE
2  TRUE FALSE FALSE  TRUE
2 FALSE FALSE  TRUE  TRUE
2 FALSE  TRUE FALSE  TRUE
2  TRUE FALSE  TRUE FALSE
2  TRUE  TRUE FALSE FALSE
2 FALSE  TRUE  TRUE FALSE
3  TRUE FALSE  TRUE  TRUE
3  TRUE  TRUE FALSE  TRUE
3 FALSE  TRUE  TRUE  TRUE
3  TRUE  TRUE  TRUE FALSE
4  TRUE  TRUE  TRUE  TRUE

\$label
[1] "(Intercept)" "1"
[3] "2"           "3"
[5] "4"

\$size
[1] 2 2 2 2 3 3 3 3 3 3 4 4 4 4 5

\$Cp
[1] 0.291457 1.330360 2.597818 2.800123
[5] 1.479960 2.123237 2.214481 2.669777
[9] 3.315351 4.290161 3.099933 3.465247
[13] 3.937318 4.529656 5.000000

> data(swiss)
> a<-regsubsets(as.matrix(swiss[,-1]),swiss[,1])
> summary(a)
Subset selection object
5 Variables  (and intercept)
Forced in Forced out
Agriculture          FALSE      FALSE
Examination          FALSE      FALSE
Education            FALSE      FALSE
Catholic             FALSE      FALSE
Infant.Mortality     FALSE      FALSE
1 subsets of each size up to 5
Selection Algorithm: exhaustive
Agriculture Examination Education
1  ( 1 ) " "         " "         "*"
2  ( 1 ) " "         " "         "*"
3  ( 1 ) " "         " "         "*"
4  ( 1 ) "*"         " "         "*"
5  ( 1 ) "*"         "*"         "*"
Catholic Infant.Mortality
1  ( 1 ) " "      " "
2  ( 1 ) "*"      " "
3  ( 1 ) "*"      "*"
4  ( 1 ) "*"      "*"
5  ( 1 ) "*"      "*"
> coef(a, 1:3)
[[1]]
(Intercept)   Education
79.6100585  -0.8623503

[[2]]
(Intercept)   Education    Catholic
74.2336892  -0.7883293   0.1109210

[[3]]
(Intercept)        Education
48.67707330      -0.75924577
Catholic Infant.Mortality
0.09606607       1.29614813

> vcov(a, 3)
(Intercept)
(Intercept)      62.711883147
Education        -0.234998201
Catholic         -0.001112006
Infant.Mortality -2.952862263
Education
(Intercept)      -0.2349982009
Education         0.0136416868
Catholic          0.0004427309
Infant.Mortality  0.0033603646
Catholic
(Intercept)      -0.0011120059
Education         0.0004427309
Catholic          0.0007408169
Infant.Mortality -0.0017163629
Infant.Mortality
(Intercept)          -2.952862263
Education             0.003360365
Catholic             -0.001716363
Infant.Mortality      0.149759535
> > library(MASS)
Error: unexpected '>' in ">"
>  library(MASS)
> data(biopsy)
> str(biopsy)
'data.frame': 699 obs. of  11 variables:
\$ ID   : chr  "1000025" "1002945" "1015425" "1016277" ...
\$ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
\$ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
\$ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
\$ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
\$ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
\$ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
\$ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
\$ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
\$ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
\$ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
> biopsy\$ID = NULL
> names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size",
+                   "nucl", "chrom", "n.nuc", "mit", "class")
> names(biopsy)
[1] "thick"   "u.size"  "u.shape"
[7] "chrom"   "n.nuc"   "mit"
[10] "class"
> biopsy.v2 = na.omit(biopsy)
> library(reshape2)
> library(ggplot2)
> biop.m = melt(biopsy.v2, id.var="class")
> ggplot(data=biop.m, aes(x=class, y=value)) + geom_boxplot()
> +facet_wrap(~variable,ncol = 3)
Error: Cannot use `+.gg()` with a single argument. Did you accidentally put + on a new line?
> library(corrplot)
Warning message:
package ‘corrplot’ was built under R version 3.6.3
> bc = cor(biopsy.v2[ ,1:9])
> corrplot.mixed(bc)

> set.seed(123)
> ind = sample(2, nrow(biopsy.v2), replace=TRUE, prob=c(0.7, 0.3))
> train = biopsy.v2[ind==1,]
> test = biopsy.v2[ind==2,]
> str(test)
'data.frame': 209 obs. of  10 variables:
\$ thick  : int  5 6 4 2 1 7 6 7 1 3 ...
\$ u.size : int  4 8 1 1 1 4 1 3 1 2 ...
\$ u.shape: int  4 8 1 2 1 6 1 2 1 1 ...
\$ adhsn  : int  5 1 3 1 1 4 1 10 1 1 ...
\$ s.size : int  7 3 2 2 1 6 2 5 2 1 ...
\$ nucl   : int  10 4 1 1 1 1 1 10 1 1 ...
\$ chrom  : int  3 3 3 3 3 4 3 5 3 2 ...
\$ n.nuc  : int  2 7 1 1 1 3 1 4 1 1 ...
\$ mit    : int  1 1 1 1 1 1 1 4 1 1 ...
\$ class  : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
- attr(*, "na.action")= 'omit' Named int  24 41 140 146 159 165 236 250 276 293 ...
..- attr(*, "names")= chr  "24" "41" "140" "146" ...
> table(train\$class)

benign malignant
302       172
> table(test\$class)

benign malignant
142        67
> full.fit = glm(class~., family=binomial, data=train)
> summary(full.fit)

Call:
glm(formula = class ~ ., family = binomial, data = train)

Deviance Residuals:
Min       1Q   Median       3Q
-3.3397  -0.1387  -0.0716   0.0321
Max
2.3559

Coefficients:
Estimate Std. Error z value
(Intercept)  -9.4293     1.2273  -7.683
thick         0.5252     0.1601   3.280
u.size       -0.1045     0.2446  -0.427
u.shape       0.2798     0.2526   1.108
s.size        0.2866     0.2074   1.382
nucl          0.4057     0.1213   3.344
chrom         0.2737     0.2174   1.259
n.nuc         0.2244     0.1373   1.635
mit           0.4296     0.3393   1.266
Pr(>|z|)
(Intercept) 1.55e-14 ***
thick       0.001039 **
u.size      0.669165
u.shape     0.268044
s.size      0.167021
nucl        0.000826 ***
chrom       0.208006
n.nuc       0.102126
mit         0.205402
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’
0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

Null deviance: 620.989  on 473  degrees of freedom
Residual deviance:  78.373  on 464  degrees of freedom
AIC: 98.373

Number of Fisher Scoring iterations: 8

> confint(full.fit)
Waiting for profiling to be done...
2.5 %     97.5 %
(Intercept) -12.23786660 -7.3421509
thick         0.23250518  0.8712407
u.size       -0.56108960  0.4212527
u.shape      -0.24551513  0.7725505
s.size       -0.11769714  0.7024139
nucl          0.17687420  0.6582354
chrom        -0.13992177  0.7232904
n.nuc        -0.03813490  0.5110293
mit          -0.14099177  1.0142786
> exp(coef(full.fit))
(Intercept)        thick       u.size
8.033466e-05 1.690879e+00 9.007478e-01