Showing posts with label probabilistic linear models. Show all posts
Showing posts with label probabilistic linear models. Show all posts

How to learn probabilistic linear models to predict a qualitative

#we looked at using probabilistic linear models to predict a qualitative
> response with the two most common methods: logistic regression and discriminant
package ‘leaps’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpUDOrMk\downloaded_packages
> library("leaps", lib.loc="~/R/win-library/3.6")
Warning message:
package ‘leaps’ was built under R version 3.6.3 
> x<-matrix(rnorm(100),ncol=4)
> y<-rnorm(25)
> leaps(x,y
+ )
$which
      1     2     3     4
1 FALSE FALSE FALSE  TRUE
1  TRUE FALSE FALSE FALSE
1 FALSE FALSE  TRUE FALSE
1 FALSE  TRUE FALSE FALSE
2  TRUE FALSE FALSE  TRUE
2 FALSE FALSE  TRUE  TRUE
2 FALSE  TRUE FALSE  TRUE
2  TRUE FALSE  TRUE FALSE
2  TRUE  TRUE FALSE FALSE
2 FALSE  TRUE  TRUE FALSE
3  TRUE FALSE  TRUE  TRUE
3  TRUE  TRUE FALSE  TRUE
3 FALSE  TRUE  TRUE  TRUE
3  TRUE  TRUE  TRUE FALSE
4  TRUE  TRUE  TRUE  TRUE

$label
[1] "(Intercept)" "1"          
[3] "2"           "3"          
[5] "4"          

$size
 [1] 2 2 2 2 3 3 3 3 3 3 4 4 4 4 5

$Cp
 [1] 0.291457 1.330360 2.597818 2.800123
 [5] 1.479960 2.123237 2.214481 2.669777
 [9] 3.315351 4.290161 3.099933 3.465247
[13] 3.937318 4.529656 5.000000

> data(swiss)
> a<-regsubsets(as.matrix(swiss[,-1]),swiss[,1])
> summary(a)
Subset selection object
5 Variables  (and intercept)
                 Forced in Forced out
Agriculture          FALSE      FALSE
Examination          FALSE      FALSE
Education            FALSE      FALSE
Catholic             FALSE      FALSE
Infant.Mortality     FALSE      FALSE
1 subsets of each size up to 5
Selection Algorithm: exhaustive
         Agriculture Examination Education
1  ( 1 ) " "         " "         "*"      
2  ( 1 ) " "         " "         "*"      
3  ( 1 ) " "         " "         "*"      
4  ( 1 ) "*"         " "         "*"      
5  ( 1 ) "*"         "*"         "*"      
         Catholic Infant.Mortality
1  ( 1 ) " "      " "             
2  ( 1 ) "*"      " "             
3  ( 1 ) "*"      "*"             
4  ( 1 ) "*"      "*"             
5  ( 1 ) "*"      "*"             
> coef(a, 1:3)
[[1]]
(Intercept)   Education 
 79.6100585  -0.8623503 

[[2]]
(Intercept)   Education    Catholic 
 74.2336892  -0.7883293   0.1109210 

[[3]]
     (Intercept)        Education 
     48.67707330      -0.75924577 
        Catholic Infant.Mortality 
      0.09606607       1.29614813 

> vcov(a, 3)
                  (Intercept)
(Intercept)      62.711883147
Education        -0.234998201
Catholic         -0.001112006
Infant.Mortality -2.952862263
                     Education
(Intercept)      -0.2349982009
Education         0.0136416868
Catholic          0.0004427309
Infant.Mortality  0.0033603646
                      Catholic
(Intercept)      -0.0011120059
Education         0.0004427309
Catholic          0.0007408169
Infant.Mortality -0.0017163629
                 Infant.Mortality
(Intercept)          -2.952862263
Education             0.003360365
Catholic             -0.001716363
Infant.Mortality      0.149759535
> > library(MASS)
Error: unexpected '>' in ">"
>  library(MASS)
> data(biopsy)
> str(biopsy)
'data.frame': 699 obs. of  11 variables:
 $ ID   : chr  "1000025" "1002945" "1015425" "1016277" ...
 $ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
 $ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
 $ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
 $ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
 $ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
 $ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
 $ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
 $ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
 $ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
 $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
> biopsy$ID = NULL
> names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size",
+                   "nucl", "chrom", "n.nuc", "mit", "class")
> names(biopsy)
 [1] "thick"   "u.size"  "u.shape"
 [4] "adhsn"   "s.size"  "nucl"   
 [7] "chrom"   "n.nuc"   "mit"    
[10] "class"  
> biopsy.v2 = na.omit(biopsy)
> library(reshape2)
> library(ggplot2)
> biop.m = melt(biopsy.v2, id.var="class")
> ggplot(data=biop.m, aes(x=class, y=value)) + geom_boxplot()
> +facet_wrap(~variable,ncol = 3)
Error: Cannot use `+.gg()` with a single argument. Did you accidentally put + on a new line?
> library(corrplot)
corrplot 0.84 loaded
Warning message:
package ‘corrplot’ was built under R version 3.6.3 
> bc = cor(biopsy.v2[ ,1:9])
> corrplot.mixed(bc)

> set.seed(123)
> ind = sample(2, nrow(biopsy.v2), replace=TRUE, prob=c(0.7, 0.3))
> train = biopsy.v2[ind==1,] 
> test = biopsy.v2[ind==2,]
> str(test) 
'data.frame': 209 obs. of  10 variables:
 $ thick  : int  5 6 4 2 1 7 6 7 1 3 ...
 $ u.size : int  4 8 1 1 1 4 1 3 1 2 ...
 $ u.shape: int  4 8 1 2 1 6 1 2 1 1 ...
 $ adhsn  : int  5 1 3 1 1 4 1 10 1 1 ...
 $ s.size : int  7 3 2 2 1 6 2 5 2 1 ...
 $ nucl   : int  10 4 1 1 1 1 1 10 1 1 ...
 $ chrom  : int  3 3 3 3 3 4 3 5 3 2 ...
 $ n.nuc  : int  2 7 1 1 1 3 1 4 1 1 ...
 $ mit    : int  1 1 1 1 1 1 1 4 1 1 ...
 $ class  : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
 - attr(*, "na.action")= 'omit' Named int  24 41 140 146 159 165 236 250 276 293 ...
  ..- attr(*, "names")= chr  "24" "41" "140" "146" ...
> table(train$class)

   benign malignant 
      302       172 
> table(test$class)

   benign malignant 
      142        67 
> full.fit = glm(class~., family=binomial, data=train)
> summary(full.fit)

Call:
glm(formula = class ~ ., family = binomial, data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q  
-3.3397  -0.1387  -0.0716   0.0321  
    Max  
 2.3559  

Coefficients:
            Estimate Std. Error z value
(Intercept)  -9.4293     1.2273  -7.683
thick         0.5252     0.1601   3.280
u.size       -0.1045     0.2446  -0.427
u.shape       0.2798     0.2526   1.108
adhsn         0.3086     0.1738   1.776
s.size        0.2866     0.2074   1.382
nucl          0.4057     0.1213   3.344
chrom         0.2737     0.2174   1.259
n.nuc         0.2244     0.1373   1.635
mit           0.4296     0.3393   1.266
            Pr(>|z|)    
(Intercept) 1.55e-14 ***
thick       0.001039 ** 
u.size      0.669165    
u.shape     0.268044    
adhsn       0.075722 .  
s.size      0.167021    
nucl        0.000826 ***
chrom       0.208006    
n.nuc       0.102126    
mit         0.205402    
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’
  0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 620.989  on 473  degrees of freedom
Residual deviance:  78.373  on 464  degrees of freedom
AIC: 98.373

Number of Fisher Scoring iterations: 8

> confint(full.fit)
Waiting for profiling to be done...
                   2.5 %     97.5 %
(Intercept) -12.23786660 -7.3421509
thick         0.23250518  0.8712407
u.size       -0.56108960  0.4212527
u.shape      -0.24551513  0.7725505
adhsn        -0.02257952  0.6760586
s.size       -0.11769714  0.7024139
nucl          0.17687420  0.6582354
chrom        -0.13992177  0.7232904
n.nuc        -0.03813490  0.5110293
mit          -0.14099177  1.0142786
> exp(coef(full.fit))
 (Intercept)        thick       u.size 
8.033466e-05 1.690879e+00 9.007478e-01 
     u.shape        adhsn       s.size 
1.322844e+00 1.361533e+00 1.331940e+00 
        nucl        chrom        n.nuc 
1.500309e+00 1.314783e+00 1.251551e+00 
         mit 
1.536709e+00