Showing posts with label logistic regression. Show all posts
Showing posts with label logistic regression. Show all posts

Friday, March 20, 2020

Logistic regression

we will focus on the logistic function. 

The logistic function used in logistic regression
 data(Carseats)
> str(Carseats)
'data.frame': 400 obs. of  11 variables:
 $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
 $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
 $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
 $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
 $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
 $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
 $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
 $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
 $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
 $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
 $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
>
> data(Smarke)
Warning message:
In data(Smarke) : data set ‘Smarke’ not found
> data(Smarket)
> str(Smarket)
'data.frame': 1250 obs. of  9 variables:
 $ Year     : num  2001 2001 2001 2001 2001 ...
 $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
 $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
 $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
 $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
 $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
 $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
 $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
 $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
> sales.fit = lm(Sales~Advertising+ShelveLoc, data=Carseats)
>
> summary(sales.fit)

Call:
lm(formula = Sales ~ Advertising + ShelveLoc, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max
-6.6480 -1.6198 -0.0476  1.5308  6.4098

Coefficients:
                Estimate Std. Error t value Pr(>|t|)   
(Intercept)      4.89662    0.25207  19.426  < 2e-16 ***
Advertising      0.10071    0.01692   5.951 5.88e-09 ***
ShelveLocGood    4.57686    0.33479  13.671  < 2e-16 ***
ShelveLocMedium  1.75142    0.27475   6.375 5.11e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.244 on 396 degrees of freedom
Multiple R-squared:  0.3733, Adjusted R-squared:  0.3685
F-statistic: 78.62 on 3 and 396 DF,  p-value: < 2.2e-16

> contrasts(Carseats$ShelveLoc)
       Good Medium
Bad       0      0
Good      1      0
Medium    0      1
> contrasts(Carseats$Urban)
    Yes
No    0
Yes   1
> contrasts(Carseats$Us)
Error in contrasts(Carseats$Us) : contrasts apply only to factors
> contrasts(Carseats$us)
Error in contrasts(Carseats$us) : contrasts apply only to factors
> contrasts(Carseats$US)
    Yes
No    0
Yes   1
> contrasts(Carseats$Price)
Error in contrasts(Carseats$Price) : contrasts apply only to factors
> lm(Today~Lag1+Lag2,data=Smarket)

Call:
lm(formula = Today ~ Lag1 + Lag2, data = Smarket)

Coefficients:
(Intercept)         Lag1         Lag2 
   0.003283    -0.026444    -0.010946 

> library("MASS", lib.loc="C:/Program Files/R/R-3.6.1/library")
> data(biopsy)
>
> str(biopsy)
'data.frame': 699 obs. of  11 variables:
 $ ID   : chr  "1000025" "1002945" "1015425" "1016277" ...
 $ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
 $ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
 $ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
 $ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
 $ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
 $ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
 $ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
 $ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
 $ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
 $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
> biopsy$ID = NULL
>
> names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size",
+                   "nucl", "chrom", "n.nuc", "mit", "class")
>
> names(biopsy)
 [1] "thick"   "u.size"  "u.shape" "adhsn"   "s.size"  "nucl" 
 [7] "chrom"   "n.nuc"   "mit"     "class" 
>
> biopsy.v2 = na.omit(biopsy)
>
> library("reshape2", lib.loc="~/R/win-library/3.6")
> library("ggplot2", lib.loc="~/R/win-library/3.6")
> biop.m = melt(biopsy.v2, id.var="class")
>
> ggplot(data=biop.m, aes(x=class, y=value)) + geom_boxplot()
> +facet_wrap(~variable,ncol = 3)
Error: Cannot use `+.gg()` with a single argument. Did you accidentally put + on a new line?
>
>
> library("corrplot", lib.loc="~/R/win-library/3.6")
corrplot 0.84 loaded
Warning message:
package ‘corrplot’ was built under R version 3.6.3
> bc = cor(biopsy.v2[ ,1:9])
> corrplot.mixed(bc)
>
rcorelation
> set.seed(123)
> ind = sample(2, nrow(biopsy.v2), replace=TRUE, prob=c(0.7, 0.3))
>
> train = biopsy.v2[ind==1,]
> test = biopsy.v2[ind==2,]
> str(test)
'data.frame': 209 obs. of  10 variables:
 $ thick  : int  5 6 4 2 1 7 6 7 1 3 ...
 $ u.size : int  4 8 1 1 1 4 1 3 1 2 ...
 $ u.shape: int  4 8 1 2 1 6 1 2 1 1 ...
 $ adhsn  : int  5 1 3 1 1 4 1 10 1 1 ...
 $ s.size : int  7 3 2 2 1 6 2 5 2 1 ...
 $ nucl   : int  10 4 1 1 1 1 1 10 1 1 ...
 $ chrom  : int  3 3 3 3 3 4 3 5 3 2 ...
 $ n.nuc  : int  2 7 1 1 1 3 1 4 1 1 ...
 $ mit    : int  1 1 1 1 1 1 1 4 1 1 ...
 $ class  : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
 - attr(*, "na.action")= 'omit' Named int  24 41 140 146 159 165 236 250 276 293 ...
  ..- attr(*, "names")= chr  "24" "41" "140" "146" ...
> table(train$class)

   benign malignant
      302       172
>
> table(test$class)

   benign malignant
      142        67
>
> full.fit = glm(class~., family=binomial, data=train)
> summary(full.fit)

Call:
glm(formula = class ~ ., family = binomial, data = train)

Deviance Residuals:
    Min       1Q   Median       3Q      Max 
-3.3397  -0.1387  -0.0716   0.0321   2.3559 

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  -9.4293     1.2273  -7.683 1.55e-14
thick         0.5252     0.1601   3.280 0.001039
u.size       -0.1045     0.2446  -0.427 0.669165
u.shape       0.2798     0.2526   1.108 0.268044
adhsn         0.3086     0.1738   1.776 0.075722
s.size        0.2866     0.2074   1.382 0.167021
nucl          0.4057     0.1213   3.344 0.000826
chrom         0.2737     0.2174   1.259 0.208006
n.nuc         0.2244     0.1373   1.635 0.102126
mit           0.4296     0.3393   1.266 0.205402
             
(Intercept) ***
thick       **
u.size       
u.shape       
adhsn       . 
s.size       
nucl        ***
chrom         
n.nuc         
mit           
---
Signif. codes: 
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 620.989  on 473  degrees of freedom
Residual deviance:  78.373  on 464  degrees of freedom
AIC: 98.373

Number of Fisher Scoring iterations: 8

>
> confint(full.fit)
Waiting for profiling to be done...
                   2.5 %     97.5 %
(Intercept) -12.23786660 -7.3421509
thick         0.23250518  0.8712407
u.size       -0.56108960  0.4212527
u.shape      -0.24551513  0.7725505
adhsn        -0.02257952  0.6760586
s.size       -0.11769714  0.7024139
nucl          0.17687420  0.6582354
chrom        -0.13992177  0.7232904
n.nuc        -0.03813490  0.5110293
mit          -0.14099177  1.0142786
> exp(coef(full.fit))
 (Intercept)        thick       u.size
8.033466e-05 1.690879e+00 9.007478e-01
     u.shape        adhsn       s.size
1.322844e+00 1.361533e+00 1.331940e+00
        nucl        chrom        n.nuc
1.500309e+00 1.314783e+00 1.251551e+00
         mit
1.536709e+00